diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,46440 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 6630, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 219.80169607321315, + "learning_rate": 5.025125628140704e-08, + "loss": 1.4166, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 68.93088300407445, + "learning_rate": 1.0050251256281409e-07, + "loss": 1.3768, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 54.936783109940805, + "learning_rate": 1.5075376884422112e-07, + "loss": 1.35, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 11.147084785627328, + "learning_rate": 2.0100502512562817e-07, + "loss": 1.1854, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 56.07248625611662, + "learning_rate": 2.512562814070352e-07, + "loss": 1.3999, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 37.91675848222748, + "learning_rate": 3.0150753768844224e-07, + "loss": 1.2615, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 57.208281128557715, + "learning_rate": 3.5175879396984927e-07, + "loss": 1.2629, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 348.3970313714995, + "learning_rate": 4.0201005025125634e-07, + "loss": 1.2685, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 52.50230647732762, + "learning_rate": 4.5226130653266337e-07, + "loss": 1.3959, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 45.244460400126854, + "learning_rate": 5.025125628140704e-07, + "loss": 1.3011, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 46.12149197339764, + "learning_rate": 5.527638190954775e-07, + "loss": 1.1406, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 41.97396330351521, + "learning_rate": 6.030150753768845e-07, + "loss": 1.0228, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 45.40047132020229, + "learning_rate": 6.532663316582916e-07, + "loss": 1.0994, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 47.357890256244474, + "learning_rate": 7.035175879396985e-07, + "loss": 1.0124, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 45.92016237398398, + "learning_rate": 7.537688442211055e-07, + "loss": 0.958, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 36.41115963246688, + "learning_rate": 8.040201005025127e-07, + "loss": 0.9093, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 19.859992493272486, + "learning_rate": 8.542713567839197e-07, + "loss": 0.923, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 18.60349269113141, + "learning_rate": 9.045226130653267e-07, + "loss": 0.8999, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 57.136740770546616, + "learning_rate": 9.547738693467337e-07, + "loss": 0.8842, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 20.535094238126142, + "learning_rate": 1.0050251256281409e-06, + "loss": 0.9231, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 24.3905464446835, + "learning_rate": 1.0552763819095479e-06, + "loss": 0.8877, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 24.573896367893912, + "learning_rate": 1.105527638190955e-06, + "loss": 0.7757, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 132.19813272395143, + "learning_rate": 1.155778894472362e-06, + "loss": 0.7601, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 22.63715442823188, + "learning_rate": 1.206030150753769e-06, + "loss": 0.7715, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 21.10455964482046, + "learning_rate": 1.256281407035176e-06, + "loss": 0.8811, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 19.53217334448488, + "learning_rate": 1.3065326633165831e-06, + "loss": 0.7143, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 18.515790932608283, + "learning_rate": 1.35678391959799e-06, + "loss": 0.7327, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 109.51013540871278, + "learning_rate": 1.407035175879397e-06, + "loss": 0.6849, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 33.71827134253702, + "learning_rate": 1.457286432160804e-06, + "loss": 0.6873, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 29.189223156503477, + "learning_rate": 1.507537688442211e-06, + "loss": 0.8473, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 35.796275909570355, + "learning_rate": 1.5577889447236184e-06, + "loss": 0.6991, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 23.283695261880045, + "learning_rate": 1.6080402010050254e-06, + "loss": 0.7264, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 29.31591340941401, + "learning_rate": 1.6582914572864323e-06, + "loss": 0.7455, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 26.707554037725632, + "learning_rate": 1.7085427135678393e-06, + "loss": 0.7368, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 37.09136708374297, + "learning_rate": 1.7587939698492465e-06, + "loss": 0.6374, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 40.854322655846346, + "learning_rate": 1.8090452261306535e-06, + "loss": 0.7454, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 26.064397979559658, + "learning_rate": 1.8592964824120604e-06, + "loss": 0.6418, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 26.509150666716433, + "learning_rate": 1.9095477386934674e-06, + "loss": 0.6377, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 125.62839699992506, + "learning_rate": 1.9597989949748746e-06, + "loss": 0.7271, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 48.21997249089312, + "learning_rate": 2.0100502512562818e-06, + "loss": 0.7225, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 88.40507848284909, + "learning_rate": 2.0603015075376885e-06, + "loss": 0.8219, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 27.714556764927426, + "learning_rate": 2.1105527638190957e-06, + "loss": 0.7235, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 25.23501982065819, + "learning_rate": 2.1608040201005025e-06, + "loss": 0.6816, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 29.398637269021645, + "learning_rate": 2.21105527638191e-06, + "loss": 0.7052, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 45.86690931113663, + "learning_rate": 2.261306532663317e-06, + "loss": 0.7121, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 54.91948007458609, + "learning_rate": 2.311557788944724e-06, + "loss": 0.7021, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 31.58202896907573, + "learning_rate": 2.3618090452261308e-06, + "loss": 0.6895, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 32.674805702378215, + "learning_rate": 2.412060301507538e-06, + "loss": 0.6169, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 46.388720790134435, + "learning_rate": 2.462311557788945e-06, + "loss": 0.6477, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 79.95580591349393, + "learning_rate": 2.512562814070352e-06, + "loss": 0.6572, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 35.90023142018094, + "learning_rate": 2.562814070351759e-06, + "loss": 0.6494, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 53.69914568614483, + "learning_rate": 2.6130653266331663e-06, + "loss": 0.6284, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 28.33405194150982, + "learning_rate": 2.663316582914573e-06, + "loss": 0.6885, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 27.997754632046586, + "learning_rate": 2.71356783919598e-06, + "loss": 0.5837, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 45.2371368322122, + "learning_rate": 2.763819095477387e-06, + "loss": 0.7025, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 6.389732964600146, + "learning_rate": 2.814070351758794e-06, + "loss": 0.7481, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 84.75442800691671, + "learning_rate": 2.8643216080402013e-06, + "loss": 0.6395, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 54.724916319146274, + "learning_rate": 2.914572864321608e-06, + "loss": 0.6724, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 36.79325989363227, + "learning_rate": 2.9648241206030153e-06, + "loss": 0.6405, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 35.726624372962526, + "learning_rate": 3.015075376884422e-06, + "loss": 0.6721, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 41.99486819155758, + "learning_rate": 3.065326633165829e-06, + "loss": 0.6044, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 36.66938168093264, + "learning_rate": 3.115577889447237e-06, + "loss": 0.6386, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 82.94525180209143, + "learning_rate": 3.165829145728643e-06, + "loss": 0.6665, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 46.23617089605336, + "learning_rate": 3.2160804020100507e-06, + "loss": 0.6014, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 62.928348783059285, + "learning_rate": 3.266331658291458e-06, + "loss": 0.6793, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 40.400025749615885, + "learning_rate": 3.3165829145728647e-06, + "loss": 0.5949, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 43.274373268410514, + "learning_rate": 3.366834170854272e-06, + "loss": 0.6838, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 41.17740099944803, + "learning_rate": 3.4170854271356786e-06, + "loss": 0.6863, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 36.223954122007875, + "learning_rate": 3.467336683417086e-06, + "loss": 0.7148, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 42.239556001754615, + "learning_rate": 3.517587939698493e-06, + "loss": 0.5889, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 72.26281145058014, + "learning_rate": 3.5678391959798997e-06, + "loss": 0.6382, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 51.064421074962404, + "learning_rate": 3.618090452261307e-06, + "loss": 0.6399, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 46.13746945750372, + "learning_rate": 3.6683417085427137e-06, + "loss": 0.5724, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 61.34816004780313, + "learning_rate": 3.718592964824121e-06, + "loss": 0.5273, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 43.20589193064767, + "learning_rate": 3.768844221105528e-06, + "loss": 0.5118, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 30.10497917287366, + "learning_rate": 3.819095477386935e-06, + "loss": 0.561, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 31.968533737951724, + "learning_rate": 3.869346733668342e-06, + "loss": 0.6049, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 44.26432209974167, + "learning_rate": 3.919597989949749e-06, + "loss": 0.6455, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 39.68079476845012, + "learning_rate": 3.969849246231156e-06, + "loss": 0.542, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 30.715115917294153, + "learning_rate": 4.0201005025125635e-06, + "loss": 0.5455, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 45.51910069229239, + "learning_rate": 4.07035175879397e-06, + "loss": 0.5784, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 59.678584465472404, + "learning_rate": 4.120603015075377e-06, + "loss": 0.6253, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 38.605599627687425, + "learning_rate": 4.170854271356784e-06, + "loss": 0.5777, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 29.301353927752366, + "learning_rate": 4.221105527638191e-06, + "loss": 0.5665, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.6523416831756592, + "learning_rate": 4.271356783919598e-06, + "loss": 0.4574, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 28.83187389327923, + "learning_rate": 4.321608040201005e-06, + "loss": 0.6209, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 21.872822130085265, + "learning_rate": 4.3718592964824125e-06, + "loss": 0.5853, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 56.360847037427874, + "learning_rate": 4.42211055276382e-06, + "loss": 0.5734, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 48.68148087515708, + "learning_rate": 4.472361809045226e-06, + "loss": 0.5491, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 30.65555739079307, + "learning_rate": 4.522613065326634e-06, + "loss": 0.5757, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 29.256091037601923, + "learning_rate": 4.57286432160804e-06, + "loss": 0.5914, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 27.8410115332911, + "learning_rate": 4.623115577889448e-06, + "loss": 0.6077, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 49.21321807169021, + "learning_rate": 4.673366834170855e-06, + "loss": 0.5944, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 40.55362164051215, + "learning_rate": 4.7236180904522615e-06, + "loss": 0.674, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 51.61931029001503, + "learning_rate": 4.773869346733669e-06, + "loss": 0.6142, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 27.6470664763442, + "learning_rate": 4.824120603015076e-06, + "loss": 0.6173, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 34.293180346262666, + "learning_rate": 4.874371859296483e-06, + "loss": 0.5773, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 38.65818714603588, + "learning_rate": 4.92462311557789e-06, + "loss": 0.657, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 21.35105429407032, + "learning_rate": 4.974874371859297e-06, + "loss": 0.5465, + "step": 99 + }, + { + "epoch": 0.02, + "grad_norm": 30.012254126098302, + "learning_rate": 5.025125628140704e-06, + "loss": 0.6993, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 28.289640193861494, + "learning_rate": 5.0753768844221105e-06, + "loss": 0.5553, + "step": 101 + }, + { + "epoch": 0.02, + "grad_norm": 102.01334548254385, + "learning_rate": 5.125628140703518e-06, + "loss": 0.6439, + "step": 102 + }, + { + "epoch": 0.02, + "grad_norm": 34.36520299347059, + "learning_rate": 5.175879396984925e-06, + "loss": 0.5853, + "step": 103 + }, + { + "epoch": 0.02, + "grad_norm": 25.425230295497396, + "learning_rate": 5.2261306532663325e-06, + "loss": 0.6361, + "step": 104 + }, + { + "epoch": 0.02, + "grad_norm": 34.76338183402994, + "learning_rate": 5.2763819095477384e-06, + "loss": 0.5541, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 46.525530607258666, + "learning_rate": 5.326633165829146e-06, + "loss": 0.5394, + "step": 106 + }, + { + "epoch": 0.02, + "grad_norm": 27.541364641367817, + "learning_rate": 5.376884422110553e-06, + "loss": 0.6212, + "step": 107 + }, + { + "epoch": 0.02, + "grad_norm": 25.842033740196463, + "learning_rate": 5.42713567839196e-06, + "loss": 0.6432, + "step": 108 + }, + { + "epoch": 0.02, + "grad_norm": 35.818253585216006, + "learning_rate": 5.477386934673368e-06, + "loss": 0.5838, + "step": 109 + }, + { + "epoch": 0.02, + "grad_norm": 36.4882165209498, + "learning_rate": 5.527638190954774e-06, + "loss": 0.5933, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 37.5355979339956, + "learning_rate": 5.577889447236181e-06, + "loss": 0.6544, + "step": 111 + }, + { + "epoch": 0.02, + "grad_norm": 41.39378007342803, + "learning_rate": 5.628140703517588e-06, + "loss": 0.5863, + "step": 112 + }, + { + "epoch": 0.02, + "grad_norm": 111.1860664374623, + "learning_rate": 5.678391959798996e-06, + "loss": 0.4986, + "step": 113 + }, + { + "epoch": 0.02, + "grad_norm": 33.26851464218771, + "learning_rate": 5.728643216080403e-06, + "loss": 0.5632, + "step": 114 + }, + { + "epoch": 0.02, + "grad_norm": 21.775705044101347, + "learning_rate": 5.778894472361809e-06, + "loss": 0.6209, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 27.742539018435583, + "learning_rate": 5.829145728643216e-06, + "loss": 0.5511, + "step": 116 + }, + { + "epoch": 0.02, + "grad_norm": 22.962555775332277, + "learning_rate": 5.879396984924624e-06, + "loss": 0.7013, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 22.126038778253104, + "learning_rate": 5.9296482412060305e-06, + "loss": 0.5704, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 34.81754750465778, + "learning_rate": 5.979899497487438e-06, + "loss": 0.6602, + "step": 119 + }, + { + "epoch": 0.02, + "grad_norm": 20.722454958381157, + "learning_rate": 6.030150753768844e-06, + "loss": 0.6041, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 1.851637449637002, + "learning_rate": 6.080402010050252e-06, + "loss": 0.493, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 21.763183463124886, + "learning_rate": 6.130653266331658e-06, + "loss": 0.598, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 40.89684016180527, + "learning_rate": 6.180904522613066e-06, + "loss": 0.5659, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 21.181290876327587, + "learning_rate": 6.231155778894474e-06, + "loss": 0.6299, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 20.56180954584001, + "learning_rate": 6.28140703517588e-06, + "loss": 0.5623, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 75.32726603198796, + "learning_rate": 6.331658291457286e-06, + "loss": 0.5382, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 20.981367869000625, + "learning_rate": 6.381909547738694e-06, + "loss": 0.6155, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 37.67101203691849, + "learning_rate": 6.4321608040201015e-06, + "loss": 0.6474, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 24.158030044125677, + "learning_rate": 6.482412060301508e-06, + "loss": 0.5876, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 20.280132606259215, + "learning_rate": 6.532663316582916e-06, + "loss": 0.5876, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 17.6756733665952, + "learning_rate": 6.582914572864322e-06, + "loss": 0.5944, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 16.17184035828278, + "learning_rate": 6.633165829145729e-06, + "loss": 0.6571, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 14.491604437792551, + "learning_rate": 6.683417085427136e-06, + "loss": 0.5428, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 107.3445942141567, + "learning_rate": 6.733668341708544e-06, + "loss": 0.597, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 12.41658507054546, + "learning_rate": 6.7839195979899505e-06, + "loss": 0.5548, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 14.919543575370547, + "learning_rate": 6.834170854271357e-06, + "loss": 0.6018, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 9.76074869020652, + "learning_rate": 6.884422110552764e-06, + "loss": 0.4914, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 17.592539228768032, + "learning_rate": 6.934673366834172e-06, + "loss": 0.5318, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 8.846351562760573, + "learning_rate": 6.984924623115578e-06, + "loss": 0.5191, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 12.298590056914627, + "learning_rate": 7.035175879396986e-06, + "loss": 0.5853, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 13.331932940756646, + "learning_rate": 7.085427135678392e-06, + "loss": 0.592, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 12.640841634849922, + "learning_rate": 7.1356783919597995e-06, + "loss": 0.577, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 14.627934085382863, + "learning_rate": 7.185929648241206e-06, + "loss": 0.6412, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 13.041339890524648, + "learning_rate": 7.236180904522614e-06, + "loss": 0.6223, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 12.080668821939677, + "learning_rate": 7.2864321608040215e-06, + "loss": 0.5403, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 28.59118964978665, + "learning_rate": 7.336683417085427e-06, + "loss": 0.5672, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 22.512062985869857, + "learning_rate": 7.386934673366835e-06, + "loss": 0.572, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 14.611894722368069, + "learning_rate": 7.437185929648242e-06, + "loss": 0.5913, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 12.908800658783578, + "learning_rate": 7.487437185929649e-06, + "loss": 0.5541, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 12.572046015264789, + "learning_rate": 7.537688442211056e-06, + "loss": 0.5756, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 4.076290379152406, + "learning_rate": 7.587939698492463e-06, + "loss": 0.7968, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 11.142987672905392, + "learning_rate": 7.63819095477387e-06, + "loss": 0.5895, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 10.219279862892433, + "learning_rate": 7.688442211055276e-06, + "loss": 0.6431, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 82.51032088105623, + "learning_rate": 7.738693467336685e-06, + "loss": 0.6039, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 25.521460456044846, + "learning_rate": 7.788944723618092e-06, + "loss": 0.5741, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 15.66496154723627, + "learning_rate": 7.839195979899498e-06, + "loss": 0.5678, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 14.600602524240477, + "learning_rate": 7.889447236180905e-06, + "loss": 0.6139, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 10.507798186601256, + "learning_rate": 7.939698492462312e-06, + "loss": 0.569, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 10.306639039986711, + "learning_rate": 7.989949748743719e-06, + "loss": 0.541, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 7.342281037687586, + "learning_rate": 8.040201005025127e-06, + "loss": 0.53, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 13.30352598042552, + "learning_rate": 8.090452261306532e-06, + "loss": 0.6573, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 9.30032789353799, + "learning_rate": 8.14070351758794e-06, + "loss": 0.5715, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 7.5080432563917014, + "learning_rate": 8.190954773869347e-06, + "loss": 0.6203, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 10.212160451857507, + "learning_rate": 8.241206030150754e-06, + "loss": 0.5741, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 1.8364686825594059, + "learning_rate": 8.291457286432163e-06, + "loss": 0.6082, + "step": 165 + }, + { + "epoch": 0.03, + "grad_norm": 12.04524440603669, + "learning_rate": 8.341708542713568e-06, + "loss": 0.5691, + "step": 166 + }, + { + "epoch": 0.03, + "grad_norm": 27.457121977496122, + "learning_rate": 8.391959798994976e-06, + "loss": 0.5014, + "step": 167 + }, + { + "epoch": 0.03, + "grad_norm": 29.12754737938816, + "learning_rate": 8.442211055276383e-06, + "loss": 0.6045, + "step": 168 + }, + { + "epoch": 0.03, + "grad_norm": 1.2743047049679155, + "learning_rate": 8.49246231155779e-06, + "loss": 0.5052, + "step": 169 + }, + { + "epoch": 0.03, + "grad_norm": 12.939720455650788, + "learning_rate": 8.542713567839196e-06, + "loss": 0.5941, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 13.847402650316074, + "learning_rate": 8.592964824120603e-06, + "loss": 0.6353, + "step": 171 + }, + { + "epoch": 0.03, + "grad_norm": 12.69994930362305, + "learning_rate": 8.64321608040201e-06, + "loss": 0.5709, + "step": 172 + }, + { + "epoch": 0.03, + "grad_norm": 14.852017450906747, + "learning_rate": 8.693467336683418e-06, + "loss": 0.5974, + "step": 173 + }, + { + "epoch": 0.03, + "grad_norm": 10.282759603287758, + "learning_rate": 8.743718592964825e-06, + "loss": 0.6649, + "step": 174 + }, + { + "epoch": 0.03, + "grad_norm": 8.705648615815905, + "learning_rate": 8.793969849246232e-06, + "loss": 0.6479, + "step": 175 + }, + { + "epoch": 0.03, + "grad_norm": 17.378780226562494, + "learning_rate": 8.84422110552764e-06, + "loss": 0.6284, + "step": 176 + }, + { + "epoch": 0.03, + "grad_norm": 10.485875767852814, + "learning_rate": 8.894472361809045e-06, + "loss": 0.6334, + "step": 177 + }, + { + "epoch": 0.03, + "grad_norm": 28.493604154824265, + "learning_rate": 8.944723618090452e-06, + "loss": 0.57, + "step": 178 + }, + { + "epoch": 0.03, + "grad_norm": 11.02569682051414, + "learning_rate": 8.99497487437186e-06, + "loss": 0.6414, + "step": 179 + }, + { + "epoch": 0.03, + "grad_norm": 23.94964353367036, + "learning_rate": 9.045226130653267e-06, + "loss": 0.5752, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 9.201892832647694, + "learning_rate": 9.095477386934674e-06, + "loss": 0.6497, + "step": 181 + }, + { + "epoch": 0.03, + "grad_norm": 34.56725707129777, + "learning_rate": 9.14572864321608e-06, + "loss": 0.6286, + "step": 182 + }, + { + "epoch": 0.03, + "grad_norm": 8.365885747574012, + "learning_rate": 9.195979899497488e-06, + "loss": 0.5054, + "step": 183 + }, + { + "epoch": 0.03, + "grad_norm": 7.315640561207363, + "learning_rate": 9.246231155778896e-06, + "loss": 0.4883, + "step": 184 + }, + { + "epoch": 0.03, + "grad_norm": 12.234484347412982, + "learning_rate": 9.296482412060303e-06, + "loss": 0.5716, + "step": 185 + }, + { + "epoch": 0.03, + "grad_norm": 10.013635084470593, + "learning_rate": 9.34673366834171e-06, + "loss": 0.5924, + "step": 186 + }, + { + "epoch": 0.03, + "grad_norm": 14.630225264034479, + "learning_rate": 9.396984924623116e-06, + "loss": 0.4601, + "step": 187 + }, + { + "epoch": 0.03, + "grad_norm": 11.696098457183469, + "learning_rate": 9.447236180904523e-06, + "loss": 0.577, + "step": 188 + }, + { + "epoch": 0.03, + "grad_norm": 6.884969530672969, + "learning_rate": 9.49748743718593e-06, + "loss": 0.5676, + "step": 189 + }, + { + "epoch": 0.03, + "grad_norm": 8.670258798400683, + "learning_rate": 9.547738693467338e-06, + "loss": 0.5952, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 8.71830253774252, + "learning_rate": 9.597989949748745e-06, + "loss": 0.6696, + "step": 191 + }, + { + "epoch": 0.03, + "grad_norm": 9.352870366648284, + "learning_rate": 9.648241206030152e-06, + "loss": 0.5667, + "step": 192 + }, + { + "epoch": 0.03, + "grad_norm": 13.676226977398779, + "learning_rate": 9.698492462311559e-06, + "loss": 0.6238, + "step": 193 + }, + { + "epoch": 0.03, + "grad_norm": 10.293976772997915, + "learning_rate": 9.748743718592965e-06, + "loss": 0.6028, + "step": 194 + }, + { + "epoch": 0.03, + "grad_norm": 22.435553917528523, + "learning_rate": 9.798994974874372e-06, + "loss": 0.5388, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 7.757970695912968, + "learning_rate": 9.84924623115578e-06, + "loss": 0.6014, + "step": 196 + }, + { + "epoch": 0.03, + "grad_norm": 1.7090923697651277, + "learning_rate": 9.899497487437186e-06, + "loss": 0.5893, + "step": 197 + }, + { + "epoch": 0.03, + "grad_norm": 11.88306428787245, + "learning_rate": 9.949748743718594e-06, + "loss": 0.5427, + "step": 198 + }, + { + "epoch": 0.03, + "grad_norm": 20.61088049278884, + "learning_rate": 1e-05, + "loss": 0.5998, + "step": 199 + }, + { + "epoch": 0.03, + "grad_norm": 8.016234517570497, + "learning_rate": 9.99999940340072e-06, + "loss": 0.706, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 11.800441976899956, + "learning_rate": 9.999997613603016e-06, + "loss": 0.6313, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 5.668359423329318, + "learning_rate": 9.999994630607318e-06, + "loss": 0.5998, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 29.442454035777118, + "learning_rate": 9.999990454414338e-06, + "loss": 0.547, + "step": 203 + }, + { + "epoch": 0.03, + "grad_norm": 9.067155539653964, + "learning_rate": 9.999985085025072e-06, + "loss": 0.6207, + "step": 204 + }, + { + "epoch": 0.03, + "grad_norm": 32.68446451394774, + "learning_rate": 9.999978522440803e-06, + "loss": 0.5438, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 13.281531163040034, + "learning_rate": 9.999970766663094e-06, + "loss": 0.5807, + "step": 206 + }, + { + "epoch": 0.03, + "grad_norm": 12.477145410296334, + "learning_rate": 9.999961817693799e-06, + "loss": 0.5582, + "step": 207 + }, + { + "epoch": 0.03, + "grad_norm": 2.35353289465063, + "learning_rate": 9.99995167553505e-06, + "loss": 0.6438, + "step": 208 + }, + { + "epoch": 0.03, + "grad_norm": 6.9351982301113475, + "learning_rate": 9.999940340189272e-06, + "loss": 0.5542, + "step": 209 + }, + { + "epoch": 0.03, + "grad_norm": 8.046577237746433, + "learning_rate": 9.999927811659165e-06, + "loss": 0.5875, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 8.878968149846823, + "learning_rate": 9.999914089947723e-06, + "loss": 0.6, + "step": 211 + }, + { + "epoch": 0.03, + "grad_norm": 7.169102821627069, + "learning_rate": 9.999899175058218e-06, + "loss": 0.5599, + "step": 212 + }, + { + "epoch": 0.03, + "grad_norm": 8.50112104914739, + "learning_rate": 9.999883066994212e-06, + "loss": 0.5484, + "step": 213 + }, + { + "epoch": 0.03, + "grad_norm": 138.92279398438325, + "learning_rate": 9.999865765759545e-06, + "loss": 0.6151, + "step": 214 + }, + { + "epoch": 0.03, + "grad_norm": 8.162819961893684, + "learning_rate": 9.999847271358347e-06, + "loss": 0.5758, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 9.284957897146134, + "learning_rate": 9.999827583795034e-06, + "loss": 0.596, + "step": 216 + }, + { + "epoch": 0.03, + "grad_norm": 27.69215261996493, + "learning_rate": 9.999806703074302e-06, + "loss": 0.5723, + "step": 217 + }, + { + "epoch": 0.03, + "grad_norm": 26.909067557024127, + "learning_rate": 9.999784629201135e-06, + "loss": 0.5338, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 6.64861562521606, + "learning_rate": 9.9997613621808e-06, + "loss": 0.6206, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 9.976521658151706, + "learning_rate": 9.99973690201885e-06, + "loss": 0.5941, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 12.777655350160853, + "learning_rate": 9.999711248721123e-06, + "loss": 0.5976, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 11.745974814050962, + "learning_rate": 9.99968440229374e-06, + "loss": 0.6298, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 11.291761308239028, + "learning_rate": 9.999656362743105e-06, + "loss": 0.6064, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 9.79596931623397, + "learning_rate": 9.999627130075913e-06, + "loss": 0.6407, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 10.600729461315963, + "learning_rate": 9.99959670429914e-06, + "loss": 0.5889, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 8.51993555567592, + "learning_rate": 9.999565085420043e-06, + "loss": 0.5285, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 11.439883313988274, + "learning_rate": 9.999532273446171e-06, + "loss": 0.609, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 7.302480184408054, + "learning_rate": 9.999498268385355e-06, + "loss": 0.6047, + "step": 228 + }, + { + "epoch": 0.03, + "grad_norm": 29.287480372108416, + "learning_rate": 9.999463070245708e-06, + "loss": 0.6137, + "step": 229 + }, + { + "epoch": 0.03, + "grad_norm": 13.267635474103606, + "learning_rate": 9.999426679035628e-06, + "loss": 0.5282, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 16.826599376523802, + "learning_rate": 9.999389094763803e-06, + "loss": 0.5646, + "step": 231 + }, + { + "epoch": 0.03, + "grad_norm": 10.873440477203618, + "learning_rate": 9.9993503174392e-06, + "loss": 0.5036, + "step": 232 + }, + { + "epoch": 0.04, + "grad_norm": 19.65972465759151, + "learning_rate": 9.999310347071074e-06, + "loss": 0.5581, + "step": 233 + }, + { + "epoch": 0.04, + "grad_norm": 26.12962130035789, + "learning_rate": 9.999269183668962e-06, + "loss": 0.5532, + "step": 234 + }, + { + "epoch": 0.04, + "grad_norm": 10.372674692441636, + "learning_rate": 9.99922682724269e-06, + "loss": 0.5737, + "step": 235 + }, + { + "epoch": 0.04, + "grad_norm": 9.42407999298219, + "learning_rate": 9.999183277802362e-06, + "loss": 0.6267, + "step": 236 + }, + { + "epoch": 0.04, + "grad_norm": 36.611472240219435, + "learning_rate": 9.999138535358373e-06, + "loss": 0.5672, + "step": 237 + }, + { + "epoch": 0.04, + "grad_norm": 8.825855345960257, + "learning_rate": 9.9990925999214e-06, + "loss": 0.5957, + "step": 238 + }, + { + "epoch": 0.04, + "grad_norm": 19.323100350984195, + "learning_rate": 9.999045471502406e-06, + "loss": 0.5767, + "step": 239 + }, + { + "epoch": 0.04, + "grad_norm": 11.610436189144876, + "learning_rate": 9.998997150112636e-06, + "loss": 0.5068, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 13.675084247743643, + "learning_rate": 9.998947635763622e-06, + "loss": 0.4517, + "step": 241 + }, + { + "epoch": 0.04, + "grad_norm": 19.84468761589128, + "learning_rate": 9.99889692846718e-06, + "loss": 0.5475, + "step": 242 + }, + { + "epoch": 0.04, + "grad_norm": 12.197872980064998, + "learning_rate": 9.998845028235411e-06, + "loss": 0.5209, + "step": 243 + }, + { + "epoch": 0.04, + "grad_norm": 10.83477683089179, + "learning_rate": 9.998791935080701e-06, + "loss": 0.6186, + "step": 244 + }, + { + "epoch": 0.04, + "grad_norm": 11.4993681294156, + "learning_rate": 9.998737649015719e-06, + "loss": 0.6267, + "step": 245 + }, + { + "epoch": 0.04, + "grad_norm": 31.252219660148725, + "learning_rate": 9.99868217005342e-06, + "loss": 0.5104, + "step": 246 + }, + { + "epoch": 0.04, + "grad_norm": 13.15122959085679, + "learning_rate": 9.998625498207044e-06, + "loss": 0.5399, + "step": 247 + }, + { + "epoch": 0.04, + "grad_norm": 23.82904805404931, + "learning_rate": 9.998567633490115e-06, + "loss": 0.5158, + "step": 248 + }, + { + "epoch": 0.04, + "grad_norm": 7.669607222407733, + "learning_rate": 9.998508575916442e-06, + "loss": 0.5174, + "step": 249 + }, + { + "epoch": 0.04, + "grad_norm": 7.15240988221442, + "learning_rate": 9.998448325500118e-06, + "loss": 0.5424, + "step": 250 + }, + { + "epoch": 0.04, + "grad_norm": 5.809479933372809, + "learning_rate": 9.998386882255522e-06, + "loss": 0.4983, + "step": 251 + }, + { + "epoch": 0.04, + "grad_norm": 7.343704750718917, + "learning_rate": 9.998324246197315e-06, + "loss": 0.5393, + "step": 252 + }, + { + "epoch": 0.04, + "grad_norm": 6.267931091924353, + "learning_rate": 9.998260417340447e-06, + "loss": 0.5441, + "step": 253 + }, + { + "epoch": 0.04, + "grad_norm": 1.68703472201826, + "learning_rate": 9.998195395700149e-06, + "loss": 0.5243, + "step": 254 + }, + { + "epoch": 0.04, + "grad_norm": 4.463114207456857, + "learning_rate": 9.998129181291937e-06, + "loss": 0.5313, + "step": 255 + }, + { + "epoch": 0.04, + "grad_norm": 8.69128090124307, + "learning_rate": 9.998061774131613e-06, + "loss": 0.5809, + "step": 256 + }, + { + "epoch": 0.04, + "grad_norm": 8.123272230025881, + "learning_rate": 9.997993174235262e-06, + "loss": 0.5827, + "step": 257 + }, + { + "epoch": 0.04, + "grad_norm": 5.612474846339437, + "learning_rate": 9.997923381619257e-06, + "loss": 0.5972, + "step": 258 + }, + { + "epoch": 0.04, + "grad_norm": 8.091467988522647, + "learning_rate": 9.997852396300251e-06, + "loss": 0.5681, + "step": 259 + }, + { + "epoch": 0.04, + "grad_norm": 7.9099777000855305, + "learning_rate": 9.997780218295186e-06, + "loss": 0.5507, + "step": 260 + }, + { + "epoch": 0.04, + "grad_norm": 7.08624697672159, + "learning_rate": 9.997706847621283e-06, + "loss": 0.5729, + "step": 261 + }, + { + "epoch": 0.04, + "grad_norm": 14.13594760982386, + "learning_rate": 9.997632284296055e-06, + "loss": 0.5495, + "step": 262 + }, + { + "epoch": 0.04, + "grad_norm": 8.334469223874015, + "learning_rate": 9.997556528337294e-06, + "loss": 0.5871, + "step": 263 + }, + { + "epoch": 0.04, + "grad_norm": 7.107314307562153, + "learning_rate": 9.99747957976308e-06, + "loss": 0.5658, + "step": 264 + }, + { + "epoch": 0.04, + "grad_norm": 29.624877151160735, + "learning_rate": 9.997401438591772e-06, + "loss": 0.6171, + "step": 265 + }, + { + "epoch": 0.04, + "grad_norm": 10.840800982792306, + "learning_rate": 9.997322104842022e-06, + "loss": 0.5909, + "step": 266 + }, + { + "epoch": 0.04, + "grad_norm": 1.7693448468099302, + "learning_rate": 9.99724157853276e-06, + "loss": 0.599, + "step": 267 + }, + { + "epoch": 0.04, + "grad_norm": 9.148578111689773, + "learning_rate": 9.997159859683202e-06, + "loss": 0.5699, + "step": 268 + }, + { + "epoch": 0.04, + "grad_norm": 5.720413406550486, + "learning_rate": 9.997076948312854e-06, + "loss": 0.5882, + "step": 269 + }, + { + "epoch": 0.04, + "grad_norm": 6.479917639919372, + "learning_rate": 9.996992844441495e-06, + "loss": 0.5104, + "step": 270 + }, + { + "epoch": 0.04, + "grad_norm": 9.40208619326653, + "learning_rate": 9.9969075480892e-06, + "loss": 0.5415, + "step": 271 + }, + { + "epoch": 0.04, + "grad_norm": 7.261932920368522, + "learning_rate": 9.996821059276324e-06, + "loss": 0.6097, + "step": 272 + }, + { + "epoch": 0.04, + "grad_norm": 7.995186867538704, + "learning_rate": 9.996733378023507e-06, + "loss": 0.5487, + "step": 273 + }, + { + "epoch": 0.04, + "grad_norm": 5.766011602556588, + "learning_rate": 9.99664450435167e-06, + "loss": 0.6235, + "step": 274 + }, + { + "epoch": 0.04, + "grad_norm": 10.245219985860349, + "learning_rate": 9.996554438282022e-06, + "loss": 0.5595, + "step": 275 + }, + { + "epoch": 0.04, + "grad_norm": 5.232355023753634, + "learning_rate": 9.99646317983606e-06, + "loss": 0.5108, + "step": 276 + }, + { + "epoch": 0.04, + "grad_norm": 5.324443517614879, + "learning_rate": 9.996370729035562e-06, + "loss": 0.562, + "step": 277 + }, + { + "epoch": 0.04, + "grad_norm": 6.675708072644669, + "learning_rate": 9.996277085902587e-06, + "loss": 0.5871, + "step": 278 + }, + { + "epoch": 0.04, + "grad_norm": 4.202930918073277, + "learning_rate": 9.996182250459482e-06, + "loss": 0.6279, + "step": 279 + }, + { + "epoch": 0.04, + "grad_norm": 12.543966258135516, + "learning_rate": 9.99608622272888e-06, + "loss": 0.5181, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 5.0054898237369505, + "learning_rate": 9.9959890027337e-06, + "loss": 0.4936, + "step": 281 + }, + { + "epoch": 0.04, + "grad_norm": 5.641326956964122, + "learning_rate": 9.995890590497137e-06, + "loss": 0.5829, + "step": 282 + }, + { + "epoch": 0.04, + "grad_norm": 4.288768190807301, + "learning_rate": 9.995790986042678e-06, + "loss": 0.5789, + "step": 283 + }, + { + "epoch": 0.04, + "grad_norm": 3.836510171047746, + "learning_rate": 9.995690189394095e-06, + "loss": 0.6051, + "step": 284 + }, + { + "epoch": 0.04, + "grad_norm": 3.9540427198366, + "learning_rate": 9.995588200575439e-06, + "loss": 0.5723, + "step": 285 + }, + { + "epoch": 0.04, + "grad_norm": 4.4835679648994535, + "learning_rate": 9.99548501961105e-06, + "loss": 0.6063, + "step": 286 + }, + { + "epoch": 0.04, + "grad_norm": 3.4688368532425877, + "learning_rate": 9.995380646525552e-06, + "loss": 0.6053, + "step": 287 + }, + { + "epoch": 0.04, + "grad_norm": 3.2781985653694194, + "learning_rate": 9.99527508134385e-06, + "loss": 0.5661, + "step": 288 + }, + { + "epoch": 0.04, + "grad_norm": 8.057395199344807, + "learning_rate": 9.995168324091138e-06, + "loss": 0.6176, + "step": 289 + }, + { + "epoch": 0.04, + "grad_norm": 11.287009804062839, + "learning_rate": 9.995060374792891e-06, + "loss": 0.5701, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 4.0065877122942695, + "learning_rate": 9.994951233474873e-06, + "loss": 0.6082, + "step": 291 + }, + { + "epoch": 0.04, + "grad_norm": 6.649890075906451, + "learning_rate": 9.994840900163127e-06, + "loss": 0.5821, + "step": 292 + }, + { + "epoch": 0.04, + "grad_norm": 3.8046918025983905, + "learning_rate": 9.994729374883982e-06, + "loss": 0.6069, + "step": 293 + }, + { + "epoch": 0.04, + "grad_norm": 4.293678993348817, + "learning_rate": 9.994616657664056e-06, + "loss": 0.5658, + "step": 294 + }, + { + "epoch": 0.04, + "grad_norm": 5.534239074937565, + "learning_rate": 9.994502748530243e-06, + "loss": 0.6102, + "step": 295 + }, + { + "epoch": 0.04, + "grad_norm": 3.4955175456292853, + "learning_rate": 9.994387647509732e-06, + "loss": 0.5602, + "step": 296 + }, + { + "epoch": 0.04, + "grad_norm": 4.674216498033882, + "learning_rate": 9.994271354629985e-06, + "loss": 0.5333, + "step": 297 + }, + { + "epoch": 0.04, + "grad_norm": 6.799211361048443, + "learning_rate": 9.994153869918757e-06, + "loss": 0.5951, + "step": 298 + }, + { + "epoch": 0.05, + "grad_norm": 5.317241033094687, + "learning_rate": 9.994035193404083e-06, + "loss": 0.5362, + "step": 299 + }, + { + "epoch": 0.05, + "grad_norm": 12.277217311250674, + "learning_rate": 9.993915325114286e-06, + "loss": 0.5365, + "step": 300 + }, + { + "epoch": 0.05, + "grad_norm": 8.71239699421881, + "learning_rate": 9.993794265077973e-06, + "loss": 0.4997, + "step": 301 + }, + { + "epoch": 0.05, + "grad_norm": 6.2808155530701795, + "learning_rate": 9.993672013324027e-06, + "loss": 0.5376, + "step": 302 + }, + { + "epoch": 0.05, + "grad_norm": 5.5451084889740905, + "learning_rate": 9.993548569881629e-06, + "loss": 0.5484, + "step": 303 + }, + { + "epoch": 0.05, + "grad_norm": 4.315428158656323, + "learning_rate": 9.993423934780234e-06, + "loss": 0.4811, + "step": 304 + }, + { + "epoch": 0.05, + "grad_norm": 5.035551424142229, + "learning_rate": 9.993298108049582e-06, + "loss": 0.5156, + "step": 305 + }, + { + "epoch": 0.05, + "grad_norm": 5.18865103806143, + "learning_rate": 9.993171089719709e-06, + "loss": 0.6081, + "step": 306 + }, + { + "epoch": 0.05, + "grad_norm": 3.7861730299504783, + "learning_rate": 9.993042879820919e-06, + "loss": 0.5759, + "step": 307 + }, + { + "epoch": 0.05, + "grad_norm": 4.2473187991770525, + "learning_rate": 9.99291347838381e-06, + "loss": 0.6178, + "step": 308 + }, + { + "epoch": 0.05, + "grad_norm": 7.460983528798518, + "learning_rate": 9.992782885439264e-06, + "loss": 0.6129, + "step": 309 + }, + { + "epoch": 0.05, + "grad_norm": 3.2504658221365204, + "learning_rate": 9.992651101018445e-06, + "loss": 0.6251, + "step": 310 + }, + { + "epoch": 0.05, + "grad_norm": 4.5471115468633485, + "learning_rate": 9.9925181251528e-06, + "loss": 0.6206, + "step": 311 + }, + { + "epoch": 0.05, + "grad_norm": 3.559711856128409, + "learning_rate": 9.992383957874064e-06, + "loss": 0.4806, + "step": 312 + }, + { + "epoch": 0.05, + "grad_norm": 3.3691960256501114, + "learning_rate": 9.992248599214253e-06, + "loss": 0.5963, + "step": 313 + }, + { + "epoch": 0.05, + "grad_norm": 2.968602774473347, + "learning_rate": 9.992112049205674e-06, + "loss": 0.558, + "step": 314 + }, + { + "epoch": 0.05, + "grad_norm": 4.0786187339854365, + "learning_rate": 9.991974307880907e-06, + "loss": 0.5641, + "step": 315 + }, + { + "epoch": 0.05, + "grad_norm": 4.138642343520191, + "learning_rate": 9.991835375272825e-06, + "loss": 0.6231, + "step": 316 + }, + { + "epoch": 0.05, + "grad_norm": 3.483354275951371, + "learning_rate": 9.991695251414584e-06, + "loss": 0.5677, + "step": 317 + }, + { + "epoch": 0.05, + "grad_norm": 1.4553910964484875, + "learning_rate": 9.991553936339621e-06, + "loss": 0.5458, + "step": 318 + }, + { + "epoch": 0.05, + "grad_norm": 3.541641123752431, + "learning_rate": 9.99141143008166e-06, + "loss": 0.5739, + "step": 319 + }, + { + "epoch": 0.05, + "grad_norm": 3.3400238903826867, + "learning_rate": 9.991267732674712e-06, + "loss": 0.6116, + "step": 320 + }, + { + "epoch": 0.05, + "grad_norm": 5.142675903895313, + "learning_rate": 9.991122844153064e-06, + "loss": 0.5393, + "step": 321 + }, + { + "epoch": 0.05, + "grad_norm": 4.787845243793807, + "learning_rate": 9.990976764551295e-06, + "loss": 0.5931, + "step": 322 + }, + { + "epoch": 0.05, + "grad_norm": 6.145015952912078, + "learning_rate": 9.990829493904263e-06, + "loss": 0.4538, + "step": 323 + }, + { + "epoch": 0.05, + "grad_norm": 3.363975419405047, + "learning_rate": 9.990681032247116e-06, + "loss": 0.5752, + "step": 324 + }, + { + "epoch": 0.05, + "grad_norm": 3.896382169356612, + "learning_rate": 9.99053137961528e-06, + "loss": 0.5102, + "step": 325 + }, + { + "epoch": 0.05, + "grad_norm": 5.220431137645542, + "learning_rate": 9.99038053604447e-06, + "loss": 0.6411, + "step": 326 + }, + { + "epoch": 0.05, + "grad_norm": 3.368212409738737, + "learning_rate": 9.990228501570683e-06, + "loss": 0.5237, + "step": 327 + }, + { + "epoch": 0.05, + "grad_norm": 5.545368920692188, + "learning_rate": 9.990075276230198e-06, + "loss": 0.5525, + "step": 328 + }, + { + "epoch": 0.05, + "grad_norm": 4.526793870549361, + "learning_rate": 9.989920860059583e-06, + "loss": 0.6496, + "step": 329 + }, + { + "epoch": 0.05, + "grad_norm": 4.39358924922123, + "learning_rate": 9.989765253095687e-06, + "loss": 0.5744, + "step": 330 + }, + { + "epoch": 0.05, + "grad_norm": 4.892514219804371, + "learning_rate": 9.989608455375646e-06, + "loss": 0.5263, + "step": 331 + }, + { + "epoch": 0.05, + "grad_norm": 4.899152726944619, + "learning_rate": 9.989450466936875e-06, + "loss": 0.5814, + "step": 332 + }, + { + "epoch": 0.05, + "grad_norm": 9.51124859640849, + "learning_rate": 9.989291287817077e-06, + "loss": 0.55, + "step": 333 + }, + { + "epoch": 0.05, + "grad_norm": 3.6007962146419947, + "learning_rate": 9.989130918054242e-06, + "loss": 0.5491, + "step": 334 + }, + { + "epoch": 0.05, + "grad_norm": 7.625736061192414, + "learning_rate": 9.988969357686636e-06, + "loss": 0.6003, + "step": 335 + }, + { + "epoch": 0.05, + "grad_norm": 4.681743846070172, + "learning_rate": 9.988806606752817e-06, + "loss": 0.6247, + "step": 336 + }, + { + "epoch": 0.05, + "grad_norm": 5.738862838490249, + "learning_rate": 9.98864266529162e-06, + "loss": 0.5547, + "step": 337 + }, + { + "epoch": 0.05, + "grad_norm": 6.794891102562757, + "learning_rate": 9.98847753334217e-06, + "loss": 0.5886, + "step": 338 + }, + { + "epoch": 0.05, + "grad_norm": 5.345375679451182, + "learning_rate": 9.988311210943878e-06, + "loss": 0.6024, + "step": 339 + }, + { + "epoch": 0.05, + "grad_norm": 4.346876133934082, + "learning_rate": 9.98814369813643e-06, + "loss": 0.5577, + "step": 340 + }, + { + "epoch": 0.05, + "grad_norm": 5.821343446390748, + "learning_rate": 9.987974994959803e-06, + "loss": 0.6299, + "step": 341 + }, + { + "epoch": 0.05, + "grad_norm": 4.68804329700119, + "learning_rate": 9.987805101454255e-06, + "loss": 0.6043, + "step": 342 + }, + { + "epoch": 0.05, + "grad_norm": 6.888239691622705, + "learning_rate": 9.98763401766033e-06, + "loss": 0.6072, + "step": 343 + }, + { + "epoch": 0.05, + "grad_norm": 3.775970261307909, + "learning_rate": 9.987461743618858e-06, + "loss": 0.5334, + "step": 344 + }, + { + "epoch": 0.05, + "grad_norm": 6.066907007927615, + "learning_rate": 9.987288279370945e-06, + "loss": 0.54, + "step": 345 + }, + { + "epoch": 0.05, + "grad_norm": 7.543418185542019, + "learning_rate": 9.987113624957993e-06, + "loss": 0.586, + "step": 346 + }, + { + "epoch": 0.05, + "grad_norm": 4.269728838035235, + "learning_rate": 9.986937780421675e-06, + "loss": 0.5388, + "step": 347 + }, + { + "epoch": 0.05, + "grad_norm": 5.427212880401611, + "learning_rate": 9.986760745803961e-06, + "loss": 0.5772, + "step": 348 + }, + { + "epoch": 0.05, + "grad_norm": 5.75785623464889, + "learning_rate": 9.986582521147093e-06, + "loss": 0.5383, + "step": 349 + }, + { + "epoch": 0.05, + "grad_norm": 4.917303082957225, + "learning_rate": 9.986403106493603e-06, + "loss": 0.5156, + "step": 350 + }, + { + "epoch": 0.05, + "grad_norm": 5.298446282361764, + "learning_rate": 9.986222501886311e-06, + "loss": 0.5486, + "step": 351 + }, + { + "epoch": 0.05, + "grad_norm": 1.764076443994568, + "learning_rate": 9.986040707368312e-06, + "loss": 0.5904, + "step": 352 + }, + { + "epoch": 0.05, + "grad_norm": 1.4753588553302286, + "learning_rate": 9.985857722982991e-06, + "loss": 0.5751, + "step": 353 + }, + { + "epoch": 0.05, + "grad_norm": 4.206012679524437, + "learning_rate": 9.985673548774015e-06, + "loss": 0.491, + "step": 354 + }, + { + "epoch": 0.05, + "grad_norm": 4.609916604632889, + "learning_rate": 9.985488184785337e-06, + "loss": 0.567, + "step": 355 + }, + { + "epoch": 0.05, + "grad_norm": 5.8057742610396685, + "learning_rate": 9.985301631061188e-06, + "loss": 0.6369, + "step": 356 + }, + { + "epoch": 0.05, + "grad_norm": 4.233662085062618, + "learning_rate": 9.985113887646093e-06, + "loss": 0.571, + "step": 357 + }, + { + "epoch": 0.05, + "grad_norm": 5.763738084839761, + "learning_rate": 9.98492495458485e-06, + "loss": 0.561, + "step": 358 + }, + { + "epoch": 0.05, + "grad_norm": 4.287610144203753, + "learning_rate": 9.984734831922546e-06, + "loss": 0.5604, + "step": 359 + }, + { + "epoch": 0.05, + "grad_norm": 4.8853425282602165, + "learning_rate": 9.984543519704558e-06, + "loss": 0.5798, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 1.9998892982915681, + "learning_rate": 9.984351017976533e-06, + "loss": 0.6312, + "step": 361 + }, + { + "epoch": 0.05, + "grad_norm": 4.005855156759343, + "learning_rate": 9.984157326784414e-06, + "loss": 0.612, + "step": 362 + }, + { + "epoch": 0.05, + "grad_norm": 4.421510220434119, + "learning_rate": 9.983962446174423e-06, + "loss": 0.5596, + "step": 363 + }, + { + "epoch": 0.05, + "grad_norm": 5.420281715177006, + "learning_rate": 9.983766376193063e-06, + "loss": 0.5929, + "step": 364 + }, + { + "epoch": 0.06, + "grad_norm": 4.577327704606224, + "learning_rate": 9.983569116887129e-06, + "loss": 0.6193, + "step": 365 + }, + { + "epoch": 0.06, + "grad_norm": 5.549613676857449, + "learning_rate": 9.983370668303691e-06, + "loss": 0.5266, + "step": 366 + }, + { + "epoch": 0.06, + "grad_norm": 7.176602900555059, + "learning_rate": 9.983171030490108e-06, + "loss": 0.5386, + "step": 367 + }, + { + "epoch": 0.06, + "grad_norm": 4.053423992976838, + "learning_rate": 9.982970203494023e-06, + "loss": 0.6397, + "step": 368 + }, + { + "epoch": 0.06, + "grad_norm": 1.6027947669367055, + "learning_rate": 9.982768187363359e-06, + "loss": 0.5973, + "step": 369 + }, + { + "epoch": 0.06, + "grad_norm": 4.995753317876349, + "learning_rate": 9.982564982146329e-06, + "loss": 0.6276, + "step": 370 + }, + { + "epoch": 0.06, + "grad_norm": 1.3903018175244346, + "learning_rate": 9.982360587891418e-06, + "loss": 0.5643, + "step": 371 + }, + { + "epoch": 0.06, + "grad_norm": 3.972729191772038, + "learning_rate": 9.982155004647412e-06, + "loss": 0.5165, + "step": 372 + }, + { + "epoch": 0.06, + "grad_norm": 8.13611535744167, + "learning_rate": 9.981948232463364e-06, + "loss": 0.5269, + "step": 373 + }, + { + "epoch": 0.06, + "grad_norm": 4.120440940286446, + "learning_rate": 9.981740271388622e-06, + "loss": 0.5889, + "step": 374 + }, + { + "epoch": 0.06, + "grad_norm": 4.31747381169351, + "learning_rate": 9.981531121472811e-06, + "loss": 0.5407, + "step": 375 + }, + { + "epoch": 0.06, + "grad_norm": 5.435769878856964, + "learning_rate": 9.981320782765847e-06, + "loss": 0.5202, + "step": 376 + }, + { + "epoch": 0.06, + "grad_norm": 4.281763425719071, + "learning_rate": 9.98110925531792e-06, + "loss": 0.5953, + "step": 377 + }, + { + "epoch": 0.06, + "grad_norm": 4.96231008950992, + "learning_rate": 9.980896539179512e-06, + "loss": 0.4878, + "step": 378 + }, + { + "epoch": 0.06, + "grad_norm": 5.227847515340797, + "learning_rate": 9.980682634401386e-06, + "loss": 0.5803, + "step": 379 + }, + { + "epoch": 0.06, + "grad_norm": 5.851879314521378, + "learning_rate": 9.980467541034584e-06, + "loss": 0.5999, + "step": 380 + }, + { + "epoch": 0.06, + "grad_norm": 7.300392466987065, + "learning_rate": 9.98025125913044e-06, + "loss": 0.5141, + "step": 381 + }, + { + "epoch": 0.06, + "grad_norm": 5.747166873032592, + "learning_rate": 9.980033788740568e-06, + "loss": 0.5022, + "step": 382 + }, + { + "epoch": 0.06, + "grad_norm": 5.556397550568694, + "learning_rate": 9.979815129916861e-06, + "loss": 0.5584, + "step": 383 + }, + { + "epoch": 0.06, + "grad_norm": 1.998458221640642, + "learning_rate": 9.979595282711503e-06, + "loss": 0.6028, + "step": 384 + }, + { + "epoch": 0.06, + "grad_norm": 5.17924806581665, + "learning_rate": 9.979374247176955e-06, + "loss": 0.5783, + "step": 385 + }, + { + "epoch": 0.06, + "grad_norm": 17.97507330314053, + "learning_rate": 9.97915202336597e-06, + "loss": 0.5192, + "step": 386 + }, + { + "epoch": 0.06, + "grad_norm": 76.40610787653104, + "learning_rate": 9.978928611331576e-06, + "loss": 0.7398, + "step": 387 + }, + { + "epoch": 0.06, + "grad_norm": 3.9553664677850966, + "learning_rate": 9.978704011127086e-06, + "loss": 0.5868, + "step": 388 + }, + { + "epoch": 0.06, + "grad_norm": 8.433542875032972, + "learning_rate": 9.978478222806102e-06, + "loss": 0.5241, + "step": 389 + }, + { + "epoch": 0.06, + "grad_norm": 1.4497632830171998, + "learning_rate": 9.978251246422506e-06, + "loss": 0.5404, + "step": 390 + }, + { + "epoch": 0.06, + "grad_norm": 4.423266334199798, + "learning_rate": 9.978023082030462e-06, + "loss": 0.6168, + "step": 391 + }, + { + "epoch": 0.06, + "grad_norm": 5.691295365122946, + "learning_rate": 9.97779372968442e-06, + "loss": 0.5628, + "step": 392 + }, + { + "epoch": 0.06, + "grad_norm": 5.393704924655381, + "learning_rate": 9.977563189439113e-06, + "loss": 0.6399, + "step": 393 + }, + { + "epoch": 0.06, + "grad_norm": 6.2426386253161965, + "learning_rate": 9.977331461349553e-06, + "loss": 0.5966, + "step": 394 + }, + { + "epoch": 0.06, + "grad_norm": 5.795146200147255, + "learning_rate": 9.977098545471047e-06, + "loss": 0.5219, + "step": 395 + }, + { + "epoch": 0.06, + "grad_norm": 4.2474805325938485, + "learning_rate": 9.976864441859171e-06, + "loss": 0.5436, + "step": 396 + }, + { + "epoch": 0.06, + "grad_norm": 4.453237967535938, + "learning_rate": 9.976629150569794e-06, + "loss": 0.5401, + "step": 397 + }, + { + "epoch": 0.06, + "grad_norm": 4.727000388294842, + "learning_rate": 9.976392671659068e-06, + "loss": 0.5593, + "step": 398 + }, + { + "epoch": 0.06, + "grad_norm": 6.017743922902207, + "learning_rate": 9.976155005183423e-06, + "loss": 0.5012, + "step": 399 + }, + { + "epoch": 0.06, + "grad_norm": 7.922427043907916, + "learning_rate": 9.975916151199578e-06, + "loss": 0.5488, + "step": 400 + }, + { + "epoch": 0.06, + "grad_norm": 5.193971439556997, + "learning_rate": 9.97567610976453e-06, + "loss": 0.5601, + "step": 401 + }, + { + "epoch": 0.06, + "grad_norm": 6.183013893789854, + "learning_rate": 9.975434880935567e-06, + "loss": 0.5536, + "step": 402 + }, + { + "epoch": 0.06, + "grad_norm": 7.29503952869622, + "learning_rate": 9.975192464770253e-06, + "loss": 0.6191, + "step": 403 + }, + { + "epoch": 0.06, + "grad_norm": 24.980136083194964, + "learning_rate": 9.974948861326436e-06, + "loss": 0.5004, + "step": 404 + }, + { + "epoch": 0.06, + "grad_norm": 4.823715892383603, + "learning_rate": 9.974704070662254e-06, + "loss": 0.5701, + "step": 405 + }, + { + "epoch": 0.06, + "grad_norm": 4.669098554552117, + "learning_rate": 9.974458092836121e-06, + "loss": 0.626, + "step": 406 + }, + { + "epoch": 0.06, + "grad_norm": 7.403529086133226, + "learning_rate": 9.974210927906737e-06, + "loss": 0.6251, + "step": 407 + }, + { + "epoch": 0.06, + "grad_norm": 19.085284398792762, + "learning_rate": 9.973962575933087e-06, + "loss": 0.5815, + "step": 408 + }, + { + "epoch": 0.06, + "grad_norm": 5.675478786974509, + "learning_rate": 9.973713036974438e-06, + "loss": 0.56, + "step": 409 + }, + { + "epoch": 0.06, + "grad_norm": 24.41717595984447, + "learning_rate": 9.973462311090336e-06, + "loss": 0.5049, + "step": 410 + }, + { + "epoch": 0.06, + "grad_norm": 3.9275479334233583, + "learning_rate": 9.973210398340619e-06, + "loss": 0.5306, + "step": 411 + }, + { + "epoch": 0.06, + "grad_norm": 5.740054884165736, + "learning_rate": 9.972957298785397e-06, + "loss": 0.5695, + "step": 412 + }, + { + "epoch": 0.06, + "grad_norm": 3.886581944733458, + "learning_rate": 9.972703012485078e-06, + "loss": 0.5416, + "step": 413 + }, + { + "epoch": 0.06, + "grad_norm": 3.451230677600706, + "learning_rate": 9.97244753950034e-06, + "loss": 0.5513, + "step": 414 + }, + { + "epoch": 0.06, + "grad_norm": 4.4106616327947785, + "learning_rate": 9.972190879892147e-06, + "loss": 0.5703, + "step": 415 + }, + { + "epoch": 0.06, + "grad_norm": 6.2227033393436075, + "learning_rate": 9.971933033721753e-06, + "loss": 0.4569, + "step": 416 + }, + { + "epoch": 0.06, + "grad_norm": 1.7364138065420724, + "learning_rate": 9.971674001050687e-06, + "loss": 0.6168, + "step": 417 + }, + { + "epoch": 0.06, + "grad_norm": 3.8710080238062123, + "learning_rate": 9.971413781940765e-06, + "loss": 0.5538, + "step": 418 + }, + { + "epoch": 0.06, + "grad_norm": 4.318556608870995, + "learning_rate": 9.971152376454087e-06, + "loss": 0.5485, + "step": 419 + }, + { + "epoch": 0.06, + "grad_norm": 4.719307350175902, + "learning_rate": 9.970889784653034e-06, + "loss": 0.5245, + "step": 420 + }, + { + "epoch": 0.06, + "grad_norm": 4.7330916466222375, + "learning_rate": 9.97062600660027e-06, + "loss": 0.6126, + "step": 421 + }, + { + "epoch": 0.06, + "grad_norm": 5.08925692657126, + "learning_rate": 9.970361042358744e-06, + "loss": 0.5518, + "step": 422 + }, + { + "epoch": 0.06, + "grad_norm": 10.16970228245275, + "learning_rate": 9.970094891991687e-06, + "loss": 0.5307, + "step": 423 + }, + { + "epoch": 0.06, + "grad_norm": 3.8627664718348136, + "learning_rate": 9.969827555562613e-06, + "loss": 0.4506, + "step": 424 + }, + { + "epoch": 0.06, + "grad_norm": 1.2947397616707408, + "learning_rate": 9.969559033135319e-06, + "loss": 0.5967, + "step": 425 + }, + { + "epoch": 0.06, + "grad_norm": 3.6201928864767106, + "learning_rate": 9.969289324773883e-06, + "loss": 0.5736, + "step": 426 + }, + { + "epoch": 0.06, + "grad_norm": 4.815248401333858, + "learning_rate": 9.969018430542671e-06, + "loss": 0.5802, + "step": 427 + }, + { + "epoch": 0.06, + "grad_norm": 4.004485568750944, + "learning_rate": 9.968746350506329e-06, + "loss": 0.5775, + "step": 428 + }, + { + "epoch": 0.06, + "grad_norm": 7.704335190330221, + "learning_rate": 9.968473084729784e-06, + "loss": 0.5376, + "step": 429 + }, + { + "epoch": 0.06, + "grad_norm": 4.9861569760139535, + "learning_rate": 9.96819863327825e-06, + "loss": 0.6122, + "step": 430 + }, + { + "epoch": 0.07, + "grad_norm": 4.425122906174862, + "learning_rate": 9.967922996217221e-06, + "loss": 0.5331, + "step": 431 + }, + { + "epoch": 0.07, + "grad_norm": 5.041352599640688, + "learning_rate": 9.967646173612474e-06, + "loss": 0.5604, + "step": 432 + }, + { + "epoch": 0.07, + "grad_norm": 4.676873179811385, + "learning_rate": 9.967368165530073e-06, + "loss": 0.5242, + "step": 433 + }, + { + "epoch": 0.07, + "grad_norm": 1.5229882995413666, + "learning_rate": 9.967088972036359e-06, + "loss": 0.5934, + "step": 434 + }, + { + "epoch": 0.07, + "grad_norm": 7.468033042033904, + "learning_rate": 9.966808593197959e-06, + "loss": 0.5668, + "step": 435 + }, + { + "epoch": 0.07, + "grad_norm": 4.660991670922934, + "learning_rate": 9.966527029081784e-06, + "loss": 0.5903, + "step": 436 + }, + { + "epoch": 0.07, + "grad_norm": 6.027582230620482, + "learning_rate": 9.966244279755025e-06, + "loss": 0.5132, + "step": 437 + }, + { + "epoch": 0.07, + "grad_norm": 3.5369347748644797, + "learning_rate": 9.965960345285159e-06, + "loss": 0.5666, + "step": 438 + }, + { + "epoch": 0.07, + "grad_norm": 4.919423928073976, + "learning_rate": 9.965675225739942e-06, + "loss": 0.4819, + "step": 439 + }, + { + "epoch": 0.07, + "grad_norm": 3.4651246232001522, + "learning_rate": 9.965388921187414e-06, + "loss": 0.6452, + "step": 440 + }, + { + "epoch": 0.07, + "grad_norm": 1.388608256521621, + "learning_rate": 9.9651014316959e-06, + "loss": 0.5998, + "step": 441 + }, + { + "epoch": 0.07, + "grad_norm": 4.053078232940991, + "learning_rate": 9.964812757334007e-06, + "loss": 0.4974, + "step": 442 + }, + { + "epoch": 0.07, + "grad_norm": 4.123262357535189, + "learning_rate": 9.964522898170626e-06, + "loss": 0.5628, + "step": 443 + }, + { + "epoch": 0.07, + "grad_norm": 5.539523724774265, + "learning_rate": 9.964231854274927e-06, + "loss": 0.5831, + "step": 444 + }, + { + "epoch": 0.07, + "grad_norm": 3.9495433062629757, + "learning_rate": 9.963939625716362e-06, + "loss": 0.5828, + "step": 445 + }, + { + "epoch": 0.07, + "grad_norm": 4.135710610667281, + "learning_rate": 9.963646212564671e-06, + "loss": 0.6071, + "step": 446 + }, + { + "epoch": 0.07, + "grad_norm": 4.32678915276538, + "learning_rate": 9.963351614889873e-06, + "loss": 0.5225, + "step": 447 + }, + { + "epoch": 0.07, + "grad_norm": 4.612260867483246, + "learning_rate": 9.963055832762271e-06, + "loss": 0.4865, + "step": 448 + }, + { + "epoch": 0.07, + "grad_norm": 1.3523418741101116, + "learning_rate": 9.962758866252452e-06, + "loss": 0.5919, + "step": 449 + }, + { + "epoch": 0.07, + "grad_norm": 4.143943921614427, + "learning_rate": 9.962460715431284e-06, + "loss": 0.5693, + "step": 450 + }, + { + "epoch": 0.07, + "grad_norm": 5.128468274509935, + "learning_rate": 9.962161380369914e-06, + "loss": 0.5716, + "step": 451 + }, + { + "epoch": 0.07, + "grad_norm": 5.788152077789797, + "learning_rate": 9.96186086113978e-06, + "loss": 0.5488, + "step": 452 + }, + { + "epoch": 0.07, + "grad_norm": 4.080940897498937, + "learning_rate": 9.961559157812595e-06, + "loss": 0.5785, + "step": 453 + }, + { + "epoch": 0.07, + "grad_norm": 4.472735791366464, + "learning_rate": 9.961256270460356e-06, + "loss": 0.4859, + "step": 454 + }, + { + "epoch": 0.07, + "grad_norm": 3.903287430622969, + "learning_rate": 9.960952199155347e-06, + "loss": 0.5579, + "step": 455 + }, + { + "epoch": 0.07, + "grad_norm": 5.984449748097196, + "learning_rate": 9.960646943970129e-06, + "loss": 0.5159, + "step": 456 + }, + { + "epoch": 0.07, + "grad_norm": 3.966278326217929, + "learning_rate": 9.960340504977551e-06, + "loss": 0.5584, + "step": 457 + }, + { + "epoch": 0.07, + "grad_norm": 5.650412649797251, + "learning_rate": 9.960032882250738e-06, + "loss": 0.5129, + "step": 458 + }, + { + "epoch": 0.07, + "grad_norm": 4.39622849374572, + "learning_rate": 9.959724075863104e-06, + "loss": 0.5258, + "step": 459 + }, + { + "epoch": 0.07, + "grad_norm": 9.82484788426156, + "learning_rate": 9.959414085888341e-06, + "loss": 0.5558, + "step": 460 + }, + { + "epoch": 0.07, + "grad_norm": 3.517670970751855, + "learning_rate": 9.959102912400424e-06, + "loss": 0.5234, + "step": 461 + }, + { + "epoch": 0.07, + "grad_norm": 5.241218754059866, + "learning_rate": 9.958790555473613e-06, + "loss": 0.575, + "step": 462 + }, + { + "epoch": 0.07, + "grad_norm": 5.189469831714092, + "learning_rate": 9.95847701518245e-06, + "loss": 0.518, + "step": 463 + }, + { + "epoch": 0.07, + "grad_norm": 3.7829893942251958, + "learning_rate": 9.958162291601756e-06, + "loss": 0.5785, + "step": 464 + }, + { + "epoch": 0.07, + "grad_norm": 15.14139924867607, + "learning_rate": 9.957846384806637e-06, + "loss": 0.578, + "step": 465 + }, + { + "epoch": 0.07, + "grad_norm": 5.371977540454606, + "learning_rate": 9.95752929487248e-06, + "loss": 0.5256, + "step": 466 + }, + { + "epoch": 0.07, + "grad_norm": 3.91306727990095, + "learning_rate": 9.957211021874956e-06, + "loss": 0.5204, + "step": 467 + }, + { + "epoch": 0.07, + "grad_norm": 11.762184003956692, + "learning_rate": 9.956891565890018e-06, + "loss": 0.5162, + "step": 468 + }, + { + "epoch": 0.07, + "grad_norm": 37.036080451162356, + "learning_rate": 9.956570926993901e-06, + "loss": 0.6089, + "step": 469 + }, + { + "epoch": 0.07, + "grad_norm": 4.176496312557425, + "learning_rate": 9.95624910526312e-06, + "loss": 0.5389, + "step": 470 + }, + { + "epoch": 0.07, + "grad_norm": 4.7558850724783435, + "learning_rate": 9.95592610077448e-06, + "loss": 0.5073, + "step": 471 + }, + { + "epoch": 0.07, + "grad_norm": 4.022910849061221, + "learning_rate": 9.955601913605056e-06, + "loss": 0.569, + "step": 472 + }, + { + "epoch": 0.07, + "grad_norm": 5.875416740436149, + "learning_rate": 9.955276543832215e-06, + "loss": 0.5834, + "step": 473 + }, + { + "epoch": 0.07, + "grad_norm": 3.587277400956277, + "learning_rate": 9.954949991533604e-06, + "loss": 0.5381, + "step": 474 + }, + { + "epoch": 0.07, + "grad_norm": 5.592422517923324, + "learning_rate": 9.95462225678715e-06, + "loss": 0.5224, + "step": 475 + }, + { + "epoch": 0.07, + "grad_norm": 4.5018458506338, + "learning_rate": 9.954293339671064e-06, + "loss": 0.4911, + "step": 476 + }, + { + "epoch": 0.07, + "grad_norm": 9.7353185470682, + "learning_rate": 9.953963240263837e-06, + "loss": 0.5976, + "step": 477 + }, + { + "epoch": 0.07, + "grad_norm": 8.194882887644598, + "learning_rate": 9.953631958644248e-06, + "loss": 0.5558, + "step": 478 + }, + { + "epoch": 0.07, + "grad_norm": 7.27504738362757, + "learning_rate": 9.95329949489135e-06, + "loss": 0.6246, + "step": 479 + }, + { + "epoch": 0.07, + "grad_norm": 9.69488845848846, + "learning_rate": 9.952965849084483e-06, + "loss": 0.5306, + "step": 480 + }, + { + "epoch": 0.07, + "grad_norm": 9.416052859208481, + "learning_rate": 9.95263102130327e-06, + "loss": 0.5838, + "step": 481 + }, + { + "epoch": 0.07, + "grad_norm": 21.10202108481693, + "learning_rate": 9.95229501162761e-06, + "loss": 0.5465, + "step": 482 + }, + { + "epoch": 0.07, + "grad_norm": 7.238713550155264, + "learning_rate": 9.951957820137694e-06, + "loss": 0.5924, + "step": 483 + }, + { + "epoch": 0.07, + "grad_norm": 5.8693431581359015, + "learning_rate": 9.951619446913985e-06, + "loss": 0.4921, + "step": 484 + }, + { + "epoch": 0.07, + "grad_norm": 4.442278284934623, + "learning_rate": 9.951279892037234e-06, + "loss": 0.5607, + "step": 485 + }, + { + "epoch": 0.07, + "grad_norm": 6.303385560990992, + "learning_rate": 9.95093915558847e-06, + "loss": 0.4815, + "step": 486 + }, + { + "epoch": 0.07, + "grad_norm": 5.889007228039267, + "learning_rate": 9.95059723764901e-06, + "loss": 0.5399, + "step": 487 + }, + { + "epoch": 0.07, + "grad_norm": 6.484839171670657, + "learning_rate": 9.950254138300447e-06, + "loss": 0.5257, + "step": 488 + }, + { + "epoch": 0.07, + "grad_norm": 5.2829726469629525, + "learning_rate": 9.949909857624658e-06, + "loss": 0.5247, + "step": 489 + }, + { + "epoch": 0.07, + "grad_norm": 1.3916596577356257, + "learning_rate": 9.949564395703803e-06, + "loss": 0.5987, + "step": 490 + }, + { + "epoch": 0.07, + "grad_norm": 4.54294780997308, + "learning_rate": 9.949217752620322e-06, + "loss": 0.4398, + "step": 491 + }, + { + "epoch": 0.07, + "grad_norm": 5.430483003061391, + "learning_rate": 9.94886992845694e-06, + "loss": 0.4554, + "step": 492 + }, + { + "epoch": 0.07, + "grad_norm": 9.17501578410412, + "learning_rate": 9.948520923296656e-06, + "loss": 0.5612, + "step": 493 + }, + { + "epoch": 0.07, + "grad_norm": 4.210116717247629, + "learning_rate": 9.948170737222763e-06, + "loss": 0.4608, + "step": 494 + }, + { + "epoch": 0.07, + "grad_norm": 4.839233500167328, + "learning_rate": 9.947819370318825e-06, + "loss": 0.5149, + "step": 495 + }, + { + "epoch": 0.07, + "grad_norm": 6.312165482182679, + "learning_rate": 9.947466822668696e-06, + "loss": 0.5531, + "step": 496 + }, + { + "epoch": 0.07, + "grad_norm": 6.142061200802827, + "learning_rate": 9.947113094356505e-06, + "loss": 0.5297, + "step": 497 + }, + { + "epoch": 0.08, + "grad_norm": 5.761292045141527, + "learning_rate": 9.946758185466666e-06, + "loss": 0.4934, + "step": 498 + }, + { + "epoch": 0.08, + "grad_norm": 10.403910455524825, + "learning_rate": 9.946402096083874e-06, + "loss": 0.5157, + "step": 499 + }, + { + "epoch": 0.08, + "grad_norm": 4.405641677429245, + "learning_rate": 9.946044826293106e-06, + "loss": 0.5634, + "step": 500 + }, + { + "epoch": 0.08, + "grad_norm": 3.8194310098991187, + "learning_rate": 9.945686376179624e-06, + "loss": 0.5445, + "step": 501 + }, + { + "epoch": 0.08, + "grad_norm": 38.425864109082276, + "learning_rate": 9.945326745828963e-06, + "loss": 0.5513, + "step": 502 + }, + { + "epoch": 0.08, + "grad_norm": 4.9515881552304775, + "learning_rate": 9.94496593532695e-06, + "loss": 0.5305, + "step": 503 + }, + { + "epoch": 0.08, + "grad_norm": 4.493571976371911, + "learning_rate": 9.944603944759685e-06, + "loss": 0.574, + "step": 504 + }, + { + "epoch": 0.08, + "grad_norm": 3.82799872516371, + "learning_rate": 9.944240774213557e-06, + "loss": 0.5904, + "step": 505 + }, + { + "epoch": 0.08, + "grad_norm": 5.2276612841893595, + "learning_rate": 9.94387642377523e-06, + "loss": 0.5128, + "step": 506 + }, + { + "epoch": 0.08, + "grad_norm": 5.702210728139473, + "learning_rate": 9.943510893531652e-06, + "loss": 0.5199, + "step": 507 + }, + { + "epoch": 0.08, + "grad_norm": 4.794873692784183, + "learning_rate": 9.943144183570058e-06, + "loss": 0.5198, + "step": 508 + }, + { + "epoch": 0.08, + "grad_norm": 3.4072298689375455, + "learning_rate": 9.942776293977951e-06, + "loss": 0.4993, + "step": 509 + }, + { + "epoch": 0.08, + "grad_norm": 3.332745324166785, + "learning_rate": 9.942407224843133e-06, + "loss": 0.5545, + "step": 510 + }, + { + "epoch": 0.08, + "grad_norm": 5.55004954134649, + "learning_rate": 9.942036976253672e-06, + "loss": 0.5997, + "step": 511 + }, + { + "epoch": 0.08, + "grad_norm": 4.206188009168057, + "learning_rate": 9.94166554829793e-06, + "loss": 0.5935, + "step": 512 + }, + { + "epoch": 0.08, + "grad_norm": 1.4444611292914382, + "learning_rate": 9.941292941064537e-06, + "loss": 0.6056, + "step": 513 + }, + { + "epoch": 0.08, + "grad_norm": 4.7204819579365225, + "learning_rate": 9.94091915464242e-06, + "loss": 0.4414, + "step": 514 + }, + { + "epoch": 0.08, + "grad_norm": 1.402768022663327, + "learning_rate": 9.940544189120771e-06, + "loss": 0.584, + "step": 515 + }, + { + "epoch": 0.08, + "grad_norm": 4.35147570407809, + "learning_rate": 9.940168044589077e-06, + "loss": 0.5615, + "step": 516 + }, + { + "epoch": 0.08, + "grad_norm": 6.49686249047898, + "learning_rate": 9.939790721137102e-06, + "loss": 0.5217, + "step": 517 + }, + { + "epoch": 0.08, + "grad_norm": 6.925563072629076, + "learning_rate": 9.939412218854888e-06, + "loss": 0.5512, + "step": 518 + }, + { + "epoch": 0.08, + "grad_norm": 3.9920214507875516, + "learning_rate": 9.93903253783276e-06, + "loss": 0.5575, + "step": 519 + }, + { + "epoch": 0.08, + "grad_norm": 3.6918470588438503, + "learning_rate": 9.938651678161326e-06, + "loss": 0.5752, + "step": 520 + }, + { + "epoch": 0.08, + "grad_norm": 4.177063291589921, + "learning_rate": 9.938269639931475e-06, + "loss": 0.4518, + "step": 521 + }, + { + "epoch": 0.08, + "grad_norm": 3.2388400852759354, + "learning_rate": 9.937886423234375e-06, + "loss": 0.5069, + "step": 522 + }, + { + "epoch": 0.08, + "grad_norm": 5.231148903388963, + "learning_rate": 9.937502028161477e-06, + "loss": 0.565, + "step": 523 + }, + { + "epoch": 0.08, + "grad_norm": 2.741163756474268, + "learning_rate": 9.937116454804514e-06, + "loss": 0.5567, + "step": 524 + }, + { + "epoch": 0.08, + "grad_norm": 7.810736561247086, + "learning_rate": 9.936729703255498e-06, + "loss": 0.5338, + "step": 525 + }, + { + "epoch": 0.08, + "grad_norm": 3.9573468748875085, + "learning_rate": 9.936341773606723e-06, + "loss": 0.5974, + "step": 526 + }, + { + "epoch": 0.08, + "grad_norm": 3.1815857805896552, + "learning_rate": 9.935952665950767e-06, + "loss": 0.5748, + "step": 527 + }, + { + "epoch": 0.08, + "grad_norm": 7.2078161301120725, + "learning_rate": 9.935562380380484e-06, + "loss": 0.6156, + "step": 528 + }, + { + "epoch": 0.08, + "grad_norm": 2.885381572037831, + "learning_rate": 9.935170916989013e-06, + "loss": 0.5052, + "step": 529 + }, + { + "epoch": 0.08, + "grad_norm": 4.010733199988919, + "learning_rate": 9.93477827586977e-06, + "loss": 0.5554, + "step": 530 + }, + { + "epoch": 0.08, + "grad_norm": 6.906623802002172, + "learning_rate": 9.934384457116457e-06, + "loss": 0.5387, + "step": 531 + }, + { + "epoch": 0.08, + "grad_norm": 5.742285914933382, + "learning_rate": 9.933989460823056e-06, + "loss": 0.5041, + "step": 532 + }, + { + "epoch": 0.08, + "grad_norm": 8.579152600869417, + "learning_rate": 9.933593287083827e-06, + "loss": 0.4479, + "step": 533 + }, + { + "epoch": 0.08, + "grad_norm": 3.94041457861351, + "learning_rate": 9.933195935993312e-06, + "loss": 0.4963, + "step": 534 + }, + { + "epoch": 0.08, + "grad_norm": 2.1186838038708187, + "learning_rate": 9.932797407646337e-06, + "loss": 0.6729, + "step": 535 + }, + { + "epoch": 0.08, + "grad_norm": 4.756980529013317, + "learning_rate": 9.932397702138006e-06, + "loss": 0.5748, + "step": 536 + }, + { + "epoch": 0.08, + "grad_norm": 3.8141388610014735, + "learning_rate": 9.931996819563702e-06, + "loss": 0.5499, + "step": 537 + }, + { + "epoch": 0.08, + "grad_norm": 5.927671633192791, + "learning_rate": 9.931594760019096e-06, + "loss": 0.5137, + "step": 538 + }, + { + "epoch": 0.08, + "grad_norm": 5.027226445048273, + "learning_rate": 9.931191523600133e-06, + "loss": 0.5611, + "step": 539 + }, + { + "epoch": 0.08, + "grad_norm": 5.139606981287645, + "learning_rate": 9.930787110403041e-06, + "loss": 0.5127, + "step": 540 + }, + { + "epoch": 0.08, + "grad_norm": 4.8469075994415, + "learning_rate": 9.930381520524328e-06, + "loss": 0.4482, + "step": 541 + }, + { + "epoch": 0.08, + "grad_norm": 4.573927185515536, + "learning_rate": 9.929974754060787e-06, + "loss": 0.5306, + "step": 542 + }, + { + "epoch": 0.08, + "grad_norm": 8.778324482664255, + "learning_rate": 9.929566811109486e-06, + "loss": 0.527, + "step": 543 + }, + { + "epoch": 0.08, + "grad_norm": 4.368202506906533, + "learning_rate": 9.929157691767778e-06, + "loss": 0.586, + "step": 544 + }, + { + "epoch": 0.08, + "grad_norm": 1.5497527803665485, + "learning_rate": 9.928747396133293e-06, + "loss": 0.5883, + "step": 545 + }, + { + "epoch": 0.08, + "grad_norm": 4.601161511377555, + "learning_rate": 9.928335924303947e-06, + "loss": 0.6103, + "step": 546 + }, + { + "epoch": 0.08, + "grad_norm": 5.333273134443389, + "learning_rate": 9.927923276377931e-06, + "loss": 0.5505, + "step": 547 + }, + { + "epoch": 0.08, + "grad_norm": 7.572097925871523, + "learning_rate": 9.927509452453719e-06, + "loss": 0.5195, + "step": 548 + }, + { + "epoch": 0.08, + "grad_norm": 6.007479301700096, + "learning_rate": 9.927094452630069e-06, + "loss": 0.5682, + "step": 549 + }, + { + "epoch": 0.08, + "grad_norm": 8.43888721008406, + "learning_rate": 9.92667827700601e-06, + "loss": 0.5791, + "step": 550 + }, + { + "epoch": 0.08, + "grad_norm": 5.669373539947307, + "learning_rate": 9.926260925680867e-06, + "loss": 0.5931, + "step": 551 + }, + { + "epoch": 0.08, + "grad_norm": 4.35161640969527, + "learning_rate": 9.925842398754228e-06, + "loss": 0.5498, + "step": 552 + }, + { + "epoch": 0.08, + "grad_norm": 4.821031768997714, + "learning_rate": 9.925422696325976e-06, + "loss": 0.4989, + "step": 553 + }, + { + "epoch": 0.08, + "grad_norm": 4.803093896117229, + "learning_rate": 9.925001818496265e-06, + "loss": 0.495, + "step": 554 + }, + { + "epoch": 0.08, + "grad_norm": 3.7372122948306594, + "learning_rate": 9.924579765365536e-06, + "loss": 0.4642, + "step": 555 + }, + { + "epoch": 0.08, + "grad_norm": 7.22790687064754, + "learning_rate": 9.924156537034506e-06, + "loss": 0.5745, + "step": 556 + }, + { + "epoch": 0.08, + "grad_norm": 6.885752461260323, + "learning_rate": 9.923732133604176e-06, + "loss": 0.5384, + "step": 557 + }, + { + "epoch": 0.08, + "grad_norm": 5.185110929736898, + "learning_rate": 9.923306555175823e-06, + "loss": 0.5681, + "step": 558 + }, + { + "epoch": 0.08, + "grad_norm": 5.598286716656606, + "learning_rate": 9.922879801851006e-06, + "loss": 0.5891, + "step": 559 + }, + { + "epoch": 0.08, + "grad_norm": 5.810599452571767, + "learning_rate": 9.92245187373157e-06, + "loss": 0.5219, + "step": 560 + }, + { + "epoch": 0.08, + "grad_norm": 5.724038116973454, + "learning_rate": 9.92202277091963e-06, + "loss": 0.5384, + "step": 561 + }, + { + "epoch": 0.08, + "grad_norm": 4.441695506213235, + "learning_rate": 9.921592493517594e-06, + "loss": 0.5176, + "step": 562 + }, + { + "epoch": 0.08, + "grad_norm": 4.924128962116078, + "learning_rate": 9.921161041628136e-06, + "loss": 0.5723, + "step": 563 + }, + { + "epoch": 0.09, + "grad_norm": 12.46739574351716, + "learning_rate": 9.92072841535422e-06, + "loss": 0.5612, + "step": 564 + }, + { + "epoch": 0.09, + "grad_norm": 4.427351452341228, + "learning_rate": 9.92029461479909e-06, + "loss": 0.5887, + "step": 565 + }, + { + "epoch": 0.09, + "grad_norm": 7.532083896994779, + "learning_rate": 9.919859640066267e-06, + "loss": 0.5178, + "step": 566 + }, + { + "epoch": 0.09, + "grad_norm": 5.1371605887183955, + "learning_rate": 9.919423491259551e-06, + "loss": 0.5823, + "step": 567 + }, + { + "epoch": 0.09, + "grad_norm": 5.184819561892654, + "learning_rate": 9.918986168483027e-06, + "loss": 0.5019, + "step": 568 + }, + { + "epoch": 0.09, + "grad_norm": 7.210768141780824, + "learning_rate": 9.918547671841057e-06, + "loss": 0.4505, + "step": 569 + }, + { + "epoch": 0.09, + "grad_norm": 3.9585646183745014, + "learning_rate": 9.918108001438283e-06, + "loss": 0.524, + "step": 570 + }, + { + "epoch": 0.09, + "grad_norm": 5.23145460828766, + "learning_rate": 9.91766715737963e-06, + "loss": 0.5179, + "step": 571 + }, + { + "epoch": 0.09, + "grad_norm": 7.109189185774883, + "learning_rate": 9.917225139770296e-06, + "loss": 0.5237, + "step": 572 + }, + { + "epoch": 0.09, + "grad_norm": 4.53740489178462, + "learning_rate": 9.91678194871577e-06, + "loss": 0.5456, + "step": 573 + }, + { + "epoch": 0.09, + "grad_norm": 3.625836491341964, + "learning_rate": 9.916337584321811e-06, + "loss": 0.5439, + "step": 574 + }, + { + "epoch": 0.09, + "grad_norm": 4.5869139235426974, + "learning_rate": 9.915892046694464e-06, + "loss": 0.4693, + "step": 575 + }, + { + "epoch": 0.09, + "grad_norm": 5.531480559045545, + "learning_rate": 9.91544533594005e-06, + "loss": 0.4974, + "step": 576 + }, + { + "epoch": 0.09, + "grad_norm": 7.3007762464713, + "learning_rate": 9.914997452165174e-06, + "loss": 0.5189, + "step": 577 + }, + { + "epoch": 0.09, + "grad_norm": 5.544427727020317, + "learning_rate": 9.914548395476716e-06, + "loss": 0.4669, + "step": 578 + }, + { + "epoch": 0.09, + "grad_norm": 2.8801104744521044, + "learning_rate": 9.914098165981842e-06, + "loss": 0.5696, + "step": 579 + }, + { + "epoch": 0.09, + "grad_norm": 7.675778780433523, + "learning_rate": 9.913646763787993e-06, + "loss": 0.4808, + "step": 580 + }, + { + "epoch": 0.09, + "grad_norm": 3.1657480701779526, + "learning_rate": 9.913194189002892e-06, + "loss": 0.4516, + "step": 581 + }, + { + "epoch": 0.09, + "grad_norm": 6.6081283291210084, + "learning_rate": 9.912740441734541e-06, + "loss": 0.5338, + "step": 582 + }, + { + "epoch": 0.09, + "grad_norm": 2.745027646474163, + "learning_rate": 9.912285522091223e-06, + "loss": 0.4911, + "step": 583 + }, + { + "epoch": 0.09, + "grad_norm": 11.626144645448106, + "learning_rate": 9.911829430181496e-06, + "loss": 0.6033, + "step": 584 + }, + { + "epoch": 0.09, + "grad_norm": 5.651174341401112, + "learning_rate": 9.911372166114208e-06, + "loss": 0.5535, + "step": 585 + }, + { + "epoch": 0.09, + "grad_norm": 2.7779520848639043, + "learning_rate": 9.910913729998477e-06, + "loss": 0.534, + "step": 586 + }, + { + "epoch": 0.09, + "grad_norm": 4.423725218344855, + "learning_rate": 9.910454121943702e-06, + "loss": 0.5701, + "step": 587 + }, + { + "epoch": 0.09, + "grad_norm": 2.5759692673081775, + "learning_rate": 9.909993342059567e-06, + "loss": 0.5489, + "step": 588 + }, + { + "epoch": 0.09, + "grad_norm": 5.332349137902474, + "learning_rate": 9.909531390456033e-06, + "loss": 0.5432, + "step": 589 + }, + { + "epoch": 0.09, + "grad_norm": 6.766215498944896, + "learning_rate": 9.909068267243336e-06, + "loss": 0.5496, + "step": 590 + }, + { + "epoch": 0.09, + "grad_norm": 4.083394180119648, + "learning_rate": 9.908603972531998e-06, + "loss": 0.4667, + "step": 591 + }, + { + "epoch": 0.09, + "grad_norm": 3.4094821376927027, + "learning_rate": 9.908138506432818e-06, + "loss": 0.4751, + "step": 592 + }, + { + "epoch": 0.09, + "grad_norm": 4.266811046766004, + "learning_rate": 9.907671869056877e-06, + "loss": 0.4794, + "step": 593 + }, + { + "epoch": 0.09, + "grad_norm": 6.5505237603017274, + "learning_rate": 9.907204060515527e-06, + "loss": 0.5629, + "step": 594 + }, + { + "epoch": 0.09, + "grad_norm": 3.5817292625362755, + "learning_rate": 9.906735080920414e-06, + "loss": 0.4803, + "step": 595 + }, + { + "epoch": 0.09, + "grad_norm": 5.6517680254580345, + "learning_rate": 9.906264930383448e-06, + "loss": 0.5156, + "step": 596 + }, + { + "epoch": 0.09, + "grad_norm": 4.343118404434846, + "learning_rate": 9.905793609016828e-06, + "loss": 0.5109, + "step": 597 + }, + { + "epoch": 0.09, + "grad_norm": 4.609198127219048, + "learning_rate": 9.905321116933031e-06, + "loss": 0.4311, + "step": 598 + }, + { + "epoch": 0.09, + "grad_norm": 5.5812105911319945, + "learning_rate": 9.904847454244812e-06, + "loss": 0.4916, + "step": 599 + }, + { + "epoch": 0.09, + "grad_norm": 3.925080604783244, + "learning_rate": 9.904372621065206e-06, + "loss": 0.5354, + "step": 600 + }, + { + "epoch": 0.09, + "grad_norm": 4.13565805887717, + "learning_rate": 9.903896617507526e-06, + "loss": 0.4798, + "step": 601 + }, + { + "epoch": 0.09, + "grad_norm": 4.125308376097846, + "learning_rate": 9.903419443685366e-06, + "loss": 0.493, + "step": 602 + }, + { + "epoch": 0.09, + "grad_norm": 4.084159587359883, + "learning_rate": 9.902941099712597e-06, + "loss": 0.5659, + "step": 603 + }, + { + "epoch": 0.09, + "grad_norm": 4.323557550991385, + "learning_rate": 9.902461585703375e-06, + "loss": 0.4837, + "step": 604 + }, + { + "epoch": 0.09, + "grad_norm": 4.535405732848617, + "learning_rate": 9.901980901772128e-06, + "loss": 0.6851, + "step": 605 + }, + { + "epoch": 0.09, + "grad_norm": 9.262480316198713, + "learning_rate": 9.901499048033565e-06, + "loss": 0.5067, + "step": 606 + }, + { + "epoch": 0.09, + "grad_norm": 4.34390922034351, + "learning_rate": 9.901016024602678e-06, + "loss": 0.4621, + "step": 607 + }, + { + "epoch": 0.09, + "grad_norm": 4.715226457537914, + "learning_rate": 9.900531831594736e-06, + "loss": 0.5174, + "step": 608 + }, + { + "epoch": 0.09, + "grad_norm": 2.7075486442286483, + "learning_rate": 9.900046469125282e-06, + "loss": 0.4409, + "step": 609 + }, + { + "epoch": 0.09, + "grad_norm": 4.904796834259952, + "learning_rate": 9.899559937310148e-06, + "loss": 0.5131, + "step": 610 + }, + { + "epoch": 0.09, + "grad_norm": 7.509641059876668, + "learning_rate": 9.89907223626544e-06, + "loss": 0.532, + "step": 611 + }, + { + "epoch": 0.09, + "grad_norm": 4.209679464684351, + "learning_rate": 9.898583366107539e-06, + "loss": 0.4677, + "step": 612 + }, + { + "epoch": 0.09, + "grad_norm": 5.933183484830707, + "learning_rate": 9.89809332695311e-06, + "loss": 0.4909, + "step": 613 + }, + { + "epoch": 0.09, + "grad_norm": 4.893680000922045, + "learning_rate": 9.897602118919098e-06, + "loss": 0.5391, + "step": 614 + }, + { + "epoch": 0.09, + "grad_norm": 6.656170294091131, + "learning_rate": 9.897109742122721e-06, + "loss": 0.5014, + "step": 615 + }, + { + "epoch": 0.09, + "grad_norm": 3.2500005642874297, + "learning_rate": 9.896616196681484e-06, + "loss": 0.4587, + "step": 616 + }, + { + "epoch": 0.09, + "grad_norm": 7.073729219471031, + "learning_rate": 9.896121482713164e-06, + "loss": 0.5421, + "step": 617 + }, + { + "epoch": 0.09, + "grad_norm": 3.646425931337879, + "learning_rate": 9.895625600335818e-06, + "loss": 0.5574, + "step": 618 + }, + { + "epoch": 0.09, + "grad_norm": 7.770649998881895, + "learning_rate": 9.895128549667787e-06, + "loss": 0.551, + "step": 619 + }, + { + "epoch": 0.09, + "grad_norm": 4.40653331681264, + "learning_rate": 9.894630330827687e-06, + "loss": 0.5449, + "step": 620 + }, + { + "epoch": 0.09, + "grad_norm": 9.047124561041556, + "learning_rate": 9.894130943934407e-06, + "loss": 0.5449, + "step": 621 + }, + { + "epoch": 0.09, + "grad_norm": 3.3979120619217724, + "learning_rate": 9.893630389107126e-06, + "loss": 0.5053, + "step": 622 + }, + { + "epoch": 0.09, + "grad_norm": 3.24392975104173, + "learning_rate": 9.893128666465294e-06, + "loss": 0.5408, + "step": 623 + }, + { + "epoch": 0.09, + "grad_norm": 4.312126626133832, + "learning_rate": 9.892625776128643e-06, + "loss": 0.5006, + "step": 624 + }, + { + "epoch": 0.09, + "grad_norm": 4.624726989020237, + "learning_rate": 9.892121718217182e-06, + "loss": 0.5051, + "step": 625 + }, + { + "epoch": 0.09, + "grad_norm": 3.5084826780752922, + "learning_rate": 9.8916164928512e-06, + "loss": 0.5617, + "step": 626 + }, + { + "epoch": 0.09, + "grad_norm": 6.08802545580933, + "learning_rate": 9.891110100151263e-06, + "loss": 0.4669, + "step": 627 + }, + { + "epoch": 0.09, + "grad_norm": 1.7991357765738936, + "learning_rate": 9.890602540238217e-06, + "loss": 0.6139, + "step": 628 + }, + { + "epoch": 0.09, + "grad_norm": 7.106317324372554, + "learning_rate": 9.890093813233186e-06, + "loss": 0.5279, + "step": 629 + }, + { + "epoch": 0.1, + "grad_norm": 7.388096239970439, + "learning_rate": 9.88958391925757e-06, + "loss": 0.5153, + "step": 630 + }, + { + "epoch": 0.1, + "grad_norm": 5.6288708344242675, + "learning_rate": 9.889072858433055e-06, + "loss": 0.5134, + "step": 631 + }, + { + "epoch": 0.1, + "grad_norm": 6.695461982141519, + "learning_rate": 9.888560630881595e-06, + "loss": 0.5242, + "step": 632 + }, + { + "epoch": 0.1, + "grad_norm": 30.0131411060209, + "learning_rate": 9.888047236725432e-06, + "loss": 0.581, + "step": 633 + }, + { + "epoch": 0.1, + "grad_norm": 6.606268323912727, + "learning_rate": 9.88753267608708e-06, + "loss": 0.4976, + "step": 634 + }, + { + "epoch": 0.1, + "grad_norm": 9.353250058895615, + "learning_rate": 9.887016949089334e-06, + "loss": 0.5704, + "step": 635 + }, + { + "epoch": 0.1, + "grad_norm": 7.635169200376543, + "learning_rate": 9.886500055855268e-06, + "loss": 0.5162, + "step": 636 + }, + { + "epoch": 0.1, + "grad_norm": 5.13072867816422, + "learning_rate": 9.885981996508231e-06, + "loss": 0.5224, + "step": 637 + }, + { + "epoch": 0.1, + "grad_norm": 16.54032089434867, + "learning_rate": 9.885462771171855e-06, + "loss": 0.492, + "step": 638 + }, + { + "epoch": 0.1, + "grad_norm": 10.758542400369512, + "learning_rate": 9.884942379970045e-06, + "loss": 0.4997, + "step": 639 + }, + { + "epoch": 0.1, + "grad_norm": 5.292338012969868, + "learning_rate": 9.88442082302699e-06, + "loss": 0.5088, + "step": 640 + }, + { + "epoch": 0.1, + "grad_norm": 4.176996531959761, + "learning_rate": 9.883898100467154e-06, + "loss": 0.508, + "step": 641 + }, + { + "epoch": 0.1, + "grad_norm": 6.170658827741615, + "learning_rate": 9.883374212415275e-06, + "loss": 0.5805, + "step": 642 + }, + { + "epoch": 0.1, + "grad_norm": 16.08527631282028, + "learning_rate": 9.882849158996379e-06, + "loss": 0.5335, + "step": 643 + }, + { + "epoch": 0.1, + "grad_norm": 3.008707277307974, + "learning_rate": 9.882322940335763e-06, + "loss": 0.5142, + "step": 644 + }, + { + "epoch": 0.1, + "grad_norm": 5.623512705310939, + "learning_rate": 9.881795556558999e-06, + "loss": 0.5146, + "step": 645 + }, + { + "epoch": 0.1, + "grad_norm": 9.562073534618671, + "learning_rate": 9.88126700779195e-06, + "loss": 0.4305, + "step": 646 + }, + { + "epoch": 0.1, + "grad_norm": 8.690223869286257, + "learning_rate": 9.880737294160742e-06, + "loss": 0.5352, + "step": 647 + }, + { + "epoch": 0.1, + "grad_norm": 6.053045750281363, + "learning_rate": 9.88020641579179e-06, + "loss": 0.5143, + "step": 648 + }, + { + "epoch": 0.1, + "grad_norm": 6.601959324167704, + "learning_rate": 9.87967437281178e-06, + "loss": 0.601, + "step": 649 + }, + { + "epoch": 0.1, + "grad_norm": 11.558555880657527, + "learning_rate": 9.87914116534768e-06, + "loss": 0.5101, + "step": 650 + }, + { + "epoch": 0.1, + "grad_norm": 12.502418823271269, + "learning_rate": 9.878606793526734e-06, + "loss": 0.5924, + "step": 651 + }, + { + "epoch": 0.1, + "grad_norm": 9.04391956463187, + "learning_rate": 9.878071257476461e-06, + "loss": 0.5547, + "step": 652 + }, + { + "epoch": 0.1, + "grad_norm": 14.458689757815653, + "learning_rate": 9.877534557324667e-06, + "loss": 0.5453, + "step": 653 + }, + { + "epoch": 0.1, + "grad_norm": 20.770908660485713, + "learning_rate": 9.876996693199427e-06, + "loss": 0.5064, + "step": 654 + }, + { + "epoch": 0.1, + "grad_norm": 18.346453583587707, + "learning_rate": 9.876457665229097e-06, + "loss": 0.4713, + "step": 655 + }, + { + "epoch": 0.1, + "grad_norm": 18.4140593359916, + "learning_rate": 9.875917473542312e-06, + "loss": 0.5388, + "step": 656 + }, + { + "epoch": 0.1, + "grad_norm": 9.173438515812045, + "learning_rate": 9.87537611826798e-06, + "loss": 0.4746, + "step": 657 + }, + { + "epoch": 0.1, + "grad_norm": 8.30619313890996, + "learning_rate": 9.874833599535291e-06, + "loss": 0.5464, + "step": 658 + }, + { + "epoch": 0.1, + "grad_norm": 18.13300877582824, + "learning_rate": 9.874289917473714e-06, + "loss": 0.5992, + "step": 659 + }, + { + "epoch": 0.1, + "grad_norm": 11.320432046519848, + "learning_rate": 9.87374507221299e-06, + "loss": 0.5648, + "step": 660 + }, + { + "epoch": 0.1, + "grad_norm": 13.883790827000775, + "learning_rate": 9.873199063883143e-06, + "loss": 0.5144, + "step": 661 + }, + { + "epoch": 0.1, + "grad_norm": 10.45884275378918, + "learning_rate": 9.87265189261447e-06, + "loss": 0.4594, + "step": 662 + }, + { + "epoch": 0.1, + "grad_norm": 63.491235740310195, + "learning_rate": 9.87210355853755e-06, + "loss": 0.5031, + "step": 663 + }, + { + "epoch": 0.1, + "grad_norm": 9.430148460386038, + "learning_rate": 9.871554061783237e-06, + "loss": 0.5022, + "step": 664 + }, + { + "epoch": 0.1, + "grad_norm": 6.686431902582455, + "learning_rate": 9.871003402482659e-06, + "loss": 0.5012, + "step": 665 + }, + { + "epoch": 0.1, + "grad_norm": 2.0250635275888587, + "learning_rate": 9.870451580767231e-06, + "loss": 0.5774, + "step": 666 + }, + { + "epoch": 0.1, + "grad_norm": 8.979856779461892, + "learning_rate": 9.869898596768636e-06, + "loss": 0.5581, + "step": 667 + }, + { + "epoch": 0.1, + "grad_norm": 5.80957149558432, + "learning_rate": 9.86934445061884e-06, + "loss": 0.4554, + "step": 668 + }, + { + "epoch": 0.1, + "grad_norm": 5.988047936466621, + "learning_rate": 9.868789142450082e-06, + "loss": 0.483, + "step": 669 + }, + { + "epoch": 0.1, + "grad_norm": 9.979814305779161, + "learning_rate": 9.868232672394881e-06, + "loss": 0.5401, + "step": 670 + }, + { + "epoch": 0.1, + "grad_norm": 1.9421015399324086, + "learning_rate": 9.867675040586035e-06, + "loss": 0.6044, + "step": 671 + }, + { + "epoch": 0.1, + "grad_norm": 9.010016322414344, + "learning_rate": 9.867116247156614e-06, + "loss": 0.5579, + "step": 672 + }, + { + "epoch": 0.1, + "grad_norm": 5.781902249603746, + "learning_rate": 9.866556292239971e-06, + "loss": 0.5244, + "step": 673 + }, + { + "epoch": 0.1, + "grad_norm": 13.891502023559045, + "learning_rate": 9.86599517596973e-06, + "loss": 0.5633, + "step": 674 + }, + { + "epoch": 0.1, + "grad_norm": 13.743974019919886, + "learning_rate": 9.8654328984798e-06, + "loss": 0.6186, + "step": 675 + }, + { + "epoch": 0.1, + "grad_norm": 5.625371747978907, + "learning_rate": 9.864869459904363e-06, + "loss": 0.5263, + "step": 676 + }, + { + "epoch": 0.1, + "grad_norm": 8.043557936978345, + "learning_rate": 9.86430486037787e-06, + "loss": 0.5079, + "step": 677 + }, + { + "epoch": 0.1, + "grad_norm": 6.116079680258187, + "learning_rate": 9.863739100035067e-06, + "loss": 0.527, + "step": 678 + }, + { + "epoch": 0.1, + "grad_norm": 6.481538300642201, + "learning_rate": 9.863172179010961e-06, + "loss": 0.5342, + "step": 679 + }, + { + "epoch": 0.1, + "grad_norm": 6.961600898164992, + "learning_rate": 9.862604097440844e-06, + "loss": 0.5557, + "step": 680 + }, + { + "epoch": 0.1, + "grad_norm": 1.3280503018656313, + "learning_rate": 9.86203485546028e-06, + "loss": 0.5895, + "step": 681 + }, + { + "epoch": 0.1, + "grad_norm": 3.7210374051949073, + "learning_rate": 9.861464453205116e-06, + "loss": 0.5743, + "step": 682 + }, + { + "epoch": 0.1, + "grad_norm": 5.68813229866644, + "learning_rate": 9.860892890811471e-06, + "loss": 0.534, + "step": 683 + }, + { + "epoch": 0.1, + "grad_norm": 7.68134022468961, + "learning_rate": 9.860320168415743e-06, + "loss": 0.557, + "step": 684 + }, + { + "epoch": 0.1, + "grad_norm": 8.426027011654915, + "learning_rate": 9.859746286154607e-06, + "loss": 0.5494, + "step": 685 + }, + { + "epoch": 0.1, + "grad_norm": 8.475755033871641, + "learning_rate": 9.859171244165012e-06, + "loss": 0.5052, + "step": 686 + }, + { + "epoch": 0.1, + "grad_norm": 8.58400149547184, + "learning_rate": 9.858595042584187e-06, + "loss": 0.5089, + "step": 687 + }, + { + "epoch": 0.1, + "grad_norm": 5.313122634967122, + "learning_rate": 9.85801768154964e-06, + "loss": 0.4508, + "step": 688 + }, + { + "epoch": 0.1, + "grad_norm": 4.720476563801436, + "learning_rate": 9.857439161199145e-06, + "loss": 0.54, + "step": 689 + }, + { + "epoch": 0.1, + "grad_norm": 4.813813923597374, + "learning_rate": 9.856859481670764e-06, + "loss": 0.5166, + "step": 690 + }, + { + "epoch": 0.1, + "grad_norm": 4.684267968911751, + "learning_rate": 9.856278643102832e-06, + "loss": 0.5784, + "step": 691 + }, + { + "epoch": 0.1, + "grad_norm": 4.029274110439998, + "learning_rate": 9.855696645633961e-06, + "loss": 0.5453, + "step": 692 + }, + { + "epoch": 0.1, + "grad_norm": 5.8839013487100775, + "learning_rate": 9.855113489403037e-06, + "loss": 0.4918, + "step": 693 + }, + { + "epoch": 0.1, + "grad_norm": 3.436241031380622, + "learning_rate": 9.854529174549223e-06, + "loss": 0.4878, + "step": 694 + }, + { + "epoch": 0.1, + "grad_norm": 15.02819330236376, + "learning_rate": 9.853943701211963e-06, + "loss": 0.4293, + "step": 695 + }, + { + "epoch": 0.1, + "grad_norm": 5.124831595735831, + "learning_rate": 9.853357069530971e-06, + "loss": 0.5196, + "step": 696 + }, + { + "epoch": 0.11, + "grad_norm": 10.52263192661969, + "learning_rate": 9.852769279646244e-06, + "loss": 0.4525, + "step": 697 + }, + { + "epoch": 0.11, + "grad_norm": 3.930252106173418, + "learning_rate": 9.852180331698049e-06, + "loss": 0.607, + "step": 698 + }, + { + "epoch": 0.11, + "grad_norm": 3.0712702208275253, + "learning_rate": 9.851590225826932e-06, + "loss": 0.5198, + "step": 699 + }, + { + "epoch": 0.11, + "grad_norm": 3.033243356219366, + "learning_rate": 9.85099896217372e-06, + "loss": 0.5763, + "step": 700 + }, + { + "epoch": 0.11, + "grad_norm": 9.71111428920958, + "learning_rate": 9.850406540879507e-06, + "loss": 0.5813, + "step": 701 + }, + { + "epoch": 0.11, + "grad_norm": 7.7961348610146874, + "learning_rate": 9.849812962085671e-06, + "loss": 0.5163, + "step": 702 + }, + { + "epoch": 0.11, + "grad_norm": 4.362292165708552, + "learning_rate": 9.849218225933863e-06, + "loss": 0.5099, + "step": 703 + }, + { + "epoch": 0.11, + "grad_norm": 3.2801737711113885, + "learning_rate": 9.84862233256601e-06, + "loss": 0.5309, + "step": 704 + }, + { + "epoch": 0.11, + "grad_norm": 6.220140805904522, + "learning_rate": 9.848025282124318e-06, + "loss": 0.5631, + "step": 705 + }, + { + "epoch": 0.11, + "grad_norm": 5.2775615041512305, + "learning_rate": 9.847427074751263e-06, + "loss": 0.5167, + "step": 706 + }, + { + "epoch": 0.11, + "grad_norm": 4.19893490380027, + "learning_rate": 9.846827710589603e-06, + "loss": 0.5367, + "step": 707 + }, + { + "epoch": 0.11, + "grad_norm": 4.4200877440363975, + "learning_rate": 9.846227189782373e-06, + "loss": 0.4694, + "step": 708 + }, + { + "epoch": 0.11, + "grad_norm": 4.620338343514876, + "learning_rate": 9.845625512472878e-06, + "loss": 0.6533, + "step": 709 + }, + { + "epoch": 0.11, + "grad_norm": 6.061990276536358, + "learning_rate": 9.845022678804701e-06, + "loss": 0.4643, + "step": 710 + }, + { + "epoch": 0.11, + "grad_norm": 5.207194936014409, + "learning_rate": 9.844418688921706e-06, + "loss": 0.4875, + "step": 711 + }, + { + "epoch": 0.11, + "grad_norm": 4.233681285735088, + "learning_rate": 9.843813542968026e-06, + "loss": 0.5611, + "step": 712 + }, + { + "epoch": 0.11, + "grad_norm": 5.616369076571687, + "learning_rate": 9.84320724108807e-06, + "loss": 0.6273, + "step": 713 + }, + { + "epoch": 0.11, + "grad_norm": 3.8340648055401685, + "learning_rate": 9.842599783426535e-06, + "loss": 0.5299, + "step": 714 + }, + { + "epoch": 0.11, + "grad_norm": 7.525619751874274, + "learning_rate": 9.841991170128374e-06, + "loss": 0.5209, + "step": 715 + }, + { + "epoch": 0.11, + "grad_norm": 6.314254955355116, + "learning_rate": 9.841381401338834e-06, + "loss": 0.5193, + "step": 716 + }, + { + "epoch": 0.11, + "grad_norm": 4.18568175879109, + "learning_rate": 9.840770477203425e-06, + "loss": 0.5226, + "step": 717 + }, + { + "epoch": 0.11, + "grad_norm": 4.677135827470922, + "learning_rate": 9.840158397867941e-06, + "loss": 0.4774, + "step": 718 + }, + { + "epoch": 0.11, + "grad_norm": 7.753441855106366, + "learning_rate": 9.839545163478447e-06, + "loss": 0.5458, + "step": 719 + }, + { + "epoch": 0.11, + "grad_norm": 3.203843810911018, + "learning_rate": 9.838930774181285e-06, + "loss": 0.4839, + "step": 720 + }, + { + "epoch": 0.11, + "grad_norm": 7.892317856373333, + "learning_rate": 9.838315230123075e-06, + "loss": 0.4867, + "step": 721 + }, + { + "epoch": 0.11, + "grad_norm": 8.31786553547229, + "learning_rate": 9.837698531450706e-06, + "loss": 0.5101, + "step": 722 + }, + { + "epoch": 0.11, + "grad_norm": 5.443977045226496, + "learning_rate": 9.83708067831135e-06, + "loss": 0.5572, + "step": 723 + }, + { + "epoch": 0.11, + "grad_norm": 5.16533402252394, + "learning_rate": 9.83646167085245e-06, + "loss": 0.5338, + "step": 724 + }, + { + "epoch": 0.11, + "grad_norm": 8.887288758018006, + "learning_rate": 9.835841509221726e-06, + "loss": 0.6768, + "step": 725 + }, + { + "epoch": 0.11, + "grad_norm": 3.594369636005012, + "learning_rate": 9.835220193567174e-06, + "loss": 0.5006, + "step": 726 + }, + { + "epoch": 0.11, + "grad_norm": 7.51957434757884, + "learning_rate": 9.834597724037064e-06, + "loss": 0.5504, + "step": 727 + }, + { + "epoch": 0.11, + "grad_norm": 4.454462066223111, + "learning_rate": 9.833974100779941e-06, + "loss": 0.5405, + "step": 728 + }, + { + "epoch": 0.11, + "grad_norm": 5.064262643929269, + "learning_rate": 9.83334932394463e-06, + "loss": 0.4458, + "step": 729 + }, + { + "epoch": 0.11, + "grad_norm": 3.752775898192587, + "learning_rate": 9.832723393680222e-06, + "loss": 0.4601, + "step": 730 + }, + { + "epoch": 0.11, + "grad_norm": 6.936468302399266, + "learning_rate": 9.832096310136092e-06, + "loss": 0.5311, + "step": 731 + }, + { + "epoch": 0.11, + "grad_norm": 4.360525655282427, + "learning_rate": 9.831468073461887e-06, + "loss": 0.4816, + "step": 732 + }, + { + "epoch": 0.11, + "grad_norm": 4.0176685371297935, + "learning_rate": 9.830838683807528e-06, + "loss": 0.5417, + "step": 733 + }, + { + "epoch": 0.11, + "grad_norm": 5.514836400874859, + "learning_rate": 9.830208141323213e-06, + "loss": 0.5128, + "step": 734 + }, + { + "epoch": 0.11, + "grad_norm": 4.758510369096224, + "learning_rate": 9.829576446159417e-06, + "loss": 0.4935, + "step": 735 + }, + { + "epoch": 0.11, + "grad_norm": 1.5013410566723069, + "learning_rate": 9.828943598466883e-06, + "loss": 0.5248, + "step": 736 + }, + { + "epoch": 0.11, + "grad_norm": 8.348990979609816, + "learning_rate": 9.828309598396637e-06, + "loss": 0.516, + "step": 737 + }, + { + "epoch": 0.11, + "grad_norm": 29.129971138645967, + "learning_rate": 9.827674446099976e-06, + "loss": 0.5207, + "step": 738 + }, + { + "epoch": 0.11, + "grad_norm": 4.526371152545476, + "learning_rate": 9.827038141728468e-06, + "loss": 0.5335, + "step": 739 + }, + { + "epoch": 0.11, + "grad_norm": 1.6670033126500097, + "learning_rate": 9.826400685433968e-06, + "loss": 0.6124, + "step": 740 + }, + { + "epoch": 0.11, + "grad_norm": 4.373677727252735, + "learning_rate": 9.825762077368595e-06, + "loss": 0.466, + "step": 741 + }, + { + "epoch": 0.11, + "grad_norm": 6.00457018880989, + "learning_rate": 9.825122317684745e-06, + "loss": 0.5213, + "step": 742 + }, + { + "epoch": 0.11, + "grad_norm": 7.733369309715239, + "learning_rate": 9.824481406535091e-06, + "loss": 0.5042, + "step": 743 + }, + { + "epoch": 0.11, + "grad_norm": 7.161045734813862, + "learning_rate": 9.823839344072582e-06, + "loss": 0.5707, + "step": 744 + }, + { + "epoch": 0.11, + "grad_norm": 6.77282703489, + "learning_rate": 9.823196130450435e-06, + "loss": 0.4855, + "step": 745 + }, + { + "epoch": 0.11, + "grad_norm": 4.2276626171490035, + "learning_rate": 9.82255176582215e-06, + "loss": 0.471, + "step": 746 + }, + { + "epoch": 0.11, + "grad_norm": 7.1738098659931495, + "learning_rate": 9.821906250341495e-06, + "loss": 0.4817, + "step": 747 + }, + { + "epoch": 0.11, + "grad_norm": 1.4556155074130797, + "learning_rate": 9.82125958416252e-06, + "loss": 0.5673, + "step": 748 + }, + { + "epoch": 0.11, + "grad_norm": 6.456034104175465, + "learning_rate": 9.820611767439544e-06, + "loss": 0.574, + "step": 749 + }, + { + "epoch": 0.11, + "grad_norm": 5.506052462033091, + "learning_rate": 9.819962800327156e-06, + "loss": 0.5002, + "step": 750 + }, + { + "epoch": 0.11, + "grad_norm": 5.337344732370875, + "learning_rate": 9.819312682980233e-06, + "loss": 0.4941, + "step": 751 + }, + { + "epoch": 0.11, + "grad_norm": 1.4983573784444206, + "learning_rate": 9.818661415553916e-06, + "loss": 0.538, + "step": 752 + }, + { + "epoch": 0.11, + "grad_norm": 5.616459918764729, + "learning_rate": 9.818008998203622e-06, + "loss": 0.4665, + "step": 753 + }, + { + "epoch": 0.11, + "grad_norm": 7.253474267847464, + "learning_rate": 9.817355431085046e-06, + "loss": 0.5577, + "step": 754 + }, + { + "epoch": 0.11, + "grad_norm": 1.373331948133087, + "learning_rate": 9.816700714354152e-06, + "loss": 0.5312, + "step": 755 + }, + { + "epoch": 0.11, + "grad_norm": 8.112520842056831, + "learning_rate": 9.816044848167183e-06, + "loss": 0.492, + "step": 756 + }, + { + "epoch": 0.11, + "grad_norm": 8.15500594420659, + "learning_rate": 9.815387832680657e-06, + "loss": 0.4916, + "step": 757 + }, + { + "epoch": 0.11, + "grad_norm": 5.081118709276497, + "learning_rate": 9.814729668051359e-06, + "loss": 0.5086, + "step": 758 + }, + { + "epoch": 0.11, + "grad_norm": 6.663504968063285, + "learning_rate": 9.814070354436359e-06, + "loss": 0.5734, + "step": 759 + }, + { + "epoch": 0.11, + "grad_norm": 4.387022808134051, + "learning_rate": 9.813409891992988e-06, + "loss": 0.529, + "step": 760 + }, + { + "epoch": 0.11, + "grad_norm": 5.252531397331252, + "learning_rate": 9.812748280878868e-06, + "loss": 0.5126, + "step": 761 + }, + { + "epoch": 0.11, + "grad_norm": 7.054063325891665, + "learning_rate": 9.812085521251878e-06, + "loss": 0.6161, + "step": 762 + }, + { + "epoch": 0.12, + "grad_norm": 8.017717325337717, + "learning_rate": 9.811421613270182e-06, + "loss": 0.5249, + "step": 763 + }, + { + "epoch": 0.12, + "grad_norm": 6.291713515871051, + "learning_rate": 9.810756557092216e-06, + "loss": 0.5483, + "step": 764 + }, + { + "epoch": 0.12, + "grad_norm": 6.713839365567865, + "learning_rate": 9.810090352876684e-06, + "loss": 0.5649, + "step": 765 + }, + { + "epoch": 0.12, + "grad_norm": 4.044709804666683, + "learning_rate": 9.809423000782575e-06, + "loss": 0.5536, + "step": 766 + }, + { + "epoch": 0.12, + "grad_norm": 9.171019087009112, + "learning_rate": 9.808754500969142e-06, + "loss": 0.5182, + "step": 767 + }, + { + "epoch": 0.12, + "grad_norm": 12.221848222519958, + "learning_rate": 9.808084853595914e-06, + "loss": 0.4465, + "step": 768 + }, + { + "epoch": 0.12, + "grad_norm": 1.9743160236815003, + "learning_rate": 9.8074140588227e-06, + "loss": 0.6468, + "step": 769 + }, + { + "epoch": 0.12, + "grad_norm": 4.737601455245831, + "learning_rate": 9.806742116809576e-06, + "loss": 0.4127, + "step": 770 + }, + { + "epoch": 0.12, + "grad_norm": 7.004688670159399, + "learning_rate": 9.806069027716891e-06, + "loss": 0.4685, + "step": 771 + }, + { + "epoch": 0.12, + "grad_norm": 8.905781085608231, + "learning_rate": 9.805394791705276e-06, + "loss": 0.5359, + "step": 772 + }, + { + "epoch": 0.12, + "grad_norm": 6.164173924814828, + "learning_rate": 9.80471940893563e-06, + "loss": 0.5359, + "step": 773 + }, + { + "epoch": 0.12, + "grad_norm": 12.121106440426251, + "learning_rate": 9.804042879569122e-06, + "loss": 0.5244, + "step": 774 + }, + { + "epoch": 0.12, + "grad_norm": 9.412524198999384, + "learning_rate": 9.803365203767202e-06, + "loss": 0.5314, + "step": 775 + }, + { + "epoch": 0.12, + "grad_norm": 32.74738974475791, + "learning_rate": 9.802686381691588e-06, + "loss": 0.525, + "step": 776 + }, + { + "epoch": 0.12, + "grad_norm": 7.456815522964599, + "learning_rate": 9.802006413504278e-06, + "loss": 0.5605, + "step": 777 + }, + { + "epoch": 0.12, + "grad_norm": 10.31379052838033, + "learning_rate": 9.801325299367536e-06, + "loss": 0.5107, + "step": 778 + }, + { + "epoch": 0.12, + "grad_norm": 1.3119613034360489, + "learning_rate": 9.800643039443903e-06, + "loss": 0.629, + "step": 779 + }, + { + "epoch": 0.12, + "grad_norm": 9.190082028040901, + "learning_rate": 9.799959633896194e-06, + "loss": 0.4646, + "step": 780 + }, + { + "epoch": 0.12, + "grad_norm": 11.322629782131484, + "learning_rate": 9.799275082887498e-06, + "loss": 0.4406, + "step": 781 + }, + { + "epoch": 0.12, + "grad_norm": 7.719262487190556, + "learning_rate": 9.798589386581175e-06, + "loss": 0.5009, + "step": 782 + }, + { + "epoch": 0.12, + "grad_norm": 19.88652721381345, + "learning_rate": 9.797902545140859e-06, + "loss": 0.5211, + "step": 783 + }, + { + "epoch": 0.12, + "grad_norm": 11.623433813198425, + "learning_rate": 9.797214558730457e-06, + "loss": 0.5036, + "step": 784 + }, + { + "epoch": 0.12, + "grad_norm": 22.255435855220746, + "learning_rate": 9.79652542751415e-06, + "loss": 0.4925, + "step": 785 + }, + { + "epoch": 0.12, + "grad_norm": 6.725349884499909, + "learning_rate": 9.795835151656395e-06, + "loss": 0.4857, + "step": 786 + }, + { + "epoch": 0.12, + "grad_norm": 30.34059618481862, + "learning_rate": 9.795143731321914e-06, + "loss": 0.4589, + "step": 787 + }, + { + "epoch": 0.12, + "grad_norm": 8.393322313239448, + "learning_rate": 9.794451166675713e-06, + "loss": 0.5366, + "step": 788 + }, + { + "epoch": 0.12, + "grad_norm": 8.598511781248794, + "learning_rate": 9.793757457883062e-06, + "loss": 0.5555, + "step": 789 + }, + { + "epoch": 0.12, + "grad_norm": 7.347974641150477, + "learning_rate": 9.79306260510951e-06, + "loss": 0.5219, + "step": 790 + }, + { + "epoch": 0.12, + "grad_norm": 16.725451309826198, + "learning_rate": 9.792366608520872e-06, + "loss": 0.4513, + "step": 791 + }, + { + "epoch": 0.12, + "grad_norm": 7.870827706252377, + "learning_rate": 9.791669468283244e-06, + "loss": 0.5367, + "step": 792 + }, + { + "epoch": 0.12, + "grad_norm": 11.74922967282771, + "learning_rate": 9.790971184562991e-06, + "loss": 0.5266, + "step": 793 + }, + { + "epoch": 0.12, + "grad_norm": 13.050385361963912, + "learning_rate": 9.79027175752675e-06, + "loss": 0.5682, + "step": 794 + }, + { + "epoch": 0.12, + "grad_norm": 9.582984528746119, + "learning_rate": 9.789571187341434e-06, + "loss": 0.4129, + "step": 795 + }, + { + "epoch": 0.12, + "grad_norm": 11.747618613602189, + "learning_rate": 9.788869474174223e-06, + "loss": 0.4877, + "step": 796 + }, + { + "epoch": 0.12, + "grad_norm": 13.38932581880064, + "learning_rate": 9.78816661819258e-06, + "loss": 0.4044, + "step": 797 + }, + { + "epoch": 0.12, + "grad_norm": 11.899976226323329, + "learning_rate": 9.787462619564228e-06, + "loss": 0.4963, + "step": 798 + }, + { + "epoch": 0.12, + "grad_norm": 11.027969300357336, + "learning_rate": 9.786757478457173e-06, + "loss": 0.4799, + "step": 799 + }, + { + "epoch": 0.12, + "grad_norm": 20.999007283605376, + "learning_rate": 9.786051195039689e-06, + "loss": 0.5413, + "step": 800 + }, + { + "epoch": 0.12, + "grad_norm": 22.9481995690256, + "learning_rate": 9.785343769480322e-06, + "loss": 0.4714, + "step": 801 + }, + { + "epoch": 0.12, + "grad_norm": 1.584374020889029, + "learning_rate": 9.784635201947891e-06, + "loss": 0.603, + "step": 802 + }, + { + "epoch": 0.12, + "grad_norm": 22.273286484600998, + "learning_rate": 9.783925492611492e-06, + "loss": 0.5038, + "step": 803 + }, + { + "epoch": 0.12, + "grad_norm": 6.476164935957038, + "learning_rate": 9.783214641640486e-06, + "loss": 0.4806, + "step": 804 + }, + { + "epoch": 0.12, + "grad_norm": 11.262811763396229, + "learning_rate": 9.782502649204513e-06, + "loss": 0.5485, + "step": 805 + }, + { + "epoch": 0.12, + "grad_norm": 1.385920451633879, + "learning_rate": 9.78178951547348e-06, + "loss": 0.552, + "step": 806 + }, + { + "epoch": 0.12, + "grad_norm": 9.138994710984688, + "learning_rate": 9.781075240617573e-06, + "loss": 0.416, + "step": 807 + }, + { + "epoch": 0.12, + "grad_norm": 15.148131841525059, + "learning_rate": 9.780359824807241e-06, + "loss": 0.4693, + "step": 808 + }, + { + "epoch": 0.12, + "grad_norm": 8.558825406708767, + "learning_rate": 9.779643268213216e-06, + "loss": 0.5794, + "step": 809 + }, + { + "epoch": 0.12, + "grad_norm": 13.577652929895075, + "learning_rate": 9.778925571006494e-06, + "loss": 0.5525, + "step": 810 + }, + { + "epoch": 0.12, + "grad_norm": 9.145771969980245, + "learning_rate": 9.778206733358346e-06, + "loss": 0.5393, + "step": 811 + }, + { + "epoch": 0.12, + "grad_norm": 16.462393852171243, + "learning_rate": 9.777486755440316e-06, + "loss": 0.4696, + "step": 812 + }, + { + "epoch": 0.12, + "grad_norm": 14.643324113786015, + "learning_rate": 9.77676563742422e-06, + "loss": 0.548, + "step": 813 + }, + { + "epoch": 0.12, + "grad_norm": 7.601726630725164, + "learning_rate": 9.776043379482143e-06, + "loss": 0.548, + "step": 814 + }, + { + "epoch": 0.12, + "grad_norm": 6.6499128862694095, + "learning_rate": 9.775319981786446e-06, + "loss": 0.4983, + "step": 815 + }, + { + "epoch": 0.12, + "grad_norm": 5.6881479643495805, + "learning_rate": 9.774595444509761e-06, + "loss": 0.5441, + "step": 816 + }, + { + "epoch": 0.12, + "grad_norm": 8.549244892714569, + "learning_rate": 9.773869767824989e-06, + "loss": 0.4928, + "step": 817 + }, + { + "epoch": 0.12, + "grad_norm": 7.218888994788077, + "learning_rate": 9.773142951905307e-06, + "loss": 0.4887, + "step": 818 + }, + { + "epoch": 0.12, + "grad_norm": 1.348482936814411, + "learning_rate": 9.772414996924165e-06, + "loss": 0.543, + "step": 819 + }, + { + "epoch": 0.12, + "grad_norm": 15.358053921681456, + "learning_rate": 9.771685903055277e-06, + "loss": 0.5248, + "step": 820 + }, + { + "epoch": 0.12, + "grad_norm": 12.646699743772382, + "learning_rate": 9.770955670472636e-06, + "loss": 0.4715, + "step": 821 + }, + { + "epoch": 0.12, + "grad_norm": 10.287930497249548, + "learning_rate": 9.770224299350504e-06, + "loss": 0.4916, + "step": 822 + }, + { + "epoch": 0.12, + "grad_norm": 5.787758581611556, + "learning_rate": 9.769491789863416e-06, + "loss": 0.5444, + "step": 823 + }, + { + "epoch": 0.12, + "grad_norm": 6.0488161310080715, + "learning_rate": 9.768758142186177e-06, + "loss": 0.4586, + "step": 824 + }, + { + "epoch": 0.12, + "grad_norm": 5.766785899769031, + "learning_rate": 9.768023356493865e-06, + "loss": 0.5112, + "step": 825 + }, + { + "epoch": 0.12, + "grad_norm": 18.672668225849293, + "learning_rate": 9.76728743296183e-06, + "loss": 0.479, + "step": 826 + }, + { + "epoch": 0.12, + "grad_norm": 5.61017933385382, + "learning_rate": 9.766550371765688e-06, + "loss": 0.4546, + "step": 827 + }, + { + "epoch": 0.12, + "grad_norm": 5.376036842210855, + "learning_rate": 9.765812173081337e-06, + "loss": 0.544, + "step": 828 + }, + { + "epoch": 0.13, + "grad_norm": 14.256196337499162, + "learning_rate": 9.765072837084939e-06, + "loss": 0.4662, + "step": 829 + }, + { + "epoch": 0.13, + "grad_norm": 7.197074234680791, + "learning_rate": 9.764332363952927e-06, + "loss": 0.5129, + "step": 830 + }, + { + "epoch": 0.13, + "grad_norm": 7.921441077837376, + "learning_rate": 9.763590753862007e-06, + "loss": 0.4645, + "step": 831 + }, + { + "epoch": 0.13, + "grad_norm": 5.133830855777482, + "learning_rate": 9.762848006989159e-06, + "loss": 0.517, + "step": 832 + }, + { + "epoch": 0.13, + "grad_norm": 10.994832116651285, + "learning_rate": 9.76210412351163e-06, + "loss": 0.505, + "step": 833 + }, + { + "epoch": 0.13, + "grad_norm": 27.034126879001196, + "learning_rate": 9.761359103606943e-06, + "loss": 0.4556, + "step": 834 + }, + { + "epoch": 0.13, + "grad_norm": 12.458462896324026, + "learning_rate": 9.760612947452885e-06, + "loss": 0.5222, + "step": 835 + }, + { + "epoch": 0.13, + "grad_norm": 9.892885878951086, + "learning_rate": 9.75986565522752e-06, + "loss": 0.5285, + "step": 836 + }, + { + "epoch": 0.13, + "grad_norm": 13.191607037361615, + "learning_rate": 9.759117227109185e-06, + "loss": 0.4993, + "step": 837 + }, + { + "epoch": 0.13, + "grad_norm": 7.837371776396669, + "learning_rate": 9.75836766327648e-06, + "loss": 0.549, + "step": 838 + }, + { + "epoch": 0.13, + "grad_norm": 2.0790649656089317, + "learning_rate": 9.757616963908283e-06, + "loss": 0.6071, + "step": 839 + }, + { + "epoch": 0.13, + "grad_norm": 1.5902221088784447, + "learning_rate": 9.756865129183742e-06, + "loss": 0.5115, + "step": 840 + }, + { + "epoch": 0.13, + "grad_norm": 8.221573041381752, + "learning_rate": 9.756112159282271e-06, + "loss": 0.4789, + "step": 841 + }, + { + "epoch": 0.13, + "grad_norm": 5.345982771928576, + "learning_rate": 9.75535805438356e-06, + "loss": 0.5225, + "step": 842 + }, + { + "epoch": 0.13, + "grad_norm": 7.97226195601666, + "learning_rate": 9.754602814667571e-06, + "loss": 0.5191, + "step": 843 + }, + { + "epoch": 0.13, + "grad_norm": 10.719039718718786, + "learning_rate": 9.753846440314532e-06, + "loss": 0.5494, + "step": 844 + }, + { + "epoch": 0.13, + "grad_norm": 4.995888783078644, + "learning_rate": 9.753088931504943e-06, + "loss": 0.5176, + "step": 845 + }, + { + "epoch": 0.13, + "grad_norm": 5.477839636132996, + "learning_rate": 9.752330288419577e-06, + "loss": 0.5162, + "step": 846 + }, + { + "epoch": 0.13, + "grad_norm": 6.937141881908445, + "learning_rate": 9.751570511239476e-06, + "loss": 0.5366, + "step": 847 + }, + { + "epoch": 0.13, + "grad_norm": 7.6499242068346485, + "learning_rate": 9.750809600145955e-06, + "loss": 0.4225, + "step": 848 + }, + { + "epoch": 0.13, + "grad_norm": 10.470924163046707, + "learning_rate": 9.750047555320592e-06, + "loss": 0.4744, + "step": 849 + }, + { + "epoch": 0.13, + "grad_norm": 20.729489309591322, + "learning_rate": 9.749284376945248e-06, + "loss": 0.5235, + "step": 850 + }, + { + "epoch": 0.13, + "grad_norm": 8.206104445753413, + "learning_rate": 9.748520065202045e-06, + "loss": 0.549, + "step": 851 + }, + { + "epoch": 0.13, + "grad_norm": 9.996024416678358, + "learning_rate": 9.747754620273375e-06, + "loss": 0.5207, + "step": 852 + }, + { + "epoch": 0.13, + "grad_norm": 8.756509478971665, + "learning_rate": 9.746988042341907e-06, + "loss": 0.4873, + "step": 853 + }, + { + "epoch": 0.13, + "grad_norm": 1.6667577910229412, + "learning_rate": 9.746220331590576e-06, + "loss": 0.6032, + "step": 854 + }, + { + "epoch": 0.13, + "grad_norm": 11.321144216107207, + "learning_rate": 9.74545148820259e-06, + "loss": 0.5646, + "step": 855 + }, + { + "epoch": 0.13, + "grad_norm": 10.756525752198923, + "learning_rate": 9.744681512361422e-06, + "loss": 0.5588, + "step": 856 + }, + { + "epoch": 0.13, + "grad_norm": 15.060069574836112, + "learning_rate": 9.743910404250822e-06, + "loss": 0.5201, + "step": 857 + }, + { + "epoch": 0.13, + "grad_norm": 10.79484498077948, + "learning_rate": 9.743138164054805e-06, + "loss": 0.5617, + "step": 858 + }, + { + "epoch": 0.13, + "grad_norm": 9.992815571518326, + "learning_rate": 9.742364791957662e-06, + "loss": 0.5799, + "step": 859 + }, + { + "epoch": 0.13, + "grad_norm": 29.121403884187195, + "learning_rate": 9.741590288143943e-06, + "loss": 0.5623, + "step": 860 + }, + { + "epoch": 0.13, + "grad_norm": 10.69987443015057, + "learning_rate": 9.740814652798482e-06, + "loss": 0.477, + "step": 861 + }, + { + "epoch": 0.13, + "grad_norm": 8.742826130276134, + "learning_rate": 9.740037886106375e-06, + "loss": 0.488, + "step": 862 + }, + { + "epoch": 0.13, + "grad_norm": 8.547782809654253, + "learning_rate": 9.739259988252987e-06, + "loss": 0.5529, + "step": 863 + }, + { + "epoch": 0.13, + "grad_norm": 13.819987930209724, + "learning_rate": 9.738480959423957e-06, + "loss": 0.4328, + "step": 864 + }, + { + "epoch": 0.13, + "grad_norm": 24.107409132585683, + "learning_rate": 9.737700799805192e-06, + "loss": 0.5356, + "step": 865 + }, + { + "epoch": 0.13, + "grad_norm": 10.393584479374104, + "learning_rate": 9.73691950958287e-06, + "loss": 0.5123, + "step": 866 + }, + { + "epoch": 0.13, + "grad_norm": 12.584286152000336, + "learning_rate": 9.736137088943434e-06, + "loss": 0.6059, + "step": 867 + }, + { + "epoch": 0.13, + "grad_norm": 10.856495424862972, + "learning_rate": 9.735353538073607e-06, + "loss": 0.4831, + "step": 868 + }, + { + "epoch": 0.13, + "grad_norm": 12.593703950680343, + "learning_rate": 9.734568857160369e-06, + "loss": 0.5229, + "step": 869 + }, + { + "epoch": 0.13, + "grad_norm": 8.152324176950547, + "learning_rate": 9.733783046390982e-06, + "loss": 0.5149, + "step": 870 + }, + { + "epoch": 0.13, + "grad_norm": 13.167941468052158, + "learning_rate": 9.732996105952967e-06, + "loss": 0.4853, + "step": 871 + }, + { + "epoch": 0.13, + "grad_norm": 10.086034172593884, + "learning_rate": 9.732208036034122e-06, + "loss": 0.5591, + "step": 872 + }, + { + "epoch": 0.13, + "grad_norm": 21.855545261396625, + "learning_rate": 9.731418836822508e-06, + "loss": 0.4302, + "step": 873 + }, + { + "epoch": 0.13, + "grad_norm": 1.5717775569278518, + "learning_rate": 9.730628508506463e-06, + "loss": 0.6106, + "step": 874 + }, + { + "epoch": 0.13, + "grad_norm": 11.55489582713876, + "learning_rate": 9.729837051274591e-06, + "loss": 0.5106, + "step": 875 + }, + { + "epoch": 0.13, + "grad_norm": 10.580850307203784, + "learning_rate": 9.729044465315762e-06, + "loss": 0.4854, + "step": 876 + }, + { + "epoch": 0.13, + "grad_norm": 7.766631456738243, + "learning_rate": 9.728250750819121e-06, + "loss": 0.5297, + "step": 877 + }, + { + "epoch": 0.13, + "grad_norm": 7.776418950003466, + "learning_rate": 9.72745590797408e-06, + "loss": 0.5164, + "step": 878 + }, + { + "epoch": 0.13, + "grad_norm": 10.545259173516069, + "learning_rate": 9.726659936970317e-06, + "loss": 0.5216, + "step": 879 + }, + { + "epoch": 0.13, + "grad_norm": 15.778574263862074, + "learning_rate": 9.725862837997786e-06, + "loss": 0.5039, + "step": 880 + }, + { + "epoch": 0.13, + "grad_norm": 6.968476918856887, + "learning_rate": 9.725064611246704e-06, + "loss": 0.5432, + "step": 881 + }, + { + "epoch": 0.13, + "grad_norm": 10.438577548261287, + "learning_rate": 9.724265256907561e-06, + "loss": 0.457, + "step": 882 + }, + { + "epoch": 0.13, + "grad_norm": 7.190698739460536, + "learning_rate": 9.723464775171115e-06, + "loss": 0.5612, + "step": 883 + }, + { + "epoch": 0.13, + "grad_norm": 20.214417182139748, + "learning_rate": 9.72266316622839e-06, + "loss": 0.4754, + "step": 884 + }, + { + "epoch": 0.13, + "grad_norm": 10.852847370402287, + "learning_rate": 9.721860430270685e-06, + "loss": 0.5359, + "step": 885 + }, + { + "epoch": 0.13, + "grad_norm": 9.630754206135213, + "learning_rate": 9.721056567489564e-06, + "loss": 0.4577, + "step": 886 + }, + { + "epoch": 0.13, + "grad_norm": 10.369630242698918, + "learning_rate": 9.720251578076857e-06, + "loss": 0.5232, + "step": 887 + }, + { + "epoch": 0.13, + "grad_norm": 16.18115998435004, + "learning_rate": 9.719445462224673e-06, + "loss": 0.4429, + "step": 888 + }, + { + "epoch": 0.13, + "grad_norm": 7.448366621332923, + "learning_rate": 9.718638220125377e-06, + "loss": 0.4862, + "step": 889 + }, + { + "epoch": 0.13, + "grad_norm": 17.207228858508813, + "learning_rate": 9.717829851971612e-06, + "loss": 0.4875, + "step": 890 + }, + { + "epoch": 0.13, + "grad_norm": 13.88082084059105, + "learning_rate": 9.717020357956286e-06, + "loss": 0.5104, + "step": 891 + }, + { + "epoch": 0.13, + "grad_norm": 9.014011088821487, + "learning_rate": 9.716209738272578e-06, + "loss": 0.4785, + "step": 892 + }, + { + "epoch": 0.13, + "grad_norm": 13.712477947897227, + "learning_rate": 9.715397993113934e-06, + "loss": 0.5376, + "step": 893 + }, + { + "epoch": 0.13, + "grad_norm": 18.752971357216747, + "learning_rate": 9.714585122674065e-06, + "loss": 0.4823, + "step": 894 + }, + { + "epoch": 0.13, + "grad_norm": 9.735169507732005, + "learning_rate": 9.713771127146956e-06, + "loss": 0.4684, + "step": 895 + }, + { + "epoch": 0.14, + "grad_norm": 6.96261836252534, + "learning_rate": 9.712956006726861e-06, + "loss": 0.6214, + "step": 896 + }, + { + "epoch": 0.14, + "grad_norm": 13.43002093592202, + "learning_rate": 9.712139761608299e-06, + "loss": 0.5111, + "step": 897 + }, + { + "epoch": 0.14, + "grad_norm": 23.889816647359666, + "learning_rate": 9.711322391986055e-06, + "loss": 0.5114, + "step": 898 + }, + { + "epoch": 0.14, + "grad_norm": 15.662217436735633, + "learning_rate": 9.710503898055191e-06, + "loss": 0.5261, + "step": 899 + }, + { + "epoch": 0.14, + "grad_norm": 45.3605296309955, + "learning_rate": 9.70968428001103e-06, + "loss": 0.5732, + "step": 900 + }, + { + "epoch": 0.14, + "grad_norm": 50.852085798689444, + "learning_rate": 9.708863538049165e-06, + "loss": 0.4737, + "step": 901 + }, + { + "epoch": 0.14, + "grad_norm": 43.715623857255395, + "learning_rate": 9.708041672365456e-06, + "loss": 0.5511, + "step": 902 + }, + { + "epoch": 0.14, + "grad_norm": 37.60185586232048, + "learning_rate": 9.707218683156036e-06, + "loss": 0.5121, + "step": 903 + }, + { + "epoch": 0.14, + "grad_norm": 16.816524086331565, + "learning_rate": 9.706394570617304e-06, + "loss": 0.4939, + "step": 904 + }, + { + "epoch": 0.14, + "grad_norm": 16.210929530145666, + "learning_rate": 9.70556933494592e-06, + "loss": 0.4645, + "step": 905 + }, + { + "epoch": 0.14, + "grad_norm": 18.689524963072557, + "learning_rate": 9.704742976338824e-06, + "loss": 0.5059, + "step": 906 + }, + { + "epoch": 0.14, + "grad_norm": 9.841447352543542, + "learning_rate": 9.703915494993215e-06, + "loss": 0.5622, + "step": 907 + }, + { + "epoch": 0.14, + "grad_norm": 17.75242783926932, + "learning_rate": 9.703086891106564e-06, + "loss": 0.5275, + "step": 908 + }, + { + "epoch": 0.14, + "grad_norm": 11.167141033512646, + "learning_rate": 9.702257164876607e-06, + "loss": 0.5325, + "step": 909 + }, + { + "epoch": 0.14, + "grad_norm": 31.15109101486402, + "learning_rate": 9.701426316501353e-06, + "loss": 0.467, + "step": 910 + }, + { + "epoch": 0.14, + "grad_norm": 1.695246263745162, + "learning_rate": 9.70059434617907e-06, + "loss": 0.5439, + "step": 911 + }, + { + "epoch": 0.14, + "grad_norm": 7.536012405342726, + "learning_rate": 9.699761254108303e-06, + "loss": 0.4676, + "step": 912 + }, + { + "epoch": 0.14, + "grad_norm": 21.668794105735902, + "learning_rate": 9.698927040487863e-06, + "loss": 0.4858, + "step": 913 + }, + { + "epoch": 0.14, + "grad_norm": 12.878296454502214, + "learning_rate": 9.698091705516823e-06, + "loss": 0.5017, + "step": 914 + }, + { + "epoch": 0.14, + "grad_norm": 24.274165668464406, + "learning_rate": 9.697255249394528e-06, + "loss": 0.548, + "step": 915 + }, + { + "epoch": 0.14, + "grad_norm": 9.425196288348717, + "learning_rate": 9.696417672320588e-06, + "loss": 0.4773, + "step": 916 + }, + { + "epoch": 0.14, + "grad_norm": 8.772683510988001, + "learning_rate": 9.695578974494886e-06, + "loss": 0.5596, + "step": 917 + }, + { + "epoch": 0.14, + "grad_norm": 7.8902602475043535, + "learning_rate": 9.694739156117565e-06, + "loss": 0.4668, + "step": 918 + }, + { + "epoch": 0.14, + "grad_norm": 9.582449753055394, + "learning_rate": 9.693898217389039e-06, + "loss": 0.5162, + "step": 919 + }, + { + "epoch": 0.14, + "grad_norm": 9.566688566259629, + "learning_rate": 9.693056158509992e-06, + "loss": 0.5132, + "step": 920 + }, + { + "epoch": 0.14, + "grad_norm": 11.461780628858964, + "learning_rate": 9.692212979681373e-06, + "loss": 0.5222, + "step": 921 + }, + { + "epoch": 0.14, + "grad_norm": 14.430196224308371, + "learning_rate": 9.691368681104395e-06, + "loss": 0.4588, + "step": 922 + }, + { + "epoch": 0.14, + "grad_norm": 1.565735553215955, + "learning_rate": 9.690523262980543e-06, + "loss": 0.585, + "step": 923 + }, + { + "epoch": 0.14, + "grad_norm": 1.5233216678417496, + "learning_rate": 9.689676725511567e-06, + "loss": 0.5545, + "step": 924 + }, + { + "epoch": 0.14, + "grad_norm": 8.075990643732144, + "learning_rate": 9.688829068899483e-06, + "loss": 0.5755, + "step": 925 + }, + { + "epoch": 0.14, + "grad_norm": 10.754480999082649, + "learning_rate": 9.687980293346578e-06, + "loss": 0.4018, + "step": 926 + }, + { + "epoch": 0.14, + "grad_norm": 1.7243256636212985, + "learning_rate": 9.687130399055402e-06, + "loss": 0.602, + "step": 927 + }, + { + "epoch": 0.14, + "grad_norm": 11.197052395718842, + "learning_rate": 9.686279386228774e-06, + "loss": 0.575, + "step": 928 + }, + { + "epoch": 0.14, + "grad_norm": 12.57692873688182, + "learning_rate": 9.68542725506978e-06, + "loss": 0.4828, + "step": 929 + }, + { + "epoch": 0.14, + "grad_norm": 9.902770721370302, + "learning_rate": 9.684574005781772e-06, + "loss": 0.5751, + "step": 930 + }, + { + "epoch": 0.14, + "grad_norm": 8.858692541245226, + "learning_rate": 9.683719638568368e-06, + "loss": 0.5199, + "step": 931 + }, + { + "epoch": 0.14, + "grad_norm": 9.473739643251799, + "learning_rate": 9.682864153633455e-06, + "loss": 0.509, + "step": 932 + }, + { + "epoch": 0.14, + "grad_norm": 6.181492014016197, + "learning_rate": 9.682007551181187e-06, + "loss": 0.5463, + "step": 933 + }, + { + "epoch": 0.14, + "grad_norm": 11.600867816040104, + "learning_rate": 9.681149831415979e-06, + "loss": 0.4749, + "step": 934 + }, + { + "epoch": 0.14, + "grad_norm": 35.214183659181856, + "learning_rate": 9.680290994542523e-06, + "loss": 0.5401, + "step": 935 + }, + { + "epoch": 0.14, + "grad_norm": 13.442532256555586, + "learning_rate": 9.679431040765767e-06, + "loss": 0.4456, + "step": 936 + }, + { + "epoch": 0.14, + "grad_norm": 8.342347588278624, + "learning_rate": 9.678569970290931e-06, + "loss": 0.4761, + "step": 937 + }, + { + "epoch": 0.14, + "grad_norm": 6.615485266272987, + "learning_rate": 9.677707783323503e-06, + "loss": 0.4151, + "step": 938 + }, + { + "epoch": 0.14, + "grad_norm": 25.390426672881006, + "learning_rate": 9.676844480069232e-06, + "loss": 0.4715, + "step": 939 + }, + { + "epoch": 0.14, + "grad_norm": 15.493109965089273, + "learning_rate": 9.675980060734138e-06, + "loss": 0.5052, + "step": 940 + }, + { + "epoch": 0.14, + "grad_norm": 2.100292914432225, + "learning_rate": 9.675114525524506e-06, + "loss": 0.6824, + "step": 941 + }, + { + "epoch": 0.14, + "grad_norm": 7.483162995018544, + "learning_rate": 9.674247874646885e-06, + "loss": 0.4175, + "step": 942 + }, + { + "epoch": 0.14, + "grad_norm": 12.187145777319943, + "learning_rate": 9.673380108308095e-06, + "loss": 0.5702, + "step": 943 + }, + { + "epoch": 0.14, + "grad_norm": 9.744519195935023, + "learning_rate": 9.672511226715219e-06, + "loss": 0.479, + "step": 944 + }, + { + "epoch": 0.14, + "grad_norm": 7.111245724851894, + "learning_rate": 9.671641230075605e-06, + "loss": 0.4795, + "step": 945 + }, + { + "epoch": 0.14, + "grad_norm": 7.99869051160868, + "learning_rate": 9.670770118596871e-06, + "loss": 0.5166, + "step": 946 + }, + { + "epoch": 0.14, + "grad_norm": 6.508050497310736, + "learning_rate": 9.669897892486896e-06, + "loss": 0.4977, + "step": 947 + }, + { + "epoch": 0.14, + "grad_norm": 10.606441881501707, + "learning_rate": 9.66902455195383e-06, + "loss": 0.4531, + "step": 948 + }, + { + "epoch": 0.14, + "grad_norm": 6.376672411208604, + "learning_rate": 9.668150097206085e-06, + "loss": 0.4468, + "step": 949 + }, + { + "epoch": 0.14, + "grad_norm": 13.894571283434995, + "learning_rate": 9.667274528452343e-06, + "loss": 0.4533, + "step": 950 + }, + { + "epoch": 0.14, + "grad_norm": 10.612885542180615, + "learning_rate": 9.66639784590155e-06, + "loss": 0.4775, + "step": 951 + }, + { + "epoch": 0.14, + "grad_norm": 11.436885072264781, + "learning_rate": 9.665520049762913e-06, + "loss": 0.5424, + "step": 952 + }, + { + "epoch": 0.14, + "grad_norm": 7.655806164435728, + "learning_rate": 9.664641140245913e-06, + "loss": 0.5136, + "step": 953 + }, + { + "epoch": 0.14, + "grad_norm": 9.848970929784373, + "learning_rate": 9.663761117560291e-06, + "loss": 0.5345, + "step": 954 + }, + { + "epoch": 0.14, + "grad_norm": 10.698417495603636, + "learning_rate": 9.662879981916054e-06, + "loss": 0.5222, + "step": 955 + }, + { + "epoch": 0.14, + "grad_norm": 15.421797293380854, + "learning_rate": 9.66199773352348e-06, + "loss": 0.5241, + "step": 956 + }, + { + "epoch": 0.14, + "grad_norm": 9.115822593036338, + "learning_rate": 9.661114372593103e-06, + "loss": 0.4524, + "step": 957 + }, + { + "epoch": 0.14, + "grad_norm": 7.571896761568674, + "learning_rate": 9.660229899335733e-06, + "loss": 0.4617, + "step": 958 + }, + { + "epoch": 0.14, + "grad_norm": 10.417452189767106, + "learning_rate": 9.65934431396244e-06, + "loss": 0.4642, + "step": 959 + }, + { + "epoch": 0.14, + "grad_norm": 9.052499318627627, + "learning_rate": 9.658457616684556e-06, + "loss": 0.4959, + "step": 960 + }, + { + "epoch": 0.14, + "grad_norm": 5.975110874813396, + "learning_rate": 9.657569807713685e-06, + "loss": 0.5237, + "step": 961 + }, + { + "epoch": 0.15, + "grad_norm": 5.308322530252969, + "learning_rate": 9.656680887261693e-06, + "loss": 0.549, + "step": 962 + }, + { + "epoch": 0.15, + "grad_norm": 5.746257772958202, + "learning_rate": 9.655790855540711e-06, + "loss": 0.5246, + "step": 963 + }, + { + "epoch": 0.15, + "grad_norm": 6.414572721844497, + "learning_rate": 9.654899712763138e-06, + "loss": 0.5269, + "step": 964 + }, + { + "epoch": 0.15, + "grad_norm": 8.740559291017155, + "learning_rate": 9.654007459141634e-06, + "loss": 0.5955, + "step": 965 + }, + { + "epoch": 0.15, + "grad_norm": 6.7349335479764045, + "learning_rate": 9.653114094889128e-06, + "loss": 0.4796, + "step": 966 + }, + { + "epoch": 0.15, + "grad_norm": 6.17517218862611, + "learning_rate": 9.65221962021881e-06, + "loss": 0.4851, + "step": 967 + }, + { + "epoch": 0.15, + "grad_norm": 8.561026067474659, + "learning_rate": 9.651324035344138e-06, + "loss": 0.4725, + "step": 968 + }, + { + "epoch": 0.15, + "grad_norm": 9.469554318308493, + "learning_rate": 9.650427340478834e-06, + "loss": 0.3884, + "step": 969 + }, + { + "epoch": 0.15, + "grad_norm": 6.654500014453655, + "learning_rate": 9.649529535836887e-06, + "loss": 0.5929, + "step": 970 + }, + { + "epoch": 0.15, + "grad_norm": 5.545698394961423, + "learning_rate": 9.648630621632547e-06, + "loss": 0.5285, + "step": 971 + }, + { + "epoch": 0.15, + "grad_norm": 7.390090082124935, + "learning_rate": 9.64773059808033e-06, + "loss": 0.5096, + "step": 972 + }, + { + "epoch": 0.15, + "grad_norm": 7.671101828562971, + "learning_rate": 9.646829465395017e-06, + "loss": 0.506, + "step": 973 + }, + { + "epoch": 0.15, + "grad_norm": 6.651252983049962, + "learning_rate": 9.645927223791655e-06, + "loss": 0.5248, + "step": 974 + }, + { + "epoch": 0.15, + "grad_norm": 38.001153156132474, + "learning_rate": 9.645023873485557e-06, + "loss": 0.5017, + "step": 975 + }, + { + "epoch": 0.15, + "grad_norm": 7.768413452609935, + "learning_rate": 9.644119414692294e-06, + "loss": 0.4405, + "step": 976 + }, + { + "epoch": 0.15, + "grad_norm": 10.614448652239094, + "learning_rate": 9.64321384762771e-06, + "loss": 0.506, + "step": 977 + }, + { + "epoch": 0.15, + "grad_norm": 1.2978786037526, + "learning_rate": 9.642307172507907e-06, + "loss": 0.5895, + "step": 978 + }, + { + "epoch": 0.15, + "grad_norm": 6.117243271611956, + "learning_rate": 9.641399389549253e-06, + "loss": 0.5168, + "step": 979 + }, + { + "epoch": 0.15, + "grad_norm": 8.378478358941672, + "learning_rate": 9.640490498968383e-06, + "loss": 0.4806, + "step": 980 + }, + { + "epoch": 0.15, + "grad_norm": 11.386725899629303, + "learning_rate": 9.639580500982192e-06, + "loss": 0.438, + "step": 981 + }, + { + "epoch": 0.15, + "grad_norm": 13.497164177411026, + "learning_rate": 9.638669395807844e-06, + "loss": 0.5074, + "step": 982 + }, + { + "epoch": 0.15, + "grad_norm": 9.950546208800471, + "learning_rate": 9.637757183662762e-06, + "loss": 0.4606, + "step": 983 + }, + { + "epoch": 0.15, + "grad_norm": 8.121221602143258, + "learning_rate": 9.63684386476464e-06, + "loss": 0.5092, + "step": 984 + }, + { + "epoch": 0.15, + "grad_norm": 7.691420671511301, + "learning_rate": 9.635929439331429e-06, + "loss": 0.4891, + "step": 985 + }, + { + "epoch": 0.15, + "grad_norm": 6.732427775191928, + "learning_rate": 9.63501390758135e-06, + "loss": 0.4732, + "step": 986 + }, + { + "epoch": 0.15, + "grad_norm": 14.880137884713431, + "learning_rate": 9.63409726973288e-06, + "loss": 0.5272, + "step": 987 + }, + { + "epoch": 0.15, + "grad_norm": 26.117316451470153, + "learning_rate": 9.633179526004772e-06, + "loss": 0.4863, + "step": 988 + }, + { + "epoch": 0.15, + "grad_norm": 7.956529232036459, + "learning_rate": 9.632260676616032e-06, + "loss": 0.4752, + "step": 989 + }, + { + "epoch": 0.15, + "grad_norm": 7.884891082586968, + "learning_rate": 9.631340721785934e-06, + "loss": 0.5239, + "step": 990 + }, + { + "epoch": 0.15, + "grad_norm": 5.772862650165771, + "learning_rate": 9.630419661734018e-06, + "loss": 0.4499, + "step": 991 + }, + { + "epoch": 0.15, + "grad_norm": 1.3421716897356615, + "learning_rate": 9.629497496680083e-06, + "loss": 0.6071, + "step": 992 + }, + { + "epoch": 0.15, + "grad_norm": 8.8740654849553, + "learning_rate": 9.628574226844194e-06, + "loss": 0.5145, + "step": 993 + }, + { + "epoch": 0.15, + "grad_norm": 7.454950418961442, + "learning_rate": 9.627649852446684e-06, + "loss": 0.5368, + "step": 994 + }, + { + "epoch": 0.15, + "grad_norm": 8.864703255481245, + "learning_rate": 9.62672437370814e-06, + "loss": 0.4865, + "step": 995 + }, + { + "epoch": 0.15, + "grad_norm": 14.206685489584821, + "learning_rate": 9.625797790849422e-06, + "loss": 0.5421, + "step": 996 + }, + { + "epoch": 0.15, + "grad_norm": 5.930551276420024, + "learning_rate": 9.624870104091646e-06, + "loss": 0.5104, + "step": 997 + }, + { + "epoch": 0.15, + "grad_norm": 18.32397406226489, + "learning_rate": 9.6239413136562e-06, + "loss": 0.5106, + "step": 998 + }, + { + "epoch": 0.15, + "grad_norm": 7.391498741158296, + "learning_rate": 9.623011419764725e-06, + "loss": 0.5157, + "step": 999 + }, + { + "epoch": 0.15, + "grad_norm": 15.5777223798632, + "learning_rate": 9.622080422639133e-06, + "loss": 0.4748, + "step": 1000 + }, + { + "epoch": 0.15, + "grad_norm": 17.28888564039094, + "learning_rate": 9.621148322501597e-06, + "loss": 0.4778, + "step": 1001 + }, + { + "epoch": 0.15, + "grad_norm": 10.073931568560706, + "learning_rate": 9.620215119574553e-06, + "loss": 0.5489, + "step": 1002 + }, + { + "epoch": 0.15, + "grad_norm": 10.09411490613066, + "learning_rate": 9.619280814080699e-06, + "loss": 0.5796, + "step": 1003 + }, + { + "epoch": 0.15, + "grad_norm": 6.777832177074167, + "learning_rate": 9.618345406242999e-06, + "loss": 0.5531, + "step": 1004 + }, + { + "epoch": 0.15, + "grad_norm": 8.2771209435661, + "learning_rate": 9.617408896284678e-06, + "loss": 0.4164, + "step": 1005 + }, + { + "epoch": 0.15, + "grad_norm": 8.89916495437376, + "learning_rate": 9.616471284429224e-06, + "loss": 0.5341, + "step": 1006 + }, + { + "epoch": 0.15, + "grad_norm": 8.704181983550628, + "learning_rate": 9.61553257090039e-06, + "loss": 0.5798, + "step": 1007 + }, + { + "epoch": 0.15, + "grad_norm": 11.99159650690447, + "learning_rate": 9.614592755922188e-06, + "loss": 0.4858, + "step": 1008 + }, + { + "epoch": 0.15, + "grad_norm": 6.712328416918565, + "learning_rate": 9.613651839718896e-06, + "loss": 0.4904, + "step": 1009 + }, + { + "epoch": 0.15, + "grad_norm": 5.167676535860376, + "learning_rate": 9.612709822515055e-06, + "loss": 0.5351, + "step": 1010 + }, + { + "epoch": 0.15, + "grad_norm": 9.193673219410297, + "learning_rate": 9.611766704535467e-06, + "loss": 0.493, + "step": 1011 + }, + { + "epoch": 0.15, + "grad_norm": 7.396400025521136, + "learning_rate": 9.610822486005197e-06, + "loss": 0.4715, + "step": 1012 + }, + { + "epoch": 0.15, + "grad_norm": 25.390671045662877, + "learning_rate": 9.609877167149573e-06, + "loss": 0.4448, + "step": 1013 + }, + { + "epoch": 0.15, + "grad_norm": 6.643329683236608, + "learning_rate": 9.608930748194188e-06, + "loss": 0.4433, + "step": 1014 + }, + { + "epoch": 0.15, + "grad_norm": 5.913269817334127, + "learning_rate": 9.607983229364891e-06, + "loss": 0.5051, + "step": 1015 + }, + { + "epoch": 0.15, + "grad_norm": 1.4933239854141311, + "learning_rate": 9.607034610887801e-06, + "loss": 0.5946, + "step": 1016 + }, + { + "epoch": 0.15, + "grad_norm": 7.439564321220899, + "learning_rate": 9.606084892989293e-06, + "loss": 0.4593, + "step": 1017 + }, + { + "epoch": 0.15, + "grad_norm": 6.855778868291502, + "learning_rate": 9.605134075896012e-06, + "loss": 0.5164, + "step": 1018 + }, + { + "epoch": 0.15, + "grad_norm": 7.31609909203796, + "learning_rate": 9.604182159834856e-06, + "loss": 0.5474, + "step": 1019 + }, + { + "epoch": 0.15, + "grad_norm": 4.8263714222077665, + "learning_rate": 9.603229145032994e-06, + "loss": 0.4895, + "step": 1020 + }, + { + "epoch": 0.15, + "grad_norm": 9.09627909077371, + "learning_rate": 9.60227503171785e-06, + "loss": 0.4989, + "step": 1021 + }, + { + "epoch": 0.15, + "grad_norm": 7.129461801891572, + "learning_rate": 9.601319820117114e-06, + "loss": 0.5318, + "step": 1022 + }, + { + "epoch": 0.15, + "grad_norm": 6.945517291440395, + "learning_rate": 9.600363510458738e-06, + "loss": 0.565, + "step": 1023 + }, + { + "epoch": 0.15, + "grad_norm": 9.565288045092654, + "learning_rate": 9.599406102970936e-06, + "loss": 0.4606, + "step": 1024 + }, + { + "epoch": 0.15, + "grad_norm": 8.363612497499618, + "learning_rate": 9.598447597882181e-06, + "loss": 0.471, + "step": 1025 + }, + { + "epoch": 0.15, + "grad_norm": 5.732012732417542, + "learning_rate": 9.597487995421214e-06, + "loss": 0.5527, + "step": 1026 + }, + { + "epoch": 0.15, + "grad_norm": 4.544149608971028, + "learning_rate": 9.596527295817032e-06, + "loss": 0.5231, + "step": 1027 + }, + { + "epoch": 0.16, + "grad_norm": 9.294403608037943, + "learning_rate": 9.595565499298896e-06, + "loss": 0.5598, + "step": 1028 + }, + { + "epoch": 0.16, + "grad_norm": 7.357992762543305, + "learning_rate": 9.59460260609633e-06, + "loss": 0.5294, + "step": 1029 + }, + { + "epoch": 0.16, + "grad_norm": 8.575525972538733, + "learning_rate": 9.593638616439118e-06, + "loss": 0.553, + "step": 1030 + }, + { + "epoch": 0.16, + "grad_norm": 6.427907302232283, + "learning_rate": 9.592673530557306e-06, + "loss": 0.4554, + "step": 1031 + }, + { + "epoch": 0.16, + "grad_norm": 4.845801017811169, + "learning_rate": 9.5917073486812e-06, + "loss": 0.4894, + "step": 1032 + }, + { + "epoch": 0.16, + "grad_norm": 5.095422199586442, + "learning_rate": 9.590740071041374e-06, + "loss": 0.5325, + "step": 1033 + }, + { + "epoch": 0.16, + "grad_norm": 8.03481310098649, + "learning_rate": 9.589771697868655e-06, + "loss": 0.5044, + "step": 1034 + }, + { + "epoch": 0.16, + "grad_norm": 6.572077976385639, + "learning_rate": 9.588802229394136e-06, + "loss": 0.4879, + "step": 1035 + }, + { + "epoch": 0.16, + "grad_norm": 5.273984375834355, + "learning_rate": 9.587831665849172e-06, + "loss": 0.546, + "step": 1036 + }, + { + "epoch": 0.16, + "grad_norm": 3.575272755532328, + "learning_rate": 9.586860007465378e-06, + "loss": 0.5081, + "step": 1037 + }, + { + "epoch": 0.16, + "grad_norm": 4.0968051920280555, + "learning_rate": 9.585887254474629e-06, + "loss": 0.4548, + "step": 1038 + }, + { + "epoch": 0.16, + "grad_norm": 8.801712233242212, + "learning_rate": 9.584913407109062e-06, + "loss": 0.5557, + "step": 1039 + }, + { + "epoch": 0.16, + "grad_norm": 5.493174438591285, + "learning_rate": 9.583938465601076e-06, + "loss": 0.5228, + "step": 1040 + }, + { + "epoch": 0.16, + "grad_norm": 7.402832559180858, + "learning_rate": 9.582962430183332e-06, + "loss": 0.4996, + "step": 1041 + }, + { + "epoch": 0.16, + "grad_norm": 8.938401594696955, + "learning_rate": 9.58198530108875e-06, + "loss": 0.4577, + "step": 1042 + }, + { + "epoch": 0.16, + "grad_norm": 5.487981594398974, + "learning_rate": 9.581007078550513e-06, + "loss": 0.5064, + "step": 1043 + }, + { + "epoch": 0.16, + "grad_norm": 6.484219086972032, + "learning_rate": 9.580027762802062e-06, + "loss": 0.473, + "step": 1044 + }, + { + "epoch": 0.16, + "grad_norm": 9.315061161357885, + "learning_rate": 9.5790473540771e-06, + "loss": 0.5334, + "step": 1045 + }, + { + "epoch": 0.16, + "grad_norm": 6.5322895729665005, + "learning_rate": 9.578065852609595e-06, + "loss": 0.529, + "step": 1046 + }, + { + "epoch": 0.16, + "grad_norm": 13.207571896179806, + "learning_rate": 9.577083258633769e-06, + "loss": 0.5557, + "step": 1047 + }, + { + "epoch": 0.16, + "grad_norm": 6.521964515889232, + "learning_rate": 9.576099572384109e-06, + "loss": 0.5242, + "step": 1048 + }, + { + "epoch": 0.16, + "grad_norm": 5.448904821309215, + "learning_rate": 9.57511479409536e-06, + "loss": 0.5384, + "step": 1049 + }, + { + "epoch": 0.16, + "grad_norm": 7.43064147369242, + "learning_rate": 9.574128924002533e-06, + "loss": 0.4747, + "step": 1050 + }, + { + "epoch": 0.16, + "grad_norm": 15.261634531459844, + "learning_rate": 9.573141962340893e-06, + "loss": 0.5531, + "step": 1051 + }, + { + "epoch": 0.16, + "grad_norm": 1.594404980946875, + "learning_rate": 9.572153909345969e-06, + "loss": 0.5579, + "step": 1052 + }, + { + "epoch": 0.16, + "grad_norm": 1.2951880160872553, + "learning_rate": 9.571164765253548e-06, + "loss": 0.5905, + "step": 1053 + }, + { + "epoch": 0.16, + "grad_norm": 9.967959098582858, + "learning_rate": 9.570174530299682e-06, + "loss": 0.449, + "step": 1054 + }, + { + "epoch": 0.16, + "grad_norm": 9.209229361710845, + "learning_rate": 9.569183204720677e-06, + "loss": 0.5469, + "step": 1055 + }, + { + "epoch": 0.16, + "grad_norm": 1.5991579082735123, + "learning_rate": 9.568190788753106e-06, + "loss": 0.5855, + "step": 1056 + }, + { + "epoch": 0.16, + "grad_norm": 12.485225456393009, + "learning_rate": 9.567197282633797e-06, + "loss": 0.5404, + "step": 1057 + }, + { + "epoch": 0.16, + "grad_norm": 21.618086859322997, + "learning_rate": 9.566202686599842e-06, + "loss": 0.5854, + "step": 1058 + }, + { + "epoch": 0.16, + "grad_norm": 6.657913890183312, + "learning_rate": 9.565207000888587e-06, + "loss": 0.482, + "step": 1059 + }, + { + "epoch": 0.16, + "grad_norm": 11.128946752106357, + "learning_rate": 9.564210225737646e-06, + "loss": 0.5495, + "step": 1060 + }, + { + "epoch": 0.16, + "grad_norm": 10.616417458629353, + "learning_rate": 9.563212361384889e-06, + "loss": 0.4993, + "step": 1061 + }, + { + "epoch": 0.16, + "grad_norm": 7.669566267554182, + "learning_rate": 9.562213408068443e-06, + "loss": 0.5414, + "step": 1062 + }, + { + "epoch": 0.16, + "grad_norm": 14.12899816958299, + "learning_rate": 9.5612133660267e-06, + "loss": 0.52, + "step": 1063 + }, + { + "epoch": 0.16, + "grad_norm": 11.8725420281941, + "learning_rate": 9.560212235498312e-06, + "loss": 0.5401, + "step": 1064 + }, + { + "epoch": 0.16, + "grad_norm": 13.348498480165699, + "learning_rate": 9.559210016722184e-06, + "loss": 0.4909, + "step": 1065 + }, + { + "epoch": 0.16, + "grad_norm": 12.084312049489514, + "learning_rate": 9.558206709937487e-06, + "loss": 0.5346, + "step": 1066 + }, + { + "epoch": 0.16, + "grad_norm": 20.37280723756399, + "learning_rate": 9.55720231538365e-06, + "loss": 0.5183, + "step": 1067 + }, + { + "epoch": 0.16, + "grad_norm": 19.11220351762379, + "learning_rate": 9.556196833300362e-06, + "loss": 0.5342, + "step": 1068 + }, + { + "epoch": 0.16, + "grad_norm": 1.4490967555159884, + "learning_rate": 9.555190263927573e-06, + "loss": 0.5941, + "step": 1069 + }, + { + "epoch": 0.16, + "grad_norm": 8.421074464349934, + "learning_rate": 9.554182607505484e-06, + "loss": 0.4688, + "step": 1070 + }, + { + "epoch": 0.16, + "grad_norm": 22.886920966496817, + "learning_rate": 9.553173864274567e-06, + "loss": 0.4872, + "step": 1071 + }, + { + "epoch": 0.16, + "grad_norm": 11.404152740919663, + "learning_rate": 9.552164034475547e-06, + "loss": 0.5532, + "step": 1072 + }, + { + "epoch": 0.16, + "grad_norm": 8.000119071858734, + "learning_rate": 9.55115311834941e-06, + "loss": 0.5006, + "step": 1073 + }, + { + "epoch": 0.16, + "grad_norm": 9.872974933987493, + "learning_rate": 9.5501411161374e-06, + "loss": 0.5558, + "step": 1074 + }, + { + "epoch": 0.16, + "grad_norm": 14.205960834080486, + "learning_rate": 9.54912802808102e-06, + "loss": 0.5421, + "step": 1075 + }, + { + "epoch": 0.16, + "grad_norm": 8.35192458399684, + "learning_rate": 9.548113854422036e-06, + "loss": 0.4714, + "step": 1076 + }, + { + "epoch": 0.16, + "grad_norm": 8.953015845208357, + "learning_rate": 9.547098595402466e-06, + "loss": 0.4549, + "step": 1077 + }, + { + "epoch": 0.16, + "grad_norm": 10.749068825555604, + "learning_rate": 9.546082251264595e-06, + "loss": 0.4494, + "step": 1078 + }, + { + "epoch": 0.16, + "grad_norm": 11.85196381058159, + "learning_rate": 9.545064822250961e-06, + "loss": 0.4887, + "step": 1079 + }, + { + "epoch": 0.16, + "grad_norm": 9.257674441147294, + "learning_rate": 9.544046308604365e-06, + "loss": 0.53, + "step": 1080 + }, + { + "epoch": 0.16, + "grad_norm": 13.110346898007494, + "learning_rate": 9.54302671056786e-06, + "loss": 0.5329, + "step": 1081 + }, + { + "epoch": 0.16, + "grad_norm": 8.55354975885972, + "learning_rate": 9.542006028384768e-06, + "loss": 0.6032, + "step": 1082 + }, + { + "epoch": 0.16, + "grad_norm": 9.50876178795718, + "learning_rate": 9.540984262298662e-06, + "loss": 0.4947, + "step": 1083 + }, + { + "epoch": 0.16, + "grad_norm": 19.730498895592994, + "learning_rate": 9.539961412553375e-06, + "loss": 0.4252, + "step": 1084 + }, + { + "epoch": 0.16, + "grad_norm": 14.340339860583967, + "learning_rate": 9.538937479393002e-06, + "loss": 0.4132, + "step": 1085 + }, + { + "epoch": 0.16, + "grad_norm": 16.063053486422145, + "learning_rate": 9.53791246306189e-06, + "loss": 0.5392, + "step": 1086 + }, + { + "epoch": 0.16, + "grad_norm": 14.06749962551076, + "learning_rate": 9.536886363804656e-06, + "loss": 0.5039, + "step": 1087 + }, + { + "epoch": 0.16, + "grad_norm": 29.67291797370467, + "learning_rate": 9.535859181866159e-06, + "loss": 0.5462, + "step": 1088 + }, + { + "epoch": 0.16, + "grad_norm": 15.329685803565182, + "learning_rate": 9.534830917491532e-06, + "loss": 0.4985, + "step": 1089 + }, + { + "epoch": 0.16, + "grad_norm": 19.075324096842866, + "learning_rate": 9.533801570926157e-06, + "loss": 0.5013, + "step": 1090 + }, + { + "epoch": 0.16, + "grad_norm": 1.5235331613291045, + "learning_rate": 9.532771142415678e-06, + "loss": 0.6327, + "step": 1091 + }, + { + "epoch": 0.16, + "grad_norm": 10.445688998289315, + "learning_rate": 9.531739632205995e-06, + "loss": 0.5291, + "step": 1092 + }, + { + "epoch": 0.16, + "grad_norm": 12.102119506325907, + "learning_rate": 9.530707040543269e-06, + "loss": 0.4105, + "step": 1093 + }, + { + "epoch": 0.17, + "grad_norm": 13.881835638067553, + "learning_rate": 9.529673367673915e-06, + "loss": 0.5223, + "step": 1094 + }, + { + "epoch": 0.17, + "grad_norm": 21.332438633665447, + "learning_rate": 9.52863861384461e-06, + "loss": 0.5049, + "step": 1095 + }, + { + "epoch": 0.17, + "grad_norm": 15.306241186982106, + "learning_rate": 9.527602779302288e-06, + "loss": 0.5417, + "step": 1096 + }, + { + "epoch": 0.17, + "grad_norm": 12.880054907370333, + "learning_rate": 9.526565864294138e-06, + "loss": 0.5006, + "step": 1097 + }, + { + "epoch": 0.17, + "grad_norm": 12.092983785015301, + "learning_rate": 9.525527869067611e-06, + "loss": 0.542, + "step": 1098 + }, + { + "epoch": 0.17, + "grad_norm": 40.13844163699883, + "learning_rate": 9.524488793870412e-06, + "loss": 0.507, + "step": 1099 + }, + { + "epoch": 0.17, + "grad_norm": 17.48252409675383, + "learning_rate": 9.523448638950509e-06, + "loss": 0.5615, + "step": 1100 + }, + { + "epoch": 0.17, + "grad_norm": 17.310083534129436, + "learning_rate": 9.52240740455612e-06, + "loss": 0.4874, + "step": 1101 + }, + { + "epoch": 0.17, + "grad_norm": 8.913650280714354, + "learning_rate": 9.52136509093573e-06, + "loss": 0.512, + "step": 1102 + }, + { + "epoch": 0.17, + "grad_norm": 70.98404672064694, + "learning_rate": 9.52032169833807e-06, + "loss": 0.5529, + "step": 1103 + }, + { + "epoch": 0.17, + "grad_norm": 1.3362033235559758, + "learning_rate": 9.519277227012142e-06, + "loss": 0.5562, + "step": 1104 + }, + { + "epoch": 0.17, + "grad_norm": 10.634363072721465, + "learning_rate": 9.518231677207193e-06, + "loss": 0.5373, + "step": 1105 + }, + { + "epoch": 0.17, + "grad_norm": 7.570022377384805, + "learning_rate": 9.517185049172734e-06, + "loss": 0.5299, + "step": 1106 + }, + { + "epoch": 0.17, + "grad_norm": 5.484255226655126, + "learning_rate": 9.516137343158534e-06, + "loss": 0.4519, + "step": 1107 + }, + { + "epoch": 0.17, + "grad_norm": 9.75876132800781, + "learning_rate": 9.515088559414614e-06, + "loss": 0.4879, + "step": 1108 + }, + { + "epoch": 0.17, + "grad_norm": 10.31111043536622, + "learning_rate": 9.514038698191257e-06, + "loss": 0.564, + "step": 1109 + }, + { + "epoch": 0.17, + "grad_norm": 18.094655990258737, + "learning_rate": 9.512987759739004e-06, + "loss": 0.5401, + "step": 1110 + }, + { + "epoch": 0.17, + "grad_norm": 27.808618569493124, + "learning_rate": 9.511935744308647e-06, + "loss": 0.5109, + "step": 1111 + }, + { + "epoch": 0.17, + "grad_norm": 10.374757585984023, + "learning_rate": 9.510882652151239e-06, + "loss": 0.494, + "step": 1112 + }, + { + "epoch": 0.17, + "grad_norm": 10.348347471059297, + "learning_rate": 9.50982848351809e-06, + "loss": 0.4842, + "step": 1113 + }, + { + "epoch": 0.17, + "grad_norm": 21.78758314573042, + "learning_rate": 9.508773238660771e-06, + "loss": 0.4963, + "step": 1114 + }, + { + "epoch": 0.17, + "grad_norm": 16.570675836274084, + "learning_rate": 9.5077169178311e-06, + "loss": 0.5098, + "step": 1115 + }, + { + "epoch": 0.17, + "grad_norm": 12.043897593336926, + "learning_rate": 9.506659521281156e-06, + "loss": 0.4684, + "step": 1116 + }, + { + "epoch": 0.17, + "grad_norm": 24.500947650440683, + "learning_rate": 9.50560104926328e-06, + "loss": 0.4908, + "step": 1117 + }, + { + "epoch": 0.17, + "grad_norm": 16.199625921416168, + "learning_rate": 9.504541502030064e-06, + "loss": 0.4788, + "step": 1118 + }, + { + "epoch": 0.17, + "grad_norm": 11.41063748046671, + "learning_rate": 9.503480879834359e-06, + "loss": 0.503, + "step": 1119 + }, + { + "epoch": 0.17, + "grad_norm": 9.888600469787498, + "learning_rate": 9.50241918292927e-06, + "loss": 0.479, + "step": 1120 + }, + { + "epoch": 0.17, + "grad_norm": 8.679295591106444, + "learning_rate": 9.50135641156816e-06, + "loss": 0.5706, + "step": 1121 + }, + { + "epoch": 0.17, + "grad_norm": 5.950250466484968, + "learning_rate": 9.50029256600465e-06, + "loss": 0.5513, + "step": 1122 + }, + { + "epoch": 0.17, + "grad_norm": 1.4317481671964705, + "learning_rate": 9.499227646492613e-06, + "loss": 0.6126, + "step": 1123 + }, + { + "epoch": 0.17, + "grad_norm": 9.622349336745607, + "learning_rate": 9.498161653286185e-06, + "loss": 0.4738, + "step": 1124 + }, + { + "epoch": 0.17, + "grad_norm": 16.76586830578393, + "learning_rate": 9.49709458663975e-06, + "loss": 0.5394, + "step": 1125 + }, + { + "epoch": 0.17, + "grad_norm": 12.214051552427792, + "learning_rate": 9.496026446807955e-06, + "loss": 0.4995, + "step": 1126 + }, + { + "epoch": 0.17, + "grad_norm": 6.834554774643155, + "learning_rate": 9.4949572340457e-06, + "loss": 0.5204, + "step": 1127 + }, + { + "epoch": 0.17, + "grad_norm": 1.3391901161989683, + "learning_rate": 9.493886948608145e-06, + "loss": 0.5949, + "step": 1128 + }, + { + "epoch": 0.17, + "grad_norm": 7.965429750362886, + "learning_rate": 9.492815590750696e-06, + "loss": 0.4818, + "step": 1129 + }, + { + "epoch": 0.17, + "grad_norm": 8.638768552658153, + "learning_rate": 9.491743160729025e-06, + "loss": 0.4746, + "step": 1130 + }, + { + "epoch": 0.17, + "grad_norm": 20.998917418809803, + "learning_rate": 9.490669658799059e-06, + "loss": 0.458, + "step": 1131 + }, + { + "epoch": 0.17, + "grad_norm": 10.772483885751827, + "learning_rate": 9.489595085216974e-06, + "loss": 0.5029, + "step": 1132 + }, + { + "epoch": 0.17, + "grad_norm": 1.2976514267690746, + "learning_rate": 9.488519440239207e-06, + "loss": 0.601, + "step": 1133 + }, + { + "epoch": 0.17, + "grad_norm": 1.2704440715181649, + "learning_rate": 9.487442724122451e-06, + "loss": 0.548, + "step": 1134 + }, + { + "epoch": 0.17, + "grad_norm": 4.7014949776266475, + "learning_rate": 9.486364937123652e-06, + "loss": 0.5297, + "step": 1135 + }, + { + "epoch": 0.17, + "grad_norm": 6.887373602383863, + "learning_rate": 9.485286079500014e-06, + "loss": 0.4901, + "step": 1136 + }, + { + "epoch": 0.17, + "grad_norm": 5.13660300183015, + "learning_rate": 9.484206151508993e-06, + "loss": 0.5638, + "step": 1137 + }, + { + "epoch": 0.17, + "grad_norm": 4.384853044396187, + "learning_rate": 9.483125153408305e-06, + "loss": 0.5568, + "step": 1138 + }, + { + "epoch": 0.17, + "grad_norm": 5.124788597510183, + "learning_rate": 9.482043085455918e-06, + "loss": 0.4945, + "step": 1139 + }, + { + "epoch": 0.17, + "grad_norm": 4.082399764504842, + "learning_rate": 9.480959947910056e-06, + "loss": 0.4598, + "step": 1140 + }, + { + "epoch": 0.17, + "grad_norm": 5.430666870642827, + "learning_rate": 9.4798757410292e-06, + "loss": 0.4738, + "step": 1141 + }, + { + "epoch": 0.17, + "grad_norm": 9.278841849403527, + "learning_rate": 9.478790465072084e-06, + "loss": 0.5093, + "step": 1142 + }, + { + "epoch": 0.17, + "grad_norm": 4.188161386664499, + "learning_rate": 9.477704120297698e-06, + "loss": 0.5447, + "step": 1143 + }, + { + "epoch": 0.17, + "grad_norm": 4.78192165695814, + "learning_rate": 9.476616706965288e-06, + "loss": 0.4969, + "step": 1144 + }, + { + "epoch": 0.17, + "grad_norm": 5.404634921523575, + "learning_rate": 9.47552822533435e-06, + "loss": 0.5253, + "step": 1145 + }, + { + "epoch": 0.17, + "grad_norm": 6.603742876299739, + "learning_rate": 9.474438675664644e-06, + "loss": 0.439, + "step": 1146 + }, + { + "epoch": 0.17, + "grad_norm": 10.754442333008274, + "learning_rate": 9.473348058216176e-06, + "loss": 0.6074, + "step": 1147 + }, + { + "epoch": 0.17, + "grad_norm": 1.4845035373501168, + "learning_rate": 9.472256373249214e-06, + "loss": 0.5831, + "step": 1148 + }, + { + "epoch": 0.17, + "grad_norm": 14.84216765317522, + "learning_rate": 9.471163621024276e-06, + "loss": 0.4571, + "step": 1149 + }, + { + "epoch": 0.17, + "grad_norm": 5.476034281613416, + "learning_rate": 9.470069801802134e-06, + "loss": 0.4339, + "step": 1150 + }, + { + "epoch": 0.17, + "grad_norm": 4.158037781109141, + "learning_rate": 9.468974915843821e-06, + "loss": 0.5052, + "step": 1151 + }, + { + "epoch": 0.17, + "grad_norm": 13.808539573827598, + "learning_rate": 9.467878963410617e-06, + "loss": 0.4187, + "step": 1152 + }, + { + "epoch": 0.17, + "grad_norm": 6.732525064777297, + "learning_rate": 9.466781944764061e-06, + "loss": 0.5091, + "step": 1153 + }, + { + "epoch": 0.17, + "grad_norm": 13.449421927175727, + "learning_rate": 9.465683860165944e-06, + "loss": 0.4832, + "step": 1154 + }, + { + "epoch": 0.17, + "grad_norm": 12.040813943991704, + "learning_rate": 9.464584709878313e-06, + "loss": 0.4905, + "step": 1155 + }, + { + "epoch": 0.17, + "grad_norm": 5.197044034350128, + "learning_rate": 9.463484494163471e-06, + "loss": 0.5736, + "step": 1156 + }, + { + "epoch": 0.17, + "grad_norm": 7.1008694563348635, + "learning_rate": 9.46238321328397e-06, + "loss": 0.5175, + "step": 1157 + }, + { + "epoch": 0.17, + "grad_norm": 10.381097566024325, + "learning_rate": 9.46128086750262e-06, + "loss": 0.5122, + "step": 1158 + }, + { + "epoch": 0.17, + "grad_norm": 19.49755797574385, + "learning_rate": 9.460177457082487e-06, + "loss": 0.4547, + "step": 1159 + }, + { + "epoch": 0.17, + "grad_norm": 5.487516320315352, + "learning_rate": 9.459072982286886e-06, + "loss": 0.539, + "step": 1160 + }, + { + "epoch": 0.18, + "grad_norm": 7.4708071861341665, + "learning_rate": 9.45796744337939e-06, + "loss": 0.5097, + "step": 1161 + }, + { + "epoch": 0.18, + "grad_norm": 5.382554656801205, + "learning_rate": 9.456860840623823e-06, + "loss": 0.4783, + "step": 1162 + }, + { + "epoch": 0.18, + "grad_norm": 8.415161911444637, + "learning_rate": 9.455753174284266e-06, + "loss": 0.4847, + "step": 1163 + }, + { + "epoch": 0.18, + "grad_norm": 3.6552989318957696, + "learning_rate": 9.454644444625053e-06, + "loss": 0.5033, + "step": 1164 + }, + { + "epoch": 0.18, + "grad_norm": 6.762539319774012, + "learning_rate": 9.453534651910766e-06, + "loss": 0.4729, + "step": 1165 + }, + { + "epoch": 0.18, + "grad_norm": 5.663151663527047, + "learning_rate": 9.452423796406251e-06, + "loss": 0.5824, + "step": 1166 + }, + { + "epoch": 0.18, + "grad_norm": 7.496513900587899, + "learning_rate": 9.4513118783766e-06, + "loss": 0.5202, + "step": 1167 + }, + { + "epoch": 0.18, + "grad_norm": 5.562112292268546, + "learning_rate": 9.450198898087161e-06, + "loss": 0.5058, + "step": 1168 + }, + { + "epoch": 0.18, + "grad_norm": 16.00484709337884, + "learning_rate": 9.449084855803535e-06, + "loss": 0.5562, + "step": 1169 + }, + { + "epoch": 0.18, + "grad_norm": 4.186457089968793, + "learning_rate": 9.447969751791577e-06, + "loss": 0.4983, + "step": 1170 + }, + { + "epoch": 0.18, + "grad_norm": 3.8674554583478824, + "learning_rate": 9.446853586317394e-06, + "loss": 0.4991, + "step": 1171 + }, + { + "epoch": 0.18, + "grad_norm": 4.278365093353962, + "learning_rate": 9.44573635964735e-06, + "loss": 0.4157, + "step": 1172 + }, + { + "epoch": 0.18, + "grad_norm": 4.504340407671717, + "learning_rate": 9.44461807204806e-06, + "loss": 0.4415, + "step": 1173 + }, + { + "epoch": 0.18, + "grad_norm": 5.772964990203814, + "learning_rate": 9.443498723786385e-06, + "loss": 0.4992, + "step": 1174 + }, + { + "epoch": 0.18, + "grad_norm": 3.371786659077868, + "learning_rate": 9.442378315129456e-06, + "loss": 0.4924, + "step": 1175 + }, + { + "epoch": 0.18, + "grad_norm": 4.772384794343003, + "learning_rate": 9.44125684634464e-06, + "loss": 0.5338, + "step": 1176 + }, + { + "epoch": 0.18, + "grad_norm": 7.0027632949778145, + "learning_rate": 9.440134317699566e-06, + "loss": 0.5139, + "step": 1177 + }, + { + "epoch": 0.18, + "grad_norm": 7.5320730609913555, + "learning_rate": 9.439010729462114e-06, + "loss": 0.49, + "step": 1178 + }, + { + "epoch": 0.18, + "grad_norm": 15.528473681485258, + "learning_rate": 9.437886081900415e-06, + "loss": 0.5454, + "step": 1179 + }, + { + "epoch": 0.18, + "grad_norm": 5.779623624851191, + "learning_rate": 9.436760375282858e-06, + "loss": 0.477, + "step": 1180 + }, + { + "epoch": 0.18, + "grad_norm": 4.465588216824085, + "learning_rate": 9.43563360987808e-06, + "loss": 0.5556, + "step": 1181 + }, + { + "epoch": 0.18, + "grad_norm": 3.9901778560404204, + "learning_rate": 9.43450578595497e-06, + "loss": 0.4973, + "step": 1182 + }, + { + "epoch": 0.18, + "grad_norm": 7.369657437048644, + "learning_rate": 9.433376903782674e-06, + "loss": 0.484, + "step": 1183 + }, + { + "epoch": 0.18, + "grad_norm": 5.573164597079299, + "learning_rate": 9.432246963630587e-06, + "loss": 0.509, + "step": 1184 + }, + { + "epoch": 0.18, + "grad_norm": 5.718706189079023, + "learning_rate": 9.431115965768358e-06, + "loss": 0.5407, + "step": 1185 + }, + { + "epoch": 0.18, + "grad_norm": 8.587573465877018, + "learning_rate": 9.429983910465887e-06, + "loss": 0.5279, + "step": 1186 + }, + { + "epoch": 0.18, + "grad_norm": 5.56803726156193, + "learning_rate": 9.42885079799333e-06, + "loss": 0.4432, + "step": 1187 + }, + { + "epoch": 0.18, + "grad_norm": 8.408489394442615, + "learning_rate": 9.427716628621089e-06, + "loss": 0.488, + "step": 1188 + }, + { + "epoch": 0.18, + "grad_norm": 6.216768419757177, + "learning_rate": 9.426581402619824e-06, + "loss": 0.4735, + "step": 1189 + }, + { + "epoch": 0.18, + "grad_norm": 4.690796080628148, + "learning_rate": 9.425445120260445e-06, + "loss": 0.5133, + "step": 1190 + }, + { + "epoch": 0.18, + "grad_norm": 5.500302447006344, + "learning_rate": 9.424307781814112e-06, + "loss": 0.4458, + "step": 1191 + }, + { + "epoch": 0.18, + "grad_norm": 6.140438425923382, + "learning_rate": 9.423169387552244e-06, + "loss": 0.4466, + "step": 1192 + }, + { + "epoch": 0.18, + "grad_norm": 109.35465265953393, + "learning_rate": 9.422029937746501e-06, + "loss": 0.4934, + "step": 1193 + }, + { + "epoch": 0.18, + "grad_norm": 5.428846090829191, + "learning_rate": 9.420889432668805e-06, + "loss": 0.4307, + "step": 1194 + }, + { + "epoch": 0.18, + "grad_norm": 9.178199227168074, + "learning_rate": 9.419747872591325e-06, + "loss": 0.5049, + "step": 1195 + }, + { + "epoch": 0.18, + "grad_norm": 10.580007863227983, + "learning_rate": 9.418605257786483e-06, + "loss": 0.5324, + "step": 1196 + }, + { + "epoch": 0.18, + "grad_norm": 8.440081274596023, + "learning_rate": 9.417461588526951e-06, + "loss": 0.5224, + "step": 1197 + }, + { + "epoch": 0.18, + "grad_norm": 4.688288925680701, + "learning_rate": 9.416316865085652e-06, + "loss": 0.5391, + "step": 1198 + }, + { + "epoch": 0.18, + "grad_norm": 26.749340384732633, + "learning_rate": 9.415171087735767e-06, + "loss": 0.6168, + "step": 1199 + }, + { + "epoch": 0.18, + "grad_norm": 4.054109476209597, + "learning_rate": 9.414024256750723e-06, + "loss": 0.4629, + "step": 1200 + }, + { + "epoch": 0.18, + "grad_norm": 8.746156797597203, + "learning_rate": 9.412876372404199e-06, + "loss": 0.4176, + "step": 1201 + }, + { + "epoch": 0.18, + "grad_norm": 8.681696287132947, + "learning_rate": 9.411727434970121e-06, + "loss": 0.5633, + "step": 1202 + }, + { + "epoch": 0.18, + "grad_norm": 10.28539852767563, + "learning_rate": 9.41057744472268e-06, + "loss": 0.4613, + "step": 1203 + }, + { + "epoch": 0.18, + "grad_norm": 13.711917720746209, + "learning_rate": 9.409426401936301e-06, + "loss": 0.4442, + "step": 1204 + }, + { + "epoch": 0.18, + "grad_norm": 5.5023204341207625, + "learning_rate": 9.408274306885675e-06, + "loss": 0.4755, + "step": 1205 + }, + { + "epoch": 0.18, + "grad_norm": 6.821841677189189, + "learning_rate": 9.407121159845731e-06, + "loss": 0.591, + "step": 1206 + }, + { + "epoch": 0.18, + "grad_norm": 8.015376271851194, + "learning_rate": 9.405966961091661e-06, + "loss": 0.4842, + "step": 1207 + }, + { + "epoch": 0.18, + "grad_norm": 5.891568675672287, + "learning_rate": 9.404811710898901e-06, + "loss": 0.4926, + "step": 1208 + }, + { + "epoch": 0.18, + "grad_norm": 4.398481246278861, + "learning_rate": 9.40365540954314e-06, + "loss": 0.4738, + "step": 1209 + }, + { + "epoch": 0.18, + "grad_norm": 11.027631774660325, + "learning_rate": 9.402498057300318e-06, + "loss": 0.4855, + "step": 1210 + }, + { + "epoch": 0.18, + "grad_norm": 5.146261185798257, + "learning_rate": 9.401339654446623e-06, + "loss": 0.517, + "step": 1211 + }, + { + "epoch": 0.18, + "grad_norm": 4.719155777515191, + "learning_rate": 9.400180201258498e-06, + "loss": 0.4783, + "step": 1212 + }, + { + "epoch": 0.18, + "grad_norm": 4.847939979981822, + "learning_rate": 9.399019698012633e-06, + "loss": 0.4748, + "step": 1213 + }, + { + "epoch": 0.18, + "grad_norm": 5.0608940120737875, + "learning_rate": 9.397858144985971e-06, + "loss": 0.5065, + "step": 1214 + }, + { + "epoch": 0.18, + "grad_norm": 4.169037447311178, + "learning_rate": 9.396695542455706e-06, + "loss": 0.5056, + "step": 1215 + }, + { + "epoch": 0.18, + "grad_norm": 4.836362224048936, + "learning_rate": 9.395531890699276e-06, + "loss": 0.4778, + "step": 1216 + }, + { + "epoch": 0.18, + "grad_norm": 3.8270914611718325, + "learning_rate": 9.394367189994382e-06, + "loss": 0.4475, + "step": 1217 + }, + { + "epoch": 0.18, + "grad_norm": 3.930380586962998, + "learning_rate": 9.393201440618963e-06, + "loss": 0.4487, + "step": 1218 + }, + { + "epoch": 0.18, + "grad_norm": 3.894721021732552, + "learning_rate": 9.392034642851214e-06, + "loss": 0.4866, + "step": 1219 + }, + { + "epoch": 0.18, + "grad_norm": 3.9290797705479696, + "learning_rate": 9.390866796969577e-06, + "loss": 0.5473, + "step": 1220 + }, + { + "epoch": 0.18, + "grad_norm": 6.551576807084982, + "learning_rate": 9.389697903252753e-06, + "loss": 0.5245, + "step": 1221 + }, + { + "epoch": 0.18, + "grad_norm": 1.4580537146312427, + "learning_rate": 9.38852796197968e-06, + "loss": 0.5506, + "step": 1222 + }, + { + "epoch": 0.18, + "grad_norm": 4.637019427426143, + "learning_rate": 9.387356973429556e-06, + "loss": 0.5582, + "step": 1223 + }, + { + "epoch": 0.18, + "grad_norm": 133.17072410620233, + "learning_rate": 9.386184937881823e-06, + "loss": 0.5031, + "step": 1224 + }, + { + "epoch": 0.18, + "grad_norm": 4.289333595540723, + "learning_rate": 9.385011855616177e-06, + "loss": 0.5461, + "step": 1225 + }, + { + "epoch": 0.18, + "grad_norm": 1.1518179433045899, + "learning_rate": 9.383837726912562e-06, + "loss": 0.5717, + "step": 1226 + }, + { + "epoch": 0.19, + "grad_norm": 6.415458624396937, + "learning_rate": 9.382662552051169e-06, + "loss": 0.5198, + "step": 1227 + }, + { + "epoch": 0.19, + "grad_norm": 7.791004043249624, + "learning_rate": 9.381486331312445e-06, + "loss": 0.5071, + "step": 1228 + }, + { + "epoch": 0.19, + "grad_norm": 5.516965112775349, + "learning_rate": 9.380309064977082e-06, + "loss": 0.5226, + "step": 1229 + }, + { + "epoch": 0.19, + "grad_norm": 4.983982392867575, + "learning_rate": 9.379130753326021e-06, + "loss": 0.5467, + "step": 1230 + }, + { + "epoch": 0.19, + "grad_norm": 14.968260400206024, + "learning_rate": 9.377951396640455e-06, + "loss": 0.5243, + "step": 1231 + }, + { + "epoch": 0.19, + "grad_norm": 5.951433375725924, + "learning_rate": 9.376770995201826e-06, + "loss": 0.485, + "step": 1232 + }, + { + "epoch": 0.19, + "grad_norm": 4.308378916802709, + "learning_rate": 9.375589549291824e-06, + "loss": 0.4047, + "step": 1233 + }, + { + "epoch": 0.19, + "grad_norm": 6.387222935729364, + "learning_rate": 9.374407059192387e-06, + "loss": 0.5099, + "step": 1234 + }, + { + "epoch": 0.19, + "grad_norm": 6.39090304980783, + "learning_rate": 9.373223525185708e-06, + "loss": 0.4994, + "step": 1235 + }, + { + "epoch": 0.19, + "grad_norm": 4.792575889175409, + "learning_rate": 9.372038947554223e-06, + "loss": 0.491, + "step": 1236 + }, + { + "epoch": 0.19, + "grad_norm": 5.890886024495494, + "learning_rate": 9.370853326580619e-06, + "loss": 0.5356, + "step": 1237 + }, + { + "epoch": 0.19, + "grad_norm": 4.544413370834462, + "learning_rate": 9.369666662547833e-06, + "loss": 0.537, + "step": 1238 + }, + { + "epoch": 0.19, + "grad_norm": 4.944888106340925, + "learning_rate": 9.368478955739048e-06, + "loss": 0.5212, + "step": 1239 + }, + { + "epoch": 0.19, + "grad_norm": 8.611748915546245, + "learning_rate": 9.367290206437703e-06, + "loss": 0.5074, + "step": 1240 + }, + { + "epoch": 0.19, + "grad_norm": 6.066969104008949, + "learning_rate": 9.366100414927477e-06, + "loss": 0.4506, + "step": 1241 + }, + { + "epoch": 0.19, + "grad_norm": 4.358228570081846, + "learning_rate": 9.364909581492301e-06, + "loss": 0.4557, + "step": 1242 + }, + { + "epoch": 0.19, + "grad_norm": 4.596153540626799, + "learning_rate": 9.363717706416355e-06, + "loss": 0.5476, + "step": 1243 + }, + { + "epoch": 0.19, + "grad_norm": 7.11737375065248, + "learning_rate": 9.362524789984072e-06, + "loss": 0.496, + "step": 1244 + }, + { + "epoch": 0.19, + "grad_norm": 6.601918223107908, + "learning_rate": 9.361330832480125e-06, + "loss": 0.492, + "step": 1245 + }, + { + "epoch": 0.19, + "grad_norm": 4.880687324489626, + "learning_rate": 9.36013583418944e-06, + "loss": 0.5879, + "step": 1246 + }, + { + "epoch": 0.19, + "grad_norm": 3.901975176886403, + "learning_rate": 9.358939795397193e-06, + "loss": 0.5006, + "step": 1247 + }, + { + "epoch": 0.19, + "grad_norm": 19.719699031692258, + "learning_rate": 9.357742716388806e-06, + "loss": 0.5052, + "step": 1248 + }, + { + "epoch": 0.19, + "grad_norm": 5.8108688984226795, + "learning_rate": 9.356544597449947e-06, + "loss": 0.4733, + "step": 1249 + }, + { + "epoch": 0.19, + "grad_norm": 10.044041620409056, + "learning_rate": 9.355345438866538e-06, + "loss": 0.5132, + "step": 1250 + }, + { + "epoch": 0.19, + "grad_norm": 6.631959109480548, + "learning_rate": 9.354145240924746e-06, + "loss": 0.4502, + "step": 1251 + }, + { + "epoch": 0.19, + "grad_norm": 4.402779074828729, + "learning_rate": 9.352944003910982e-06, + "loss": 0.5161, + "step": 1252 + }, + { + "epoch": 0.19, + "grad_norm": 8.23455391436101, + "learning_rate": 9.351741728111911e-06, + "loss": 0.5736, + "step": 1253 + }, + { + "epoch": 0.19, + "grad_norm": 14.325165586992664, + "learning_rate": 9.350538413814447e-06, + "loss": 0.4566, + "step": 1254 + }, + { + "epoch": 0.19, + "grad_norm": 8.618766106100697, + "learning_rate": 9.349334061305744e-06, + "loss": 0.479, + "step": 1255 + }, + { + "epoch": 0.19, + "grad_norm": 6.136575730529583, + "learning_rate": 9.34812867087321e-06, + "loss": 0.5113, + "step": 1256 + }, + { + "epoch": 0.19, + "grad_norm": 7.318632817107247, + "learning_rate": 9.346922242804499e-06, + "loss": 0.46, + "step": 1257 + }, + { + "epoch": 0.19, + "grad_norm": 27.942137813601036, + "learning_rate": 9.345714777387512e-06, + "loss": 0.4353, + "step": 1258 + }, + { + "epoch": 0.19, + "grad_norm": 6.889311760078206, + "learning_rate": 9.3445062749104e-06, + "loss": 0.5285, + "step": 1259 + }, + { + "epoch": 0.19, + "grad_norm": 6.633695497243732, + "learning_rate": 9.343296735661559e-06, + "loss": 0.5377, + "step": 1260 + }, + { + "epoch": 0.19, + "grad_norm": 7.353230421451321, + "learning_rate": 9.342086159929629e-06, + "loss": 0.5278, + "step": 1261 + }, + { + "epoch": 0.19, + "grad_norm": 10.570593972695121, + "learning_rate": 9.34087454800351e-06, + "loss": 0.5418, + "step": 1262 + }, + { + "epoch": 0.19, + "grad_norm": 7.885950918833172, + "learning_rate": 9.339661900172331e-06, + "loss": 0.4488, + "step": 1263 + }, + { + "epoch": 0.19, + "grad_norm": 8.087655818071346, + "learning_rate": 9.338448216725487e-06, + "loss": 0.5166, + "step": 1264 + }, + { + "epoch": 0.19, + "grad_norm": 7.002019785944465, + "learning_rate": 9.337233497952605e-06, + "loss": 0.5122, + "step": 1265 + }, + { + "epoch": 0.19, + "grad_norm": 6.347678680075843, + "learning_rate": 9.336017744143567e-06, + "loss": 0.4832, + "step": 1266 + }, + { + "epoch": 0.19, + "grad_norm": 5.315419658681437, + "learning_rate": 9.3348009555885e-06, + "loss": 0.5239, + "step": 1267 + }, + { + "epoch": 0.19, + "grad_norm": 7.1751391191781915, + "learning_rate": 9.333583132577777e-06, + "loss": 0.4924, + "step": 1268 + }, + { + "epoch": 0.19, + "grad_norm": 13.288391393508343, + "learning_rate": 9.332364275402021e-06, + "loss": 0.5311, + "step": 1269 + }, + { + "epoch": 0.19, + "grad_norm": 14.379948110651485, + "learning_rate": 9.3311443843521e-06, + "loss": 0.4869, + "step": 1270 + }, + { + "epoch": 0.19, + "grad_norm": 9.008173431188924, + "learning_rate": 9.329923459719126e-06, + "loss": 0.5492, + "step": 1271 + }, + { + "epoch": 0.19, + "grad_norm": 5.77186653032091, + "learning_rate": 9.328701501794461e-06, + "loss": 0.4814, + "step": 1272 + }, + { + "epoch": 0.19, + "grad_norm": 6.506693441736103, + "learning_rate": 9.327478510869714e-06, + "loss": 0.5048, + "step": 1273 + }, + { + "epoch": 0.19, + "grad_norm": 5.553521406030673, + "learning_rate": 9.32625448723674e-06, + "loss": 0.5693, + "step": 1274 + }, + { + "epoch": 0.19, + "grad_norm": 12.25930794171845, + "learning_rate": 9.325029431187635e-06, + "loss": 0.5286, + "step": 1275 + }, + { + "epoch": 0.19, + "grad_norm": 8.648351646028999, + "learning_rate": 9.323803343014749e-06, + "loss": 0.5684, + "step": 1276 + }, + { + "epoch": 0.19, + "grad_norm": 4.9545806803530255, + "learning_rate": 9.322576223010677e-06, + "loss": 0.5414, + "step": 1277 + }, + { + "epoch": 0.19, + "grad_norm": 4.825722571190724, + "learning_rate": 9.321348071468255e-06, + "loss": 0.5144, + "step": 1278 + }, + { + "epoch": 0.19, + "grad_norm": 4.3784505295665435, + "learning_rate": 9.320118888680571e-06, + "loss": 0.4771, + "step": 1279 + }, + { + "epoch": 0.19, + "grad_norm": 3.5405349256888465, + "learning_rate": 9.318888674940959e-06, + "loss": 0.4757, + "step": 1280 + }, + { + "epoch": 0.19, + "grad_norm": 5.342385898634505, + "learning_rate": 9.31765743054299e-06, + "loss": 0.5489, + "step": 1281 + }, + { + "epoch": 0.19, + "grad_norm": 7.415000434486711, + "learning_rate": 9.316425155780495e-06, + "loss": 0.5, + "step": 1282 + }, + { + "epoch": 0.19, + "grad_norm": 4.156838461066658, + "learning_rate": 9.315191850947541e-06, + "loss": 0.4618, + "step": 1283 + }, + { + "epoch": 0.19, + "grad_norm": 8.825326923772595, + "learning_rate": 9.31395751633844e-06, + "loss": 0.4166, + "step": 1284 + }, + { + "epoch": 0.19, + "grad_norm": 8.87659249356095, + "learning_rate": 9.31272215224776e-06, + "loss": 0.4669, + "step": 1285 + }, + { + "epoch": 0.19, + "grad_norm": 4.922852404300426, + "learning_rate": 9.311485758970302e-06, + "loss": 0.5317, + "step": 1286 + }, + { + "epoch": 0.19, + "grad_norm": 1.4768638154291598, + "learning_rate": 9.310248336801122e-06, + "loss": 0.648, + "step": 1287 + }, + { + "epoch": 0.19, + "grad_norm": 11.38832393080718, + "learning_rate": 9.309009886035518e-06, + "loss": 0.5366, + "step": 1288 + }, + { + "epoch": 0.19, + "grad_norm": 4.239412145060281, + "learning_rate": 9.307770406969032e-06, + "loss": 0.5133, + "step": 1289 + }, + { + "epoch": 0.19, + "grad_norm": 15.805255782105718, + "learning_rate": 9.306529899897452e-06, + "loss": 0.4593, + "step": 1290 + }, + { + "epoch": 0.19, + "grad_norm": 6.796085656656423, + "learning_rate": 9.305288365116816e-06, + "loss": 0.4784, + "step": 1291 + }, + { + "epoch": 0.19, + "grad_norm": 6.274255387475334, + "learning_rate": 9.3040458029234e-06, + "loss": 0.5172, + "step": 1292 + }, + { + "epoch": 0.2, + "grad_norm": 5.798273524576829, + "learning_rate": 9.302802213613729e-06, + "loss": 0.4621, + "step": 1293 + }, + { + "epoch": 0.2, + "grad_norm": 7.288690697391453, + "learning_rate": 9.301557597484576e-06, + "loss": 0.4842, + "step": 1294 + }, + { + "epoch": 0.2, + "grad_norm": 3.8770426847690396, + "learning_rate": 9.300311954832952e-06, + "loss": 0.5147, + "step": 1295 + }, + { + "epoch": 0.2, + "grad_norm": 4.047214225580251, + "learning_rate": 9.299065285956119e-06, + "loss": 0.4828, + "step": 1296 + }, + { + "epoch": 0.2, + "grad_norm": 3.3005555788526566, + "learning_rate": 9.297817591151581e-06, + "loss": 0.4923, + "step": 1297 + }, + { + "epoch": 0.2, + "grad_norm": 3.4169056299224767, + "learning_rate": 9.296568870717087e-06, + "loss": 0.5853, + "step": 1298 + }, + { + "epoch": 0.2, + "grad_norm": 5.253705735502439, + "learning_rate": 9.295319124950632e-06, + "loss": 0.5037, + "step": 1299 + }, + { + "epoch": 0.2, + "grad_norm": 4.209245106970996, + "learning_rate": 9.294068354150456e-06, + "loss": 0.4758, + "step": 1300 + }, + { + "epoch": 0.2, + "grad_norm": 1.179265305096352, + "learning_rate": 9.292816558615041e-06, + "loss": 0.5894, + "step": 1301 + }, + { + "epoch": 0.2, + "grad_norm": 5.132730942693958, + "learning_rate": 9.291563738643116e-06, + "loss": 0.5405, + "step": 1302 + }, + { + "epoch": 0.2, + "grad_norm": 4.909888067177052, + "learning_rate": 9.290309894533653e-06, + "loss": 0.4801, + "step": 1303 + }, + { + "epoch": 0.2, + "grad_norm": 4.6696918608622395, + "learning_rate": 9.289055026585868e-06, + "loss": 0.5368, + "step": 1304 + }, + { + "epoch": 0.2, + "grad_norm": 4.910752022917877, + "learning_rate": 9.287799135099224e-06, + "loss": 0.5216, + "step": 1305 + }, + { + "epoch": 0.2, + "grad_norm": 7.591670736943786, + "learning_rate": 9.286542220373427e-06, + "loss": 0.4645, + "step": 1306 + }, + { + "epoch": 0.2, + "grad_norm": 4.001286122705727, + "learning_rate": 9.285284282708424e-06, + "loss": 0.4849, + "step": 1307 + }, + { + "epoch": 0.2, + "grad_norm": 5.457533753209876, + "learning_rate": 9.284025322404412e-06, + "loss": 0.5403, + "step": 1308 + }, + { + "epoch": 0.2, + "grad_norm": 10.59171730433206, + "learning_rate": 9.282765339761828e-06, + "loss": 0.4629, + "step": 1309 + }, + { + "epoch": 0.2, + "grad_norm": 5.260917557912551, + "learning_rate": 9.281504335081353e-06, + "loss": 0.5182, + "step": 1310 + }, + { + "epoch": 0.2, + "grad_norm": 5.035950094463464, + "learning_rate": 9.280242308663913e-06, + "loss": 0.5026, + "step": 1311 + }, + { + "epoch": 0.2, + "grad_norm": 23.57997518406638, + "learning_rate": 9.278979260810678e-06, + "loss": 0.5071, + "step": 1312 + }, + { + "epoch": 0.2, + "grad_norm": 9.805392352865697, + "learning_rate": 9.277715191823063e-06, + "loss": 0.5124, + "step": 1313 + }, + { + "epoch": 0.2, + "grad_norm": 4.094834601072279, + "learning_rate": 9.276450102002723e-06, + "loss": 0.469, + "step": 1314 + }, + { + "epoch": 0.2, + "grad_norm": 3.2480198835074745, + "learning_rate": 9.275183991651557e-06, + "loss": 0.5067, + "step": 1315 + }, + { + "epoch": 0.2, + "grad_norm": 6.680331829212345, + "learning_rate": 9.273916861071715e-06, + "loss": 0.477, + "step": 1316 + }, + { + "epoch": 0.2, + "grad_norm": 5.814176784875257, + "learning_rate": 9.272648710565576e-06, + "loss": 0.4616, + "step": 1317 + }, + { + "epoch": 0.2, + "grad_norm": 6.846380753115313, + "learning_rate": 9.27137954043578e-06, + "loss": 0.4458, + "step": 1318 + }, + { + "epoch": 0.2, + "grad_norm": 5.2327803151047485, + "learning_rate": 9.270109350985197e-06, + "loss": 0.4799, + "step": 1319 + }, + { + "epoch": 0.2, + "grad_norm": 7.059192562362302, + "learning_rate": 9.268838142516943e-06, + "loss": 0.4759, + "step": 1320 + }, + { + "epoch": 0.2, + "grad_norm": 6.171157650721041, + "learning_rate": 9.267565915334382e-06, + "loss": 0.4414, + "step": 1321 + }, + { + "epoch": 0.2, + "grad_norm": 9.583272495093906, + "learning_rate": 9.266292669741118e-06, + "loss": 0.4254, + "step": 1322 + }, + { + "epoch": 0.2, + "grad_norm": 1.389893165740359, + "learning_rate": 9.265018406040996e-06, + "loss": 0.5756, + "step": 1323 + }, + { + "epoch": 0.2, + "grad_norm": 5.575761734631283, + "learning_rate": 9.263743124538107e-06, + "loss": 0.4619, + "step": 1324 + }, + { + "epoch": 0.2, + "grad_norm": 6.25550130146345, + "learning_rate": 9.262466825536783e-06, + "loss": 0.5331, + "step": 1325 + }, + { + "epoch": 0.2, + "grad_norm": 3.5416439321992543, + "learning_rate": 9.2611895093416e-06, + "loss": 0.6031, + "step": 1326 + }, + { + "epoch": 0.2, + "grad_norm": 432.8468035549292, + "learning_rate": 9.259911176257376e-06, + "loss": 0.5404, + "step": 1327 + }, + { + "epoch": 0.2, + "grad_norm": 4.9738607073254535, + "learning_rate": 9.258631826589175e-06, + "loss": 0.4947, + "step": 1328 + }, + { + "epoch": 0.2, + "grad_norm": 5.808425389380071, + "learning_rate": 9.257351460642295e-06, + "loss": 0.4982, + "step": 1329 + }, + { + "epoch": 0.2, + "grad_norm": 4.206610541934726, + "learning_rate": 9.256070078722287e-06, + "loss": 0.4533, + "step": 1330 + }, + { + "epoch": 0.2, + "grad_norm": 19.938371406381457, + "learning_rate": 9.254787681134939e-06, + "loss": 0.5684, + "step": 1331 + }, + { + "epoch": 0.2, + "grad_norm": 9.969815014557337, + "learning_rate": 9.253504268186279e-06, + "loss": 0.4971, + "step": 1332 + }, + { + "epoch": 0.2, + "grad_norm": 5.817235283433717, + "learning_rate": 9.252219840182583e-06, + "loss": 0.5319, + "step": 1333 + }, + { + "epoch": 0.2, + "grad_norm": 5.009054167956684, + "learning_rate": 9.250934397430367e-06, + "loss": 0.5103, + "step": 1334 + }, + { + "epoch": 0.2, + "grad_norm": 4.111311389869556, + "learning_rate": 9.249647940236386e-06, + "loss": 0.4554, + "step": 1335 + }, + { + "epoch": 0.2, + "grad_norm": 4.560099183024048, + "learning_rate": 9.248360468907643e-06, + "loss": 0.4995, + "step": 1336 + }, + { + "epoch": 0.2, + "grad_norm": 3.3007882277831535, + "learning_rate": 9.247071983751377e-06, + "loss": 0.512, + "step": 1337 + }, + { + "epoch": 0.2, + "grad_norm": 3.3980511252078824, + "learning_rate": 9.245782485075072e-06, + "loss": 0.5961, + "step": 1338 + }, + { + "epoch": 0.2, + "grad_norm": 5.210564132326873, + "learning_rate": 9.244491973186455e-06, + "loss": 0.4883, + "step": 1339 + }, + { + "epoch": 0.2, + "grad_norm": 3.877034351394793, + "learning_rate": 9.243200448393492e-06, + "loss": 0.5011, + "step": 1340 + }, + { + "epoch": 0.2, + "grad_norm": 4.618686010126334, + "learning_rate": 9.241907911004394e-06, + "loss": 0.4622, + "step": 1341 + }, + { + "epoch": 0.2, + "grad_norm": 4.134539077221495, + "learning_rate": 9.24061436132761e-06, + "loss": 0.5205, + "step": 1342 + }, + { + "epoch": 0.2, + "grad_norm": 5.978256327467286, + "learning_rate": 9.239319799671834e-06, + "loss": 0.5275, + "step": 1343 + }, + { + "epoch": 0.2, + "grad_norm": 5.88898782930768, + "learning_rate": 9.238024226345998e-06, + "loss": 0.4374, + "step": 1344 + }, + { + "epoch": 0.2, + "grad_norm": 5.630642165713237, + "learning_rate": 9.236727641659277e-06, + "loss": 0.433, + "step": 1345 + }, + { + "epoch": 0.2, + "grad_norm": 7.166345229500069, + "learning_rate": 9.23543004592109e-06, + "loss": 0.4965, + "step": 1346 + }, + { + "epoch": 0.2, + "grad_norm": 4.413745135294686, + "learning_rate": 9.234131439441095e-06, + "loss": 0.4838, + "step": 1347 + }, + { + "epoch": 0.2, + "grad_norm": 4.133048250121696, + "learning_rate": 9.232831822529188e-06, + "loss": 0.5104, + "step": 1348 + }, + { + "epoch": 0.2, + "grad_norm": 6.409959052970513, + "learning_rate": 9.231531195495509e-06, + "loss": 0.5096, + "step": 1349 + }, + { + "epoch": 0.2, + "grad_norm": 5.184602726058528, + "learning_rate": 9.230229558650443e-06, + "loss": 0.5189, + "step": 1350 + }, + { + "epoch": 0.2, + "grad_norm": 6.878181493119966, + "learning_rate": 9.22892691230461e-06, + "loss": 0.4721, + "step": 1351 + }, + { + "epoch": 0.2, + "grad_norm": 1.4103836403338035, + "learning_rate": 9.227623256768872e-06, + "loss": 0.5871, + "step": 1352 + }, + { + "epoch": 0.2, + "grad_norm": 4.842425968646203, + "learning_rate": 9.226318592354336e-06, + "loss": 0.4418, + "step": 1353 + }, + { + "epoch": 0.2, + "grad_norm": 4.684193545252756, + "learning_rate": 9.225012919372344e-06, + "loss": 0.4831, + "step": 1354 + }, + { + "epoch": 0.2, + "grad_norm": 11.97009100245864, + "learning_rate": 9.223706238134485e-06, + "loss": 0.5447, + "step": 1355 + }, + { + "epoch": 0.2, + "grad_norm": 6.009236065615713, + "learning_rate": 9.22239854895258e-06, + "loss": 0.5551, + "step": 1356 + }, + { + "epoch": 0.2, + "grad_norm": 12.52474315220209, + "learning_rate": 9.2210898521387e-06, + "loss": 0.5544, + "step": 1357 + }, + { + "epoch": 0.2, + "grad_norm": 6.703976585154395, + "learning_rate": 9.21978014800515e-06, + "loss": 0.4802, + "step": 1358 + }, + { + "epoch": 0.2, + "grad_norm": 6.407600571493279, + "learning_rate": 9.218469436864476e-06, + "loss": 0.4556, + "step": 1359 + }, + { + "epoch": 0.21, + "grad_norm": 6.450928062456577, + "learning_rate": 9.21715771902947e-06, + "loss": 0.4431, + "step": 1360 + }, + { + "epoch": 0.21, + "grad_norm": 12.78456757442652, + "learning_rate": 9.215844994813155e-06, + "loss": 0.4795, + "step": 1361 + }, + { + "epoch": 0.21, + "grad_norm": 9.175992059230888, + "learning_rate": 9.2145312645288e-06, + "loss": 0.5194, + "step": 1362 + }, + { + "epoch": 0.21, + "grad_norm": 1.4169141164909331, + "learning_rate": 9.21321652848992e-06, + "loss": 0.5349, + "step": 1363 + }, + { + "epoch": 0.21, + "grad_norm": 4.635598628320136, + "learning_rate": 9.211900787010254e-06, + "loss": 0.5161, + "step": 1364 + }, + { + "epoch": 0.21, + "grad_norm": 6.548005769959011, + "learning_rate": 9.210584040403794e-06, + "loss": 0.4893, + "step": 1365 + }, + { + "epoch": 0.21, + "grad_norm": 8.08315589592544, + "learning_rate": 9.209266288984769e-06, + "loss": 0.4958, + "step": 1366 + }, + { + "epoch": 0.21, + "grad_norm": 10.442191153430398, + "learning_rate": 9.207947533067647e-06, + "loss": 0.5006, + "step": 1367 + }, + { + "epoch": 0.21, + "grad_norm": 8.585391293094771, + "learning_rate": 9.20662777296713e-06, + "loss": 0.5233, + "step": 1368 + }, + { + "epoch": 0.21, + "grad_norm": 5.204065675044848, + "learning_rate": 9.205307008998176e-06, + "loss": 0.4624, + "step": 1369 + }, + { + "epoch": 0.21, + "grad_norm": 1.2806049031540891, + "learning_rate": 9.203985241475962e-06, + "loss": 0.595, + "step": 1370 + }, + { + "epoch": 0.21, + "grad_norm": 5.352622672113265, + "learning_rate": 9.202662470715918e-06, + "loss": 0.5225, + "step": 1371 + }, + { + "epoch": 0.21, + "grad_norm": 8.334240719838116, + "learning_rate": 9.201338697033709e-06, + "loss": 0.5094, + "step": 1372 + }, + { + "epoch": 0.21, + "grad_norm": 8.632978657724268, + "learning_rate": 9.200013920745241e-06, + "loss": 0.5751, + "step": 1373 + }, + { + "epoch": 0.21, + "grad_norm": 7.489048249694532, + "learning_rate": 9.198688142166656e-06, + "loss": 0.5159, + "step": 1374 + }, + { + "epoch": 0.21, + "grad_norm": 7.565864612282921, + "learning_rate": 9.19736136161434e-06, + "loss": 0.5048, + "step": 1375 + }, + { + "epoch": 0.21, + "grad_norm": 5.3780326025128735, + "learning_rate": 9.196033579404914e-06, + "loss": 0.4955, + "step": 1376 + }, + { + "epoch": 0.21, + "grad_norm": 11.028199694992104, + "learning_rate": 9.19470479585524e-06, + "loss": 0.5177, + "step": 1377 + }, + { + "epoch": 0.21, + "grad_norm": 42.70731937340343, + "learning_rate": 9.193375011282419e-06, + "loss": 0.4977, + "step": 1378 + }, + { + "epoch": 0.21, + "grad_norm": 8.283314069450698, + "learning_rate": 9.19204422600379e-06, + "loss": 0.472, + "step": 1379 + }, + { + "epoch": 0.21, + "grad_norm": 6.118304235174843, + "learning_rate": 9.190712440336928e-06, + "loss": 0.5267, + "step": 1380 + }, + { + "epoch": 0.21, + "grad_norm": 11.789021112676851, + "learning_rate": 9.189379654599657e-06, + "loss": 0.4493, + "step": 1381 + }, + { + "epoch": 0.21, + "grad_norm": 46.48333479375923, + "learning_rate": 9.188045869110026e-06, + "loss": 0.5548, + "step": 1382 + }, + { + "epoch": 0.21, + "grad_norm": 6.390426175962068, + "learning_rate": 9.186711084186333e-06, + "loss": 0.5132, + "step": 1383 + }, + { + "epoch": 0.21, + "grad_norm": 32.556600225291454, + "learning_rate": 9.185375300147111e-06, + "loss": 0.5115, + "step": 1384 + }, + { + "epoch": 0.21, + "grad_norm": 135.4053692137494, + "learning_rate": 9.184038517311127e-06, + "loss": 0.4669, + "step": 1385 + }, + { + "epoch": 0.21, + "grad_norm": 11.909026970033961, + "learning_rate": 9.182700735997395e-06, + "loss": 0.5414, + "step": 1386 + }, + { + "epoch": 0.21, + "grad_norm": 11.70572219305469, + "learning_rate": 9.181361956525159e-06, + "loss": 0.483, + "step": 1387 + }, + { + "epoch": 0.21, + "grad_norm": 8.65336434885517, + "learning_rate": 9.180022179213908e-06, + "loss": 0.5283, + "step": 1388 + }, + { + "epoch": 0.21, + "grad_norm": 13.297981626923628, + "learning_rate": 9.178681404383365e-06, + "loss": 0.5025, + "step": 1389 + }, + { + "epoch": 0.21, + "grad_norm": 7.384550788634928, + "learning_rate": 9.177339632353493e-06, + "loss": 0.4998, + "step": 1390 + }, + { + "epoch": 0.21, + "grad_norm": 9.64499420238818, + "learning_rate": 9.175996863444489e-06, + "loss": 0.5028, + "step": 1391 + }, + { + "epoch": 0.21, + "grad_norm": 8.943869543564151, + "learning_rate": 9.174653097976794e-06, + "loss": 0.5442, + "step": 1392 + }, + { + "epoch": 0.21, + "grad_norm": 8.121333353354556, + "learning_rate": 9.173308336271083e-06, + "loss": 0.4628, + "step": 1393 + }, + { + "epoch": 0.21, + "grad_norm": 23.292822590467964, + "learning_rate": 9.171962578648268e-06, + "loss": 0.5015, + "step": 1394 + }, + { + "epoch": 0.21, + "grad_norm": 5.9757121130550495, + "learning_rate": 9.170615825429503e-06, + "loss": 0.4969, + "step": 1395 + }, + { + "epoch": 0.21, + "grad_norm": 17.249292714427384, + "learning_rate": 9.169268076936175e-06, + "loss": 0.4712, + "step": 1396 + }, + { + "epoch": 0.21, + "grad_norm": 1.2734440972042236, + "learning_rate": 9.167919333489912e-06, + "loss": 0.6234, + "step": 1397 + }, + { + "epoch": 0.21, + "grad_norm": 5.782569494948452, + "learning_rate": 9.166569595412576e-06, + "loss": 0.404, + "step": 1398 + }, + { + "epoch": 0.21, + "grad_norm": 1.4479841334576389, + "learning_rate": 9.165218863026268e-06, + "loss": 0.5553, + "step": 1399 + }, + { + "epoch": 0.21, + "grad_norm": 9.92983083804337, + "learning_rate": 9.163867136653327e-06, + "loss": 0.5269, + "step": 1400 + }, + { + "epoch": 0.21, + "grad_norm": 8.155494255078668, + "learning_rate": 9.162514416616329e-06, + "loss": 0.4839, + "step": 1401 + }, + { + "epoch": 0.21, + "grad_norm": 12.35721953235503, + "learning_rate": 9.161160703238085e-06, + "loss": 0.4795, + "step": 1402 + }, + { + "epoch": 0.21, + "grad_norm": 12.941667281890753, + "learning_rate": 9.15980599684165e-06, + "loss": 0.4851, + "step": 1403 + }, + { + "epoch": 0.21, + "grad_norm": 14.503102980911349, + "learning_rate": 9.158450297750303e-06, + "loss": 0.4826, + "step": 1404 + }, + { + "epoch": 0.21, + "grad_norm": 9.211804589954603, + "learning_rate": 9.157093606287573e-06, + "loss": 0.4227, + "step": 1405 + }, + { + "epoch": 0.21, + "grad_norm": 27.628510822181873, + "learning_rate": 9.155735922777219e-06, + "loss": 0.4102, + "step": 1406 + }, + { + "epoch": 0.21, + "grad_norm": 17.349167792670606, + "learning_rate": 9.154377247543236e-06, + "loss": 0.5061, + "step": 1407 + }, + { + "epoch": 0.21, + "grad_norm": 8.707863982326407, + "learning_rate": 9.153017580909863e-06, + "loss": 0.4977, + "step": 1408 + }, + { + "epoch": 0.21, + "grad_norm": 7.346047472364499, + "learning_rate": 9.151656923201566e-06, + "loss": 0.527, + "step": 1409 + }, + { + "epoch": 0.21, + "grad_norm": 6.923685917646882, + "learning_rate": 9.150295274743054e-06, + "loss": 0.5107, + "step": 1410 + }, + { + "epoch": 0.21, + "grad_norm": 9.143608794943074, + "learning_rate": 9.148932635859269e-06, + "loss": 0.5016, + "step": 1411 + }, + { + "epoch": 0.21, + "grad_norm": 10.373911371978839, + "learning_rate": 9.147569006875392e-06, + "loss": 0.4922, + "step": 1412 + }, + { + "epoch": 0.21, + "grad_norm": 19.063058102702172, + "learning_rate": 9.146204388116839e-06, + "loss": 0.493, + "step": 1413 + }, + { + "epoch": 0.21, + "grad_norm": 14.428239187320592, + "learning_rate": 9.14483877990926e-06, + "loss": 0.4821, + "step": 1414 + }, + { + "epoch": 0.21, + "grad_norm": 9.338104783299679, + "learning_rate": 9.143472182578548e-06, + "loss": 0.4802, + "step": 1415 + }, + { + "epoch": 0.21, + "grad_norm": 21.121469105400216, + "learning_rate": 9.142104596450823e-06, + "loss": 0.4995, + "step": 1416 + }, + { + "epoch": 0.21, + "grad_norm": 6.063409727002606, + "learning_rate": 9.140736021852446e-06, + "loss": 0.5163, + "step": 1417 + }, + { + "epoch": 0.21, + "grad_norm": 17.601861145528414, + "learning_rate": 9.139366459110016e-06, + "loss": 0.5531, + "step": 1418 + }, + { + "epoch": 0.21, + "grad_norm": 7.249191826667966, + "learning_rate": 9.137995908550363e-06, + "loss": 0.4765, + "step": 1419 + }, + { + "epoch": 0.21, + "grad_norm": 8.801433064129984, + "learning_rate": 9.136624370500555e-06, + "loss": 0.4782, + "step": 1420 + }, + { + "epoch": 0.21, + "grad_norm": 8.178305509487918, + "learning_rate": 9.135251845287895e-06, + "loss": 0.534, + "step": 1421 + }, + { + "epoch": 0.21, + "grad_norm": 5.799888631283141, + "learning_rate": 9.133878333239923e-06, + "loss": 0.5095, + "step": 1422 + }, + { + "epoch": 0.21, + "grad_norm": 7.7539426481581994, + "learning_rate": 9.132503834684412e-06, + "loss": 0.5573, + "step": 1423 + }, + { + "epoch": 0.21, + "grad_norm": 14.248648711886878, + "learning_rate": 9.131128349949375e-06, + "loss": 0.5536, + "step": 1424 + }, + { + "epoch": 0.21, + "grad_norm": 6.205449838269684, + "learning_rate": 9.129751879363053e-06, + "loss": 0.4763, + "step": 1425 + }, + { + "epoch": 0.22, + "grad_norm": 5.1855540235029896, + "learning_rate": 9.12837442325393e-06, + "loss": 0.4613, + "step": 1426 + }, + { + "epoch": 0.22, + "grad_norm": 6.312733988116978, + "learning_rate": 9.126995981950719e-06, + "loss": 0.4672, + "step": 1427 + }, + { + "epoch": 0.22, + "grad_norm": 7.969378071207799, + "learning_rate": 9.125616555782374e-06, + "loss": 0.4944, + "step": 1428 + }, + { + "epoch": 0.22, + "grad_norm": 13.560939353860247, + "learning_rate": 9.124236145078079e-06, + "loss": 0.4893, + "step": 1429 + }, + { + "epoch": 0.22, + "grad_norm": 6.727310868847268, + "learning_rate": 9.122854750167255e-06, + "loss": 0.5298, + "step": 1430 + }, + { + "epoch": 0.22, + "grad_norm": 16.87160167062892, + "learning_rate": 9.121472371379559e-06, + "loss": 0.4133, + "step": 1431 + }, + { + "epoch": 0.22, + "grad_norm": 23.57780058146975, + "learning_rate": 9.120089009044876e-06, + "loss": 0.4868, + "step": 1432 + }, + { + "epoch": 0.22, + "grad_norm": 5.680148847443938, + "learning_rate": 9.11870466349334e-06, + "loss": 0.5909, + "step": 1433 + }, + { + "epoch": 0.22, + "grad_norm": 5.731360824656868, + "learning_rate": 9.117319335055304e-06, + "loss": 0.4931, + "step": 1434 + }, + { + "epoch": 0.22, + "grad_norm": 8.812148184396323, + "learning_rate": 9.115933024061366e-06, + "loss": 0.4759, + "step": 1435 + }, + { + "epoch": 0.22, + "grad_norm": 6.985552656479306, + "learning_rate": 9.114545730842352e-06, + "loss": 0.5127, + "step": 1436 + }, + { + "epoch": 0.22, + "grad_norm": 13.059552253300287, + "learning_rate": 9.113157455729326e-06, + "loss": 0.5119, + "step": 1437 + }, + { + "epoch": 0.22, + "grad_norm": 7.334915407973871, + "learning_rate": 9.111768199053588e-06, + "loss": 0.4913, + "step": 1438 + }, + { + "epoch": 0.22, + "grad_norm": 14.119529773665557, + "learning_rate": 9.110377961146668e-06, + "loss": 0.566, + "step": 1439 + }, + { + "epoch": 0.22, + "grad_norm": 4.846150868201904, + "learning_rate": 9.108986742340331e-06, + "loss": 0.5034, + "step": 1440 + }, + { + "epoch": 0.22, + "grad_norm": 9.08892686335765, + "learning_rate": 9.10759454296658e-06, + "loss": 0.5015, + "step": 1441 + }, + { + "epoch": 0.22, + "grad_norm": 23.45613332108298, + "learning_rate": 9.106201363357645e-06, + "loss": 0.489, + "step": 1442 + }, + { + "epoch": 0.22, + "grad_norm": 19.827004993588893, + "learning_rate": 9.104807203845997e-06, + "loss": 0.5275, + "step": 1443 + }, + { + "epoch": 0.22, + "grad_norm": 12.511753529328168, + "learning_rate": 9.103412064764338e-06, + "loss": 0.503, + "step": 1444 + }, + { + "epoch": 0.22, + "grad_norm": 12.107480886845112, + "learning_rate": 9.102015946445602e-06, + "loss": 0.5262, + "step": 1445 + }, + { + "epoch": 0.22, + "grad_norm": 8.878004419485872, + "learning_rate": 9.10061884922296e-06, + "loss": 0.5212, + "step": 1446 + }, + { + "epoch": 0.22, + "grad_norm": 28.116267478321486, + "learning_rate": 9.099220773429812e-06, + "loss": 0.5442, + "step": 1447 + }, + { + "epoch": 0.22, + "grad_norm": 11.443314897524829, + "learning_rate": 9.097821719399795e-06, + "loss": 0.4505, + "step": 1448 + }, + { + "epoch": 0.22, + "grad_norm": 16.647970507174, + "learning_rate": 9.096421687466782e-06, + "loss": 0.4994, + "step": 1449 + }, + { + "epoch": 0.22, + "grad_norm": 8.233590711197776, + "learning_rate": 9.095020677964874e-06, + "loss": 0.55, + "step": 1450 + }, + { + "epoch": 0.22, + "grad_norm": 8.899181100425423, + "learning_rate": 9.093618691228407e-06, + "loss": 0.4766, + "step": 1451 + }, + { + "epoch": 0.22, + "grad_norm": 40.80967423407616, + "learning_rate": 9.092215727591952e-06, + "loss": 0.4784, + "step": 1452 + }, + { + "epoch": 0.22, + "grad_norm": 6.42039925823618, + "learning_rate": 9.090811787390313e-06, + "loss": 0.4862, + "step": 1453 + }, + { + "epoch": 0.22, + "grad_norm": 5.785200126996847, + "learning_rate": 9.089406870958522e-06, + "loss": 0.4558, + "step": 1454 + }, + { + "epoch": 0.22, + "grad_norm": 6.511216865150638, + "learning_rate": 9.08800097863185e-06, + "loss": 0.5197, + "step": 1455 + }, + { + "epoch": 0.22, + "grad_norm": 8.224894069969645, + "learning_rate": 9.0865941107458e-06, + "loss": 0.529, + "step": 1456 + }, + { + "epoch": 0.22, + "grad_norm": 12.899155665057501, + "learning_rate": 9.085186267636103e-06, + "loss": 0.5472, + "step": 1457 + }, + { + "epoch": 0.22, + "grad_norm": 10.877000259025259, + "learning_rate": 9.08377744963873e-06, + "loss": 0.6346, + "step": 1458 + }, + { + "epoch": 0.22, + "grad_norm": 8.915888978915937, + "learning_rate": 9.08236765708988e-06, + "loss": 0.4406, + "step": 1459 + }, + { + "epoch": 0.22, + "grad_norm": 25.72000585285448, + "learning_rate": 9.080956890325985e-06, + "loss": 0.493, + "step": 1460 + }, + { + "epoch": 0.22, + "grad_norm": 10.194341404954631, + "learning_rate": 9.079545149683708e-06, + "loss": 0.5405, + "step": 1461 + }, + { + "epoch": 0.22, + "grad_norm": 5.408975067927627, + "learning_rate": 9.078132435499951e-06, + "loss": 0.5278, + "step": 1462 + }, + { + "epoch": 0.22, + "grad_norm": 8.360398106935369, + "learning_rate": 9.07671874811184e-06, + "loss": 0.4664, + "step": 1463 + }, + { + "epoch": 0.22, + "grad_norm": 4.896914999939522, + "learning_rate": 9.075304087856738e-06, + "loss": 0.4741, + "step": 1464 + }, + { + "epoch": 0.22, + "grad_norm": 8.094189745324796, + "learning_rate": 9.073888455072239e-06, + "loss": 0.4509, + "step": 1465 + }, + { + "epoch": 0.22, + "grad_norm": 7.516996394659442, + "learning_rate": 9.07247185009617e-06, + "loss": 0.5, + "step": 1466 + }, + { + "epoch": 0.22, + "grad_norm": 8.261252757580161, + "learning_rate": 9.071054273266586e-06, + "loss": 0.5135, + "step": 1467 + }, + { + "epoch": 0.22, + "grad_norm": 9.49978403359483, + "learning_rate": 9.069635724921784e-06, + "loss": 0.5813, + "step": 1468 + }, + { + "epoch": 0.22, + "grad_norm": 9.39739633279772, + "learning_rate": 9.068216205400278e-06, + "loss": 0.5693, + "step": 1469 + }, + { + "epoch": 0.22, + "grad_norm": 7.957432278613746, + "learning_rate": 9.066795715040825e-06, + "loss": 0.513, + "step": 1470 + }, + { + "epoch": 0.22, + "grad_norm": 11.254103134007421, + "learning_rate": 9.065374254182414e-06, + "loss": 0.5117, + "step": 1471 + }, + { + "epoch": 0.22, + "grad_norm": 13.077983048144374, + "learning_rate": 9.063951823164256e-06, + "loss": 0.4606, + "step": 1472 + }, + { + "epoch": 0.22, + "grad_norm": 6.576507268153216, + "learning_rate": 9.062528422325804e-06, + "loss": 0.4501, + "step": 1473 + }, + { + "epoch": 0.22, + "grad_norm": 6.424099464896427, + "learning_rate": 9.061104052006737e-06, + "loss": 0.5181, + "step": 1474 + }, + { + "epoch": 0.22, + "grad_norm": 17.378660237187464, + "learning_rate": 9.059678712546964e-06, + "loss": 0.5073, + "step": 1475 + }, + { + "epoch": 0.22, + "grad_norm": 7.308392613715438, + "learning_rate": 9.05825240428663e-06, + "loss": 0.5583, + "step": 1476 + }, + { + "epoch": 0.22, + "grad_norm": 5.918691571190655, + "learning_rate": 9.056825127566107e-06, + "loss": 0.4525, + "step": 1477 + }, + { + "epoch": 0.22, + "grad_norm": 4.115829345213592, + "learning_rate": 9.055396882726001e-06, + "loss": 0.4572, + "step": 1478 + }, + { + "epoch": 0.22, + "grad_norm": 6.085009054783822, + "learning_rate": 9.053967670107148e-06, + "loss": 0.5051, + "step": 1479 + }, + { + "epoch": 0.22, + "grad_norm": 6.90600339656031, + "learning_rate": 9.052537490050615e-06, + "loss": 0.4836, + "step": 1480 + }, + { + "epoch": 0.22, + "grad_norm": 4.584471229361751, + "learning_rate": 9.051106342897699e-06, + "loss": 0.5384, + "step": 1481 + }, + { + "epoch": 0.22, + "grad_norm": 5.53480885264879, + "learning_rate": 9.049674228989929e-06, + "loss": 0.4941, + "step": 1482 + }, + { + "epoch": 0.22, + "grad_norm": 3.847990377012414, + "learning_rate": 9.048241148669066e-06, + "loss": 0.4394, + "step": 1483 + }, + { + "epoch": 0.22, + "grad_norm": 6.875381513647627, + "learning_rate": 9.046807102277096e-06, + "loss": 0.4773, + "step": 1484 + }, + { + "epoch": 0.22, + "grad_norm": 7.2767679503740545, + "learning_rate": 9.045372090156244e-06, + "loss": 0.4396, + "step": 1485 + }, + { + "epoch": 0.22, + "grad_norm": 6.458650884658711, + "learning_rate": 9.043936112648956e-06, + "loss": 0.498, + "step": 1486 + }, + { + "epoch": 0.22, + "grad_norm": 11.265900380200062, + "learning_rate": 9.042499170097917e-06, + "loss": 0.4541, + "step": 1487 + }, + { + "epoch": 0.22, + "grad_norm": 6.182825735743462, + "learning_rate": 9.041061262846037e-06, + "loss": 0.4927, + "step": 1488 + }, + { + "epoch": 0.22, + "grad_norm": 5.4337311328873446, + "learning_rate": 9.039622391236457e-06, + "loss": 0.4895, + "step": 1489 + }, + { + "epoch": 0.22, + "grad_norm": 4.583484486706033, + "learning_rate": 9.038182555612551e-06, + "loss": 0.4796, + "step": 1490 + }, + { + "epoch": 0.22, + "grad_norm": 4.696687945240696, + "learning_rate": 9.03674175631792e-06, + "loss": 0.4277, + "step": 1491 + }, + { + "epoch": 0.23, + "grad_norm": 7.00230277684635, + "learning_rate": 9.035299993696396e-06, + "loss": 0.4199, + "step": 1492 + }, + { + "epoch": 0.23, + "grad_norm": 9.193402853176698, + "learning_rate": 9.03385726809204e-06, + "loss": 0.5018, + "step": 1493 + }, + { + "epoch": 0.23, + "grad_norm": 9.081424735682337, + "learning_rate": 9.032413579849146e-06, + "loss": 0.544, + "step": 1494 + }, + { + "epoch": 0.23, + "grad_norm": 6.503721000668175, + "learning_rate": 9.030968929312231e-06, + "loss": 0.5325, + "step": 1495 + }, + { + "epoch": 0.23, + "grad_norm": 5.593979988459677, + "learning_rate": 9.02952331682605e-06, + "loss": 0.5785, + "step": 1496 + }, + { + "epoch": 0.23, + "grad_norm": 7.474528584345009, + "learning_rate": 9.028076742735583e-06, + "loss": 0.4555, + "step": 1497 + }, + { + "epoch": 0.23, + "grad_norm": 4.817703541843584, + "learning_rate": 9.026629207386038e-06, + "loss": 0.5077, + "step": 1498 + }, + { + "epoch": 0.23, + "grad_norm": 5.616431368330579, + "learning_rate": 9.025180711122857e-06, + "loss": 0.4743, + "step": 1499 + }, + { + "epoch": 0.23, + "grad_norm": 5.151542388569003, + "learning_rate": 9.023731254291706e-06, + "loss": 0.446, + "step": 1500 + }, + { + "epoch": 0.23, + "grad_norm": 4.627918830248438, + "learning_rate": 9.022280837238484e-06, + "loss": 0.4623, + "step": 1501 + }, + { + "epoch": 0.23, + "grad_norm": 4.129737175707645, + "learning_rate": 9.02082946030932e-06, + "loss": 0.4728, + "step": 1502 + }, + { + "epoch": 0.23, + "grad_norm": 1.372478521838489, + "learning_rate": 9.019377123850566e-06, + "loss": 0.5239, + "step": 1503 + }, + { + "epoch": 0.23, + "grad_norm": 3.2072584269996787, + "learning_rate": 9.017923828208812e-06, + "loss": 0.491, + "step": 1504 + }, + { + "epoch": 0.23, + "grad_norm": 4.561788126108043, + "learning_rate": 9.01646957373087e-06, + "loss": 0.445, + "step": 1505 + }, + { + "epoch": 0.23, + "grad_norm": 1.40397455301525, + "learning_rate": 9.015014360763781e-06, + "loss": 0.5685, + "step": 1506 + }, + { + "epoch": 0.23, + "grad_norm": 3.402329650156023, + "learning_rate": 9.013558189654819e-06, + "loss": 0.6057, + "step": 1507 + }, + { + "epoch": 0.23, + "grad_norm": 5.201592731475415, + "learning_rate": 9.012101060751484e-06, + "loss": 0.406, + "step": 1508 + }, + { + "epoch": 0.23, + "grad_norm": 3.6707778332294816, + "learning_rate": 9.010642974401505e-06, + "loss": 0.4499, + "step": 1509 + }, + { + "epoch": 0.23, + "grad_norm": 4.5218108138367095, + "learning_rate": 9.009183930952837e-06, + "loss": 0.5088, + "step": 1510 + }, + { + "epoch": 0.23, + "grad_norm": 5.041990522030021, + "learning_rate": 9.007723930753667e-06, + "loss": 0.5896, + "step": 1511 + }, + { + "epoch": 0.23, + "grad_norm": 6.519231233376105, + "learning_rate": 9.006262974152411e-06, + "loss": 0.5319, + "step": 1512 + }, + { + "epoch": 0.23, + "grad_norm": 13.890619553144052, + "learning_rate": 9.00480106149771e-06, + "loss": 0.499, + "step": 1513 + }, + { + "epoch": 0.23, + "grad_norm": 12.764277952653995, + "learning_rate": 9.003338193138433e-06, + "loss": 0.5098, + "step": 1514 + }, + { + "epoch": 0.23, + "grad_norm": 6.337403526654539, + "learning_rate": 9.00187436942368e-06, + "loss": 0.5164, + "step": 1515 + }, + { + "epoch": 0.23, + "grad_norm": 4.478355267798711, + "learning_rate": 9.000409590702777e-06, + "loss": 0.4604, + "step": 1516 + }, + { + "epoch": 0.23, + "grad_norm": 6.984036265618275, + "learning_rate": 8.998943857325277e-06, + "loss": 0.4696, + "step": 1517 + }, + { + "epoch": 0.23, + "grad_norm": 3.925662389882426, + "learning_rate": 8.997477169640965e-06, + "loss": 0.5376, + "step": 1518 + }, + { + "epoch": 0.23, + "grad_norm": 5.4826423798942665, + "learning_rate": 8.996009527999849e-06, + "loss": 0.4367, + "step": 1519 + }, + { + "epoch": 0.23, + "grad_norm": 4.728759318804837, + "learning_rate": 8.994540932752168e-06, + "loss": 0.6066, + "step": 1520 + }, + { + "epoch": 0.23, + "grad_norm": 1.687180636863638, + "learning_rate": 8.993071384248386e-06, + "loss": 0.713, + "step": 1521 + }, + { + "epoch": 0.23, + "grad_norm": 7.036612061401911, + "learning_rate": 8.991600882839195e-06, + "loss": 0.5503, + "step": 1522 + }, + { + "epoch": 0.23, + "grad_norm": 4.4033612859676925, + "learning_rate": 8.990129428875517e-06, + "loss": 0.5558, + "step": 1523 + }, + { + "epoch": 0.23, + "grad_norm": 3.339142560411955, + "learning_rate": 8.988657022708497e-06, + "loss": 0.4918, + "step": 1524 + }, + { + "epoch": 0.23, + "grad_norm": 21.437009577271816, + "learning_rate": 8.987183664689511e-06, + "loss": 0.5511, + "step": 1525 + }, + { + "epoch": 0.23, + "grad_norm": 3.1805261051584033, + "learning_rate": 8.985709355170162e-06, + "loss": 0.4578, + "step": 1526 + }, + { + "epoch": 0.23, + "grad_norm": 6.299927412331658, + "learning_rate": 8.984234094502278e-06, + "loss": 0.5639, + "step": 1527 + }, + { + "epoch": 0.23, + "grad_norm": 5.021996371802004, + "learning_rate": 8.982757883037913e-06, + "loss": 0.5398, + "step": 1528 + }, + { + "epoch": 0.23, + "grad_norm": 4.300686914145199, + "learning_rate": 8.98128072112935e-06, + "loss": 0.5288, + "step": 1529 + }, + { + "epoch": 0.23, + "grad_norm": 7.183426579181348, + "learning_rate": 8.979802609129101e-06, + "loss": 0.5156, + "step": 1530 + }, + { + "epoch": 0.23, + "grad_norm": 4.721015940813186, + "learning_rate": 8.978323547389899e-06, + "loss": 0.5001, + "step": 1531 + }, + { + "epoch": 0.23, + "grad_norm": 5.025467704964178, + "learning_rate": 8.97684353626471e-06, + "loss": 0.4918, + "step": 1532 + }, + { + "epoch": 0.23, + "grad_norm": 3.179572041345102, + "learning_rate": 8.97536257610672e-06, + "loss": 0.4668, + "step": 1533 + }, + { + "epoch": 0.23, + "grad_norm": 3.718165624468269, + "learning_rate": 8.973880667269348e-06, + "loss": 0.4776, + "step": 1534 + }, + { + "epoch": 0.23, + "grad_norm": 3.705962047252945, + "learning_rate": 8.972397810106235e-06, + "loss": 0.5, + "step": 1535 + }, + { + "epoch": 0.23, + "grad_norm": 2.8257552670085317, + "learning_rate": 8.970914004971249e-06, + "loss": 0.4914, + "step": 1536 + }, + { + "epoch": 0.23, + "grad_norm": 5.61466705105209, + "learning_rate": 8.969429252218487e-06, + "loss": 0.5056, + "step": 1537 + }, + { + "epoch": 0.23, + "grad_norm": 1.477905846502025, + "learning_rate": 8.967943552202268e-06, + "loss": 0.6105, + "step": 1538 + }, + { + "epoch": 0.23, + "grad_norm": 28.237532498731063, + "learning_rate": 8.966456905277138e-06, + "loss": 0.5358, + "step": 1539 + }, + { + "epoch": 0.23, + "grad_norm": 2.7084151387854067, + "learning_rate": 8.964969311797873e-06, + "loss": 0.4807, + "step": 1540 + }, + { + "epoch": 0.23, + "grad_norm": 7.90764865438277, + "learning_rate": 8.96348077211947e-06, + "loss": 0.5164, + "step": 1541 + }, + { + "epoch": 0.23, + "grad_norm": 3.09509778196222, + "learning_rate": 8.961991286597154e-06, + "loss": 0.4562, + "step": 1542 + }, + { + "epoch": 0.23, + "grad_norm": 4.936573492749508, + "learning_rate": 8.960500855586374e-06, + "loss": 0.4729, + "step": 1543 + }, + { + "epoch": 0.23, + "grad_norm": 3.977975310854192, + "learning_rate": 8.959009479442811e-06, + "loss": 0.5089, + "step": 1544 + }, + { + "epoch": 0.23, + "grad_norm": 7.813690494260851, + "learning_rate": 8.95751715852236e-06, + "loss": 0.4874, + "step": 1545 + }, + { + "epoch": 0.23, + "grad_norm": 3.7836356658002654, + "learning_rate": 8.956023893181152e-06, + "loss": 0.486, + "step": 1546 + }, + { + "epoch": 0.23, + "grad_norm": 2.845656914896341, + "learning_rate": 8.954529683775539e-06, + "loss": 0.4486, + "step": 1547 + }, + { + "epoch": 0.23, + "grad_norm": 3.5431160236267876, + "learning_rate": 8.953034530662096e-06, + "loss": 0.4564, + "step": 1548 + }, + { + "epoch": 0.23, + "grad_norm": 5.295600230765116, + "learning_rate": 8.951538434197631e-06, + "loss": 0.5581, + "step": 1549 + }, + { + "epoch": 0.23, + "grad_norm": 5.8005098960015475, + "learning_rate": 8.950041394739169e-06, + "loss": 0.5365, + "step": 1550 + }, + { + "epoch": 0.23, + "grad_norm": 6.4124557518316845, + "learning_rate": 8.948543412643962e-06, + "loss": 0.5371, + "step": 1551 + }, + { + "epoch": 0.23, + "grad_norm": 3.7992116211150027, + "learning_rate": 8.947044488269489e-06, + "loss": 0.472, + "step": 1552 + }, + { + "epoch": 0.23, + "grad_norm": 3.4404926818713246, + "learning_rate": 8.945544621973454e-06, + "loss": 0.4439, + "step": 1553 + }, + { + "epoch": 0.23, + "grad_norm": 19.724220377121565, + "learning_rate": 8.944043814113785e-06, + "loss": 0.4997, + "step": 1554 + }, + { + "epoch": 0.23, + "grad_norm": 1.5006398971090862, + "learning_rate": 8.942542065048631e-06, + "loss": 0.5852, + "step": 1555 + }, + { + "epoch": 0.23, + "grad_norm": 3.9479806675799196, + "learning_rate": 8.94103937513637e-06, + "loss": 0.4397, + "step": 1556 + }, + { + "epoch": 0.23, + "grad_norm": 4.240396290372782, + "learning_rate": 8.939535744735608e-06, + "loss": 0.5704, + "step": 1557 + }, + { + "epoch": 0.23, + "grad_norm": 4.483672322966049, + "learning_rate": 8.938031174205166e-06, + "loss": 0.5407, + "step": 1558 + }, + { + "epoch": 0.24, + "grad_norm": 7.620883873344044, + "learning_rate": 8.936525663904097e-06, + "loss": 0.5232, + "step": 1559 + }, + { + "epoch": 0.24, + "grad_norm": 7.9346802210421945, + "learning_rate": 8.935019214191673e-06, + "loss": 0.5494, + "step": 1560 + }, + { + "epoch": 0.24, + "grad_norm": 4.200113489173202, + "learning_rate": 8.933511825427393e-06, + "loss": 0.4709, + "step": 1561 + }, + { + "epoch": 0.24, + "grad_norm": 3.8291109058451522, + "learning_rate": 8.932003497970983e-06, + "loss": 0.5548, + "step": 1562 + }, + { + "epoch": 0.24, + "grad_norm": 5.258140076890803, + "learning_rate": 8.930494232182385e-06, + "loss": 0.5561, + "step": 1563 + }, + { + "epoch": 0.24, + "grad_norm": 5.728808823924346, + "learning_rate": 8.928984028421777e-06, + "loss": 0.519, + "step": 1564 + }, + { + "epoch": 0.24, + "grad_norm": 5.744608648630482, + "learning_rate": 8.927472887049546e-06, + "loss": 0.5023, + "step": 1565 + }, + { + "epoch": 0.24, + "grad_norm": 16.546278846576048, + "learning_rate": 8.925960808426313e-06, + "loss": 0.4632, + "step": 1566 + }, + { + "epoch": 0.24, + "grad_norm": 7.536346941375556, + "learning_rate": 8.924447792912921e-06, + "loss": 0.4559, + "step": 1567 + }, + { + "epoch": 0.24, + "grad_norm": 6.440776909911878, + "learning_rate": 8.922933840870436e-06, + "loss": 0.4829, + "step": 1568 + }, + { + "epoch": 0.24, + "grad_norm": 7.081884084987932, + "learning_rate": 8.921418952660146e-06, + "loss": 0.5551, + "step": 1569 + }, + { + "epoch": 0.24, + "grad_norm": 1.6872539177515735, + "learning_rate": 8.919903128643561e-06, + "loss": 0.5538, + "step": 1570 + }, + { + "epoch": 0.24, + "grad_norm": 8.723160141797946, + "learning_rate": 8.918386369182423e-06, + "loss": 0.4926, + "step": 1571 + }, + { + "epoch": 0.24, + "grad_norm": 6.012526672899946, + "learning_rate": 8.916868674638686e-06, + "loss": 0.4464, + "step": 1572 + }, + { + "epoch": 0.24, + "grad_norm": 9.214614306187443, + "learning_rate": 8.915350045374533e-06, + "loss": 0.5044, + "step": 1573 + }, + { + "epoch": 0.24, + "grad_norm": 11.104318510025482, + "learning_rate": 8.91383048175237e-06, + "loss": 0.4585, + "step": 1574 + }, + { + "epoch": 0.24, + "grad_norm": 1.3764786205247317, + "learning_rate": 8.912309984134826e-06, + "loss": 0.6087, + "step": 1575 + }, + { + "epoch": 0.24, + "grad_norm": 10.49536406566925, + "learning_rate": 8.91078855288475e-06, + "loss": 0.4914, + "step": 1576 + }, + { + "epoch": 0.24, + "grad_norm": 5.943308936776226, + "learning_rate": 8.909266188365218e-06, + "loss": 0.4914, + "step": 1577 + }, + { + "epoch": 0.24, + "grad_norm": 5.9575791586285884, + "learning_rate": 8.907742890939526e-06, + "loss": 0.4861, + "step": 1578 + }, + { + "epoch": 0.24, + "grad_norm": 8.578798004886123, + "learning_rate": 8.906218660971192e-06, + "loss": 0.5015, + "step": 1579 + }, + { + "epoch": 0.24, + "grad_norm": 5.298115173163746, + "learning_rate": 8.90469349882396e-06, + "loss": 0.4087, + "step": 1580 + }, + { + "epoch": 0.24, + "grad_norm": 6.414973434589644, + "learning_rate": 8.903167404861792e-06, + "loss": 0.4824, + "step": 1581 + }, + { + "epoch": 0.24, + "grad_norm": 5.485907870921125, + "learning_rate": 8.901640379448875e-06, + "loss": 0.5567, + "step": 1582 + }, + { + "epoch": 0.24, + "grad_norm": 6.755053685400907, + "learning_rate": 8.90011242294962e-06, + "loss": 0.465, + "step": 1583 + }, + { + "epoch": 0.24, + "grad_norm": 8.07057115155756, + "learning_rate": 8.898583535728656e-06, + "loss": 0.4254, + "step": 1584 + }, + { + "epoch": 0.24, + "grad_norm": 17.26107606581141, + "learning_rate": 8.89705371815084e-06, + "loss": 0.5965, + "step": 1585 + }, + { + "epoch": 0.24, + "grad_norm": 8.81838509404585, + "learning_rate": 8.89552297058124e-06, + "loss": 0.5313, + "step": 1586 + }, + { + "epoch": 0.24, + "grad_norm": 5.794821587600955, + "learning_rate": 8.893991293385159e-06, + "loss": 0.4032, + "step": 1587 + }, + { + "epoch": 0.24, + "grad_norm": 7.444793442133021, + "learning_rate": 8.892458686928115e-06, + "loss": 0.4633, + "step": 1588 + }, + { + "epoch": 0.24, + "grad_norm": 6.114230069577213, + "learning_rate": 8.890925151575846e-06, + "loss": 0.5664, + "step": 1589 + }, + { + "epoch": 0.24, + "grad_norm": 8.173006355354904, + "learning_rate": 8.889390687694318e-06, + "loss": 0.5422, + "step": 1590 + }, + { + "epoch": 0.24, + "grad_norm": 7.398966619540447, + "learning_rate": 8.887855295649713e-06, + "loss": 0.4815, + "step": 1591 + }, + { + "epoch": 0.24, + "grad_norm": 69.17134608264283, + "learning_rate": 8.886318975808436e-06, + "loss": 0.5056, + "step": 1592 + }, + { + "epoch": 0.24, + "grad_norm": 2.04047377674635, + "learning_rate": 8.884781728537117e-06, + "loss": 0.5805, + "step": 1593 + }, + { + "epoch": 0.24, + "grad_norm": 6.145511927930641, + "learning_rate": 8.883243554202603e-06, + "loss": 0.4789, + "step": 1594 + }, + { + "epoch": 0.24, + "grad_norm": 7.963253899583847, + "learning_rate": 8.88170445317196e-06, + "loss": 0.577, + "step": 1595 + }, + { + "epoch": 0.24, + "grad_norm": 9.081286073706801, + "learning_rate": 8.880164425812483e-06, + "loss": 0.4723, + "step": 1596 + }, + { + "epoch": 0.24, + "grad_norm": 6.186068393302075, + "learning_rate": 8.878623472491683e-06, + "loss": 0.4703, + "step": 1597 + }, + { + "epoch": 0.24, + "grad_norm": 6.392687277500229, + "learning_rate": 8.877081593577289e-06, + "loss": 0.4525, + "step": 1598 + }, + { + "epoch": 0.24, + "grad_norm": 32.20000426959304, + "learning_rate": 8.875538789437259e-06, + "loss": 0.4454, + "step": 1599 + }, + { + "epoch": 0.24, + "grad_norm": 9.224367896613412, + "learning_rate": 8.873995060439765e-06, + "loss": 0.5762, + "step": 1600 + }, + { + "epoch": 0.24, + "grad_norm": 7.634713571904084, + "learning_rate": 8.872450406953202e-06, + "loss": 0.4568, + "step": 1601 + }, + { + "epoch": 0.24, + "grad_norm": 4.355223764585945, + "learning_rate": 8.870904829346187e-06, + "loss": 0.4946, + "step": 1602 + }, + { + "epoch": 0.24, + "grad_norm": 4.1180019370521315, + "learning_rate": 8.869358327987556e-06, + "loss": 0.4824, + "step": 1603 + }, + { + "epoch": 0.24, + "grad_norm": 3.0992207440796005, + "learning_rate": 8.867810903246364e-06, + "loss": 0.4444, + "step": 1604 + }, + { + "epoch": 0.24, + "grad_norm": 4.1724872996723015, + "learning_rate": 8.86626255549189e-06, + "loss": 0.4056, + "step": 1605 + }, + { + "epoch": 0.24, + "grad_norm": 7.809846539413385, + "learning_rate": 8.86471328509363e-06, + "loss": 0.489, + "step": 1606 + }, + { + "epoch": 0.24, + "grad_norm": 7.3616182603951925, + "learning_rate": 8.8631630924213e-06, + "loss": 0.5177, + "step": 1607 + }, + { + "epoch": 0.24, + "grad_norm": 7.97609224174291, + "learning_rate": 8.861611977844843e-06, + "loss": 0.5472, + "step": 1608 + }, + { + "epoch": 0.24, + "grad_norm": 5.3523603447103945, + "learning_rate": 8.860059941734412e-06, + "loss": 0.4696, + "step": 1609 + }, + { + "epoch": 0.24, + "grad_norm": 9.712176575715722, + "learning_rate": 8.858506984460383e-06, + "loss": 0.5528, + "step": 1610 + }, + { + "epoch": 0.24, + "grad_norm": 6.44691845074557, + "learning_rate": 8.856953106393357e-06, + "loss": 0.5045, + "step": 1611 + }, + { + "epoch": 0.24, + "grad_norm": 6.040833230502797, + "learning_rate": 8.85539830790415e-06, + "loss": 0.4487, + "step": 1612 + }, + { + "epoch": 0.24, + "grad_norm": 5.621001085871318, + "learning_rate": 8.853842589363796e-06, + "loss": 0.5862, + "step": 1613 + }, + { + "epoch": 0.24, + "grad_norm": 5.557456278573803, + "learning_rate": 8.852285951143556e-06, + "loss": 0.5322, + "step": 1614 + }, + { + "epoch": 0.24, + "grad_norm": 4.6880220450133026, + "learning_rate": 8.850728393614903e-06, + "loss": 0.5186, + "step": 1615 + }, + { + "epoch": 0.24, + "grad_norm": 4.288556227132453, + "learning_rate": 8.849169917149532e-06, + "loss": 0.4823, + "step": 1616 + }, + { + "epoch": 0.24, + "grad_norm": 11.296741569900654, + "learning_rate": 8.847610522119358e-06, + "loss": 0.5524, + "step": 1617 + }, + { + "epoch": 0.24, + "grad_norm": 1.972526387373982, + "learning_rate": 8.846050208896512e-06, + "loss": 0.7018, + "step": 1618 + }, + { + "epoch": 0.24, + "grad_norm": 4.158975955011825, + "learning_rate": 8.844488977853351e-06, + "loss": 0.4879, + "step": 1619 + }, + { + "epoch": 0.24, + "grad_norm": 4.6125019554649045, + "learning_rate": 8.842926829362447e-06, + "loss": 0.555, + "step": 1620 + }, + { + "epoch": 0.24, + "grad_norm": 7.780515089934748, + "learning_rate": 8.841363763796585e-06, + "loss": 0.48, + "step": 1621 + }, + { + "epoch": 0.24, + "grad_norm": 3.6402425437624655, + "learning_rate": 8.839799781528778e-06, + "loss": 0.4894, + "step": 1622 + }, + { + "epoch": 0.24, + "grad_norm": 5.701134920357182, + "learning_rate": 8.838234882932254e-06, + "loss": 0.4408, + "step": 1623 + }, + { + "epoch": 0.24, + "grad_norm": 4.8214848962619845, + "learning_rate": 8.836669068380461e-06, + "loss": 0.4687, + "step": 1624 + }, + { + "epoch": 0.25, + "grad_norm": 5.927607111756089, + "learning_rate": 8.835102338247063e-06, + "loss": 0.4023, + "step": 1625 + }, + { + "epoch": 0.25, + "grad_norm": 7.163116439516577, + "learning_rate": 8.833534692905945e-06, + "loss": 0.4706, + "step": 1626 + }, + { + "epoch": 0.25, + "grad_norm": 4.154094327152368, + "learning_rate": 8.83196613273121e-06, + "loss": 0.5008, + "step": 1627 + }, + { + "epoch": 0.25, + "grad_norm": 6.772041329013814, + "learning_rate": 8.830396658097176e-06, + "loss": 0.4576, + "step": 1628 + }, + { + "epoch": 0.25, + "grad_norm": 10.144494067258229, + "learning_rate": 8.828826269378385e-06, + "loss": 0.5608, + "step": 1629 + }, + { + "epoch": 0.25, + "grad_norm": 4.795100358442994, + "learning_rate": 8.827254966949594e-06, + "loss": 0.4369, + "step": 1630 + }, + { + "epoch": 0.25, + "grad_norm": 5.524220784243382, + "learning_rate": 8.825682751185775e-06, + "loss": 0.4601, + "step": 1631 + }, + { + "epoch": 0.25, + "grad_norm": 4.3840387303231445, + "learning_rate": 8.824109622462123e-06, + "loss": 0.4671, + "step": 1632 + }, + { + "epoch": 0.25, + "grad_norm": 5.266428674581977, + "learning_rate": 8.82253558115405e-06, + "loss": 0.4906, + "step": 1633 + }, + { + "epoch": 0.25, + "grad_norm": 4.623526914571225, + "learning_rate": 8.820960627637185e-06, + "loss": 0.5836, + "step": 1634 + }, + { + "epoch": 0.25, + "grad_norm": 7.326559272938898, + "learning_rate": 8.819384762287372e-06, + "loss": 0.5281, + "step": 1635 + }, + { + "epoch": 0.25, + "grad_norm": 5.205320946490698, + "learning_rate": 8.817807985480677e-06, + "loss": 0.4087, + "step": 1636 + }, + { + "epoch": 0.25, + "grad_norm": 1.7248459229059117, + "learning_rate": 8.816230297593382e-06, + "loss": 0.6174, + "step": 1637 + }, + { + "epoch": 0.25, + "grad_norm": 4.766487058133429, + "learning_rate": 8.814651699001984e-06, + "loss": 0.4711, + "step": 1638 + }, + { + "epoch": 0.25, + "grad_norm": 5.560501904242485, + "learning_rate": 8.813072190083201e-06, + "loss": 0.5029, + "step": 1639 + }, + { + "epoch": 0.25, + "grad_norm": 4.781784477683787, + "learning_rate": 8.811491771213965e-06, + "loss": 0.52, + "step": 1640 + }, + { + "epoch": 0.25, + "grad_norm": 6.027907580841462, + "learning_rate": 8.809910442771427e-06, + "loss": 0.4953, + "step": 1641 + }, + { + "epoch": 0.25, + "grad_norm": 3.8979379211286567, + "learning_rate": 8.808328205132957e-06, + "loss": 0.4604, + "step": 1642 + }, + { + "epoch": 0.25, + "grad_norm": 6.450293775066662, + "learning_rate": 8.806745058676137e-06, + "loss": 0.4645, + "step": 1643 + }, + { + "epoch": 0.25, + "grad_norm": 3.4094134569507615, + "learning_rate": 8.80516100377877e-06, + "loss": 0.5406, + "step": 1644 + }, + { + "epoch": 0.25, + "grad_norm": 6.811636257719224, + "learning_rate": 8.803576040818874e-06, + "loss": 0.5321, + "step": 1645 + }, + { + "epoch": 0.25, + "grad_norm": 7.487177067160405, + "learning_rate": 8.801990170174685e-06, + "loss": 0.5319, + "step": 1646 + }, + { + "epoch": 0.25, + "grad_norm": 3.6953686344707206, + "learning_rate": 8.800403392224652e-06, + "loss": 0.5106, + "step": 1647 + }, + { + "epoch": 0.25, + "grad_norm": 8.783852864513696, + "learning_rate": 8.798815707347449e-06, + "loss": 0.4406, + "step": 1648 + }, + { + "epoch": 0.25, + "grad_norm": 10.002935297200702, + "learning_rate": 8.797227115921953e-06, + "loss": 0.5236, + "step": 1649 + }, + { + "epoch": 0.25, + "grad_norm": 10.585681638649321, + "learning_rate": 8.795637618327269e-06, + "loss": 0.5207, + "step": 1650 + }, + { + "epoch": 0.25, + "grad_norm": 3.5058702588937574, + "learning_rate": 8.794047214942716e-06, + "loss": 0.4787, + "step": 1651 + }, + { + "epoch": 0.25, + "grad_norm": 3.0621273299799614, + "learning_rate": 8.792455906147825e-06, + "loss": 0.4954, + "step": 1652 + }, + { + "epoch": 0.25, + "grad_norm": 3.3863423632919316, + "learning_rate": 8.790863692322345e-06, + "loss": 0.4834, + "step": 1653 + }, + { + "epoch": 0.25, + "grad_norm": 4.086800459201511, + "learning_rate": 8.789270573846243e-06, + "loss": 0.4625, + "step": 1654 + }, + { + "epoch": 0.25, + "grad_norm": 4.018283759382251, + "learning_rate": 8.7876765510997e-06, + "loss": 0.5181, + "step": 1655 + }, + { + "epoch": 0.25, + "grad_norm": 4.9126111585435, + "learning_rate": 8.786081624463113e-06, + "loss": 0.4736, + "step": 1656 + }, + { + "epoch": 0.25, + "grad_norm": 3.6098207226673504, + "learning_rate": 8.784485794317094e-06, + "loss": 0.5197, + "step": 1657 + }, + { + "epoch": 0.25, + "grad_norm": 25.434575790810822, + "learning_rate": 8.782889061042473e-06, + "loss": 0.4489, + "step": 1658 + }, + { + "epoch": 0.25, + "grad_norm": 5.623123257587492, + "learning_rate": 8.781291425020291e-06, + "loss": 0.4399, + "step": 1659 + }, + { + "epoch": 0.25, + "grad_norm": 9.588996656431235, + "learning_rate": 8.779692886631812e-06, + "loss": 0.4834, + "step": 1660 + }, + { + "epoch": 0.25, + "grad_norm": 4.141390110432851, + "learning_rate": 8.778093446258506e-06, + "loss": 0.4516, + "step": 1661 + }, + { + "epoch": 0.25, + "grad_norm": 3.9457607614863255, + "learning_rate": 8.776493104282069e-06, + "loss": 0.5136, + "step": 1662 + }, + { + "epoch": 0.25, + "grad_norm": 4.359451779538328, + "learning_rate": 8.774891861084399e-06, + "loss": 0.5075, + "step": 1663 + }, + { + "epoch": 0.25, + "grad_norm": 3.0228773282199333, + "learning_rate": 8.773289717047621e-06, + "loss": 0.5019, + "step": 1664 + }, + { + "epoch": 0.25, + "grad_norm": 4.1570443551814265, + "learning_rate": 8.771686672554068e-06, + "loss": 0.5236, + "step": 1665 + }, + { + "epoch": 0.25, + "grad_norm": 9.122739043388691, + "learning_rate": 8.770082727986292e-06, + "loss": 0.5183, + "step": 1666 + }, + { + "epoch": 0.25, + "grad_norm": 8.616537409213944, + "learning_rate": 8.768477883727056e-06, + "loss": 0.5108, + "step": 1667 + }, + { + "epoch": 0.25, + "grad_norm": 4.688798977017811, + "learning_rate": 8.76687214015934e-06, + "loss": 0.5319, + "step": 1668 + }, + { + "epoch": 0.25, + "grad_norm": 2.3875133199985785, + "learning_rate": 8.765265497666339e-06, + "loss": 0.719, + "step": 1669 + }, + { + "epoch": 0.25, + "grad_norm": 3.675669693868851, + "learning_rate": 8.763657956631461e-06, + "loss": 0.5414, + "step": 1670 + }, + { + "epoch": 0.25, + "grad_norm": 4.7141885523597375, + "learning_rate": 8.762049517438332e-06, + "loss": 0.5163, + "step": 1671 + }, + { + "epoch": 0.25, + "grad_norm": 4.006016801245012, + "learning_rate": 8.760440180470784e-06, + "loss": 0.525, + "step": 1672 + }, + { + "epoch": 0.25, + "grad_norm": 4.873245554602478, + "learning_rate": 8.75882994611287e-06, + "loss": 0.4427, + "step": 1673 + }, + { + "epoch": 0.25, + "grad_norm": 6.400958463737394, + "learning_rate": 8.75721881474886e-06, + "loss": 0.5145, + "step": 1674 + }, + { + "epoch": 0.25, + "grad_norm": 3.8153539939755574, + "learning_rate": 8.75560678676323e-06, + "loss": 0.4316, + "step": 1675 + }, + { + "epoch": 0.25, + "grad_norm": 7.672317338727773, + "learning_rate": 8.753993862540678e-06, + "loss": 0.5213, + "step": 1676 + }, + { + "epoch": 0.25, + "grad_norm": 9.698555107641, + "learning_rate": 8.752380042466107e-06, + "loss": 0.4416, + "step": 1677 + }, + { + "epoch": 0.25, + "grad_norm": 6.236479927148768, + "learning_rate": 8.75076532692464e-06, + "loss": 0.5224, + "step": 1678 + }, + { + "epoch": 0.25, + "grad_norm": 6.088855734919785, + "learning_rate": 8.749149716301612e-06, + "loss": 0.4271, + "step": 1679 + }, + { + "epoch": 0.25, + "grad_norm": 6.606439945243697, + "learning_rate": 8.747533210982576e-06, + "loss": 0.4871, + "step": 1680 + }, + { + "epoch": 0.25, + "grad_norm": 6.2344410052785335, + "learning_rate": 8.745915811353288e-06, + "loss": 0.5627, + "step": 1681 + }, + { + "epoch": 0.25, + "grad_norm": 11.05021026541072, + "learning_rate": 8.744297517799728e-06, + "loss": 0.4676, + "step": 1682 + }, + { + "epoch": 0.25, + "grad_norm": 29.41510511226427, + "learning_rate": 8.742678330708085e-06, + "loss": 0.5105, + "step": 1683 + }, + { + "epoch": 0.25, + "grad_norm": 5.784851741938727, + "learning_rate": 8.741058250464758e-06, + "loss": 0.5444, + "step": 1684 + }, + { + "epoch": 0.25, + "grad_norm": 7.374545347808052, + "learning_rate": 8.739437277456366e-06, + "loss": 0.4593, + "step": 1685 + }, + { + "epoch": 0.25, + "grad_norm": 5.706560139265927, + "learning_rate": 8.737815412069737e-06, + "loss": 0.4056, + "step": 1686 + }, + { + "epoch": 0.25, + "grad_norm": 7.152341776308717, + "learning_rate": 8.73619265469191e-06, + "loss": 0.4901, + "step": 1687 + }, + { + "epoch": 0.25, + "grad_norm": 7.116323451961182, + "learning_rate": 8.734569005710142e-06, + "loss": 0.5244, + "step": 1688 + }, + { + "epoch": 0.25, + "grad_norm": 6.837048672011069, + "learning_rate": 8.732944465511899e-06, + "loss": 0.428, + "step": 1689 + }, + { + "epoch": 0.25, + "grad_norm": 6.685629218608238, + "learning_rate": 8.731319034484862e-06, + "loss": 0.5249, + "step": 1690 + }, + { + "epoch": 0.26, + "grad_norm": 9.938563631714166, + "learning_rate": 8.729692713016921e-06, + "loss": 0.5015, + "step": 1691 + }, + { + "epoch": 0.26, + "grad_norm": 4.318960008574061, + "learning_rate": 8.728065501496183e-06, + "loss": 0.5402, + "step": 1692 + }, + { + "epoch": 0.26, + "grad_norm": 13.79651926512532, + "learning_rate": 8.726437400310965e-06, + "loss": 0.5039, + "step": 1693 + }, + { + "epoch": 0.26, + "grad_norm": 2.627296378205132, + "learning_rate": 8.724808409849795e-06, + "loss": 0.5966, + "step": 1694 + }, + { + "epoch": 0.26, + "grad_norm": 7.423552605889969, + "learning_rate": 8.723178530501417e-06, + "loss": 0.4478, + "step": 1695 + }, + { + "epoch": 0.26, + "grad_norm": 13.005001217358673, + "learning_rate": 8.721547762654786e-06, + "loss": 0.5774, + "step": 1696 + }, + { + "epoch": 0.26, + "grad_norm": 6.527322593397208, + "learning_rate": 8.719916106699063e-06, + "loss": 0.5402, + "step": 1697 + }, + { + "epoch": 0.26, + "grad_norm": 11.381746489586043, + "learning_rate": 8.718283563023628e-06, + "loss": 0.4364, + "step": 1698 + }, + { + "epoch": 0.26, + "grad_norm": 6.644291536834517, + "learning_rate": 8.716650132018075e-06, + "loss": 0.4915, + "step": 1699 + }, + { + "epoch": 0.26, + "grad_norm": 7.505490353424833, + "learning_rate": 8.7150158140722e-06, + "loss": 0.5699, + "step": 1700 + }, + { + "epoch": 0.26, + "grad_norm": 5.310694577414711, + "learning_rate": 8.71338060957602e-06, + "loss": 0.5051, + "step": 1701 + }, + { + "epoch": 0.26, + "grad_norm": 4.898145353850273, + "learning_rate": 8.711744518919757e-06, + "loss": 0.4829, + "step": 1702 + }, + { + "epoch": 0.26, + "grad_norm": 7.500599759507134, + "learning_rate": 8.710107542493846e-06, + "loss": 0.4641, + "step": 1703 + }, + { + "epoch": 0.26, + "grad_norm": 5.785609001946518, + "learning_rate": 8.708469680688941e-06, + "loss": 0.4454, + "step": 1704 + }, + { + "epoch": 0.26, + "grad_norm": 12.207839154157819, + "learning_rate": 8.706830933895893e-06, + "loss": 0.5116, + "step": 1705 + }, + { + "epoch": 0.26, + "grad_norm": 5.763470957268615, + "learning_rate": 8.705191302505777e-06, + "loss": 0.4912, + "step": 1706 + }, + { + "epoch": 0.26, + "grad_norm": 3.8536668164014145, + "learning_rate": 8.703550786909873e-06, + "loss": 0.4294, + "step": 1707 + }, + { + "epoch": 0.26, + "grad_norm": 13.736500341565748, + "learning_rate": 8.701909387499674e-06, + "loss": 0.4742, + "step": 1708 + }, + { + "epoch": 0.26, + "grad_norm": 13.736247885515326, + "learning_rate": 8.70026710466688e-06, + "loss": 0.5239, + "step": 1709 + }, + { + "epoch": 0.26, + "grad_norm": 5.15627137113051, + "learning_rate": 8.69862393880341e-06, + "loss": 0.4705, + "step": 1710 + }, + { + "epoch": 0.26, + "grad_norm": 5.305562459014393, + "learning_rate": 8.696979890301383e-06, + "loss": 0.4959, + "step": 1711 + }, + { + "epoch": 0.26, + "grad_norm": 4.845415504914943, + "learning_rate": 8.695334959553139e-06, + "loss": 0.4682, + "step": 1712 + }, + { + "epoch": 0.26, + "grad_norm": 3.425886614853946, + "learning_rate": 8.69368914695122e-06, + "loss": 0.4691, + "step": 1713 + }, + { + "epoch": 0.26, + "grad_norm": 1.7985279231382951, + "learning_rate": 8.692042452888385e-06, + "loss": 0.6462, + "step": 1714 + }, + { + "epoch": 0.26, + "grad_norm": 8.273808183626967, + "learning_rate": 8.6903948777576e-06, + "loss": 0.4762, + "step": 1715 + }, + { + "epoch": 0.26, + "grad_norm": 4.1320508650919425, + "learning_rate": 8.68874642195204e-06, + "loss": 0.5079, + "step": 1716 + }, + { + "epoch": 0.26, + "grad_norm": 7.4699204319018175, + "learning_rate": 8.687097085865096e-06, + "loss": 0.5072, + "step": 1717 + }, + { + "epoch": 0.26, + "grad_norm": 3.6980225365146056, + "learning_rate": 8.685446869890359e-06, + "loss": 0.5331, + "step": 1718 + }, + { + "epoch": 0.26, + "grad_norm": 4.626394132804195, + "learning_rate": 8.683795774421644e-06, + "loss": 0.4791, + "step": 1719 + }, + { + "epoch": 0.26, + "grad_norm": 4.094627104804033, + "learning_rate": 8.68214379985296e-06, + "loss": 0.4471, + "step": 1720 + }, + { + "epoch": 0.26, + "grad_norm": 4.1343135022707225, + "learning_rate": 8.680490946578538e-06, + "loss": 0.4632, + "step": 1721 + }, + { + "epoch": 0.26, + "grad_norm": 6.642830257804902, + "learning_rate": 8.678837214992815e-06, + "loss": 0.5063, + "step": 1722 + }, + { + "epoch": 0.26, + "grad_norm": 6.4526371088508965, + "learning_rate": 8.677182605490435e-06, + "loss": 0.4812, + "step": 1723 + }, + { + "epoch": 0.26, + "grad_norm": 3.5996428755776635, + "learning_rate": 8.675527118466254e-06, + "loss": 0.4852, + "step": 1724 + }, + { + "epoch": 0.26, + "grad_norm": 9.248127966248044, + "learning_rate": 8.673870754315336e-06, + "loss": 0.4586, + "step": 1725 + }, + { + "epoch": 0.26, + "grad_norm": 6.732716959695629, + "learning_rate": 8.672213513432958e-06, + "loss": 0.5593, + "step": 1726 + }, + { + "epoch": 0.26, + "grad_norm": 5.174808278174984, + "learning_rate": 8.670555396214604e-06, + "loss": 0.4781, + "step": 1727 + }, + { + "epoch": 0.26, + "grad_norm": 3.9326768689224925, + "learning_rate": 8.668896403055963e-06, + "loss": 0.5074, + "step": 1728 + }, + { + "epoch": 0.26, + "grad_norm": 7.063687075272308, + "learning_rate": 8.667236534352936e-06, + "loss": 0.5015, + "step": 1729 + }, + { + "epoch": 0.26, + "grad_norm": 7.289384214794713, + "learning_rate": 8.665575790501638e-06, + "loss": 0.5447, + "step": 1730 + }, + { + "epoch": 0.26, + "grad_norm": 6.288055654048188, + "learning_rate": 8.663914171898388e-06, + "loss": 0.5276, + "step": 1731 + }, + { + "epoch": 0.26, + "grad_norm": 4.591013441382628, + "learning_rate": 8.66225167893971e-06, + "loss": 0.4584, + "step": 1732 + }, + { + "epoch": 0.26, + "grad_norm": 4.608858885699101, + "learning_rate": 8.660588312022345e-06, + "loss": 0.5112, + "step": 1733 + }, + { + "epoch": 0.26, + "grad_norm": 5.480236393874353, + "learning_rate": 8.658924071543235e-06, + "loss": 0.4786, + "step": 1734 + }, + { + "epoch": 0.26, + "grad_norm": 1.6480562704402328, + "learning_rate": 8.657258957899536e-06, + "loss": 0.6693, + "step": 1735 + }, + { + "epoch": 0.26, + "grad_norm": 6.712964484884782, + "learning_rate": 8.655592971488612e-06, + "loss": 0.3909, + "step": 1736 + }, + { + "epoch": 0.26, + "grad_norm": 6.9523993818905465, + "learning_rate": 8.653926112708028e-06, + "loss": 0.5562, + "step": 1737 + }, + { + "epoch": 0.26, + "grad_norm": 5.88565232528944, + "learning_rate": 8.652258381955567e-06, + "loss": 0.557, + "step": 1738 + }, + { + "epoch": 0.26, + "grad_norm": 7.257852941197796, + "learning_rate": 8.650589779629218e-06, + "loss": 0.4583, + "step": 1739 + }, + { + "epoch": 0.26, + "grad_norm": 4.668374004936056, + "learning_rate": 8.64892030612717e-06, + "loss": 0.5121, + "step": 1740 + }, + { + "epoch": 0.26, + "grad_norm": 6.699757509249108, + "learning_rate": 8.647249961847828e-06, + "loss": 0.577, + "step": 1741 + }, + { + "epoch": 0.26, + "grad_norm": 4.65447756940695, + "learning_rate": 8.645578747189803e-06, + "loss": 0.5308, + "step": 1742 + }, + { + "epoch": 0.26, + "grad_norm": 1.3051734868593994, + "learning_rate": 8.643906662551912e-06, + "loss": 0.5594, + "step": 1743 + }, + { + "epoch": 0.26, + "grad_norm": 5.242595266781202, + "learning_rate": 8.642233708333183e-06, + "loss": 0.4674, + "step": 1744 + }, + { + "epoch": 0.26, + "grad_norm": 5.372166397375121, + "learning_rate": 8.640559884932848e-06, + "loss": 0.5248, + "step": 1745 + }, + { + "epoch": 0.26, + "grad_norm": 15.961298152122554, + "learning_rate": 8.638885192750348e-06, + "loss": 0.5336, + "step": 1746 + }, + { + "epoch": 0.26, + "grad_norm": 16.388903237462902, + "learning_rate": 8.637209632185333e-06, + "loss": 0.4675, + "step": 1747 + }, + { + "epoch": 0.26, + "grad_norm": 6.90907103741499, + "learning_rate": 8.635533203637653e-06, + "loss": 0.5293, + "step": 1748 + }, + { + "epoch": 0.26, + "grad_norm": 9.94809111533093, + "learning_rate": 8.633855907507375e-06, + "loss": 0.5483, + "step": 1749 + }, + { + "epoch": 0.26, + "grad_norm": 8.432071700735126, + "learning_rate": 8.632177744194766e-06, + "loss": 0.5178, + "step": 1750 + }, + { + "epoch": 0.26, + "grad_norm": 4.914964138186451, + "learning_rate": 8.630498714100306e-06, + "loss": 0.5118, + "step": 1751 + }, + { + "epoch": 0.26, + "grad_norm": 5.786472124812631, + "learning_rate": 8.628818817624672e-06, + "loss": 0.4803, + "step": 1752 + }, + { + "epoch": 0.26, + "grad_norm": 6.543855766819282, + "learning_rate": 8.62713805516876e-06, + "loss": 0.5887, + "step": 1753 + }, + { + "epoch": 0.26, + "grad_norm": 5.255353077316545, + "learning_rate": 8.625456427133665e-06, + "loss": 0.4387, + "step": 1754 + }, + { + "epoch": 0.26, + "grad_norm": 3.5496368362857282, + "learning_rate": 8.623773933920689e-06, + "loss": 0.4464, + "step": 1755 + }, + { + "epoch": 0.26, + "grad_norm": 6.136869128886629, + "learning_rate": 8.622090575931341e-06, + "loss": 0.5017, + "step": 1756 + }, + { + "epoch": 0.27, + "grad_norm": 20.960506743289706, + "learning_rate": 8.62040635356734e-06, + "loss": 0.4815, + "step": 1757 + }, + { + "epoch": 0.27, + "grad_norm": 3.923631606303919, + "learning_rate": 8.618721267230606e-06, + "loss": 0.4363, + "step": 1758 + }, + { + "epoch": 0.27, + "grad_norm": 4.336558220678053, + "learning_rate": 8.61703531732327e-06, + "loss": 0.4891, + "step": 1759 + }, + { + "epoch": 0.27, + "grad_norm": 4.9102619509329015, + "learning_rate": 8.615348504247663e-06, + "loss": 0.4527, + "step": 1760 + }, + { + "epoch": 0.27, + "grad_norm": 4.356492946306187, + "learning_rate": 8.613660828406328e-06, + "loss": 0.5535, + "step": 1761 + }, + { + "epoch": 0.27, + "grad_norm": 4.588172444952468, + "learning_rate": 8.61197229020201e-06, + "loss": 0.4982, + "step": 1762 + }, + { + "epoch": 0.27, + "grad_norm": 4.8934242683224305, + "learning_rate": 8.610282890037664e-06, + "loss": 0.5157, + "step": 1763 + }, + { + "epoch": 0.27, + "grad_norm": 5.92502392325219, + "learning_rate": 8.608592628316446e-06, + "loss": 0.4622, + "step": 1764 + }, + { + "epoch": 0.27, + "grad_norm": 4.901789152839644, + "learning_rate": 8.606901505441719e-06, + "loss": 0.5146, + "step": 1765 + }, + { + "epoch": 0.27, + "grad_norm": 14.072053580569666, + "learning_rate": 8.605209521817053e-06, + "loss": 0.501, + "step": 1766 + }, + { + "epoch": 0.27, + "grad_norm": 5.903189522576522, + "learning_rate": 8.603516677846222e-06, + "loss": 0.5097, + "step": 1767 + }, + { + "epoch": 0.27, + "grad_norm": 10.053744418915311, + "learning_rate": 8.601822973933207e-06, + "loss": 0.4541, + "step": 1768 + }, + { + "epoch": 0.27, + "grad_norm": 5.612330055149999, + "learning_rate": 8.60012841048219e-06, + "loss": 0.4175, + "step": 1769 + }, + { + "epoch": 0.27, + "grad_norm": 3.844650730344002, + "learning_rate": 8.598432987897565e-06, + "loss": 0.4639, + "step": 1770 + }, + { + "epoch": 0.27, + "grad_norm": 5.197881919392901, + "learning_rate": 8.596736706583926e-06, + "loss": 0.507, + "step": 1771 + }, + { + "epoch": 0.27, + "grad_norm": 16.36907618351591, + "learning_rate": 8.595039566946072e-06, + "loss": 0.5325, + "step": 1772 + }, + { + "epoch": 0.27, + "grad_norm": 7.788018738524345, + "learning_rate": 8.593341569389008e-06, + "loss": 0.4754, + "step": 1773 + }, + { + "epoch": 0.27, + "grad_norm": 4.279323201105887, + "learning_rate": 8.591642714317945e-06, + "loss": 0.4949, + "step": 1774 + }, + { + "epoch": 0.27, + "grad_norm": 12.212077955354141, + "learning_rate": 8.589943002138296e-06, + "loss": 0.4978, + "step": 1775 + }, + { + "epoch": 0.27, + "grad_norm": 3.761324304834908, + "learning_rate": 8.588242433255679e-06, + "loss": 0.468, + "step": 1776 + }, + { + "epoch": 0.27, + "grad_norm": 5.141129198540113, + "learning_rate": 8.586541008075919e-06, + "loss": 0.5107, + "step": 1777 + }, + { + "epoch": 0.27, + "grad_norm": 4.902380163912766, + "learning_rate": 8.584838727005045e-06, + "loss": 0.4897, + "step": 1778 + }, + { + "epoch": 0.27, + "grad_norm": 4.7474945763245, + "learning_rate": 8.583135590449285e-06, + "loss": 0.4152, + "step": 1779 + }, + { + "epoch": 0.27, + "grad_norm": 19.861544170191262, + "learning_rate": 8.581431598815077e-06, + "loss": 0.6019, + "step": 1780 + }, + { + "epoch": 0.27, + "grad_norm": 10.502514597191615, + "learning_rate": 8.57972675250906e-06, + "loss": 0.4737, + "step": 1781 + }, + { + "epoch": 0.27, + "grad_norm": 5.494484292370675, + "learning_rate": 8.57802105193808e-06, + "loss": 0.4464, + "step": 1782 + }, + { + "epoch": 0.27, + "grad_norm": 6.050161142457534, + "learning_rate": 8.576314497509185e-06, + "loss": 0.5978, + "step": 1783 + }, + { + "epoch": 0.27, + "grad_norm": 5.759088505694082, + "learning_rate": 8.574607089629627e-06, + "loss": 0.5524, + "step": 1784 + }, + { + "epoch": 0.27, + "grad_norm": 7.147084885891129, + "learning_rate": 8.572898828706858e-06, + "loss": 0.4786, + "step": 1785 + }, + { + "epoch": 0.27, + "grad_norm": 9.399932466749341, + "learning_rate": 8.571189715148539e-06, + "loss": 0.5037, + "step": 1786 + }, + { + "epoch": 0.27, + "grad_norm": 6.074990533664426, + "learning_rate": 8.569479749362531e-06, + "loss": 0.4467, + "step": 1787 + }, + { + "epoch": 0.27, + "grad_norm": 9.207518606558148, + "learning_rate": 8.567768931756903e-06, + "loss": 0.4336, + "step": 1788 + }, + { + "epoch": 0.27, + "grad_norm": 7.003562436795217, + "learning_rate": 8.566057262739922e-06, + "loss": 0.442, + "step": 1789 + }, + { + "epoch": 0.27, + "grad_norm": 9.510809088126551, + "learning_rate": 8.56434474272006e-06, + "loss": 0.4981, + "step": 1790 + }, + { + "epoch": 0.27, + "grad_norm": 9.564979457302885, + "learning_rate": 8.56263137210599e-06, + "loss": 0.4587, + "step": 1791 + }, + { + "epoch": 0.27, + "grad_norm": 5.7226634684707145, + "learning_rate": 8.560917151306594e-06, + "loss": 0.4504, + "step": 1792 + }, + { + "epoch": 0.27, + "grad_norm": 10.781576613288939, + "learning_rate": 8.559202080730952e-06, + "loss": 0.4877, + "step": 1793 + }, + { + "epoch": 0.27, + "grad_norm": 4.775262744022146, + "learning_rate": 8.557486160788348e-06, + "loss": 0.5346, + "step": 1794 + }, + { + "epoch": 0.27, + "grad_norm": 28.022989280439724, + "learning_rate": 8.55576939188827e-06, + "loss": 0.492, + "step": 1795 + }, + { + "epoch": 0.27, + "grad_norm": 8.935354467138271, + "learning_rate": 8.554051774440404e-06, + "loss": 0.4636, + "step": 1796 + }, + { + "epoch": 0.27, + "grad_norm": 6.599607442048166, + "learning_rate": 8.552333308854645e-06, + "loss": 0.4865, + "step": 1797 + }, + { + "epoch": 0.27, + "grad_norm": 10.659096205944559, + "learning_rate": 8.550613995541084e-06, + "loss": 0.5302, + "step": 1798 + }, + { + "epoch": 0.27, + "grad_norm": 13.627743035687987, + "learning_rate": 8.54889383491002e-06, + "loss": 0.4651, + "step": 1799 + }, + { + "epoch": 0.27, + "grad_norm": 6.434794239492384, + "learning_rate": 8.54717282737195e-06, + "loss": 0.4465, + "step": 1800 + }, + { + "epoch": 0.27, + "grad_norm": 6.042855575229791, + "learning_rate": 8.545450973337575e-06, + "loss": 0.5001, + "step": 1801 + }, + { + "epoch": 0.27, + "grad_norm": 5.450688593313572, + "learning_rate": 8.5437282732178e-06, + "loss": 0.502, + "step": 1802 + }, + { + "epoch": 0.27, + "grad_norm": 8.868801112505036, + "learning_rate": 8.542004727423724e-06, + "loss": 0.5314, + "step": 1803 + }, + { + "epoch": 0.27, + "grad_norm": 11.093308302781576, + "learning_rate": 8.540280336366662e-06, + "loss": 0.5238, + "step": 1804 + }, + { + "epoch": 0.27, + "grad_norm": 12.58213146531138, + "learning_rate": 8.538555100458114e-06, + "loss": 0.5106, + "step": 1805 + }, + { + "epoch": 0.27, + "grad_norm": 4.252357638272064, + "learning_rate": 8.536829020109796e-06, + "loss": 0.4776, + "step": 1806 + }, + { + "epoch": 0.27, + "grad_norm": 7.762650358637841, + "learning_rate": 8.535102095733615e-06, + "loss": 0.5282, + "step": 1807 + }, + { + "epoch": 0.27, + "grad_norm": 10.116321501069386, + "learning_rate": 8.533374327741687e-06, + "loss": 0.5367, + "step": 1808 + }, + { + "epoch": 0.27, + "grad_norm": 7.5679847521469314, + "learning_rate": 8.531645716546322e-06, + "loss": 0.4385, + "step": 1809 + }, + { + "epoch": 0.27, + "grad_norm": 5.333155217016179, + "learning_rate": 8.529916262560039e-06, + "loss": 0.4781, + "step": 1810 + }, + { + "epoch": 0.27, + "grad_norm": 7.0902949407806775, + "learning_rate": 8.528185966195553e-06, + "loss": 0.4163, + "step": 1811 + }, + { + "epoch": 0.27, + "grad_norm": 7.0236505254688915, + "learning_rate": 8.52645482786578e-06, + "loss": 0.4678, + "step": 1812 + }, + { + "epoch": 0.27, + "grad_norm": 14.221946029093298, + "learning_rate": 8.524722847983843e-06, + "loss": 0.4408, + "step": 1813 + }, + { + "epoch": 0.27, + "grad_norm": 5.949250880093887, + "learning_rate": 8.522990026963057e-06, + "loss": 0.4727, + "step": 1814 + }, + { + "epoch": 0.27, + "grad_norm": 5.110156870110036, + "learning_rate": 8.521256365216941e-06, + "loss": 0.4803, + "step": 1815 + }, + { + "epoch": 0.27, + "grad_norm": 13.628590696226684, + "learning_rate": 8.519521863159219e-06, + "loss": 0.5122, + "step": 1816 + }, + { + "epoch": 0.27, + "grad_norm": 6.637886203011313, + "learning_rate": 8.51778652120381e-06, + "loss": 0.4607, + "step": 1817 + }, + { + "epoch": 0.27, + "grad_norm": 5.209315918323017, + "learning_rate": 8.516050339764837e-06, + "loss": 0.5505, + "step": 1818 + }, + { + "epoch": 0.27, + "grad_norm": 4.218812448646503, + "learning_rate": 8.51431331925662e-06, + "loss": 0.4536, + "step": 1819 + }, + { + "epoch": 0.27, + "grad_norm": 6.460554472978976, + "learning_rate": 8.512575460093683e-06, + "loss": 0.4345, + "step": 1820 + }, + { + "epoch": 0.27, + "grad_norm": 7.5248961250705495, + "learning_rate": 8.510836762690746e-06, + "loss": 0.474, + "step": 1821 + }, + { + "epoch": 0.27, + "grad_norm": 5.023876765171489, + "learning_rate": 8.509097227462732e-06, + "loss": 0.5274, + "step": 1822 + }, + { + "epoch": 0.27, + "grad_norm": 8.280240523966562, + "learning_rate": 8.507356854824767e-06, + "loss": 0.4937, + "step": 1823 + }, + { + "epoch": 0.28, + "grad_norm": 6.953024984379047, + "learning_rate": 8.505615645192167e-06, + "loss": 0.5576, + "step": 1824 + }, + { + "epoch": 0.28, + "grad_norm": 6.249564398263031, + "learning_rate": 8.503873598980456e-06, + "loss": 0.5054, + "step": 1825 + }, + { + "epoch": 0.28, + "grad_norm": 6.17611958988426, + "learning_rate": 8.502130716605357e-06, + "loss": 0.4489, + "step": 1826 + }, + { + "epoch": 0.28, + "grad_norm": 9.069451800631331, + "learning_rate": 8.50038699848279e-06, + "loss": 0.4492, + "step": 1827 + }, + { + "epoch": 0.28, + "grad_norm": 6.062277612239777, + "learning_rate": 8.498642445028874e-06, + "loss": 0.4546, + "step": 1828 + }, + { + "epoch": 0.28, + "grad_norm": 1.4205387302616734, + "learning_rate": 8.496897056659931e-06, + "loss": 0.6011, + "step": 1829 + }, + { + "epoch": 0.28, + "grad_norm": 13.201516928975844, + "learning_rate": 8.495150833792478e-06, + "loss": 0.5119, + "step": 1830 + }, + { + "epoch": 0.28, + "grad_norm": 6.601077429374877, + "learning_rate": 8.493403776843234e-06, + "loss": 0.5678, + "step": 1831 + }, + { + "epoch": 0.28, + "grad_norm": 6.349263316708461, + "learning_rate": 8.491655886229118e-06, + "loss": 0.5139, + "step": 1832 + }, + { + "epoch": 0.28, + "grad_norm": 23.935170692534157, + "learning_rate": 8.489907162367243e-06, + "loss": 0.4797, + "step": 1833 + }, + { + "epoch": 0.28, + "grad_norm": 1.2310350281218412, + "learning_rate": 8.488157605674924e-06, + "loss": 0.5893, + "step": 1834 + }, + { + "epoch": 0.28, + "grad_norm": 5.0531365364577745, + "learning_rate": 8.486407216569677e-06, + "loss": 0.4764, + "step": 1835 + }, + { + "epoch": 0.28, + "grad_norm": 7.179423152698777, + "learning_rate": 8.484655995469214e-06, + "loss": 0.4813, + "step": 1836 + }, + { + "epoch": 0.28, + "grad_norm": 9.704398625161359, + "learning_rate": 8.482903942791445e-06, + "loss": 0.4769, + "step": 1837 + }, + { + "epoch": 0.28, + "grad_norm": 1.316044246996018, + "learning_rate": 8.48115105895448e-06, + "loss": 0.5835, + "step": 1838 + }, + { + "epoch": 0.28, + "grad_norm": 7.885186855756579, + "learning_rate": 8.479397344376626e-06, + "loss": 0.4991, + "step": 1839 + }, + { + "epoch": 0.28, + "grad_norm": 12.359084841215594, + "learning_rate": 8.477642799476387e-06, + "loss": 0.4813, + "step": 1840 + }, + { + "epoch": 0.28, + "grad_norm": 7.423745871490641, + "learning_rate": 8.475887424672471e-06, + "loss": 0.4536, + "step": 1841 + }, + { + "epoch": 0.28, + "grad_norm": 1.2671694765264407, + "learning_rate": 8.474131220383779e-06, + "loss": 0.6233, + "step": 1842 + }, + { + "epoch": 0.28, + "grad_norm": 9.084892532420517, + "learning_rate": 8.47237418702941e-06, + "loss": 0.4998, + "step": 1843 + }, + { + "epoch": 0.28, + "grad_norm": 12.93300981373885, + "learning_rate": 8.470616325028663e-06, + "loss": 0.4361, + "step": 1844 + }, + { + "epoch": 0.28, + "grad_norm": 7.623951776019602, + "learning_rate": 8.468857634801033e-06, + "loss": 0.5137, + "step": 1845 + }, + { + "epoch": 0.28, + "grad_norm": 6.602910540911669, + "learning_rate": 8.467098116766213e-06, + "loss": 0.5938, + "step": 1846 + }, + { + "epoch": 0.28, + "grad_norm": 9.850617537819085, + "learning_rate": 8.465337771344094e-06, + "loss": 0.469, + "step": 1847 + }, + { + "epoch": 0.28, + "grad_norm": 15.208655893721819, + "learning_rate": 8.463576598954765e-06, + "loss": 0.5211, + "step": 1848 + }, + { + "epoch": 0.28, + "grad_norm": 17.16457890158877, + "learning_rate": 8.461814600018512e-06, + "loss": 0.5364, + "step": 1849 + }, + { + "epoch": 0.28, + "grad_norm": 13.101825803438341, + "learning_rate": 8.460051774955817e-06, + "loss": 0.475, + "step": 1850 + }, + { + "epoch": 0.28, + "grad_norm": 10.550805611292844, + "learning_rate": 8.45828812418736e-06, + "loss": 0.5078, + "step": 1851 + }, + { + "epoch": 0.28, + "grad_norm": 14.448961568010011, + "learning_rate": 8.456523648134018e-06, + "loss": 0.462, + "step": 1852 + }, + { + "epoch": 0.28, + "grad_norm": 8.933825881135313, + "learning_rate": 8.454758347216866e-06, + "loss": 0.5176, + "step": 1853 + }, + { + "epoch": 0.28, + "grad_norm": 73.74426371723455, + "learning_rate": 8.452992221857175e-06, + "loss": 0.4896, + "step": 1854 + }, + { + "epoch": 0.28, + "grad_norm": 14.565198831120773, + "learning_rate": 8.451225272476413e-06, + "loss": 0.4857, + "step": 1855 + }, + { + "epoch": 0.28, + "grad_norm": 13.397318925358146, + "learning_rate": 8.44945749949624e-06, + "loss": 0.4839, + "step": 1856 + }, + { + "epoch": 0.28, + "grad_norm": 5.387171058174955, + "learning_rate": 8.447688903338521e-06, + "loss": 0.4969, + "step": 1857 + }, + { + "epoch": 0.28, + "grad_norm": 1.5556548758571396, + "learning_rate": 8.445919484425315e-06, + "loss": 0.6326, + "step": 1858 + }, + { + "epoch": 0.28, + "grad_norm": 6.428192186207272, + "learning_rate": 8.444149243178873e-06, + "loss": 0.4953, + "step": 1859 + }, + { + "epoch": 0.28, + "grad_norm": 6.18242170561202, + "learning_rate": 8.442378180021644e-06, + "loss": 0.4722, + "step": 1860 + }, + { + "epoch": 0.28, + "grad_norm": 5.2718846612402706, + "learning_rate": 8.440606295376274e-06, + "loss": 0.443, + "step": 1861 + }, + { + "epoch": 0.28, + "grad_norm": 31.299233178472303, + "learning_rate": 8.438833589665606e-06, + "loss": 0.4877, + "step": 1862 + }, + { + "epoch": 0.28, + "grad_norm": 14.312259855990655, + "learning_rate": 8.43706006331268e-06, + "loss": 0.4551, + "step": 1863 + }, + { + "epoch": 0.28, + "grad_norm": 9.117869103732973, + "learning_rate": 8.435285716740726e-06, + "loss": 0.5438, + "step": 1864 + }, + { + "epoch": 0.28, + "grad_norm": 5.687136956511441, + "learning_rate": 8.433510550373177e-06, + "loss": 0.5039, + "step": 1865 + }, + { + "epoch": 0.28, + "grad_norm": 7.086505593927276, + "learning_rate": 8.431734564633654e-06, + "loss": 0.482, + "step": 1866 + }, + { + "epoch": 0.28, + "grad_norm": 17.851117512392186, + "learning_rate": 8.429957759945981e-06, + "loss": 0.5071, + "step": 1867 + }, + { + "epoch": 0.28, + "grad_norm": 7.240297693730779, + "learning_rate": 8.428180136734176e-06, + "loss": 0.5413, + "step": 1868 + }, + { + "epoch": 0.28, + "grad_norm": 6.473930899582503, + "learning_rate": 8.426401695422444e-06, + "loss": 0.4375, + "step": 1869 + }, + { + "epoch": 0.28, + "grad_norm": 8.6846512656853, + "learning_rate": 8.424622436435199e-06, + "loss": 0.4365, + "step": 1870 + }, + { + "epoch": 0.28, + "grad_norm": 13.679776280358043, + "learning_rate": 8.422842360197037e-06, + "loss": 0.5022, + "step": 1871 + }, + { + "epoch": 0.28, + "grad_norm": 150.99814688875188, + "learning_rate": 8.421061467132761e-06, + "loss": 0.473, + "step": 1872 + }, + { + "epoch": 0.28, + "grad_norm": 12.383248393204392, + "learning_rate": 8.419279757667356e-06, + "loss": 0.519, + "step": 1873 + }, + { + "epoch": 0.28, + "grad_norm": 16.41083440802398, + "learning_rate": 8.417497232226013e-06, + "loss": 0.4275, + "step": 1874 + }, + { + "epoch": 0.28, + "grad_norm": 7.215802619605748, + "learning_rate": 8.41571389123411e-06, + "loss": 0.4904, + "step": 1875 + }, + { + "epoch": 0.28, + "grad_norm": 9.132820859585701, + "learning_rate": 8.413929735117229e-06, + "loss": 0.4951, + "step": 1876 + }, + { + "epoch": 0.28, + "grad_norm": 7.437918597614514, + "learning_rate": 8.412144764301133e-06, + "loss": 0.4751, + "step": 1877 + }, + { + "epoch": 0.28, + "grad_norm": 6.584245146489485, + "learning_rate": 8.410358979211792e-06, + "loss": 0.4664, + "step": 1878 + }, + { + "epoch": 0.28, + "grad_norm": 11.412091687789838, + "learning_rate": 8.408572380275365e-06, + "loss": 0.422, + "step": 1879 + }, + { + "epoch": 0.28, + "grad_norm": 9.384494138680635, + "learning_rate": 8.406784967918203e-06, + "loss": 0.4766, + "step": 1880 + }, + { + "epoch": 0.28, + "grad_norm": 7.106123974759556, + "learning_rate": 8.404996742566855e-06, + "loss": 0.4358, + "step": 1881 + }, + { + "epoch": 0.28, + "grad_norm": 10.07143594434362, + "learning_rate": 8.403207704648064e-06, + "loss": 0.5811, + "step": 1882 + }, + { + "epoch": 0.28, + "grad_norm": 10.226107909620538, + "learning_rate": 8.401417854588763e-06, + "loss": 0.5203, + "step": 1883 + }, + { + "epoch": 0.28, + "grad_norm": 9.044694800131651, + "learning_rate": 8.399627192816082e-06, + "loss": 0.4824, + "step": 1884 + }, + { + "epoch": 0.28, + "grad_norm": 6.049425244878882, + "learning_rate": 8.397835719757343e-06, + "loss": 0.5795, + "step": 1885 + }, + { + "epoch": 0.28, + "grad_norm": 10.192805016853793, + "learning_rate": 8.396043435840064e-06, + "loss": 0.5078, + "step": 1886 + }, + { + "epoch": 0.28, + "grad_norm": 9.214471471189285, + "learning_rate": 8.394250341491958e-06, + "loss": 0.4712, + "step": 1887 + }, + { + "epoch": 0.28, + "grad_norm": 12.350337528536405, + "learning_rate": 8.392456437140923e-06, + "loss": 0.4444, + "step": 1888 + }, + { + "epoch": 0.28, + "grad_norm": 7.81283897269889, + "learning_rate": 8.39066172321506e-06, + "loss": 0.4932, + "step": 1889 + }, + { + "epoch": 0.29, + "grad_norm": 6.287695319714385, + "learning_rate": 8.388866200142656e-06, + "loss": 0.4708, + "step": 1890 + }, + { + "epoch": 0.29, + "grad_norm": 7.109766842376803, + "learning_rate": 8.387069868352196e-06, + "loss": 0.4238, + "step": 1891 + }, + { + "epoch": 0.29, + "grad_norm": 4.712429590629011, + "learning_rate": 8.385272728272356e-06, + "loss": 0.4536, + "step": 1892 + }, + { + "epoch": 0.29, + "grad_norm": 6.013873095311785, + "learning_rate": 8.383474780332002e-06, + "loss": 0.4801, + "step": 1893 + }, + { + "epoch": 0.29, + "grad_norm": 7.849634504896088, + "learning_rate": 8.381676024960202e-06, + "loss": 0.4695, + "step": 1894 + }, + { + "epoch": 0.29, + "grad_norm": 5.673773777896566, + "learning_rate": 8.379876462586202e-06, + "loss": 0.5303, + "step": 1895 + }, + { + "epoch": 0.29, + "grad_norm": 6.223613553587142, + "learning_rate": 8.378076093639458e-06, + "loss": 0.4541, + "step": 1896 + }, + { + "epoch": 0.29, + "grad_norm": 9.140242999750354, + "learning_rate": 8.376274918549605e-06, + "loss": 0.4952, + "step": 1897 + }, + { + "epoch": 0.29, + "grad_norm": 5.101780926044955, + "learning_rate": 8.374472937746474e-06, + "loss": 0.4243, + "step": 1898 + }, + { + "epoch": 0.29, + "grad_norm": 6.443413843434282, + "learning_rate": 8.372670151660092e-06, + "loss": 0.4903, + "step": 1899 + }, + { + "epoch": 0.29, + "grad_norm": 5.087927991425739, + "learning_rate": 8.370866560720671e-06, + "loss": 0.4677, + "step": 1900 + }, + { + "epoch": 0.29, + "grad_norm": 6.292840443964737, + "learning_rate": 8.369062165358626e-06, + "loss": 0.551, + "step": 1901 + }, + { + "epoch": 0.29, + "grad_norm": 5.371315197490912, + "learning_rate": 8.367256966004552e-06, + "loss": 0.4542, + "step": 1902 + }, + { + "epoch": 0.29, + "grad_norm": 7.408560536206505, + "learning_rate": 8.365450963089242e-06, + "loss": 0.4907, + "step": 1903 + }, + { + "epoch": 0.29, + "grad_norm": 5.912018416396802, + "learning_rate": 8.363644157043682e-06, + "loss": 0.5776, + "step": 1904 + }, + { + "epoch": 0.29, + "grad_norm": 59.97594752450321, + "learning_rate": 8.361836548299047e-06, + "loss": 0.477, + "step": 1905 + }, + { + "epoch": 0.29, + "grad_norm": 11.05013248106383, + "learning_rate": 8.360028137286701e-06, + "loss": 0.5311, + "step": 1906 + }, + { + "epoch": 0.29, + "grad_norm": 7.186335230654786, + "learning_rate": 8.358218924438208e-06, + "loss": 0.4646, + "step": 1907 + }, + { + "epoch": 0.29, + "grad_norm": 9.867807096724809, + "learning_rate": 8.356408910185317e-06, + "loss": 0.4589, + "step": 1908 + }, + { + "epoch": 0.29, + "grad_norm": 5.581858110443558, + "learning_rate": 8.354598094959965e-06, + "loss": 0.4849, + "step": 1909 + }, + { + "epoch": 0.29, + "grad_norm": 4.212448272381275, + "learning_rate": 8.352786479194288e-06, + "loss": 0.4546, + "step": 1910 + }, + { + "epoch": 0.29, + "grad_norm": 4.466302170262632, + "learning_rate": 8.350974063320608e-06, + "loss": 0.4207, + "step": 1911 + }, + { + "epoch": 0.29, + "grad_norm": 7.770867287582647, + "learning_rate": 8.349160847771442e-06, + "loss": 0.4561, + "step": 1912 + }, + { + "epoch": 0.29, + "grad_norm": 4.13976262285489, + "learning_rate": 8.347346832979491e-06, + "loss": 0.4789, + "step": 1913 + }, + { + "epoch": 0.29, + "grad_norm": 4.063852300104583, + "learning_rate": 8.345532019377655e-06, + "loss": 0.5, + "step": 1914 + }, + { + "epoch": 0.29, + "grad_norm": 7.668937730923807, + "learning_rate": 8.343716407399019e-06, + "loss": 0.4153, + "step": 1915 + }, + { + "epoch": 0.29, + "grad_norm": 7.095286181012478, + "learning_rate": 8.341899997476859e-06, + "loss": 0.5318, + "step": 1916 + }, + { + "epoch": 0.29, + "grad_norm": 3.9239370408539074, + "learning_rate": 8.340082790044644e-06, + "loss": 0.4303, + "step": 1917 + }, + { + "epoch": 0.29, + "grad_norm": 4.428503042912719, + "learning_rate": 8.338264785536032e-06, + "loss": 0.4307, + "step": 1918 + }, + { + "epoch": 0.29, + "grad_norm": 13.175547894527906, + "learning_rate": 8.33644598438487e-06, + "loss": 0.4566, + "step": 1919 + }, + { + "epoch": 0.29, + "grad_norm": 3.91154551936458, + "learning_rate": 8.334626387025196e-06, + "loss": 0.3902, + "step": 1920 + }, + { + "epoch": 0.29, + "grad_norm": 4.185033562289264, + "learning_rate": 8.332805993891241e-06, + "loss": 0.4078, + "step": 1921 + }, + { + "epoch": 0.29, + "grad_norm": 4.529501153304833, + "learning_rate": 8.330984805417419e-06, + "loss": 0.5096, + "step": 1922 + }, + { + "epoch": 0.29, + "grad_norm": 16.859744204279707, + "learning_rate": 8.32916282203834e-06, + "loss": 0.5028, + "step": 1923 + }, + { + "epoch": 0.29, + "grad_norm": 5.090610679074903, + "learning_rate": 8.327340044188803e-06, + "loss": 0.4525, + "step": 1924 + }, + { + "epoch": 0.29, + "grad_norm": 1.2425053351042594, + "learning_rate": 8.325516472303792e-06, + "loss": 0.5785, + "step": 1925 + }, + { + "epoch": 0.29, + "grad_norm": 3.319463705588319, + "learning_rate": 8.323692106818486e-06, + "loss": 0.4281, + "step": 1926 + }, + { + "epoch": 0.29, + "grad_norm": 7.479883960634091, + "learning_rate": 8.321866948168252e-06, + "loss": 0.4025, + "step": 1927 + }, + { + "epoch": 0.29, + "grad_norm": 5.564403959032149, + "learning_rate": 8.320040996788641e-06, + "loss": 0.4877, + "step": 1928 + }, + { + "epoch": 0.29, + "grad_norm": 4.55778089581736, + "learning_rate": 8.3182142531154e-06, + "loss": 0.4558, + "step": 1929 + }, + { + "epoch": 0.29, + "grad_norm": 5.088171622900685, + "learning_rate": 8.316386717584463e-06, + "loss": 0.4797, + "step": 1930 + }, + { + "epoch": 0.29, + "grad_norm": 4.388169511492117, + "learning_rate": 8.314558390631954e-06, + "loss": 0.4528, + "step": 1931 + }, + { + "epoch": 0.29, + "grad_norm": 9.697441883225904, + "learning_rate": 8.312729272694184e-06, + "loss": 0.5191, + "step": 1932 + }, + { + "epoch": 0.29, + "grad_norm": 6.525395999412714, + "learning_rate": 8.31089936420765e-06, + "loss": 0.5245, + "step": 1933 + }, + { + "epoch": 0.29, + "grad_norm": 5.53724236000614, + "learning_rate": 8.309068665609043e-06, + "loss": 0.4804, + "step": 1934 + }, + { + "epoch": 0.29, + "grad_norm": 3.5059799080755116, + "learning_rate": 8.307237177335238e-06, + "loss": 0.4682, + "step": 1935 + }, + { + "epoch": 0.29, + "grad_norm": 3.9493644428581867, + "learning_rate": 8.305404899823308e-06, + "loss": 0.4159, + "step": 1936 + }, + { + "epoch": 0.29, + "grad_norm": 5.584174767082249, + "learning_rate": 8.303571833510498e-06, + "loss": 0.5067, + "step": 1937 + }, + { + "epoch": 0.29, + "grad_norm": 8.873649077967587, + "learning_rate": 8.301737978834257e-06, + "loss": 0.4753, + "step": 1938 + }, + { + "epoch": 0.29, + "grad_norm": 11.409539324638367, + "learning_rate": 8.299903336232212e-06, + "loss": 0.5078, + "step": 1939 + }, + { + "epoch": 0.29, + "grad_norm": 4.826054198120511, + "learning_rate": 8.298067906142182e-06, + "loss": 0.4995, + "step": 1940 + }, + { + "epoch": 0.29, + "grad_norm": 4.586393167785583, + "learning_rate": 8.296231689002176e-06, + "loss": 0.5092, + "step": 1941 + }, + { + "epoch": 0.29, + "grad_norm": 3.5385528280024623, + "learning_rate": 8.294394685250386e-06, + "loss": 0.4935, + "step": 1942 + }, + { + "epoch": 0.29, + "grad_norm": 7.793228514442019, + "learning_rate": 8.292556895325195e-06, + "loss": 0.4801, + "step": 1943 + }, + { + "epoch": 0.29, + "grad_norm": 3.812104874920025, + "learning_rate": 8.29071831966517e-06, + "loss": 0.4289, + "step": 1944 + }, + { + "epoch": 0.29, + "grad_norm": 5.450796437518967, + "learning_rate": 8.288878958709072e-06, + "loss": 0.4563, + "step": 1945 + }, + { + "epoch": 0.29, + "grad_norm": 3.6190766930547618, + "learning_rate": 8.287038812895845e-06, + "loss": 0.5445, + "step": 1946 + }, + { + "epoch": 0.29, + "grad_norm": 5.100854100676929, + "learning_rate": 8.28519788266462e-06, + "loss": 0.4698, + "step": 1947 + }, + { + "epoch": 0.29, + "grad_norm": 3.1853678960862974, + "learning_rate": 8.283356168454715e-06, + "loss": 0.4519, + "step": 1948 + }, + { + "epoch": 0.29, + "grad_norm": 6.803452130620062, + "learning_rate": 8.281513670705637e-06, + "loss": 0.4749, + "step": 1949 + }, + { + "epoch": 0.29, + "grad_norm": 4.627115010571595, + "learning_rate": 8.27967038985708e-06, + "loss": 0.554, + "step": 1950 + }, + { + "epoch": 0.29, + "grad_norm": 3.8611967206334916, + "learning_rate": 8.277826326348921e-06, + "loss": 0.5131, + "step": 1951 + }, + { + "epoch": 0.29, + "grad_norm": 2.6442814803644947, + "learning_rate": 8.275981480621232e-06, + "loss": 0.4688, + "step": 1952 + }, + { + "epoch": 0.29, + "grad_norm": 2.5180105780469244, + "learning_rate": 8.274135853114263e-06, + "loss": 0.4834, + "step": 1953 + }, + { + "epoch": 0.29, + "grad_norm": 3.0693467847055547, + "learning_rate": 8.272289444268452e-06, + "loss": 0.4519, + "step": 1954 + }, + { + "epoch": 0.29, + "grad_norm": 3.362765111180697, + "learning_rate": 8.270442254524432e-06, + "loss": 0.5201, + "step": 1955 + }, + { + "epoch": 0.3, + "grad_norm": 3.3921777883462862, + "learning_rate": 8.268594284323008e-06, + "loss": 0.4216, + "step": 1956 + }, + { + "epoch": 0.3, + "grad_norm": 2.500942070745847, + "learning_rate": 8.266745534105185e-06, + "loss": 0.4955, + "step": 1957 + }, + { + "epoch": 0.3, + "grad_norm": 2.255584062934611, + "learning_rate": 8.264896004312147e-06, + "loss": 0.498, + "step": 1958 + }, + { + "epoch": 0.3, + "grad_norm": 3.2127037395125395, + "learning_rate": 8.26304569538526e-06, + "loss": 0.521, + "step": 1959 + }, + { + "epoch": 0.3, + "grad_norm": 2.700747247072509, + "learning_rate": 8.261194607766089e-06, + "loss": 0.447, + "step": 1960 + }, + { + "epoch": 0.3, + "grad_norm": 1.4751087350130125, + "learning_rate": 8.259342741896376e-06, + "loss": 0.6339, + "step": 1961 + }, + { + "epoch": 0.3, + "grad_norm": 3.2214553913778934, + "learning_rate": 8.257490098218044e-06, + "loss": 0.4278, + "step": 1962 + }, + { + "epoch": 0.3, + "grad_norm": 8.338134168787052, + "learning_rate": 8.255636677173212e-06, + "loss": 0.4849, + "step": 1963 + }, + { + "epoch": 0.3, + "grad_norm": 3.1859409457637344, + "learning_rate": 8.253782479204178e-06, + "loss": 0.4765, + "step": 1964 + }, + { + "epoch": 0.3, + "grad_norm": 2.5117292554240382, + "learning_rate": 8.251927504753427e-06, + "loss": 0.485, + "step": 1965 + }, + { + "epoch": 0.3, + "grad_norm": 3.4926681944018276, + "learning_rate": 8.25007175426363e-06, + "loss": 0.5457, + "step": 1966 + }, + { + "epoch": 0.3, + "grad_norm": 7.502478848056594, + "learning_rate": 8.248215228177645e-06, + "loss": 0.4623, + "step": 1967 + }, + { + "epoch": 0.3, + "grad_norm": 10.578073808552931, + "learning_rate": 8.24635792693851e-06, + "loss": 0.5076, + "step": 1968 + }, + { + "epoch": 0.3, + "grad_norm": 3.4154922910540093, + "learning_rate": 8.244499850989453e-06, + "loss": 0.5006, + "step": 1969 + }, + { + "epoch": 0.3, + "grad_norm": 2.909656340943361, + "learning_rate": 8.242641000773882e-06, + "loss": 0.5305, + "step": 1970 + }, + { + "epoch": 0.3, + "grad_norm": 2.8961743673901017, + "learning_rate": 8.240781376735395e-06, + "loss": 0.4628, + "step": 1971 + }, + { + "epoch": 0.3, + "grad_norm": 2.917908945541841, + "learning_rate": 8.23892097931777e-06, + "loss": 0.4598, + "step": 1972 + }, + { + "epoch": 0.3, + "grad_norm": 3.741156802336023, + "learning_rate": 8.237059808964975e-06, + "loss": 0.4473, + "step": 1973 + }, + { + "epoch": 0.3, + "grad_norm": 4.032668712719807, + "learning_rate": 8.235197866121155e-06, + "loss": 0.5037, + "step": 1974 + }, + { + "epoch": 0.3, + "grad_norm": 3.0185136848240717, + "learning_rate": 8.233335151230645e-06, + "loss": 0.472, + "step": 1975 + }, + { + "epoch": 0.3, + "grad_norm": 12.429646674056704, + "learning_rate": 8.231471664737967e-06, + "loss": 0.4228, + "step": 1976 + }, + { + "epoch": 0.3, + "grad_norm": 4.228851602991045, + "learning_rate": 8.229607407087816e-06, + "loss": 0.5216, + "step": 1977 + }, + { + "epoch": 0.3, + "grad_norm": 3.5526744296993145, + "learning_rate": 8.227742378725082e-06, + "loss": 0.4672, + "step": 1978 + }, + { + "epoch": 0.3, + "grad_norm": 3.882425841669333, + "learning_rate": 8.225876580094833e-06, + "loss": 0.498, + "step": 1979 + }, + { + "epoch": 0.3, + "grad_norm": 3.087778652514507, + "learning_rate": 8.224010011642326e-06, + "loss": 0.4754, + "step": 1980 + }, + { + "epoch": 0.3, + "grad_norm": 4.187336347503646, + "learning_rate": 8.222142673812995e-06, + "loss": 0.5261, + "step": 1981 + }, + { + "epoch": 0.3, + "grad_norm": 3.523961548911623, + "learning_rate": 8.220274567052461e-06, + "loss": 0.5126, + "step": 1982 + }, + { + "epoch": 0.3, + "grad_norm": 3.5052179445481246, + "learning_rate": 8.21840569180653e-06, + "loss": 0.5107, + "step": 1983 + }, + { + "epoch": 0.3, + "grad_norm": 3.0612608496042757, + "learning_rate": 8.216536048521188e-06, + "loss": 0.4819, + "step": 1984 + }, + { + "epoch": 0.3, + "grad_norm": 3.6089297594096084, + "learning_rate": 8.21466563764261e-06, + "loss": 0.4708, + "step": 1985 + }, + { + "epoch": 0.3, + "grad_norm": 4.026456647855598, + "learning_rate": 8.212794459617144e-06, + "loss": 0.4199, + "step": 1986 + }, + { + "epoch": 0.3, + "grad_norm": 3.656058538823349, + "learning_rate": 8.210922514891333e-06, + "loss": 0.5044, + "step": 1987 + }, + { + "epoch": 0.3, + "grad_norm": 3.9403317306778742, + "learning_rate": 8.209049803911895e-06, + "loss": 0.4356, + "step": 1988 + }, + { + "epoch": 0.3, + "grad_norm": 3.5577057754780164, + "learning_rate": 8.207176327125735e-06, + "loss": 0.5261, + "step": 1989 + }, + { + "epoch": 0.3, + "grad_norm": 3.407363825377123, + "learning_rate": 8.205302084979937e-06, + "loss": 0.4201, + "step": 1990 + }, + { + "epoch": 0.3, + "grad_norm": 3.504710866212139, + "learning_rate": 8.203427077921768e-06, + "loss": 0.5056, + "step": 1991 + }, + { + "epoch": 0.3, + "grad_norm": 3.571440377051897, + "learning_rate": 8.201551306398685e-06, + "loss": 0.4444, + "step": 1992 + }, + { + "epoch": 0.3, + "grad_norm": 2.5763005484626396, + "learning_rate": 8.199674770858315e-06, + "loss": 0.5007, + "step": 1993 + }, + { + "epoch": 0.3, + "grad_norm": 3.670484043552327, + "learning_rate": 8.197797471748478e-06, + "loss": 0.5201, + "step": 1994 + }, + { + "epoch": 0.3, + "grad_norm": 3.552146469875878, + "learning_rate": 8.19591940951717e-06, + "loss": 0.5054, + "step": 1995 + }, + { + "epoch": 0.3, + "grad_norm": 4.3374418475358905, + "learning_rate": 8.194040584612573e-06, + "loss": 0.5143, + "step": 1996 + }, + { + "epoch": 0.3, + "grad_norm": 4.353304581907365, + "learning_rate": 8.19216099748305e-06, + "loss": 0.5117, + "step": 1997 + }, + { + "epoch": 0.3, + "grad_norm": 3.4034030542110356, + "learning_rate": 8.19028064857714e-06, + "loss": 0.5334, + "step": 1998 + }, + { + "epoch": 0.3, + "grad_norm": 3.3089755476199203, + "learning_rate": 8.188399538343574e-06, + "loss": 0.5027, + "step": 1999 + }, + { + "epoch": 0.3, + "grad_norm": 5.868066876994381, + "learning_rate": 8.18651766723126e-06, + "loss": 0.481, + "step": 2000 + }, + { + "epoch": 0.3, + "grad_norm": 5.767863292317751, + "learning_rate": 8.184635035689283e-06, + "loss": 0.4768, + "step": 2001 + }, + { + "epoch": 0.3, + "grad_norm": 8.372190543989447, + "learning_rate": 8.182751644166915e-06, + "loss": 0.3999, + "step": 2002 + }, + { + "epoch": 0.3, + "grad_norm": 4.446942437485215, + "learning_rate": 8.180867493113611e-06, + "loss": 0.5254, + "step": 2003 + }, + { + "epoch": 0.3, + "grad_norm": 4.214872549516012, + "learning_rate": 8.178982582979001e-06, + "loss": 0.5101, + "step": 2004 + }, + { + "epoch": 0.3, + "grad_norm": 5.269443902190875, + "learning_rate": 8.177096914212901e-06, + "loss": 0.4826, + "step": 2005 + }, + { + "epoch": 0.3, + "grad_norm": 4.19947193220622, + "learning_rate": 8.175210487265307e-06, + "loss": 0.4632, + "step": 2006 + }, + { + "epoch": 0.3, + "grad_norm": 5.6315303257785265, + "learning_rate": 8.173323302586393e-06, + "loss": 0.5393, + "step": 2007 + }, + { + "epoch": 0.3, + "grad_norm": 4.239807957951117, + "learning_rate": 8.171435360626519e-06, + "loss": 0.5029, + "step": 2008 + }, + { + "epoch": 0.3, + "grad_norm": 4.537615747047689, + "learning_rate": 8.16954666183622e-06, + "loss": 0.4415, + "step": 2009 + }, + { + "epoch": 0.3, + "grad_norm": 2.150769027654825, + "learning_rate": 8.167657206666217e-06, + "loss": 0.5134, + "step": 2010 + }, + { + "epoch": 0.3, + "grad_norm": 3.5656512067934027, + "learning_rate": 8.165766995567408e-06, + "loss": 0.5854, + "step": 2011 + }, + { + "epoch": 0.3, + "grad_norm": 3.013288406417272, + "learning_rate": 8.16387602899087e-06, + "loss": 0.5072, + "step": 2012 + }, + { + "epoch": 0.3, + "grad_norm": 3.8652192956738816, + "learning_rate": 8.161984307387869e-06, + "loss": 0.5515, + "step": 2013 + }, + { + "epoch": 0.3, + "grad_norm": 2.5103800302150816, + "learning_rate": 8.16009183120984e-06, + "loss": 0.4655, + "step": 2014 + }, + { + "epoch": 0.3, + "grad_norm": 4.4720729120187475, + "learning_rate": 8.158198600908404e-06, + "loss": 0.4939, + "step": 2015 + }, + { + "epoch": 0.3, + "grad_norm": 7.958453184624063, + "learning_rate": 8.156304616935362e-06, + "loss": 0.5417, + "step": 2016 + }, + { + "epoch": 0.3, + "grad_norm": 3.846866877369375, + "learning_rate": 8.15440987974269e-06, + "loss": 0.6142, + "step": 2017 + }, + { + "epoch": 0.3, + "grad_norm": 3.76397675643439, + "learning_rate": 8.152514389782553e-06, + "loss": 0.4306, + "step": 2018 + }, + { + "epoch": 0.3, + "grad_norm": 3.308392708126946, + "learning_rate": 8.150618147507288e-06, + "loss": 0.5295, + "step": 2019 + }, + { + "epoch": 0.3, + "grad_norm": 4.486768825300372, + "learning_rate": 8.148721153369413e-06, + "loss": 0.4603, + "step": 2020 + }, + { + "epoch": 0.3, + "grad_norm": 3.716408944567494, + "learning_rate": 8.146823407821625e-06, + "loss": 0.4265, + "step": 2021 + }, + { + "epoch": 0.3, + "grad_norm": 3.3296597656415257, + "learning_rate": 8.144924911316803e-06, + "loss": 0.4787, + "step": 2022 + }, + { + "epoch": 0.31, + "grad_norm": 5.990574664500636, + "learning_rate": 8.143025664308005e-06, + "loss": 0.4943, + "step": 2023 + }, + { + "epoch": 0.31, + "grad_norm": 3.1312951518334695, + "learning_rate": 8.141125667248463e-06, + "loss": 0.5317, + "step": 2024 + }, + { + "epoch": 0.31, + "grad_norm": 4.681245116193845, + "learning_rate": 8.139224920591598e-06, + "loss": 0.4425, + "step": 2025 + }, + { + "epoch": 0.31, + "grad_norm": 3.8267613626899046, + "learning_rate": 8.137323424790996e-06, + "loss": 0.4611, + "step": 2026 + }, + { + "epoch": 0.31, + "grad_norm": 2.7696190903946976, + "learning_rate": 8.135421180300435e-06, + "loss": 0.5465, + "step": 2027 + }, + { + "epoch": 0.31, + "grad_norm": 3.4065612451213854, + "learning_rate": 8.133518187573864e-06, + "loss": 0.4431, + "step": 2028 + }, + { + "epoch": 0.31, + "grad_norm": 3.7174488705796107, + "learning_rate": 8.131614447065411e-06, + "loss": 0.4446, + "step": 2029 + }, + { + "epoch": 0.31, + "grad_norm": 4.050472334301364, + "learning_rate": 8.129709959229389e-06, + "loss": 0.4785, + "step": 2030 + }, + { + "epoch": 0.31, + "grad_norm": 3.4351775185835813, + "learning_rate": 8.127804724520278e-06, + "loss": 0.5031, + "step": 2031 + }, + { + "epoch": 0.31, + "grad_norm": 3.007100437839703, + "learning_rate": 8.125898743392748e-06, + "loss": 0.4808, + "step": 2032 + }, + { + "epoch": 0.31, + "grad_norm": 1.5586077632724835, + "learning_rate": 8.12399201630164e-06, + "loss": 0.6114, + "step": 2033 + }, + { + "epoch": 0.31, + "grad_norm": 5.376235176776049, + "learning_rate": 8.122084543701974e-06, + "loss": 0.4648, + "step": 2034 + }, + { + "epoch": 0.31, + "grad_norm": 3.6721041804343337, + "learning_rate": 8.12017632604895e-06, + "loss": 0.4845, + "step": 2035 + }, + { + "epoch": 0.31, + "grad_norm": 3.681413460958966, + "learning_rate": 8.118267363797943e-06, + "loss": 0.5184, + "step": 2036 + }, + { + "epoch": 0.31, + "grad_norm": 3.731296360387705, + "learning_rate": 8.116357657404508e-06, + "loss": 0.4976, + "step": 2037 + }, + { + "epoch": 0.31, + "grad_norm": 4.569076338376435, + "learning_rate": 8.114447207324377e-06, + "loss": 0.4644, + "step": 2038 + }, + { + "epoch": 0.31, + "grad_norm": 4.791417564080911, + "learning_rate": 8.11253601401346e-06, + "loss": 0.5108, + "step": 2039 + }, + { + "epoch": 0.31, + "grad_norm": 5.444381450521485, + "learning_rate": 8.110624077927841e-06, + "loss": 0.4431, + "step": 2040 + }, + { + "epoch": 0.31, + "grad_norm": 4.435879462004735, + "learning_rate": 8.108711399523787e-06, + "loss": 0.4465, + "step": 2041 + }, + { + "epoch": 0.31, + "grad_norm": 4.582886714480457, + "learning_rate": 8.106797979257738e-06, + "loss": 0.4837, + "step": 2042 + }, + { + "epoch": 0.31, + "grad_norm": 4.756466978791274, + "learning_rate": 8.10488381758631e-06, + "loss": 0.45, + "step": 2043 + }, + { + "epoch": 0.31, + "grad_norm": 5.009929127273303, + "learning_rate": 8.1029689149663e-06, + "loss": 0.5163, + "step": 2044 + }, + { + "epoch": 0.31, + "grad_norm": 3.113618177217519, + "learning_rate": 8.101053271854681e-06, + "loss": 0.4631, + "step": 2045 + }, + { + "epoch": 0.31, + "grad_norm": 3.933276270302843, + "learning_rate": 8.099136888708598e-06, + "loss": 0.507, + "step": 2046 + }, + { + "epoch": 0.31, + "grad_norm": 4.4842137529285875, + "learning_rate": 8.09721976598538e-06, + "loss": 0.5072, + "step": 2047 + }, + { + "epoch": 0.31, + "grad_norm": 2.8269996026018434, + "learning_rate": 8.095301904142526e-06, + "loss": 0.4863, + "step": 2048 + }, + { + "epoch": 0.31, + "grad_norm": 5.024216530452258, + "learning_rate": 8.093383303637715e-06, + "loss": 0.4922, + "step": 2049 + }, + { + "epoch": 0.31, + "grad_norm": 3.7200555831604305, + "learning_rate": 8.091463964928801e-06, + "loss": 0.4707, + "step": 2050 + }, + { + "epoch": 0.31, + "grad_norm": 7.834822972876296, + "learning_rate": 8.089543888473814e-06, + "loss": 0.4926, + "step": 2051 + }, + { + "epoch": 0.31, + "grad_norm": 4.426336026688418, + "learning_rate": 8.08762307473096e-06, + "loss": 0.467, + "step": 2052 + }, + { + "epoch": 0.31, + "grad_norm": 4.5505786325782305, + "learning_rate": 8.085701524158623e-06, + "loss": 0.5343, + "step": 2053 + }, + { + "epoch": 0.31, + "grad_norm": 4.977518940312848, + "learning_rate": 8.083779237215363e-06, + "loss": 0.4399, + "step": 2054 + }, + { + "epoch": 0.31, + "grad_norm": 3.311429715090669, + "learning_rate": 8.08185621435991e-06, + "loss": 0.4652, + "step": 2055 + }, + { + "epoch": 0.31, + "grad_norm": 7.255482934946208, + "learning_rate": 8.079932456051173e-06, + "loss": 0.4484, + "step": 2056 + }, + { + "epoch": 0.31, + "grad_norm": 4.1025038690054565, + "learning_rate": 8.078007962748243e-06, + "loss": 0.454, + "step": 2057 + }, + { + "epoch": 0.31, + "grad_norm": 4.484045824534379, + "learning_rate": 8.076082734910374e-06, + "loss": 0.4777, + "step": 2058 + }, + { + "epoch": 0.31, + "grad_norm": 5.70323585591674, + "learning_rate": 8.074156772997005e-06, + "loss": 0.4541, + "step": 2059 + }, + { + "epoch": 0.31, + "grad_norm": 5.660005945017618, + "learning_rate": 8.072230077467749e-06, + "loss": 0.5198, + "step": 2060 + }, + { + "epoch": 0.31, + "grad_norm": 4.804770453324357, + "learning_rate": 8.070302648782389e-06, + "loss": 0.4886, + "step": 2061 + }, + { + "epoch": 0.31, + "grad_norm": 4.038516545225295, + "learning_rate": 8.068374487400885e-06, + "loss": 0.4563, + "step": 2062 + }, + { + "epoch": 0.31, + "grad_norm": 4.090637895257619, + "learning_rate": 8.066445593783377e-06, + "loss": 0.4443, + "step": 2063 + }, + { + "epoch": 0.31, + "grad_norm": 5.150128018373972, + "learning_rate": 8.064515968390175e-06, + "loss": 0.4921, + "step": 2064 + }, + { + "epoch": 0.31, + "grad_norm": 5.375706286153132, + "learning_rate": 8.062585611681758e-06, + "loss": 0.4992, + "step": 2065 + }, + { + "epoch": 0.31, + "grad_norm": 3.8341800901534477, + "learning_rate": 8.060654524118793e-06, + "loss": 0.4824, + "step": 2066 + }, + { + "epoch": 0.31, + "grad_norm": 5.327680655176633, + "learning_rate": 8.058722706162112e-06, + "loss": 0.4786, + "step": 2067 + }, + { + "epoch": 0.31, + "grad_norm": 6.437113712214887, + "learning_rate": 8.05679015827272e-06, + "loss": 0.5071, + "step": 2068 + }, + { + "epoch": 0.31, + "grad_norm": 4.543795078075457, + "learning_rate": 8.054856880911806e-06, + "loss": 0.4857, + "step": 2069 + }, + { + "epoch": 0.31, + "grad_norm": 6.009692575470058, + "learning_rate": 8.052922874540722e-06, + "loss": 0.5398, + "step": 2070 + }, + { + "epoch": 0.31, + "grad_norm": 3.408972246639082, + "learning_rate": 8.050988139621001e-06, + "loss": 0.445, + "step": 2071 + }, + { + "epoch": 0.31, + "grad_norm": 3.3962576168139185, + "learning_rate": 8.049052676614346e-06, + "loss": 0.4225, + "step": 2072 + }, + { + "epoch": 0.31, + "grad_norm": 6.576044474025559, + "learning_rate": 8.047116485982638e-06, + "loss": 0.4897, + "step": 2073 + }, + { + "epoch": 0.31, + "grad_norm": 4.573494057779335, + "learning_rate": 8.045179568187925e-06, + "loss": 0.4982, + "step": 2074 + }, + { + "epoch": 0.31, + "grad_norm": 3.952367989162258, + "learning_rate": 8.043241923692436e-06, + "loss": 0.417, + "step": 2075 + }, + { + "epoch": 0.31, + "grad_norm": 3.4992245110181988, + "learning_rate": 8.041303552958567e-06, + "loss": 0.472, + "step": 2076 + }, + { + "epoch": 0.31, + "grad_norm": 8.292086418026972, + "learning_rate": 8.039364456448892e-06, + "loss": 0.5095, + "step": 2077 + }, + { + "epoch": 0.31, + "grad_norm": 3.679280125003664, + "learning_rate": 8.037424634626157e-06, + "loss": 0.4253, + "step": 2078 + }, + { + "epoch": 0.31, + "grad_norm": 3.922984905593826, + "learning_rate": 8.035484087953278e-06, + "loss": 0.538, + "step": 2079 + }, + { + "epoch": 0.31, + "grad_norm": 4.818958788630032, + "learning_rate": 8.03354281689335e-06, + "loss": 0.4862, + "step": 2080 + }, + { + "epoch": 0.31, + "grad_norm": 5.589804268321271, + "learning_rate": 8.031600821909636e-06, + "loss": 0.5003, + "step": 2081 + }, + { + "epoch": 0.31, + "grad_norm": 4.516644248174846, + "learning_rate": 8.029658103465571e-06, + "loss": 0.5076, + "step": 2082 + }, + { + "epoch": 0.31, + "grad_norm": 5.502339125545945, + "learning_rate": 8.027714662024768e-06, + "loss": 0.5039, + "step": 2083 + }, + { + "epoch": 0.31, + "grad_norm": 3.663590284462899, + "learning_rate": 8.025770498051007e-06, + "loss": 0.4998, + "step": 2084 + }, + { + "epoch": 0.31, + "grad_norm": 4.695407621501322, + "learning_rate": 8.023825612008243e-06, + "loss": 0.4742, + "step": 2085 + }, + { + "epoch": 0.31, + "grad_norm": 4.374660107052677, + "learning_rate": 8.021880004360604e-06, + "loss": 0.4294, + "step": 2086 + }, + { + "epoch": 0.31, + "grad_norm": 1.2727153203765817, + "learning_rate": 8.019933675572389e-06, + "loss": 0.5777, + "step": 2087 + }, + { + "epoch": 0.31, + "grad_norm": 4.6615253900440425, + "learning_rate": 8.01798662610807e-06, + "loss": 0.5448, + "step": 2088 + }, + { + "epoch": 0.32, + "grad_norm": 4.530246365631273, + "learning_rate": 8.016038856432287e-06, + "loss": 0.4614, + "step": 2089 + }, + { + "epoch": 0.32, + "grad_norm": 5.065180473846315, + "learning_rate": 8.01409036700986e-06, + "loss": 0.4679, + "step": 2090 + }, + { + "epoch": 0.32, + "grad_norm": 4.371048369007121, + "learning_rate": 8.012141158305773e-06, + "loss": 0.5205, + "step": 2091 + }, + { + "epoch": 0.32, + "grad_norm": 5.587327698869069, + "learning_rate": 8.010191230785184e-06, + "loss": 0.5486, + "step": 2092 + }, + { + "epoch": 0.32, + "grad_norm": 4.821258332716041, + "learning_rate": 8.008240584913426e-06, + "loss": 0.508, + "step": 2093 + }, + { + "epoch": 0.32, + "grad_norm": 2.011982498831261, + "learning_rate": 8.006289221155998e-06, + "loss": 0.6289, + "step": 2094 + }, + { + "epoch": 0.32, + "grad_norm": 5.878147474332953, + "learning_rate": 8.004337139978575e-06, + "loss": 0.5477, + "step": 2095 + }, + { + "epoch": 0.32, + "grad_norm": 4.282814793944137, + "learning_rate": 8.002384341846998e-06, + "loss": 0.5185, + "step": 2096 + }, + { + "epoch": 0.32, + "grad_norm": 6.342754365287702, + "learning_rate": 8.000430827227285e-06, + "loss": 0.543, + "step": 2097 + }, + { + "epoch": 0.32, + "grad_norm": 6.445318457807897, + "learning_rate": 7.998476596585622e-06, + "loss": 0.492, + "step": 2098 + }, + { + "epoch": 0.32, + "grad_norm": 5.047950773267699, + "learning_rate": 7.996521650388365e-06, + "loss": 0.4311, + "step": 2099 + }, + { + "epoch": 0.32, + "grad_norm": 4.51031738832105, + "learning_rate": 7.994565989102042e-06, + "loss": 0.4892, + "step": 2100 + }, + { + "epoch": 0.32, + "grad_norm": 3.7394656305504292, + "learning_rate": 7.992609613193351e-06, + "loss": 0.45, + "step": 2101 + }, + { + "epoch": 0.32, + "grad_norm": 4.070023113346108, + "learning_rate": 7.990652523129163e-06, + "loss": 0.5129, + "step": 2102 + }, + { + "epoch": 0.32, + "grad_norm": 5.837699230467814, + "learning_rate": 7.988694719376516e-06, + "loss": 0.5216, + "step": 2103 + }, + { + "epoch": 0.32, + "grad_norm": 4.500655472133036, + "learning_rate": 7.986736202402617e-06, + "loss": 0.4732, + "step": 2104 + }, + { + "epoch": 0.32, + "grad_norm": 6.040471246124025, + "learning_rate": 7.98477697267485e-06, + "loss": 0.5288, + "step": 2105 + }, + { + "epoch": 0.32, + "grad_norm": 4.184036126587419, + "learning_rate": 7.982817030660766e-06, + "loss": 0.4802, + "step": 2106 + }, + { + "epoch": 0.32, + "grad_norm": 7.128219390586616, + "learning_rate": 7.980856376828078e-06, + "loss": 0.4946, + "step": 2107 + }, + { + "epoch": 0.32, + "grad_norm": 5.718767210908153, + "learning_rate": 7.978895011644686e-06, + "loss": 0.5292, + "step": 2108 + }, + { + "epoch": 0.32, + "grad_norm": 3.639627820588555, + "learning_rate": 7.97693293557864e-06, + "loss": 0.4575, + "step": 2109 + }, + { + "epoch": 0.32, + "grad_norm": 4.486877771308906, + "learning_rate": 7.974970149098175e-06, + "loss": 0.4639, + "step": 2110 + }, + { + "epoch": 0.32, + "grad_norm": 6.861783287525592, + "learning_rate": 7.973006652671686e-06, + "loss": 0.4708, + "step": 2111 + }, + { + "epoch": 0.32, + "grad_norm": 5.028232489728966, + "learning_rate": 7.971042446767747e-06, + "loss": 0.444, + "step": 2112 + }, + { + "epoch": 0.32, + "grad_norm": 7.772587799046671, + "learning_rate": 7.96907753185509e-06, + "loss": 0.4705, + "step": 2113 + }, + { + "epoch": 0.32, + "grad_norm": 4.84025197652857, + "learning_rate": 7.967111908402623e-06, + "loss": 0.499, + "step": 2114 + }, + { + "epoch": 0.32, + "grad_norm": 4.356241538060642, + "learning_rate": 7.965145576879425e-06, + "loss": 0.5044, + "step": 2115 + }, + { + "epoch": 0.32, + "grad_norm": 7.807750504920291, + "learning_rate": 7.963178537754735e-06, + "loss": 0.4851, + "step": 2116 + }, + { + "epoch": 0.32, + "grad_norm": 4.020002449227239, + "learning_rate": 7.961210791497972e-06, + "loss": 0.4761, + "step": 2117 + }, + { + "epoch": 0.32, + "grad_norm": 7.8999533187777, + "learning_rate": 7.959242338578717e-06, + "loss": 0.4742, + "step": 2118 + }, + { + "epoch": 0.32, + "grad_norm": 31.523979780068863, + "learning_rate": 7.957273179466719e-06, + "loss": 0.5224, + "step": 2119 + }, + { + "epoch": 0.32, + "grad_norm": 7.55498993934344, + "learning_rate": 7.955303314631899e-06, + "loss": 0.4993, + "step": 2120 + }, + { + "epoch": 0.32, + "grad_norm": 5.090650539062269, + "learning_rate": 7.953332744544344e-06, + "loss": 0.3998, + "step": 2121 + }, + { + "epoch": 0.32, + "grad_norm": 7.727608535891911, + "learning_rate": 7.951361469674312e-06, + "loss": 0.5469, + "step": 2122 + }, + { + "epoch": 0.32, + "grad_norm": 4.446840171682057, + "learning_rate": 7.949389490492228e-06, + "loss": 0.5006, + "step": 2123 + }, + { + "epoch": 0.32, + "grad_norm": 6.166113422618472, + "learning_rate": 7.947416807468682e-06, + "loss": 0.5453, + "step": 2124 + }, + { + "epoch": 0.32, + "grad_norm": 6.692659368011256, + "learning_rate": 7.945443421074436e-06, + "loss": 0.4698, + "step": 2125 + }, + { + "epoch": 0.32, + "grad_norm": 7.379013079725918, + "learning_rate": 7.943469331780418e-06, + "loss": 0.4651, + "step": 2126 + }, + { + "epoch": 0.32, + "grad_norm": 9.360336372036317, + "learning_rate": 7.941494540057723e-06, + "loss": 0.5045, + "step": 2127 + }, + { + "epoch": 0.32, + "grad_norm": 3.8798164474051116, + "learning_rate": 7.93951904637762e-06, + "loss": 0.4779, + "step": 2128 + }, + { + "epoch": 0.32, + "grad_norm": 6.586178382440598, + "learning_rate": 7.937542851211533e-06, + "loss": 0.4801, + "step": 2129 + }, + { + "epoch": 0.32, + "grad_norm": 3.256108382607711, + "learning_rate": 7.935565955031064e-06, + "loss": 0.5116, + "step": 2130 + }, + { + "epoch": 0.32, + "grad_norm": 11.640907416460164, + "learning_rate": 7.933588358307978e-06, + "loss": 0.5381, + "step": 2131 + }, + { + "epoch": 0.32, + "grad_norm": 5.954041253881225, + "learning_rate": 7.93161006151421e-06, + "loss": 0.5127, + "step": 2132 + }, + { + "epoch": 0.32, + "grad_norm": 4.240094312523461, + "learning_rate": 7.929631065121859e-06, + "loss": 0.5015, + "step": 2133 + }, + { + "epoch": 0.32, + "grad_norm": 1.3686261349287927, + "learning_rate": 7.927651369603192e-06, + "loss": 0.6195, + "step": 2134 + }, + { + "epoch": 0.32, + "grad_norm": 4.316009964017145, + "learning_rate": 7.925670975430644e-06, + "loss": 0.4647, + "step": 2135 + }, + { + "epoch": 0.32, + "grad_norm": 9.770136480265396, + "learning_rate": 7.923689883076813e-06, + "loss": 0.3973, + "step": 2136 + }, + { + "epoch": 0.32, + "grad_norm": 24.42306528823531, + "learning_rate": 7.92170809301447e-06, + "loss": 0.4305, + "step": 2137 + }, + { + "epoch": 0.32, + "grad_norm": 3.8943321634780705, + "learning_rate": 7.919725605716547e-06, + "loss": 0.4623, + "step": 2138 + }, + { + "epoch": 0.32, + "grad_norm": 5.982860020072937, + "learning_rate": 7.917742421656142e-06, + "loss": 0.4728, + "step": 2139 + }, + { + "epoch": 0.32, + "grad_norm": 4.35216111816546, + "learning_rate": 7.915758541306523e-06, + "loss": 0.5555, + "step": 2140 + }, + { + "epoch": 0.32, + "grad_norm": 5.736581155957992, + "learning_rate": 7.913773965141126e-06, + "loss": 0.4504, + "step": 2141 + }, + { + "epoch": 0.32, + "grad_norm": 6.785217131705203, + "learning_rate": 7.911788693633546e-06, + "loss": 0.5016, + "step": 2142 + }, + { + "epoch": 0.32, + "grad_norm": 5.90801967389207, + "learning_rate": 7.90980272725755e-06, + "loss": 0.4722, + "step": 2143 + }, + { + "epoch": 0.32, + "grad_norm": 22.730499328387808, + "learning_rate": 7.907816066487065e-06, + "loss": 0.5073, + "step": 2144 + }, + { + "epoch": 0.32, + "grad_norm": 3.78894965584396, + "learning_rate": 7.90582871179619e-06, + "loss": 0.4589, + "step": 2145 + }, + { + "epoch": 0.32, + "grad_norm": 5.261887595308605, + "learning_rate": 7.903840663659186e-06, + "loss": 0.4717, + "step": 2146 + }, + { + "epoch": 0.32, + "grad_norm": 4.548216292368708, + "learning_rate": 7.90185192255048e-06, + "loss": 0.4711, + "step": 2147 + }, + { + "epoch": 0.32, + "grad_norm": 4.565504178454162, + "learning_rate": 7.899862488944664e-06, + "loss": 0.4094, + "step": 2148 + }, + { + "epoch": 0.32, + "grad_norm": 5.49578837097961, + "learning_rate": 7.897872363316496e-06, + "loss": 0.3931, + "step": 2149 + }, + { + "epoch": 0.32, + "grad_norm": 5.45909877112897, + "learning_rate": 7.895881546140903e-06, + "loss": 0.5676, + "step": 2150 + }, + { + "epoch": 0.32, + "grad_norm": 4.328625233123079, + "learning_rate": 7.893890037892966e-06, + "loss": 0.512, + "step": 2151 + }, + { + "epoch": 0.32, + "grad_norm": 4.042434989107127, + "learning_rate": 7.891897839047944e-06, + "loss": 0.4762, + "step": 2152 + }, + { + "epoch": 0.32, + "grad_norm": 4.744368777257508, + "learning_rate": 7.889904950081252e-06, + "loss": 0.4937, + "step": 2153 + }, + { + "epoch": 0.32, + "grad_norm": 5.674849416026325, + "learning_rate": 7.887911371468474e-06, + "loss": 0.5436, + "step": 2154 + }, + { + "epoch": 0.33, + "grad_norm": 6.013054728693937, + "learning_rate": 7.885917103685354e-06, + "loss": 0.4946, + "step": 2155 + }, + { + "epoch": 0.33, + "grad_norm": 9.551857759603399, + "learning_rate": 7.883922147207805e-06, + "loss": 0.4562, + "step": 2156 + }, + { + "epoch": 0.33, + "grad_norm": 5.031393922642984, + "learning_rate": 7.881926502511906e-06, + "loss": 0.4254, + "step": 2157 + }, + { + "epoch": 0.33, + "grad_norm": 4.71574286855643, + "learning_rate": 7.879930170073891e-06, + "loss": 0.4783, + "step": 2158 + }, + { + "epoch": 0.33, + "grad_norm": 7.616650024329564, + "learning_rate": 7.877933150370168e-06, + "loss": 0.5097, + "step": 2159 + }, + { + "epoch": 0.33, + "grad_norm": 7.863964779273687, + "learning_rate": 7.875935443877305e-06, + "loss": 0.405, + "step": 2160 + }, + { + "epoch": 0.33, + "grad_norm": 4.196974992329872, + "learning_rate": 7.873937051072037e-06, + "loss": 0.4858, + "step": 2161 + }, + { + "epoch": 0.33, + "grad_norm": 1.16667125838661, + "learning_rate": 7.871937972431253e-06, + "loss": 0.5635, + "step": 2162 + }, + { + "epoch": 0.33, + "grad_norm": 4.719646713991938, + "learning_rate": 7.869938208432016e-06, + "loss": 0.4877, + "step": 2163 + }, + { + "epoch": 0.33, + "grad_norm": 8.1988541600288, + "learning_rate": 7.867937759551551e-06, + "loss": 0.5041, + "step": 2164 + }, + { + "epoch": 0.33, + "grad_norm": 4.0627969480865875, + "learning_rate": 7.865936626267244e-06, + "loss": 0.4481, + "step": 2165 + }, + { + "epoch": 0.33, + "grad_norm": 4.260221441621175, + "learning_rate": 7.86393480905664e-06, + "loss": 0.4045, + "step": 2166 + }, + { + "epoch": 0.33, + "grad_norm": 6.323204477134403, + "learning_rate": 7.86193230839746e-06, + "loss": 0.4774, + "step": 2167 + }, + { + "epoch": 0.33, + "grad_norm": 5.634169001136357, + "learning_rate": 7.859929124767574e-06, + "loss": 0.4664, + "step": 2168 + }, + { + "epoch": 0.33, + "grad_norm": 8.307725787834439, + "learning_rate": 7.857925258645022e-06, + "loss": 0.508, + "step": 2169 + }, + { + "epoch": 0.33, + "grad_norm": 3.75387980271713, + "learning_rate": 7.85592071050801e-06, + "loss": 0.5495, + "step": 2170 + }, + { + "epoch": 0.33, + "grad_norm": 4.4334479081515505, + "learning_rate": 7.853915480834897e-06, + "loss": 0.5304, + "step": 2171 + }, + { + "epoch": 0.33, + "grad_norm": 4.610796011653907, + "learning_rate": 7.851909570104215e-06, + "loss": 0.4874, + "step": 2172 + }, + { + "epoch": 0.33, + "grad_norm": 5.46111470450064, + "learning_rate": 7.849902978794653e-06, + "loss": 0.4471, + "step": 2173 + }, + { + "epoch": 0.33, + "grad_norm": 3.3407789685569433, + "learning_rate": 7.847895707385063e-06, + "loss": 0.4945, + "step": 2174 + }, + { + "epoch": 0.33, + "grad_norm": 3.6494318295919395, + "learning_rate": 7.845887756354458e-06, + "loss": 0.4904, + "step": 2175 + }, + { + "epoch": 0.33, + "grad_norm": 7.124591521678854, + "learning_rate": 7.843879126182017e-06, + "loss": 0.5073, + "step": 2176 + }, + { + "epoch": 0.33, + "grad_norm": 4.254035371712426, + "learning_rate": 7.841869817347079e-06, + "loss": 0.4834, + "step": 2177 + }, + { + "epoch": 0.33, + "grad_norm": 8.093151501267945, + "learning_rate": 7.839859830329141e-06, + "loss": 0.4956, + "step": 2178 + }, + { + "epoch": 0.33, + "grad_norm": 4.940168631288694, + "learning_rate": 7.837849165607872e-06, + "loss": 0.4748, + "step": 2179 + }, + { + "epoch": 0.33, + "grad_norm": 4.687471595529828, + "learning_rate": 7.835837823663093e-06, + "loss": 0.415, + "step": 2180 + }, + { + "epoch": 0.33, + "grad_norm": 4.697200784902237, + "learning_rate": 7.833825804974791e-06, + "loss": 0.5021, + "step": 2181 + }, + { + "epoch": 0.33, + "grad_norm": 5.271349965181905, + "learning_rate": 7.831813110023113e-06, + "loss": 0.5292, + "step": 2182 + }, + { + "epoch": 0.33, + "grad_norm": 18.93701093774531, + "learning_rate": 7.829799739288364e-06, + "loss": 0.478, + "step": 2183 + }, + { + "epoch": 0.33, + "grad_norm": 17.014321834641205, + "learning_rate": 7.827785693251023e-06, + "loss": 0.5284, + "step": 2184 + }, + { + "epoch": 0.33, + "grad_norm": 4.866402131474855, + "learning_rate": 7.825770972391713e-06, + "loss": 0.4753, + "step": 2185 + }, + { + "epoch": 0.33, + "grad_norm": 6.236777398818386, + "learning_rate": 7.82375557719123e-06, + "loss": 0.4717, + "step": 2186 + }, + { + "epoch": 0.33, + "grad_norm": 1.504396751582203, + "learning_rate": 7.821739508130528e-06, + "loss": 0.6661, + "step": 2187 + }, + { + "epoch": 0.33, + "grad_norm": 3.9820391702594624, + "learning_rate": 7.819722765690723e-06, + "loss": 0.4298, + "step": 2188 + }, + { + "epoch": 0.33, + "grad_norm": 6.7361601999847975, + "learning_rate": 7.817705350353085e-06, + "loss": 0.4839, + "step": 2189 + }, + { + "epoch": 0.33, + "grad_norm": 8.338259722341007, + "learning_rate": 7.81568726259905e-06, + "loss": 0.5113, + "step": 2190 + }, + { + "epoch": 0.33, + "grad_norm": 7.59208814913395, + "learning_rate": 7.813668502910217e-06, + "loss": 0.4855, + "step": 2191 + }, + { + "epoch": 0.33, + "grad_norm": 5.172904759245342, + "learning_rate": 7.811649071768342e-06, + "loss": 0.4979, + "step": 2192 + }, + { + "epoch": 0.33, + "grad_norm": 1.1817078946551491, + "learning_rate": 7.809628969655338e-06, + "loss": 0.5781, + "step": 2193 + }, + { + "epoch": 0.33, + "grad_norm": 3.7623441585259623, + "learning_rate": 7.807608197053284e-06, + "loss": 0.5057, + "step": 2194 + }, + { + "epoch": 0.33, + "grad_norm": 4.863023361224732, + "learning_rate": 7.805586754444416e-06, + "loss": 0.4687, + "step": 2195 + }, + { + "epoch": 0.33, + "grad_norm": 4.692465100577142, + "learning_rate": 7.803564642311133e-06, + "loss": 0.4898, + "step": 2196 + }, + { + "epoch": 0.33, + "grad_norm": 4.376862031115437, + "learning_rate": 7.801541861135988e-06, + "loss": 0.4951, + "step": 2197 + }, + { + "epoch": 0.33, + "grad_norm": 6.115803755422277, + "learning_rate": 7.799518411401698e-06, + "loss": 0.4707, + "step": 2198 + }, + { + "epoch": 0.33, + "grad_norm": 3.331988467054175, + "learning_rate": 7.797494293591138e-06, + "loss": 0.4712, + "step": 2199 + }, + { + "epoch": 0.33, + "grad_norm": 4.645756969254886, + "learning_rate": 7.795469508187343e-06, + "loss": 0.4239, + "step": 2200 + }, + { + "epoch": 0.33, + "grad_norm": 7.764241281048958, + "learning_rate": 7.79344405567351e-06, + "loss": 0.4836, + "step": 2201 + }, + { + "epoch": 0.33, + "grad_norm": 6.45291251264358, + "learning_rate": 7.791417936532987e-06, + "loss": 0.4405, + "step": 2202 + }, + { + "epoch": 0.33, + "grad_norm": 6.530735790795023, + "learning_rate": 7.78939115124929e-06, + "loss": 0.4865, + "step": 2203 + }, + { + "epoch": 0.33, + "grad_norm": 6.895862206372964, + "learning_rate": 7.787363700306092e-06, + "loss": 0.5127, + "step": 2204 + }, + { + "epoch": 0.33, + "grad_norm": 3.7095273962399045, + "learning_rate": 7.78533558418722e-06, + "loss": 0.4805, + "step": 2205 + }, + { + "epoch": 0.33, + "grad_norm": 5.300751484560304, + "learning_rate": 7.783306803376664e-06, + "loss": 0.5849, + "step": 2206 + }, + { + "epoch": 0.33, + "grad_norm": 5.410188735106823, + "learning_rate": 7.781277358358573e-06, + "loss": 0.444, + "step": 2207 + }, + { + "epoch": 0.33, + "grad_norm": 10.003925819727987, + "learning_rate": 7.77924724961725e-06, + "loss": 0.4389, + "step": 2208 + }, + { + "epoch": 0.33, + "grad_norm": 3.498274220337695, + "learning_rate": 7.777216477637162e-06, + "loss": 0.3882, + "step": 2209 + }, + { + "epoch": 0.33, + "grad_norm": 7.211067056646403, + "learning_rate": 7.775185042902932e-06, + "loss": 0.5186, + "step": 2210 + }, + { + "epoch": 0.33, + "grad_norm": 7.372237635794461, + "learning_rate": 7.773152945899343e-06, + "loss": 0.4679, + "step": 2211 + }, + { + "epoch": 0.33, + "grad_norm": 11.969806409589777, + "learning_rate": 7.77112018711133e-06, + "loss": 0.443, + "step": 2212 + }, + { + "epoch": 0.33, + "grad_norm": 5.113006032377814, + "learning_rate": 7.76908676702399e-06, + "loss": 0.5581, + "step": 2213 + }, + { + "epoch": 0.33, + "grad_norm": 3.381632691074837, + "learning_rate": 7.767052686122582e-06, + "loss": 0.453, + "step": 2214 + }, + { + "epoch": 0.33, + "grad_norm": 7.903511901982955, + "learning_rate": 7.765017944892513e-06, + "loss": 0.459, + "step": 2215 + }, + { + "epoch": 0.33, + "grad_norm": 4.167734325882224, + "learning_rate": 7.762982543819358e-06, + "loss": 0.4724, + "step": 2216 + }, + { + "epoch": 0.33, + "grad_norm": 5.524264263964921, + "learning_rate": 7.760946483388843e-06, + "loss": 0.5182, + "step": 2217 + }, + { + "epoch": 0.33, + "grad_norm": 3.2427223109487247, + "learning_rate": 7.75890976408685e-06, + "loss": 0.4283, + "step": 2218 + }, + { + "epoch": 0.33, + "grad_norm": 3.9112451067848384, + "learning_rate": 7.756872386399424e-06, + "loss": 0.4845, + "step": 2219 + }, + { + "epoch": 0.33, + "grad_norm": 3.023423941003666, + "learning_rate": 7.754834350812765e-06, + "loss": 0.4259, + "step": 2220 + }, + { + "epoch": 0.33, + "grad_norm": 3.550337240890039, + "learning_rate": 7.752795657813227e-06, + "loss": 0.4707, + "step": 2221 + }, + { + "epoch": 0.34, + "grad_norm": 4.832586977104632, + "learning_rate": 7.750756307887323e-06, + "loss": 0.4975, + "step": 2222 + }, + { + "epoch": 0.34, + "grad_norm": 3.6828741998903833, + "learning_rate": 7.748716301521726e-06, + "loss": 0.4354, + "step": 2223 + }, + { + "epoch": 0.34, + "grad_norm": 4.718182301539495, + "learning_rate": 7.746675639203258e-06, + "loss": 0.4857, + "step": 2224 + }, + { + "epoch": 0.34, + "grad_norm": 4.541075844528042, + "learning_rate": 7.744634321418907e-06, + "loss": 0.4612, + "step": 2225 + }, + { + "epoch": 0.34, + "grad_norm": 3.164285514763037, + "learning_rate": 7.74259234865581e-06, + "loss": 0.5385, + "step": 2226 + }, + { + "epoch": 0.34, + "grad_norm": 3.385217029859917, + "learning_rate": 7.74054972140126e-06, + "loss": 0.4892, + "step": 2227 + }, + { + "epoch": 0.34, + "grad_norm": 3.4354853703148183, + "learning_rate": 7.738506440142713e-06, + "loss": 0.474, + "step": 2228 + }, + { + "epoch": 0.34, + "grad_norm": 3.3111682565596974, + "learning_rate": 7.736462505367775e-06, + "loss": 0.5294, + "step": 2229 + }, + { + "epoch": 0.34, + "grad_norm": 1.4776405368640575, + "learning_rate": 7.734417917564212e-06, + "loss": 0.5849, + "step": 2230 + }, + { + "epoch": 0.34, + "grad_norm": 5.123442443517847, + "learning_rate": 7.732372677219942e-06, + "loss": 0.5043, + "step": 2231 + }, + { + "epoch": 0.34, + "grad_norm": 4.7629193807938774, + "learning_rate": 7.730326784823042e-06, + "loss": 0.5866, + "step": 2232 + }, + { + "epoch": 0.34, + "grad_norm": 3.8303656972756643, + "learning_rate": 7.72828024086174e-06, + "loss": 0.4491, + "step": 2233 + }, + { + "epoch": 0.34, + "grad_norm": 3.5550750408239176, + "learning_rate": 7.726233045824427e-06, + "loss": 0.5077, + "step": 2234 + }, + { + "epoch": 0.34, + "grad_norm": 4.013021933844967, + "learning_rate": 7.724185200199642e-06, + "loss": 0.4422, + "step": 2235 + }, + { + "epoch": 0.34, + "grad_norm": 10.074469155291931, + "learning_rate": 7.722136704476086e-06, + "loss": 0.4465, + "step": 2236 + }, + { + "epoch": 0.34, + "grad_norm": 3.5485749830158526, + "learning_rate": 7.720087559142607e-06, + "loss": 0.3921, + "step": 2237 + }, + { + "epoch": 0.34, + "grad_norm": 5.818457222940472, + "learning_rate": 7.718037764688215e-06, + "loss": 0.4346, + "step": 2238 + }, + { + "epoch": 0.34, + "grad_norm": 4.527702045899896, + "learning_rate": 7.71598732160207e-06, + "loss": 0.4758, + "step": 2239 + }, + { + "epoch": 0.34, + "grad_norm": 5.171578178258161, + "learning_rate": 7.713936230373491e-06, + "loss": 0.4446, + "step": 2240 + }, + { + "epoch": 0.34, + "grad_norm": 4.858507846635064, + "learning_rate": 7.711884491491953e-06, + "loss": 0.4672, + "step": 2241 + }, + { + "epoch": 0.34, + "grad_norm": 3.4540914075896705, + "learning_rate": 7.709832105447075e-06, + "loss": 0.4755, + "step": 2242 + }, + { + "epoch": 0.34, + "grad_norm": 3.677163104081454, + "learning_rate": 7.707779072728643e-06, + "loss": 0.4744, + "step": 2243 + }, + { + "epoch": 0.34, + "grad_norm": 4.754609000067211, + "learning_rate": 7.705725393826591e-06, + "loss": 0.4983, + "step": 2244 + }, + { + "epoch": 0.34, + "grad_norm": 11.448159110835741, + "learning_rate": 7.703671069231007e-06, + "loss": 0.4673, + "step": 2245 + }, + { + "epoch": 0.34, + "grad_norm": 5.853395701847896, + "learning_rate": 7.701616099432137e-06, + "loss": 0.464, + "step": 2246 + }, + { + "epoch": 0.34, + "grad_norm": 5.6108452016247705, + "learning_rate": 7.699560484920378e-06, + "loss": 0.4559, + "step": 2247 + }, + { + "epoch": 0.34, + "grad_norm": 2.792459428507114, + "learning_rate": 7.697504226186279e-06, + "loss": 0.4398, + "step": 2248 + }, + { + "epoch": 0.34, + "grad_norm": 3.248093122888479, + "learning_rate": 7.695447323720547e-06, + "loss": 0.5419, + "step": 2249 + }, + { + "epoch": 0.34, + "grad_norm": 4.736821857064174, + "learning_rate": 7.693389778014038e-06, + "loss": 0.4855, + "step": 2250 + }, + { + "epoch": 0.34, + "grad_norm": 4.515647200063691, + "learning_rate": 7.691331589557767e-06, + "loss": 0.4689, + "step": 2251 + }, + { + "epoch": 0.34, + "grad_norm": 2.7439235957701436, + "learning_rate": 7.6892727588429e-06, + "loss": 0.4595, + "step": 2252 + }, + { + "epoch": 0.34, + "grad_norm": 5.486528988819889, + "learning_rate": 7.687213286360754e-06, + "loss": 0.4133, + "step": 2253 + }, + { + "epoch": 0.34, + "grad_norm": 2.944870414467476, + "learning_rate": 7.685153172602799e-06, + "loss": 0.4275, + "step": 2254 + }, + { + "epoch": 0.34, + "grad_norm": 3.8091008053728275, + "learning_rate": 7.683092418060664e-06, + "loss": 0.4873, + "step": 2255 + }, + { + "epoch": 0.34, + "grad_norm": 3.3534539575238926, + "learning_rate": 7.681031023226126e-06, + "loss": 0.4834, + "step": 2256 + }, + { + "epoch": 0.34, + "grad_norm": 4.660781671805133, + "learning_rate": 7.678968988591114e-06, + "loss": 0.4692, + "step": 2257 + }, + { + "epoch": 0.34, + "grad_norm": 5.204042381836633, + "learning_rate": 7.676906314647713e-06, + "loss": 0.4332, + "step": 2258 + }, + { + "epoch": 0.34, + "grad_norm": 4.092228284133106, + "learning_rate": 7.674843001888156e-06, + "loss": 0.5518, + "step": 2259 + }, + { + "epoch": 0.34, + "grad_norm": 7.432305954559312, + "learning_rate": 7.672779050804834e-06, + "loss": 0.4905, + "step": 2260 + }, + { + "epoch": 0.34, + "grad_norm": 4.303047100994622, + "learning_rate": 7.670714461890287e-06, + "loss": 0.4818, + "step": 2261 + }, + { + "epoch": 0.34, + "grad_norm": 4.8671256128651255, + "learning_rate": 7.668649235637209e-06, + "loss": 0.423, + "step": 2262 + }, + { + "epoch": 0.34, + "grad_norm": 8.106252559946471, + "learning_rate": 7.666583372538443e-06, + "loss": 0.4982, + "step": 2263 + }, + { + "epoch": 0.34, + "grad_norm": 3.3922877286906736, + "learning_rate": 7.664516873086987e-06, + "loss": 0.4858, + "step": 2264 + }, + { + "epoch": 0.34, + "grad_norm": 4.430306247078682, + "learning_rate": 7.662449737775992e-06, + "loss": 0.4979, + "step": 2265 + }, + { + "epoch": 0.34, + "grad_norm": 2.739244484901206, + "learning_rate": 7.660381967098753e-06, + "loss": 0.459, + "step": 2266 + }, + { + "epoch": 0.34, + "grad_norm": 3.287673453029182, + "learning_rate": 7.658313561548727e-06, + "loss": 0.4728, + "step": 2267 + }, + { + "epoch": 0.34, + "grad_norm": 4.200883218055887, + "learning_rate": 7.656244521619516e-06, + "loss": 0.4403, + "step": 2268 + }, + { + "epoch": 0.34, + "grad_norm": 3.241313989189559, + "learning_rate": 7.654174847804876e-06, + "loss": 0.5128, + "step": 2269 + }, + { + "epoch": 0.34, + "grad_norm": 2.454916603722773, + "learning_rate": 7.652104540598712e-06, + "loss": 0.4542, + "step": 2270 + }, + { + "epoch": 0.34, + "grad_norm": 4.846682446655618, + "learning_rate": 7.650033600495085e-06, + "loss": 0.4976, + "step": 2271 + }, + { + "epoch": 0.34, + "grad_norm": 2.570160536200375, + "learning_rate": 7.647962027988198e-06, + "loss": 0.5302, + "step": 2272 + }, + { + "epoch": 0.34, + "grad_norm": 2.69400521704816, + "learning_rate": 7.645889823572414e-06, + "loss": 0.5013, + "step": 2273 + }, + { + "epoch": 0.34, + "grad_norm": 4.132253518412157, + "learning_rate": 7.64381698774224e-06, + "loss": 0.4776, + "step": 2274 + }, + { + "epoch": 0.34, + "grad_norm": 4.248818058467961, + "learning_rate": 7.641743520992343e-06, + "loss": 0.5039, + "step": 2275 + }, + { + "epoch": 0.34, + "grad_norm": 2.9206915459343854, + "learning_rate": 7.63966942381753e-06, + "loss": 0.4484, + "step": 2276 + }, + { + "epoch": 0.34, + "grad_norm": 3.018739013453988, + "learning_rate": 7.637594696712764e-06, + "loss": 0.486, + "step": 2277 + }, + { + "epoch": 0.34, + "grad_norm": 3.060724434668855, + "learning_rate": 7.635519340173158e-06, + "loss": 0.4728, + "step": 2278 + }, + { + "epoch": 0.34, + "grad_norm": 3.0903544711181343, + "learning_rate": 7.633443354693972e-06, + "loss": 0.5149, + "step": 2279 + }, + { + "epoch": 0.34, + "grad_norm": 8.938529300876123, + "learning_rate": 7.631366740770622e-06, + "loss": 0.5119, + "step": 2280 + }, + { + "epoch": 0.34, + "grad_norm": 3.3174035234796326, + "learning_rate": 7.629289498898668e-06, + "loss": 0.454, + "step": 2281 + }, + { + "epoch": 0.34, + "grad_norm": 2.745714741978283, + "learning_rate": 7.627211629573823e-06, + "loss": 0.4894, + "step": 2282 + }, + { + "epoch": 0.34, + "grad_norm": 2.440185290054798, + "learning_rate": 7.62513313329195e-06, + "loss": 0.4784, + "step": 2283 + }, + { + "epoch": 0.34, + "grad_norm": 3.5165780264646433, + "learning_rate": 7.623054010549059e-06, + "loss": 0.4701, + "step": 2284 + }, + { + "epoch": 0.34, + "grad_norm": 2.4499738306862464, + "learning_rate": 7.620974261841314e-06, + "loss": 0.4827, + "step": 2285 + }, + { + "epoch": 0.34, + "grad_norm": 3.6224948144255533, + "learning_rate": 7.618893887665023e-06, + "loss": 0.4011, + "step": 2286 + }, + { + "epoch": 0.34, + "grad_norm": 2.793566931630369, + "learning_rate": 7.616812888516647e-06, + "loss": 0.4877, + "step": 2287 + }, + { + "epoch": 0.35, + "grad_norm": 6.391115502612298, + "learning_rate": 7.6147312648927965e-06, + "loss": 0.4667, + "step": 2288 + }, + { + "epoch": 0.35, + "grad_norm": 13.835996460213295, + "learning_rate": 7.612649017290226e-06, + "loss": 0.5398, + "step": 2289 + }, + { + "epoch": 0.35, + "grad_norm": 2.2422443584194647, + "learning_rate": 7.610566146205845e-06, + "loss": 0.4862, + "step": 2290 + }, + { + "epoch": 0.35, + "grad_norm": 3.6073575472593338, + "learning_rate": 7.608482652136708e-06, + "loss": 0.445, + "step": 2291 + }, + { + "epoch": 0.35, + "grad_norm": 3.410911447507514, + "learning_rate": 7.606398535580022e-06, + "loss": 0.4626, + "step": 2292 + }, + { + "epoch": 0.35, + "grad_norm": 2.716683746671777, + "learning_rate": 7.604313797033139e-06, + "loss": 0.4388, + "step": 2293 + }, + { + "epoch": 0.35, + "grad_norm": 2.5714470414665596, + "learning_rate": 7.602228436993556e-06, + "loss": 0.5326, + "step": 2294 + }, + { + "epoch": 0.35, + "grad_norm": 3.2992389994539955, + "learning_rate": 7.60014245595893e-06, + "loss": 0.5278, + "step": 2295 + }, + { + "epoch": 0.35, + "grad_norm": 3.8073232961435317, + "learning_rate": 7.598055854427052e-06, + "loss": 0.3928, + "step": 2296 + }, + { + "epoch": 0.35, + "grad_norm": 1.2828805534073893, + "learning_rate": 7.595968632895872e-06, + "loss": 0.5716, + "step": 2297 + }, + { + "epoch": 0.35, + "grad_norm": 4.351403044743196, + "learning_rate": 7.593880791863484e-06, + "loss": 0.5161, + "step": 2298 + }, + { + "epoch": 0.35, + "grad_norm": 2.6287241846561997, + "learning_rate": 7.59179233182813e-06, + "loss": 0.4746, + "step": 2299 + }, + { + "epoch": 0.35, + "grad_norm": 2.8255561895653836, + "learning_rate": 7.589703253288196e-06, + "loss": 0.4746, + "step": 2300 + }, + { + "epoch": 0.35, + "grad_norm": 3.6237136435664787, + "learning_rate": 7.587613556742224e-06, + "loss": 0.5696, + "step": 2301 + }, + { + "epoch": 0.35, + "grad_norm": 3.6381770416095205, + "learning_rate": 7.585523242688894e-06, + "loss": 0.4796, + "step": 2302 + }, + { + "epoch": 0.35, + "grad_norm": 2.4757415396026063, + "learning_rate": 7.583432311627042e-06, + "loss": 0.4885, + "step": 2303 + }, + { + "epoch": 0.35, + "grad_norm": 3.8351873207164493, + "learning_rate": 7.581340764055642e-06, + "loss": 0.499, + "step": 2304 + }, + { + "epoch": 0.35, + "grad_norm": 3.5355110284374742, + "learning_rate": 7.579248600473827e-06, + "loss": 0.4702, + "step": 2305 + }, + { + "epoch": 0.35, + "grad_norm": 4.452357047864653, + "learning_rate": 7.577155821380866e-06, + "loss": 0.5849, + "step": 2306 + }, + { + "epoch": 0.35, + "grad_norm": 3.314047411421253, + "learning_rate": 7.57506242727618e-06, + "loss": 0.3686, + "step": 2307 + }, + { + "epoch": 0.35, + "grad_norm": 4.573902745993604, + "learning_rate": 7.5729684186593344e-06, + "loss": 0.4979, + "step": 2308 + }, + { + "epoch": 0.35, + "grad_norm": 2.8156751812625647, + "learning_rate": 7.570873796030046e-06, + "loss": 0.4316, + "step": 2309 + }, + { + "epoch": 0.35, + "grad_norm": 4.600049244185723, + "learning_rate": 7.5687785598881734e-06, + "loss": 0.5268, + "step": 2310 + }, + { + "epoch": 0.35, + "grad_norm": 3.5808200959525296, + "learning_rate": 7.566682710733724e-06, + "loss": 0.4497, + "step": 2311 + }, + { + "epoch": 0.35, + "grad_norm": 2.9107101813019423, + "learning_rate": 7.564586249066849e-06, + "loss": 0.5138, + "step": 2312 + }, + { + "epoch": 0.35, + "grad_norm": 3.1324272698959206, + "learning_rate": 7.562489175387847e-06, + "loss": 0.5506, + "step": 2313 + }, + { + "epoch": 0.35, + "grad_norm": 2.5583265096612626, + "learning_rate": 7.5603914901971655e-06, + "loss": 0.4833, + "step": 2314 + }, + { + "epoch": 0.35, + "grad_norm": 3.217738882956242, + "learning_rate": 7.5582931939953945e-06, + "loss": 0.4615, + "step": 2315 + }, + { + "epoch": 0.35, + "grad_norm": 2.723701861670058, + "learning_rate": 7.55619428728327e-06, + "loss": 0.4541, + "step": 2316 + }, + { + "epoch": 0.35, + "grad_norm": 2.7545212931361975, + "learning_rate": 7.554094770561675e-06, + "loss": 0.4732, + "step": 2317 + }, + { + "epoch": 0.35, + "grad_norm": 1.5956854653926493, + "learning_rate": 7.5519946443316375e-06, + "loss": 0.5931, + "step": 2318 + }, + { + "epoch": 0.35, + "grad_norm": 2.698105812583981, + "learning_rate": 7.5498939090943325e-06, + "loss": 0.4715, + "step": 2319 + }, + { + "epoch": 0.35, + "grad_norm": 2.582070491072131, + "learning_rate": 7.547792565351076e-06, + "loss": 0.4373, + "step": 2320 + }, + { + "epoch": 0.35, + "grad_norm": 2.326872809435797, + "learning_rate": 7.545690613603333e-06, + "loss": 0.4178, + "step": 2321 + }, + { + "epoch": 0.35, + "grad_norm": 2.9036913839925713, + "learning_rate": 7.543588054352715e-06, + "loss": 0.4681, + "step": 2322 + }, + { + "epoch": 0.35, + "grad_norm": 3.012533937870758, + "learning_rate": 7.541484888100974e-06, + "loss": 0.5056, + "step": 2323 + }, + { + "epoch": 0.35, + "grad_norm": 2.433188318684449, + "learning_rate": 7.53938111535001e-06, + "loss": 0.4928, + "step": 2324 + }, + { + "epoch": 0.35, + "grad_norm": 3.0515687308988624, + "learning_rate": 7.537276736601864e-06, + "loss": 0.4644, + "step": 2325 + }, + { + "epoch": 0.35, + "grad_norm": 1.3941245715115476, + "learning_rate": 7.535171752358727e-06, + "loss": 0.6249, + "step": 2326 + }, + { + "epoch": 0.35, + "grad_norm": 4.400350993669927, + "learning_rate": 7.533066163122932e-06, + "loss": 0.4655, + "step": 2327 + }, + { + "epoch": 0.35, + "grad_norm": 4.9743790153201415, + "learning_rate": 7.5309599693969556e-06, + "loss": 0.4487, + "step": 2328 + }, + { + "epoch": 0.35, + "grad_norm": 3.7286994813150485, + "learning_rate": 7.528853171683419e-06, + "loss": 0.4254, + "step": 2329 + }, + { + "epoch": 0.35, + "grad_norm": 9.625766798890735, + "learning_rate": 7.526745770485087e-06, + "loss": 0.4968, + "step": 2330 + }, + { + "epoch": 0.35, + "grad_norm": 2.8421209314991422, + "learning_rate": 7.524637766304872e-06, + "loss": 0.467, + "step": 2331 + }, + { + "epoch": 0.35, + "grad_norm": 3.300125231132899, + "learning_rate": 7.522529159645824e-06, + "loss": 0.4586, + "step": 2332 + }, + { + "epoch": 0.35, + "grad_norm": 5.874058698185691, + "learning_rate": 7.520419951011142e-06, + "loss": 0.4456, + "step": 2333 + }, + { + "epoch": 0.35, + "grad_norm": 5.771212291214637, + "learning_rate": 7.518310140904168e-06, + "loss": 0.4834, + "step": 2334 + }, + { + "epoch": 0.35, + "grad_norm": 2.948613403980211, + "learning_rate": 7.516199729828385e-06, + "loss": 0.4857, + "step": 2335 + }, + { + "epoch": 0.35, + "grad_norm": 5.149792528040112, + "learning_rate": 7.51408871828742e-06, + "loss": 0.5548, + "step": 2336 + }, + { + "epoch": 0.35, + "grad_norm": 8.524187215245263, + "learning_rate": 7.511977106785047e-06, + "loss": 0.4868, + "step": 2337 + }, + { + "epoch": 0.35, + "grad_norm": 4.165713280904329, + "learning_rate": 7.509864895825177e-06, + "loss": 0.5515, + "step": 2338 + }, + { + "epoch": 0.35, + "grad_norm": 2.5807550343802252, + "learning_rate": 7.5077520859118725e-06, + "loss": 0.5035, + "step": 2339 + }, + { + "epoch": 0.35, + "grad_norm": 3.3115251734617095, + "learning_rate": 7.505638677549327e-06, + "loss": 0.5427, + "step": 2340 + }, + { + "epoch": 0.35, + "grad_norm": 3.3351459670309946, + "learning_rate": 7.503524671241891e-06, + "loss": 0.512, + "step": 2341 + }, + { + "epoch": 0.35, + "grad_norm": 3.637112794821689, + "learning_rate": 7.501410067494044e-06, + "loss": 0.5465, + "step": 2342 + }, + { + "epoch": 0.35, + "grad_norm": 8.841462445033107, + "learning_rate": 7.499294866810416e-06, + "loss": 0.5146, + "step": 2343 + }, + { + "epoch": 0.35, + "grad_norm": 3.1341016899078467, + "learning_rate": 7.497179069695781e-06, + "loss": 0.4752, + "step": 2344 + }, + { + "epoch": 0.35, + "grad_norm": 3.411183226226576, + "learning_rate": 7.49506267665505e-06, + "loss": 0.4508, + "step": 2345 + }, + { + "epoch": 0.35, + "grad_norm": 3.631883951717825, + "learning_rate": 7.492945688193278e-06, + "loss": 0.5189, + "step": 2346 + }, + { + "epoch": 0.35, + "grad_norm": 5.519052420568145, + "learning_rate": 7.490828104815663e-06, + "loss": 0.4944, + "step": 2347 + }, + { + "epoch": 0.35, + "grad_norm": 3.5069326027306844, + "learning_rate": 7.488709927027544e-06, + "loss": 0.4421, + "step": 2348 + }, + { + "epoch": 0.35, + "grad_norm": 3.147271464713538, + "learning_rate": 7.486591155334404e-06, + "loss": 0.5075, + "step": 2349 + }, + { + "epoch": 0.35, + "grad_norm": 3.916707981575327, + "learning_rate": 7.484471790241865e-06, + "loss": 0.5159, + "step": 2350 + }, + { + "epoch": 0.35, + "grad_norm": 2.3412114822708805, + "learning_rate": 7.482351832255691e-06, + "loss": 0.5657, + "step": 2351 + }, + { + "epoch": 0.35, + "grad_norm": 4.940698847921472, + "learning_rate": 7.480231281881789e-06, + "loss": 0.4997, + "step": 2352 + }, + { + "epoch": 0.35, + "grad_norm": 3.1639305801434627, + "learning_rate": 7.4781101396262064e-06, + "loss": 0.5072, + "step": 2353 + }, + { + "epoch": 0.36, + "grad_norm": 2.6802727470418577, + "learning_rate": 7.475988405995133e-06, + "loss": 0.524, + "step": 2354 + }, + { + "epoch": 0.36, + "grad_norm": 2.8229294585406794, + "learning_rate": 7.473866081494896e-06, + "loss": 0.4526, + "step": 2355 + }, + { + "epoch": 0.36, + "grad_norm": 3.328084063388005, + "learning_rate": 7.471743166631969e-06, + "loss": 0.4862, + "step": 2356 + }, + { + "epoch": 0.36, + "grad_norm": 3.15578375667564, + "learning_rate": 7.469619661912962e-06, + "loss": 0.4907, + "step": 2357 + }, + { + "epoch": 0.36, + "grad_norm": 2.786440732658542, + "learning_rate": 7.467495567844628e-06, + "loss": 0.44, + "step": 2358 + }, + { + "epoch": 0.36, + "grad_norm": 2.9863262528581376, + "learning_rate": 7.465370884933863e-06, + "loss": 0.5004, + "step": 2359 + }, + { + "epoch": 0.36, + "grad_norm": 4.616551227698675, + "learning_rate": 7.463245613687695e-06, + "loss": 0.4941, + "step": 2360 + }, + { + "epoch": 0.36, + "grad_norm": 2.6790899021916617, + "learning_rate": 7.461119754613302e-06, + "loss": 0.4546, + "step": 2361 + }, + { + "epoch": 0.36, + "grad_norm": 3.6973709898783294, + "learning_rate": 7.458993308217999e-06, + "loss": 0.4428, + "step": 2362 + }, + { + "epoch": 0.36, + "grad_norm": 1.3537657484537213, + "learning_rate": 7.456866275009238e-06, + "loss": 0.6041, + "step": 2363 + }, + { + "epoch": 0.36, + "grad_norm": 3.7908744390982507, + "learning_rate": 7.4547386554946165e-06, + "loss": 0.5042, + "step": 2364 + }, + { + "epoch": 0.36, + "grad_norm": 4.617095600969997, + "learning_rate": 7.4526104501818665e-06, + "loss": 0.4904, + "step": 2365 + }, + { + "epoch": 0.36, + "grad_norm": 3.0657080777062142, + "learning_rate": 7.450481659578862e-06, + "loss": 0.4887, + "step": 2366 + }, + { + "epoch": 0.36, + "grad_norm": 3.4908255036798472, + "learning_rate": 7.44835228419362e-06, + "loss": 0.4522, + "step": 2367 + }, + { + "epoch": 0.36, + "grad_norm": 4.520778842206725, + "learning_rate": 7.44622232453429e-06, + "loss": 0.4988, + "step": 2368 + }, + { + "epoch": 0.36, + "grad_norm": 4.701702868806101, + "learning_rate": 7.444091781109169e-06, + "loss": 0.4542, + "step": 2369 + }, + { + "epoch": 0.36, + "grad_norm": 5.796542487044817, + "learning_rate": 7.441960654426688e-06, + "loss": 0.459, + "step": 2370 + }, + { + "epoch": 0.36, + "grad_norm": 1.2967643453209285, + "learning_rate": 7.4398289449954165e-06, + "loss": 0.5814, + "step": 2371 + }, + { + "epoch": 0.36, + "grad_norm": 5.303351148039858, + "learning_rate": 7.437696653324067e-06, + "loss": 0.417, + "step": 2372 + }, + { + "epoch": 0.36, + "grad_norm": 3.679285475377113, + "learning_rate": 7.4355637799214884e-06, + "loss": 0.446, + "step": 2373 + }, + { + "epoch": 0.36, + "grad_norm": 6.3864743657630445, + "learning_rate": 7.433430325296669e-06, + "loss": 0.472, + "step": 2374 + }, + { + "epoch": 0.36, + "grad_norm": 3.5446376520745755, + "learning_rate": 7.431296289958736e-06, + "loss": 0.4379, + "step": 2375 + }, + { + "epoch": 0.36, + "grad_norm": 5.117399289277465, + "learning_rate": 7.429161674416953e-06, + "loss": 0.5216, + "step": 2376 + }, + { + "epoch": 0.36, + "grad_norm": 2.944676639685801, + "learning_rate": 7.427026479180727e-06, + "loss": 0.4441, + "step": 2377 + }, + { + "epoch": 0.36, + "grad_norm": 4.794498971384045, + "learning_rate": 7.4248907047596005e-06, + "loss": 0.5262, + "step": 2378 + }, + { + "epoch": 0.36, + "grad_norm": 5.6051392071934245, + "learning_rate": 7.422754351663252e-06, + "loss": 0.5123, + "step": 2379 + }, + { + "epoch": 0.36, + "grad_norm": 2.391459230053948, + "learning_rate": 7.4206174204015e-06, + "loss": 0.4023, + "step": 2380 + }, + { + "epoch": 0.36, + "grad_norm": 1.2943844882663575, + "learning_rate": 7.418479911484303e-06, + "loss": 0.5447, + "step": 2381 + }, + { + "epoch": 0.36, + "grad_norm": 2.5665395781799307, + "learning_rate": 7.416341825421755e-06, + "loss": 0.4606, + "step": 2382 + }, + { + "epoch": 0.36, + "grad_norm": 6.271612990798521, + "learning_rate": 7.414203162724087e-06, + "loss": 0.4694, + "step": 2383 + }, + { + "epoch": 0.36, + "grad_norm": 3.0382331565958105, + "learning_rate": 7.41206392390167e-06, + "loss": 0.484, + "step": 2384 + }, + { + "epoch": 0.36, + "grad_norm": 3.5697827937076445, + "learning_rate": 7.409924109465011e-06, + "loss": 0.4507, + "step": 2385 + }, + { + "epoch": 0.36, + "grad_norm": 2.4761784504394564, + "learning_rate": 7.4077837199247545e-06, + "loss": 0.466, + "step": 2386 + }, + { + "epoch": 0.36, + "grad_norm": 4.205812400079637, + "learning_rate": 7.405642755791684e-06, + "loss": 0.4937, + "step": 2387 + }, + { + "epoch": 0.36, + "grad_norm": 3.056025525539817, + "learning_rate": 7.403501217576716e-06, + "loss": 0.4878, + "step": 2388 + }, + { + "epoch": 0.36, + "grad_norm": 2.3889560841604407, + "learning_rate": 7.401359105790909e-06, + "loss": 0.4498, + "step": 2389 + }, + { + "epoch": 0.36, + "grad_norm": 3.2738953265966377, + "learning_rate": 7.399216420945453e-06, + "loss": 0.4348, + "step": 2390 + }, + { + "epoch": 0.36, + "grad_norm": 3.613248927475757, + "learning_rate": 7.39707316355168e-06, + "loss": 0.425, + "step": 2391 + }, + { + "epoch": 0.36, + "grad_norm": 2.996498917868073, + "learning_rate": 7.394929334121057e-06, + "loss": 0.4822, + "step": 2392 + }, + { + "epoch": 0.36, + "grad_norm": 4.128055419570635, + "learning_rate": 7.392784933165186e-06, + "loss": 0.424, + "step": 2393 + }, + { + "epoch": 0.36, + "grad_norm": 4.165272945381678, + "learning_rate": 7.390639961195805e-06, + "loss": 0.5441, + "step": 2394 + }, + { + "epoch": 0.36, + "grad_norm": 4.127464113392247, + "learning_rate": 7.3884944187247895e-06, + "loss": 0.4873, + "step": 2395 + }, + { + "epoch": 0.36, + "grad_norm": 7.947277047673751, + "learning_rate": 7.386348306264153e-06, + "loss": 0.5021, + "step": 2396 + }, + { + "epoch": 0.36, + "grad_norm": 2.38805023443246, + "learning_rate": 7.384201624326042e-06, + "loss": 0.4678, + "step": 2397 + }, + { + "epoch": 0.36, + "grad_norm": 2.4039732258795277, + "learning_rate": 7.38205437342274e-06, + "loss": 0.4571, + "step": 2398 + }, + { + "epoch": 0.36, + "grad_norm": 4.550805545956856, + "learning_rate": 7.379906554066667e-06, + "loss": 0.4614, + "step": 2399 + }, + { + "epoch": 0.36, + "grad_norm": 3.0738208257797357, + "learning_rate": 7.3777581667703764e-06, + "loss": 0.4821, + "step": 2400 + }, + { + "epoch": 0.36, + "grad_norm": 2.6846303795490853, + "learning_rate": 7.3756092120465615e-06, + "loss": 0.45, + "step": 2401 + }, + { + "epoch": 0.36, + "grad_norm": 2.4463112035762338, + "learning_rate": 7.373459690408046e-06, + "loss": 0.4696, + "step": 2402 + }, + { + "epoch": 0.36, + "grad_norm": 3.898412590499899, + "learning_rate": 7.371309602367792e-06, + "loss": 0.4397, + "step": 2403 + }, + { + "epoch": 0.36, + "grad_norm": 3.3678424224651757, + "learning_rate": 7.369158948438895e-06, + "loss": 0.4646, + "step": 2404 + }, + { + "epoch": 0.36, + "grad_norm": 2.5468587515720715, + "learning_rate": 7.367007729134589e-06, + "loss": 0.5007, + "step": 2405 + }, + { + "epoch": 0.36, + "grad_norm": 2.84200309342235, + "learning_rate": 7.3648559449682364e-06, + "loss": 0.4899, + "step": 2406 + }, + { + "epoch": 0.36, + "grad_norm": 3.1385236052087917, + "learning_rate": 7.36270359645334e-06, + "loss": 0.5357, + "step": 2407 + }, + { + "epoch": 0.36, + "grad_norm": 4.0117544221422285, + "learning_rate": 7.360550684103538e-06, + "loss": 0.4323, + "step": 2408 + }, + { + "epoch": 0.36, + "grad_norm": 3.278077874312744, + "learning_rate": 7.358397208432597e-06, + "loss": 0.4423, + "step": 2409 + }, + { + "epoch": 0.36, + "grad_norm": 3.739929815210438, + "learning_rate": 7.356243169954426e-06, + "loss": 0.4289, + "step": 2410 + }, + { + "epoch": 0.36, + "grad_norm": 2.6890506583928087, + "learning_rate": 7.35408856918306e-06, + "loss": 0.4918, + "step": 2411 + }, + { + "epoch": 0.36, + "grad_norm": 3.5527607087240387, + "learning_rate": 7.3519334066326744e-06, + "loss": 0.4567, + "step": 2412 + }, + { + "epoch": 0.36, + "grad_norm": 2.7259827107013255, + "learning_rate": 7.349777682817576e-06, + "loss": 0.4772, + "step": 2413 + }, + { + "epoch": 0.36, + "grad_norm": 3.01317696317138, + "learning_rate": 7.347621398252207e-06, + "loss": 0.5182, + "step": 2414 + }, + { + "epoch": 0.36, + "grad_norm": 3.901459310104945, + "learning_rate": 7.345464553451141e-06, + "loss": 0.3925, + "step": 2415 + }, + { + "epoch": 0.36, + "grad_norm": 3.798720250285504, + "learning_rate": 7.343307148929089e-06, + "loss": 0.503, + "step": 2416 + }, + { + "epoch": 0.36, + "grad_norm": 9.764285321686053, + "learning_rate": 7.341149185200891e-06, + "loss": 0.4157, + "step": 2417 + }, + { + "epoch": 0.36, + "grad_norm": 2.822440712892292, + "learning_rate": 7.338990662781524e-06, + "loss": 0.4875, + "step": 2418 + }, + { + "epoch": 0.36, + "grad_norm": 3.5520900723640674, + "learning_rate": 7.336831582186097e-06, + "loss": 0.4654, + "step": 2419 + }, + { + "epoch": 0.37, + "grad_norm": 3.1943192699259804, + "learning_rate": 7.334671943929853e-06, + "loss": 0.4738, + "step": 2420 + }, + { + "epoch": 0.37, + "grad_norm": 2.9235056902629255, + "learning_rate": 7.332511748528167e-06, + "loss": 0.5514, + "step": 2421 + }, + { + "epoch": 0.37, + "grad_norm": 2.5964164022569878, + "learning_rate": 7.330350996496547e-06, + "loss": 0.5358, + "step": 2422 + }, + { + "epoch": 0.37, + "grad_norm": 1.2529298418572723, + "learning_rate": 7.3281896883506355e-06, + "loss": 0.5752, + "step": 2423 + }, + { + "epoch": 0.37, + "grad_norm": 2.547050492474515, + "learning_rate": 7.3260278246062045e-06, + "loss": 0.5059, + "step": 2424 + }, + { + "epoch": 0.37, + "grad_norm": 2.565889330482098, + "learning_rate": 7.323865405779163e-06, + "loss": 0.4515, + "step": 2425 + }, + { + "epoch": 0.37, + "grad_norm": 2.3993677637499897, + "learning_rate": 7.321702432385546e-06, + "loss": 0.4807, + "step": 2426 + }, + { + "epoch": 0.37, + "grad_norm": 3.890712382095551, + "learning_rate": 7.31953890494153e-06, + "loss": 0.5415, + "step": 2427 + }, + { + "epoch": 0.37, + "grad_norm": 3.5513493780949887, + "learning_rate": 7.317374823963415e-06, + "loss": 0.5037, + "step": 2428 + }, + { + "epoch": 0.37, + "grad_norm": 2.369435061019334, + "learning_rate": 7.315210189967637e-06, + "loss": 0.4041, + "step": 2429 + }, + { + "epoch": 0.37, + "grad_norm": 2.6106424036222062, + "learning_rate": 7.313045003470766e-06, + "loss": 0.4101, + "step": 2430 + }, + { + "epoch": 0.37, + "grad_norm": 3.58224470044946, + "learning_rate": 7.310879264989498e-06, + "loss": 0.4305, + "step": 2431 + }, + { + "epoch": 0.37, + "grad_norm": 5.9394626167088385, + "learning_rate": 7.308712975040667e-06, + "loss": 0.4888, + "step": 2432 + }, + { + "epoch": 0.37, + "grad_norm": 4.503333802951347, + "learning_rate": 7.306546134141234e-06, + "loss": 0.48, + "step": 2433 + }, + { + "epoch": 0.37, + "grad_norm": 2.3862982817127865, + "learning_rate": 7.304378742808296e-06, + "loss": 0.5569, + "step": 2434 + }, + { + "epoch": 0.37, + "grad_norm": 6.174515425988138, + "learning_rate": 7.3022108015590755e-06, + "loss": 0.4866, + "step": 2435 + }, + { + "epoch": 0.37, + "grad_norm": 4.341559137287811, + "learning_rate": 7.30004231091093e-06, + "loss": 0.3901, + "step": 2436 + }, + { + "epoch": 0.37, + "grad_norm": 3.4811153408009345, + "learning_rate": 7.29787327138135e-06, + "loss": 0.5211, + "step": 2437 + }, + { + "epoch": 0.37, + "grad_norm": 3.3739158207486137, + "learning_rate": 7.2957036834879505e-06, + "loss": 0.4942, + "step": 2438 + }, + { + "epoch": 0.37, + "grad_norm": 3.7242955373101747, + "learning_rate": 7.2935335477484844e-06, + "loss": 0.4635, + "step": 2439 + }, + { + "epoch": 0.37, + "grad_norm": 3.770147307770478, + "learning_rate": 7.291362864680831e-06, + "loss": 0.5273, + "step": 2440 + }, + { + "epoch": 0.37, + "grad_norm": 4.908437433983117, + "learning_rate": 7.289191634803002e-06, + "loss": 0.4769, + "step": 2441 + }, + { + "epoch": 0.37, + "grad_norm": 5.200020574832539, + "learning_rate": 7.28701985863314e-06, + "loss": 0.5732, + "step": 2442 + }, + { + "epoch": 0.37, + "grad_norm": 3.6926356792365813, + "learning_rate": 7.284847536689514e-06, + "loss": 0.4559, + "step": 2443 + }, + { + "epoch": 0.37, + "grad_norm": 3.7538166653297127, + "learning_rate": 7.282674669490528e-06, + "loss": 0.5405, + "step": 2444 + }, + { + "epoch": 0.37, + "grad_norm": 6.4560967316352516, + "learning_rate": 7.280501257554717e-06, + "loss": 0.4932, + "step": 2445 + }, + { + "epoch": 0.37, + "grad_norm": 3.7161029463295336, + "learning_rate": 7.278327301400738e-06, + "loss": 0.4319, + "step": 2446 + }, + { + "epoch": 0.37, + "grad_norm": 1.345377974682153, + "learning_rate": 7.276152801547389e-06, + "loss": 0.5685, + "step": 2447 + }, + { + "epoch": 0.37, + "grad_norm": 4.2537992393948, + "learning_rate": 7.273977758513588e-06, + "loss": 0.4742, + "step": 2448 + }, + { + "epoch": 0.37, + "grad_norm": 5.267840225793219, + "learning_rate": 7.271802172818387e-06, + "loss": 0.4522, + "step": 2449 + }, + { + "epoch": 0.37, + "grad_norm": 3.9102798015901823, + "learning_rate": 7.269626044980968e-06, + "loss": 0.5225, + "step": 2450 + }, + { + "epoch": 0.37, + "grad_norm": 8.132715424658986, + "learning_rate": 7.267449375520644e-06, + "loss": 0.39, + "step": 2451 + }, + { + "epoch": 0.37, + "grad_norm": 13.300986338579055, + "learning_rate": 7.2652721649568516e-06, + "loss": 0.523, + "step": 2452 + }, + { + "epoch": 0.37, + "grad_norm": 8.450007266556247, + "learning_rate": 7.2630944138091595e-06, + "loss": 0.5306, + "step": 2453 + }, + { + "epoch": 0.37, + "grad_norm": 10.908865235441064, + "learning_rate": 7.260916122597267e-06, + "loss": 0.5516, + "step": 2454 + }, + { + "epoch": 0.37, + "grad_norm": 8.942873889334752, + "learning_rate": 7.258737291840999e-06, + "loss": 0.4663, + "step": 2455 + }, + { + "epoch": 0.37, + "grad_norm": 7.583131320200942, + "learning_rate": 7.256557922060315e-06, + "loss": 0.5487, + "step": 2456 + }, + { + "epoch": 0.37, + "grad_norm": 4.199426091843973, + "learning_rate": 7.254378013775296e-06, + "loss": 0.5279, + "step": 2457 + }, + { + "epoch": 0.37, + "grad_norm": 7.850121215742621, + "learning_rate": 7.2521975675061555e-06, + "loss": 0.4969, + "step": 2458 + }, + { + "epoch": 0.37, + "grad_norm": 11.240548874900549, + "learning_rate": 7.250016583773232e-06, + "loss": 0.4426, + "step": 2459 + }, + { + "epoch": 0.37, + "grad_norm": 16.928179558543754, + "learning_rate": 7.247835063097e-06, + "loss": 0.5458, + "step": 2460 + }, + { + "epoch": 0.37, + "grad_norm": 22.699289561346102, + "learning_rate": 7.245653005998053e-06, + "loss": 0.5332, + "step": 2461 + }, + { + "epoch": 0.37, + "grad_norm": 7.047685134894629, + "learning_rate": 7.243470412997118e-06, + "loss": 0.4457, + "step": 2462 + }, + { + "epoch": 0.37, + "grad_norm": 17.866870740156013, + "learning_rate": 7.24128728461505e-06, + "loss": 0.475, + "step": 2463 + }, + { + "epoch": 0.37, + "grad_norm": 3.2274055359494023, + "learning_rate": 7.239103621372825e-06, + "loss": 0.4988, + "step": 2464 + }, + { + "epoch": 0.37, + "grad_norm": 1.3161080676896624, + "learning_rate": 7.2369194237915565e-06, + "loss": 0.5564, + "step": 2465 + }, + { + "epoch": 0.37, + "grad_norm": 5.130710110155509, + "learning_rate": 7.234734692392479e-06, + "loss": 0.555, + "step": 2466 + }, + { + "epoch": 0.37, + "grad_norm": 2.7000968887129133, + "learning_rate": 7.232549427696958e-06, + "loss": 0.4387, + "step": 2467 + }, + { + "epoch": 0.37, + "grad_norm": 13.615869986831212, + "learning_rate": 7.23036363022648e-06, + "loss": 0.4546, + "step": 2468 + }, + { + "epoch": 0.37, + "grad_norm": 5.047224633925523, + "learning_rate": 7.228177300502668e-06, + "loss": 0.4678, + "step": 2469 + }, + { + "epoch": 0.37, + "grad_norm": 3.5798114342673992, + "learning_rate": 7.225990439047265e-06, + "loss": 0.4365, + "step": 2470 + }, + { + "epoch": 0.37, + "grad_norm": 2.920529402273849, + "learning_rate": 7.223803046382142e-06, + "loss": 0.4861, + "step": 2471 + }, + { + "epoch": 0.37, + "grad_norm": 4.208049628186236, + "learning_rate": 7.2216151230293e-06, + "loss": 0.5203, + "step": 2472 + }, + { + "epoch": 0.37, + "grad_norm": 3.46167637221668, + "learning_rate": 7.219426669510862e-06, + "loss": 0.5041, + "step": 2473 + }, + { + "epoch": 0.37, + "grad_norm": 5.303352840461777, + "learning_rate": 7.2172376863490824e-06, + "loss": 0.4607, + "step": 2474 + }, + { + "epoch": 0.37, + "grad_norm": 3.4698930047824708, + "learning_rate": 7.215048174066337e-06, + "loss": 0.4473, + "step": 2475 + }, + { + "epoch": 0.37, + "grad_norm": 6.249019520549649, + "learning_rate": 7.212858133185132e-06, + "loss": 0.4515, + "step": 2476 + }, + { + "epoch": 0.37, + "grad_norm": 1.1874611045886068, + "learning_rate": 7.210667564228099e-06, + "loss": 0.5608, + "step": 2477 + }, + { + "epoch": 0.37, + "grad_norm": 2.500591819340289, + "learning_rate": 7.208476467717992e-06, + "loss": 0.4921, + "step": 2478 + }, + { + "epoch": 0.37, + "grad_norm": 3.2456384618481753, + "learning_rate": 7.206284844177695e-06, + "loss": 0.4683, + "step": 2479 + }, + { + "epoch": 0.37, + "grad_norm": 4.266524121256204, + "learning_rate": 7.204092694130218e-06, + "loss": 0.4988, + "step": 2480 + }, + { + "epoch": 0.37, + "grad_norm": 2.8134156725042487, + "learning_rate": 7.201900018098692e-06, + "loss": 0.4699, + "step": 2481 + }, + { + "epoch": 0.37, + "grad_norm": 4.045493583913286, + "learning_rate": 7.19970681660638e-06, + "loss": 0.4907, + "step": 2482 + }, + { + "epoch": 0.37, + "grad_norm": 2.4213431954863918, + "learning_rate": 7.197513090176663e-06, + "loss": 0.5486, + "step": 2483 + }, + { + "epoch": 0.37, + "grad_norm": 3.543827736508919, + "learning_rate": 7.195318839333054e-06, + "loss": 0.4544, + "step": 2484 + }, + { + "epoch": 0.37, + "grad_norm": 5.4460178828026455, + "learning_rate": 7.193124064599188e-06, + "loss": 0.5397, + "step": 2485 + }, + { + "epoch": 0.37, + "grad_norm": 3.13067526898253, + "learning_rate": 7.190928766498828e-06, + "loss": 0.543, + "step": 2486 + }, + { + "epoch": 0.38, + "grad_norm": 2.655323900086479, + "learning_rate": 7.188732945555854e-06, + "loss": 0.4816, + "step": 2487 + }, + { + "epoch": 0.38, + "grad_norm": 3.159541709150244, + "learning_rate": 7.186536602294278e-06, + "loss": 0.497, + "step": 2488 + }, + { + "epoch": 0.38, + "grad_norm": 2.733146261504955, + "learning_rate": 7.184339737238235e-06, + "loss": 0.478, + "step": 2489 + }, + { + "epoch": 0.38, + "grad_norm": 2.944714026763166, + "learning_rate": 7.182142350911986e-06, + "loss": 0.4398, + "step": 2490 + }, + { + "epoch": 0.38, + "grad_norm": 4.5159824289025154, + "learning_rate": 7.179944443839913e-06, + "loss": 0.4895, + "step": 2491 + }, + { + "epoch": 0.38, + "grad_norm": 1.2306893866020683, + "learning_rate": 7.177746016546525e-06, + "loss": 0.5594, + "step": 2492 + }, + { + "epoch": 0.38, + "grad_norm": 2.9144531074063504, + "learning_rate": 7.175547069556451e-06, + "loss": 0.5251, + "step": 2493 + }, + { + "epoch": 0.38, + "grad_norm": 3.4025611349858624, + "learning_rate": 7.17334760339445e-06, + "loss": 0.3815, + "step": 2494 + }, + { + "epoch": 0.38, + "grad_norm": 4.163767397372717, + "learning_rate": 7.171147618585401e-06, + "loss": 0.4447, + "step": 2495 + }, + { + "epoch": 0.38, + "grad_norm": 10.862737775664096, + "learning_rate": 7.16894711565431e-06, + "loss": 0.5153, + "step": 2496 + }, + { + "epoch": 0.38, + "grad_norm": 4.520721774046978, + "learning_rate": 7.1667460951263e-06, + "loss": 0.4075, + "step": 2497 + }, + { + "epoch": 0.38, + "grad_norm": 3.9311274655638675, + "learning_rate": 7.164544557526626e-06, + "loss": 0.5436, + "step": 2498 + }, + { + "epoch": 0.38, + "grad_norm": 3.1469865371162267, + "learning_rate": 7.162342503380659e-06, + "loss": 0.4243, + "step": 2499 + }, + { + "epoch": 0.38, + "grad_norm": 5.427717836716093, + "learning_rate": 7.160139933213899e-06, + "loss": 0.3954, + "step": 2500 + }, + { + "epoch": 0.38, + "grad_norm": 5.069241649688368, + "learning_rate": 7.1579368475519665e-06, + "loss": 0.4758, + "step": 2501 + }, + { + "epoch": 0.38, + "grad_norm": 5.682528917077807, + "learning_rate": 7.155733246920603e-06, + "loss": 0.5123, + "step": 2502 + }, + { + "epoch": 0.38, + "grad_norm": 5.282706188952751, + "learning_rate": 7.153529131845678e-06, + "loss": 0.4466, + "step": 2503 + }, + { + "epoch": 0.38, + "grad_norm": 6.174855962183863, + "learning_rate": 7.151324502853179e-06, + "loss": 0.494, + "step": 2504 + }, + { + "epoch": 0.38, + "grad_norm": 3.778926872876768, + "learning_rate": 7.149119360469218e-06, + "loss": 0.3703, + "step": 2505 + }, + { + "epoch": 0.38, + "grad_norm": 3.167661947319722, + "learning_rate": 7.146913705220031e-06, + "loss": 0.4402, + "step": 2506 + }, + { + "epoch": 0.38, + "grad_norm": 4.67117642766957, + "learning_rate": 7.144707537631974e-06, + "loss": 0.4509, + "step": 2507 + }, + { + "epoch": 0.38, + "grad_norm": 4.167825265498907, + "learning_rate": 7.142500858231526e-06, + "loss": 0.4666, + "step": 2508 + }, + { + "epoch": 0.38, + "grad_norm": 4.545973808294085, + "learning_rate": 7.140293667545289e-06, + "loss": 0.5098, + "step": 2509 + }, + { + "epoch": 0.38, + "grad_norm": 4.069696407357052, + "learning_rate": 7.138085966099986e-06, + "loss": 0.4161, + "step": 2510 + }, + { + "epoch": 0.38, + "grad_norm": 4.335668373189224, + "learning_rate": 7.135877754422462e-06, + "loss": 0.523, + "step": 2511 + }, + { + "epoch": 0.38, + "grad_norm": 3.009478497883999, + "learning_rate": 7.133669033039685e-06, + "loss": 0.4811, + "step": 2512 + }, + { + "epoch": 0.38, + "grad_norm": 2.7001275513313785, + "learning_rate": 7.131459802478742e-06, + "loss": 0.3922, + "step": 2513 + }, + { + "epoch": 0.38, + "grad_norm": 2.7703645069479585, + "learning_rate": 7.129250063266844e-06, + "loss": 0.4212, + "step": 2514 + }, + { + "epoch": 0.38, + "grad_norm": 4.550220398959434, + "learning_rate": 7.127039815931323e-06, + "loss": 0.499, + "step": 2515 + }, + { + "epoch": 0.38, + "grad_norm": 5.65764183758504, + "learning_rate": 7.124829060999632e-06, + "loss": 0.4994, + "step": 2516 + }, + { + "epoch": 0.38, + "grad_norm": 2.9428146673129807, + "learning_rate": 7.122617798999343e-06, + "loss": 0.4783, + "step": 2517 + }, + { + "epoch": 0.38, + "grad_norm": 3.350277750916895, + "learning_rate": 7.120406030458151e-06, + "loss": 0.4727, + "step": 2518 + }, + { + "epoch": 0.38, + "grad_norm": 3.074620205019584, + "learning_rate": 7.118193755903875e-06, + "loss": 0.4587, + "step": 2519 + }, + { + "epoch": 0.38, + "grad_norm": 4.147323801255245, + "learning_rate": 7.115980975864449e-06, + "loss": 0.5663, + "step": 2520 + }, + { + "epoch": 0.38, + "grad_norm": 3.986781005598588, + "learning_rate": 7.113767690867932e-06, + "loss": 0.4779, + "step": 2521 + }, + { + "epoch": 0.38, + "grad_norm": 4.093697721263707, + "learning_rate": 7.111553901442499e-06, + "loss": 0.5608, + "step": 2522 + }, + { + "epoch": 0.38, + "grad_norm": 3.360996968374214, + "learning_rate": 7.109339608116449e-06, + "loss": 0.4529, + "step": 2523 + }, + { + "epoch": 0.38, + "grad_norm": 2.7226960540930536, + "learning_rate": 7.107124811418201e-06, + "loss": 0.4583, + "step": 2524 + }, + { + "epoch": 0.38, + "grad_norm": 4.157622868312493, + "learning_rate": 7.1049095118762935e-06, + "loss": 0.537, + "step": 2525 + }, + { + "epoch": 0.38, + "grad_norm": 3.4151379421395025, + "learning_rate": 7.102693710019386e-06, + "loss": 0.4528, + "step": 2526 + }, + { + "epoch": 0.38, + "grad_norm": 2.352865024462654, + "learning_rate": 7.100477406376255e-06, + "loss": 0.51, + "step": 2527 + }, + { + "epoch": 0.38, + "grad_norm": 3.491197550580401, + "learning_rate": 7.098260601475798e-06, + "loss": 0.4745, + "step": 2528 + }, + { + "epoch": 0.38, + "grad_norm": 3.673017389117518, + "learning_rate": 7.096043295847035e-06, + "loss": 0.439, + "step": 2529 + }, + { + "epoch": 0.38, + "grad_norm": 3.117805873415266, + "learning_rate": 7.0938254900191e-06, + "loss": 0.4942, + "step": 2530 + }, + { + "epoch": 0.38, + "grad_norm": 8.208658959351665, + "learning_rate": 7.0916071845212546e-06, + "loss": 0.4647, + "step": 2531 + }, + { + "epoch": 0.38, + "grad_norm": 2.984412937025655, + "learning_rate": 7.089388379882869e-06, + "loss": 0.5581, + "step": 2532 + }, + { + "epoch": 0.38, + "grad_norm": 2.9007183279521063, + "learning_rate": 7.087169076633442e-06, + "loss": 0.5013, + "step": 2533 + }, + { + "epoch": 0.38, + "grad_norm": 5.48976722203762, + "learning_rate": 7.084949275302586e-06, + "loss": 0.5164, + "step": 2534 + }, + { + "epoch": 0.38, + "grad_norm": 6.246883197610956, + "learning_rate": 7.082728976420033e-06, + "loss": 0.5332, + "step": 2535 + }, + { + "epoch": 0.38, + "grad_norm": 3.246441603180961, + "learning_rate": 7.080508180515636e-06, + "loss": 0.5152, + "step": 2536 + }, + { + "epoch": 0.38, + "grad_norm": 2.7679058375365506, + "learning_rate": 7.078286888119364e-06, + "loss": 0.4669, + "step": 2537 + }, + { + "epoch": 0.38, + "grad_norm": 2.9939204605539587, + "learning_rate": 7.076065099761307e-06, + "loss": 0.4893, + "step": 2538 + }, + { + "epoch": 0.38, + "grad_norm": 7.434791090843588, + "learning_rate": 7.0738428159716695e-06, + "loss": 0.4548, + "step": 2539 + }, + { + "epoch": 0.38, + "grad_norm": 5.319598766866957, + "learning_rate": 7.0716200372807795e-06, + "loss": 0.3992, + "step": 2540 + }, + { + "epoch": 0.38, + "grad_norm": 3.64989431744513, + "learning_rate": 7.0693967642190784e-06, + "loss": 0.4575, + "step": 2541 + }, + { + "epoch": 0.38, + "grad_norm": 2.951892777578616, + "learning_rate": 7.0671729973171275e-06, + "loss": 0.4577, + "step": 2542 + }, + { + "epoch": 0.38, + "grad_norm": 4.497582046138382, + "learning_rate": 7.064948737105607e-06, + "loss": 0.3853, + "step": 2543 + }, + { + "epoch": 0.38, + "grad_norm": 3.771240642969287, + "learning_rate": 7.0627239841153124e-06, + "loss": 0.476, + "step": 2544 + }, + { + "epoch": 0.38, + "grad_norm": 3.6398774430586376, + "learning_rate": 7.060498738877159e-06, + "loss": 0.4996, + "step": 2545 + }, + { + "epoch": 0.38, + "grad_norm": 5.69946825518343, + "learning_rate": 7.058273001922179e-06, + "loss": 0.4385, + "step": 2546 + }, + { + "epoch": 0.38, + "grad_norm": 3.5087073478259616, + "learning_rate": 7.056046773781521e-06, + "loss": 0.5168, + "step": 2547 + }, + { + "epoch": 0.38, + "grad_norm": 2.7703037449211636, + "learning_rate": 7.05382005498645e-06, + "loss": 0.4804, + "step": 2548 + }, + { + "epoch": 0.38, + "grad_norm": 2.7462161805338607, + "learning_rate": 7.051592846068351e-06, + "loss": 0.4503, + "step": 2549 + }, + { + "epoch": 0.38, + "grad_norm": 2.6226172722161563, + "learning_rate": 7.049365147558726e-06, + "loss": 0.4589, + "step": 2550 + }, + { + "epoch": 0.38, + "grad_norm": 3.4481356288883025, + "learning_rate": 7.047136959989193e-06, + "loss": 0.4107, + "step": 2551 + }, + { + "epoch": 0.38, + "grad_norm": 4.801266433403592, + "learning_rate": 7.044908283891481e-06, + "loss": 0.4935, + "step": 2552 + }, + { + "epoch": 0.39, + "grad_norm": 3.068944555113061, + "learning_rate": 7.042679119797443e-06, + "loss": 0.4857, + "step": 2553 + }, + { + "epoch": 0.39, + "grad_norm": 4.584399671310988, + "learning_rate": 7.040449468239048e-06, + "loss": 0.4576, + "step": 2554 + }, + { + "epoch": 0.39, + "grad_norm": 89.62009189447063, + "learning_rate": 7.038219329748377e-06, + "loss": 0.5017, + "step": 2555 + }, + { + "epoch": 0.39, + "grad_norm": 8.08707089089859, + "learning_rate": 7.0359887048576325e-06, + "loss": 0.4956, + "step": 2556 + }, + { + "epoch": 0.39, + "grad_norm": 3.529183891188903, + "learning_rate": 7.033757594099126e-06, + "loss": 0.4326, + "step": 2557 + }, + { + "epoch": 0.39, + "grad_norm": 3.6028692682055605, + "learning_rate": 7.031525998005293e-06, + "loss": 0.53, + "step": 2558 + }, + { + "epoch": 0.39, + "grad_norm": 2.7971696327190503, + "learning_rate": 7.029293917108678e-06, + "loss": 0.5071, + "step": 2559 + }, + { + "epoch": 0.39, + "grad_norm": 4.487357698147262, + "learning_rate": 7.027061351941948e-06, + "loss": 0.403, + "step": 2560 + }, + { + "epoch": 0.39, + "grad_norm": 3.2582944488532712, + "learning_rate": 7.0248283030378775e-06, + "loss": 0.4743, + "step": 2561 + }, + { + "epoch": 0.39, + "grad_norm": 2.7277108087876063, + "learning_rate": 7.022594770929363e-06, + "loss": 0.4105, + "step": 2562 + }, + { + "epoch": 0.39, + "grad_norm": 3.279027233181638, + "learning_rate": 7.0203607561494125e-06, + "loss": 0.5217, + "step": 2563 + }, + { + "epoch": 0.39, + "grad_norm": 3.8943965921958297, + "learning_rate": 7.018126259231151e-06, + "loss": 0.5714, + "step": 2564 + }, + { + "epoch": 0.39, + "grad_norm": 4.499912385775661, + "learning_rate": 7.01589128070782e-06, + "loss": 0.4685, + "step": 2565 + }, + { + "epoch": 0.39, + "grad_norm": 4.634490660377617, + "learning_rate": 7.013655821112774e-06, + "loss": 0.4815, + "step": 2566 + }, + { + "epoch": 0.39, + "grad_norm": 3.7637684660022455, + "learning_rate": 7.01141988097948e-06, + "loss": 0.4504, + "step": 2567 + }, + { + "epoch": 0.39, + "grad_norm": 3.7158219227317977, + "learning_rate": 7.009183460841523e-06, + "loss": 0.4582, + "step": 2568 + }, + { + "epoch": 0.39, + "grad_norm": 3.298136652730904, + "learning_rate": 7.006946561232602e-06, + "loss": 0.5172, + "step": 2569 + }, + { + "epoch": 0.39, + "grad_norm": 4.296329536122423, + "learning_rate": 7.004709182686531e-06, + "loss": 0.5034, + "step": 2570 + }, + { + "epoch": 0.39, + "grad_norm": 4.531712317033271, + "learning_rate": 7.002471325737236e-06, + "loss": 0.4295, + "step": 2571 + }, + { + "epoch": 0.39, + "grad_norm": 3.391637753698668, + "learning_rate": 7.000232990918758e-06, + "loss": 0.4927, + "step": 2572 + }, + { + "epoch": 0.39, + "grad_norm": 3.180925132182002, + "learning_rate": 6.997994178765255e-06, + "loss": 0.4871, + "step": 2573 + }, + { + "epoch": 0.39, + "grad_norm": 6.303999549390409, + "learning_rate": 6.995754889810995e-06, + "loss": 0.4929, + "step": 2574 + }, + { + "epoch": 0.39, + "grad_norm": 3.3592218086160037, + "learning_rate": 6.993515124590362e-06, + "loss": 0.481, + "step": 2575 + }, + { + "epoch": 0.39, + "grad_norm": 3.9474698849426706, + "learning_rate": 6.9912748836378505e-06, + "loss": 0.4834, + "step": 2576 + }, + { + "epoch": 0.39, + "grad_norm": 4.190630069585373, + "learning_rate": 6.989034167488073e-06, + "loss": 0.509, + "step": 2577 + }, + { + "epoch": 0.39, + "grad_norm": 8.831245770172945, + "learning_rate": 6.986792976675753e-06, + "loss": 0.5488, + "step": 2578 + }, + { + "epoch": 0.39, + "grad_norm": 1.2584833943129166, + "learning_rate": 6.98455131173573e-06, + "loss": 0.5645, + "step": 2579 + }, + { + "epoch": 0.39, + "grad_norm": 5.6062386734017196, + "learning_rate": 6.982309173202951e-06, + "loss": 0.5275, + "step": 2580 + }, + { + "epoch": 0.39, + "grad_norm": 5.740900755627528, + "learning_rate": 6.98006656161248e-06, + "loss": 0.546, + "step": 2581 + }, + { + "epoch": 0.39, + "grad_norm": 16.033685579990035, + "learning_rate": 6.9778234774994935e-06, + "loss": 0.5056, + "step": 2582 + }, + { + "epoch": 0.39, + "grad_norm": 5.817444108062227, + "learning_rate": 6.97557992139928e-06, + "loss": 0.4031, + "step": 2583 + }, + { + "epoch": 0.39, + "grad_norm": 3.7861456121972408, + "learning_rate": 6.9733358938472425e-06, + "loss": 0.4846, + "step": 2584 + }, + { + "epoch": 0.39, + "grad_norm": 3.9977890308450332, + "learning_rate": 6.971091395378895e-06, + "loss": 0.4232, + "step": 2585 + }, + { + "epoch": 0.39, + "grad_norm": 2.4247895174454985, + "learning_rate": 6.96884642652986e-06, + "loss": 0.429, + "step": 2586 + }, + { + "epoch": 0.39, + "grad_norm": 1.3825941655559477, + "learning_rate": 6.966600987835882e-06, + "loss": 0.62, + "step": 2587 + }, + { + "epoch": 0.39, + "grad_norm": 7.201280933879752, + "learning_rate": 6.964355079832808e-06, + "loss": 0.4298, + "step": 2588 + }, + { + "epoch": 0.39, + "grad_norm": 4.560599717145375, + "learning_rate": 6.962108703056602e-06, + "loss": 0.4791, + "step": 2589 + }, + { + "epoch": 0.39, + "grad_norm": 4.4151280085291935, + "learning_rate": 6.95986185804334e-06, + "loss": 0.4078, + "step": 2590 + }, + { + "epoch": 0.39, + "grad_norm": 3.751651247825529, + "learning_rate": 6.957614545329207e-06, + "loss": 0.468, + "step": 2591 + }, + { + "epoch": 0.39, + "grad_norm": 3.4408467067611945, + "learning_rate": 6.9553667654504995e-06, + "loss": 0.4543, + "step": 2592 + }, + { + "epoch": 0.39, + "grad_norm": 11.750691414016371, + "learning_rate": 6.953118518943629e-06, + "loss": 0.4532, + "step": 2593 + }, + { + "epoch": 0.39, + "grad_norm": 10.756542615531776, + "learning_rate": 6.950869806345117e-06, + "loss": 0.4617, + "step": 2594 + }, + { + "epoch": 0.39, + "grad_norm": 1.1192898662299482, + "learning_rate": 6.948620628191596e-06, + "loss": 0.5645, + "step": 2595 + }, + { + "epoch": 0.39, + "grad_norm": 4.420109002475213, + "learning_rate": 6.9463709850198055e-06, + "loss": 0.4584, + "step": 2596 + }, + { + "epoch": 0.39, + "grad_norm": 5.393486846886865, + "learning_rate": 6.944120877366605e-06, + "loss": 0.4592, + "step": 2597 + }, + { + "epoch": 0.39, + "grad_norm": 5.636295957254785, + "learning_rate": 6.941870305768954e-06, + "loss": 0.4767, + "step": 2598 + }, + { + "epoch": 0.39, + "grad_norm": 3.395552489517268, + "learning_rate": 6.939619270763933e-06, + "loss": 0.4649, + "step": 2599 + }, + { + "epoch": 0.39, + "grad_norm": 7.602451829478677, + "learning_rate": 6.937367772888726e-06, + "loss": 0.4931, + "step": 2600 + }, + { + "epoch": 0.39, + "grad_norm": 4.37539369439954, + "learning_rate": 6.935115812680629e-06, + "loss": 0.5108, + "step": 2601 + }, + { + "epoch": 0.39, + "grad_norm": 3.113245454140188, + "learning_rate": 6.932863390677052e-06, + "loss": 0.4491, + "step": 2602 + }, + { + "epoch": 0.39, + "grad_norm": 3.5929787021613375, + "learning_rate": 6.93061050741551e-06, + "loss": 0.4919, + "step": 2603 + }, + { + "epoch": 0.39, + "grad_norm": 4.731494332202367, + "learning_rate": 6.928357163433631e-06, + "loss": 0.4634, + "step": 2604 + }, + { + "epoch": 0.39, + "grad_norm": 10.75513019726705, + "learning_rate": 6.926103359269152e-06, + "loss": 0.4525, + "step": 2605 + }, + { + "epoch": 0.39, + "grad_norm": 6.562060049884234, + "learning_rate": 6.9238490954599215e-06, + "loss": 0.5108, + "step": 2606 + }, + { + "epoch": 0.39, + "grad_norm": 7.474435969949516, + "learning_rate": 6.921594372543894e-06, + "loss": 0.5554, + "step": 2607 + }, + { + "epoch": 0.39, + "grad_norm": 1.190822645139277, + "learning_rate": 6.91933919105914e-06, + "loss": 0.5453, + "step": 2608 + }, + { + "epoch": 0.39, + "grad_norm": 4.384564796257071, + "learning_rate": 6.917083551543832e-06, + "loss": 0.5438, + "step": 2609 + }, + { + "epoch": 0.39, + "grad_norm": 3.8306014508925017, + "learning_rate": 6.914827454536255e-06, + "loss": 0.428, + "step": 2610 + }, + { + "epoch": 0.39, + "grad_norm": 4.965405593412001, + "learning_rate": 6.912570900574805e-06, + "loss": 0.616, + "step": 2611 + }, + { + "epoch": 0.39, + "grad_norm": 5.0653542434579135, + "learning_rate": 6.910313890197983e-06, + "loss": 0.4735, + "step": 2612 + }, + { + "epoch": 0.39, + "grad_norm": 3.590459889582739, + "learning_rate": 6.908056423944404e-06, + "loss": 0.4413, + "step": 2613 + }, + { + "epoch": 0.39, + "grad_norm": 26.201162635074574, + "learning_rate": 6.905798502352788e-06, + "loss": 0.5028, + "step": 2614 + }, + { + "epoch": 0.39, + "grad_norm": 4.153301014924827, + "learning_rate": 6.903540125961966e-06, + "loss": 0.4381, + "step": 2615 + }, + { + "epoch": 0.39, + "grad_norm": 5.043632976805833, + "learning_rate": 6.9012812953108746e-06, + "loss": 0.4538, + "step": 2616 + }, + { + "epoch": 0.39, + "grad_norm": 8.920155585033736, + "learning_rate": 6.8990220109385595e-06, + "loss": 0.4753, + "step": 2617 + }, + { + "epoch": 0.39, + "grad_norm": 3.7248487419543044, + "learning_rate": 6.896762273384179e-06, + "loss": 0.4228, + "step": 2618 + }, + { + "epoch": 0.4, + "grad_norm": 3.510265667654125, + "learning_rate": 6.894502083186993e-06, + "loss": 0.4925, + "step": 2619 + }, + { + "epoch": 0.4, + "grad_norm": 3.166504893131973, + "learning_rate": 6.892241440886377e-06, + "loss": 0.5124, + "step": 2620 + }, + { + "epoch": 0.4, + "grad_norm": 6.151127580555455, + "learning_rate": 6.889980347021804e-06, + "loss": 0.5123, + "step": 2621 + }, + { + "epoch": 0.4, + "grad_norm": 2.8269478317390186, + "learning_rate": 6.887718802132864e-06, + "loss": 0.4582, + "step": 2622 + }, + { + "epoch": 0.4, + "grad_norm": 1.3073494768057674, + "learning_rate": 6.885456806759253e-06, + "loss": 0.5866, + "step": 2623 + }, + { + "epoch": 0.4, + "grad_norm": 3.750272131537456, + "learning_rate": 6.88319436144077e-06, + "loss": 0.4874, + "step": 2624 + }, + { + "epoch": 0.4, + "grad_norm": 2.7393839217074505, + "learning_rate": 6.880931466717327e-06, + "loss": 0.3877, + "step": 2625 + }, + { + "epoch": 0.4, + "grad_norm": 2.707357918916511, + "learning_rate": 6.878668123128939e-06, + "loss": 0.4752, + "step": 2626 + }, + { + "epoch": 0.4, + "grad_norm": 1.5067168084540215, + "learning_rate": 6.876404331215728e-06, + "loss": 0.5904, + "step": 2627 + }, + { + "epoch": 0.4, + "grad_norm": 3.538787638451028, + "learning_rate": 6.874140091517927e-06, + "loss": 0.4801, + "step": 2628 + }, + { + "epoch": 0.4, + "grad_norm": 2.435453555584165, + "learning_rate": 6.871875404575874e-06, + "loss": 0.4794, + "step": 2629 + }, + { + "epoch": 0.4, + "grad_norm": 4.94482919738694, + "learning_rate": 6.8696102709300104e-06, + "loss": 0.4962, + "step": 2630 + }, + { + "epoch": 0.4, + "grad_norm": 2.989348944456226, + "learning_rate": 6.867344691120891e-06, + "loss": 0.4813, + "step": 2631 + }, + { + "epoch": 0.4, + "grad_norm": 4.077974254547494, + "learning_rate": 6.865078665689169e-06, + "loss": 0.4352, + "step": 2632 + }, + { + "epoch": 0.4, + "grad_norm": 2.929602877981309, + "learning_rate": 6.86281219517561e-06, + "loss": 0.5044, + "step": 2633 + }, + { + "epoch": 0.4, + "grad_norm": 2.7794802641800005, + "learning_rate": 6.860545280121086e-06, + "loss": 0.464, + "step": 2634 + }, + { + "epoch": 0.4, + "grad_norm": 3.8820943903575693, + "learning_rate": 6.858277921066568e-06, + "loss": 0.484, + "step": 2635 + }, + { + "epoch": 0.4, + "grad_norm": 3.942733930052468, + "learning_rate": 6.856010118553143e-06, + "loss": 0.4804, + "step": 2636 + }, + { + "epoch": 0.4, + "grad_norm": 3.0416033016526267, + "learning_rate": 6.8537418731219955e-06, + "loss": 0.5735, + "step": 2637 + }, + { + "epoch": 0.4, + "grad_norm": 5.392032665513893, + "learning_rate": 6.851473185314419e-06, + "loss": 0.4148, + "step": 2638 + }, + { + "epoch": 0.4, + "grad_norm": 4.30508253906485, + "learning_rate": 6.849204055671815e-06, + "loss": 0.4815, + "step": 2639 + }, + { + "epoch": 0.4, + "grad_norm": 4.76620153612275, + "learning_rate": 6.846934484735685e-06, + "loss": 0.4613, + "step": 2640 + }, + { + "epoch": 0.4, + "grad_norm": 2.4042593511934442, + "learning_rate": 6.844664473047641e-06, + "loss": 0.473, + "step": 2641 + }, + { + "epoch": 0.4, + "grad_norm": 3.2144097064836137, + "learning_rate": 6.8423940211493964e-06, + "loss": 0.4952, + "step": 2642 + }, + { + "epoch": 0.4, + "grad_norm": 2.6414567024726314, + "learning_rate": 6.840123129582773e-06, + "loss": 0.5099, + "step": 2643 + }, + { + "epoch": 0.4, + "grad_norm": 3.0920252857656245, + "learning_rate": 6.837851798889694e-06, + "loss": 0.451, + "step": 2644 + }, + { + "epoch": 0.4, + "grad_norm": 5.073033550987081, + "learning_rate": 6.83558002961219e-06, + "loss": 0.4978, + "step": 2645 + }, + { + "epoch": 0.4, + "grad_norm": 5.663003090004529, + "learning_rate": 6.833307822292393e-06, + "loss": 0.4602, + "step": 2646 + }, + { + "epoch": 0.4, + "grad_norm": 2.653286777601565, + "learning_rate": 6.831035177472545e-06, + "loss": 0.4853, + "step": 2647 + }, + { + "epoch": 0.4, + "grad_norm": 3.406204248708831, + "learning_rate": 6.82876209569499e-06, + "loss": 0.4933, + "step": 2648 + }, + { + "epoch": 0.4, + "grad_norm": 2.94126719179757, + "learning_rate": 6.826488577502173e-06, + "loss": 0.402, + "step": 2649 + }, + { + "epoch": 0.4, + "grad_norm": 4.236495961773163, + "learning_rate": 6.824214623436645e-06, + "loss": 0.4123, + "step": 2650 + }, + { + "epoch": 0.4, + "grad_norm": 5.4983167598309715, + "learning_rate": 6.821940234041065e-06, + "loss": 0.4863, + "step": 2651 + }, + { + "epoch": 0.4, + "grad_norm": 1.365559063673132, + "learning_rate": 6.819665409858188e-06, + "loss": 0.5593, + "step": 2652 + }, + { + "epoch": 0.4, + "grad_norm": 2.526755912636174, + "learning_rate": 6.817390151430884e-06, + "loss": 0.4294, + "step": 2653 + }, + { + "epoch": 0.4, + "grad_norm": 17.18988335039718, + "learning_rate": 6.815114459302115e-06, + "loss": 0.4283, + "step": 2654 + }, + { + "epoch": 0.4, + "grad_norm": 3.6461214322071704, + "learning_rate": 6.8128383340149516e-06, + "loss": 0.4337, + "step": 2655 + }, + { + "epoch": 0.4, + "grad_norm": 7.053656956856703, + "learning_rate": 6.81056177611257e-06, + "loss": 0.4957, + "step": 2656 + }, + { + "epoch": 0.4, + "grad_norm": 3.7726857858812695, + "learning_rate": 6.808284786138246e-06, + "loss": 0.479, + "step": 2657 + }, + { + "epoch": 0.4, + "grad_norm": 2.6193265523656386, + "learning_rate": 6.806007364635361e-06, + "loss": 0.4444, + "step": 2658 + }, + { + "epoch": 0.4, + "grad_norm": 2.791467929122126, + "learning_rate": 6.803729512147396e-06, + "loss": 0.4993, + "step": 2659 + }, + { + "epoch": 0.4, + "grad_norm": 2.739295639482101, + "learning_rate": 6.801451229217938e-06, + "loss": 0.4635, + "step": 2660 + }, + { + "epoch": 0.4, + "grad_norm": 1.52218733029149, + "learning_rate": 6.799172516390678e-06, + "loss": 0.5663, + "step": 2661 + }, + { + "epoch": 0.4, + "grad_norm": 2.6392401882982703, + "learning_rate": 6.7968933742094035e-06, + "loss": 0.4145, + "step": 2662 + }, + { + "epoch": 0.4, + "grad_norm": 2.830439049912536, + "learning_rate": 6.794613803218012e-06, + "loss": 0.5252, + "step": 2663 + }, + { + "epoch": 0.4, + "grad_norm": 2.4767301364663012, + "learning_rate": 6.792333803960497e-06, + "loss": 0.4029, + "step": 2664 + }, + { + "epoch": 0.4, + "grad_norm": 2.4096789682837514, + "learning_rate": 6.790053376980958e-06, + "loss": 0.4777, + "step": 2665 + }, + { + "epoch": 0.4, + "grad_norm": 2.8997601813328835, + "learning_rate": 6.787772522823596e-06, + "loss": 0.4425, + "step": 2666 + }, + { + "epoch": 0.4, + "grad_norm": 3.877382753823121, + "learning_rate": 6.7854912420327114e-06, + "loss": 0.4772, + "step": 2667 + }, + { + "epoch": 0.4, + "grad_norm": 3.7273241582245613, + "learning_rate": 6.783209535152712e-06, + "loss": 0.4881, + "step": 2668 + }, + { + "epoch": 0.4, + "grad_norm": 2.5765791607966126, + "learning_rate": 6.780927402728099e-06, + "loss": 0.5036, + "step": 2669 + }, + { + "epoch": 0.4, + "grad_norm": 2.9779269179484147, + "learning_rate": 6.778644845303482e-06, + "loss": 0.4409, + "step": 2670 + }, + { + "epoch": 0.4, + "grad_norm": 2.7333926579117813, + "learning_rate": 6.776361863423572e-06, + "loss": 0.4434, + "step": 2671 + }, + { + "epoch": 0.4, + "grad_norm": 3.7413579779823234, + "learning_rate": 6.774078457633177e-06, + "loss": 0.4753, + "step": 2672 + }, + { + "epoch": 0.4, + "grad_norm": 3.3693737489658044, + "learning_rate": 6.771794628477209e-06, + "loss": 0.4619, + "step": 2673 + }, + { + "epoch": 0.4, + "grad_norm": 6.365353032449093, + "learning_rate": 6.769510376500677e-06, + "loss": 0.4195, + "step": 2674 + }, + { + "epoch": 0.4, + "grad_norm": 4.382994192876083, + "learning_rate": 6.767225702248699e-06, + "loss": 0.4237, + "step": 2675 + }, + { + "epoch": 0.4, + "grad_norm": 3.852593872063646, + "learning_rate": 6.764940606266485e-06, + "loss": 0.5618, + "step": 2676 + }, + { + "epoch": 0.4, + "grad_norm": 4.902473712492849, + "learning_rate": 6.762655089099353e-06, + "loss": 0.4808, + "step": 2677 + }, + { + "epoch": 0.4, + "grad_norm": 3.385107298871977, + "learning_rate": 6.760369151292719e-06, + "loss": 0.5603, + "step": 2678 + }, + { + "epoch": 0.4, + "grad_norm": 2.9931059468018706, + "learning_rate": 6.758082793392094e-06, + "loss": 0.5248, + "step": 2679 + }, + { + "epoch": 0.4, + "grad_norm": 2.9078021827484686, + "learning_rate": 6.755796015943097e-06, + "loss": 0.4673, + "step": 2680 + }, + { + "epoch": 0.4, + "grad_norm": 4.063288847436795, + "learning_rate": 6.753508819491443e-06, + "loss": 0.4333, + "step": 2681 + }, + { + "epoch": 0.4, + "grad_norm": 3.9265000427314765, + "learning_rate": 6.751221204582948e-06, + "loss": 0.4893, + "step": 2682 + }, + { + "epoch": 0.4, + "grad_norm": 3.3807908021344044, + "learning_rate": 6.748933171763527e-06, + "loss": 0.4696, + "step": 2683 + }, + { + "epoch": 0.4, + "grad_norm": 5.415496291709117, + "learning_rate": 6.746644721579198e-06, + "loss": 0.4686, + "step": 2684 + }, + { + "epoch": 0.4, + "grad_norm": 3.424958446614556, + "learning_rate": 6.744355854576074e-06, + "loss": 0.4904, + "step": 2685 + }, + { + "epoch": 0.41, + "grad_norm": 2.938663623029467, + "learning_rate": 6.742066571300368e-06, + "loss": 0.4533, + "step": 2686 + }, + { + "epoch": 0.41, + "grad_norm": 3.5134535953486945, + "learning_rate": 6.739776872298398e-06, + "loss": 0.4413, + "step": 2687 + }, + { + "epoch": 0.41, + "grad_norm": 4.575249574934905, + "learning_rate": 6.7374867581165745e-06, + "loss": 0.5178, + "step": 2688 + }, + { + "epoch": 0.41, + "grad_norm": 4.442213835426479, + "learning_rate": 6.735196229301409e-06, + "loss": 0.5173, + "step": 2689 + }, + { + "epoch": 0.41, + "grad_norm": 2.9244530376008386, + "learning_rate": 6.732905286399516e-06, + "loss": 0.4261, + "step": 2690 + }, + { + "epoch": 0.41, + "grad_norm": 2.8923016878278243, + "learning_rate": 6.730613929957601e-06, + "loss": 0.5049, + "step": 2691 + }, + { + "epoch": 0.41, + "grad_norm": 4.700886026041967, + "learning_rate": 6.728322160522476e-06, + "loss": 0.5345, + "step": 2692 + }, + { + "epoch": 0.41, + "grad_norm": 2.131206645849946, + "learning_rate": 6.726029978641046e-06, + "loss": 0.4321, + "step": 2693 + }, + { + "epoch": 0.41, + "grad_norm": 1.3231796218617438, + "learning_rate": 6.723737384860319e-06, + "loss": 0.5587, + "step": 2694 + }, + { + "epoch": 0.41, + "grad_norm": 4.23720195648198, + "learning_rate": 6.721444379727398e-06, + "loss": 0.4699, + "step": 2695 + }, + { + "epoch": 0.41, + "grad_norm": 5.067988253870886, + "learning_rate": 6.719150963789484e-06, + "loss": 0.4491, + "step": 2696 + }, + { + "epoch": 0.41, + "grad_norm": 3.792634355597209, + "learning_rate": 6.7168571375938775e-06, + "loss": 0.4494, + "step": 2697 + }, + { + "epoch": 0.41, + "grad_norm": 3.7008036062053256, + "learning_rate": 6.714562901687978e-06, + "loss": 0.4924, + "step": 2698 + }, + { + "epoch": 0.41, + "grad_norm": 4.170969714281913, + "learning_rate": 6.712268256619279e-06, + "loss": 0.4247, + "step": 2699 + }, + { + "epoch": 0.41, + "grad_norm": 8.560364176622954, + "learning_rate": 6.7099732029353735e-06, + "loss": 0.4192, + "step": 2700 + }, + { + "epoch": 0.41, + "grad_norm": 10.472714094273426, + "learning_rate": 6.707677741183957e-06, + "loss": 0.4725, + "step": 2701 + }, + { + "epoch": 0.41, + "grad_norm": 4.1451821467443475, + "learning_rate": 6.705381871912813e-06, + "loss": 0.5135, + "step": 2702 + }, + { + "epoch": 0.41, + "grad_norm": 4.2289625758079215, + "learning_rate": 6.703085595669832e-06, + "loss": 0.4566, + "step": 2703 + }, + { + "epoch": 0.41, + "grad_norm": 2.8351475757162126, + "learning_rate": 6.70078891300299e-06, + "loss": 0.394, + "step": 2704 + }, + { + "epoch": 0.41, + "grad_norm": 2.7606016174901455, + "learning_rate": 6.698491824460371e-06, + "loss": 0.4341, + "step": 2705 + }, + { + "epoch": 0.41, + "grad_norm": 4.930043687640907, + "learning_rate": 6.6961943305901515e-06, + "loss": 0.4231, + "step": 2706 + }, + { + "epoch": 0.41, + "grad_norm": 17.15976904357539, + "learning_rate": 6.693896431940605e-06, + "loss": 0.9852, + "step": 2707 + }, + { + "epoch": 0.41, + "grad_norm": 6.857941503136434, + "learning_rate": 6.6915981290601e-06, + "loss": 0.5087, + "step": 2708 + }, + { + "epoch": 0.41, + "grad_norm": 2.6352471493517857, + "learning_rate": 6.6892994224971044e-06, + "loss": 0.5104, + "step": 2709 + }, + { + "epoch": 0.41, + "grad_norm": 4.391006285927482, + "learning_rate": 6.6870003128001784e-06, + "loss": 0.5035, + "step": 2710 + }, + { + "epoch": 0.41, + "grad_norm": 6.589500913637127, + "learning_rate": 6.684700800517981e-06, + "loss": 0.4585, + "step": 2711 + }, + { + "epoch": 0.41, + "grad_norm": 5.365026576219805, + "learning_rate": 6.682400886199272e-06, + "loss": 0.4772, + "step": 2712 + }, + { + "epoch": 0.41, + "grad_norm": 4.490439023431818, + "learning_rate": 6.680100570392898e-06, + "loss": 0.5004, + "step": 2713 + }, + { + "epoch": 0.41, + "grad_norm": 3.1036526206907173, + "learning_rate": 6.677799853647804e-06, + "loss": 0.4785, + "step": 2714 + }, + { + "epoch": 0.41, + "grad_norm": 3.4725433148499447, + "learning_rate": 6.675498736513036e-06, + "loss": 0.4204, + "step": 2715 + }, + { + "epoch": 0.41, + "grad_norm": 6.750064706379368, + "learning_rate": 6.673197219537731e-06, + "loss": 0.5236, + "step": 2716 + }, + { + "epoch": 0.41, + "grad_norm": 2.923850515142398, + "learning_rate": 6.670895303271123e-06, + "loss": 0.5273, + "step": 2717 + }, + { + "epoch": 0.41, + "grad_norm": 4.204699151378934, + "learning_rate": 6.668592988262539e-06, + "loss": 0.5328, + "step": 2718 + }, + { + "epoch": 0.41, + "grad_norm": 4.654279067061114, + "learning_rate": 6.666290275061404e-06, + "loss": 0.4557, + "step": 2719 + }, + { + "epoch": 0.41, + "grad_norm": 2.9806359355838428, + "learning_rate": 6.663987164217236e-06, + "loss": 0.4571, + "step": 2720 + }, + { + "epoch": 0.41, + "grad_norm": 3.730724760495364, + "learning_rate": 6.66168365627965e-06, + "loss": 0.4758, + "step": 2721 + }, + { + "epoch": 0.41, + "grad_norm": 4.565231152658073, + "learning_rate": 6.659379751798352e-06, + "loss": 0.5295, + "step": 2722 + }, + { + "epoch": 0.41, + "grad_norm": 3.6104070435410693, + "learning_rate": 6.657075451323148e-06, + "loss": 0.4436, + "step": 2723 + }, + { + "epoch": 0.41, + "grad_norm": 4.834102785072117, + "learning_rate": 6.654770755403934e-06, + "loss": 0.5049, + "step": 2724 + }, + { + "epoch": 0.41, + "grad_norm": 3.2652921551846097, + "learning_rate": 6.652465664590702e-06, + "loss": 0.4132, + "step": 2725 + }, + { + "epoch": 0.41, + "grad_norm": 3.202111885046156, + "learning_rate": 6.650160179433539e-06, + "loss": 0.4889, + "step": 2726 + }, + { + "epoch": 0.41, + "grad_norm": 7.784382570564905, + "learning_rate": 6.647854300482626e-06, + "loss": 0.434, + "step": 2727 + }, + { + "epoch": 0.41, + "grad_norm": 3.0026001243955185, + "learning_rate": 6.645548028288234e-06, + "loss": 0.5168, + "step": 2728 + }, + { + "epoch": 0.41, + "grad_norm": 3.922247361610575, + "learning_rate": 6.643241363400732e-06, + "loss": 0.4567, + "step": 2729 + }, + { + "epoch": 0.41, + "grad_norm": 4.6272641005123125, + "learning_rate": 6.640934306370585e-06, + "loss": 0.4556, + "step": 2730 + }, + { + "epoch": 0.41, + "grad_norm": 5.772712535106437, + "learning_rate": 6.6386268577483474e-06, + "loss": 0.5463, + "step": 2731 + }, + { + "epoch": 0.41, + "grad_norm": 2.38711761039972, + "learning_rate": 6.636319018084667e-06, + "loss": 0.5496, + "step": 2732 + }, + { + "epoch": 0.41, + "grad_norm": 2.984509874288353, + "learning_rate": 6.634010787930284e-06, + "loss": 0.4662, + "step": 2733 + }, + { + "epoch": 0.41, + "grad_norm": 3.1742101561885017, + "learning_rate": 6.631702167836037e-06, + "loss": 0.4679, + "step": 2734 + }, + { + "epoch": 0.41, + "grad_norm": 4.682229800713317, + "learning_rate": 6.629393158352854e-06, + "loss": 0.5045, + "step": 2735 + }, + { + "epoch": 0.41, + "grad_norm": 2.6638278667558954, + "learning_rate": 6.627083760031755e-06, + "loss": 0.4134, + "step": 2736 + }, + { + "epoch": 0.41, + "grad_norm": 3.4168407905465195, + "learning_rate": 6.624773973423856e-06, + "loss": 0.4827, + "step": 2737 + }, + { + "epoch": 0.41, + "grad_norm": 4.037439292160289, + "learning_rate": 6.622463799080362e-06, + "loss": 0.4877, + "step": 2738 + }, + { + "epoch": 0.41, + "grad_norm": 1.2624109053429897, + "learning_rate": 6.620153237552572e-06, + "loss": 0.5612, + "step": 2739 + }, + { + "epoch": 0.41, + "grad_norm": 4.513823717854631, + "learning_rate": 6.617842289391881e-06, + "loss": 0.4468, + "step": 2740 + }, + { + "epoch": 0.41, + "grad_norm": 4.593256282675752, + "learning_rate": 6.615530955149768e-06, + "loss": 0.4998, + "step": 2741 + }, + { + "epoch": 0.41, + "grad_norm": 4.1171043512366605, + "learning_rate": 6.613219235377815e-06, + "loss": 0.4787, + "step": 2742 + }, + { + "epoch": 0.41, + "grad_norm": 2.900379386048598, + "learning_rate": 6.610907130627685e-06, + "loss": 0.3699, + "step": 2743 + }, + { + "epoch": 0.41, + "grad_norm": 4.394613372515317, + "learning_rate": 6.60859464145114e-06, + "loss": 0.5168, + "step": 2744 + }, + { + "epoch": 0.41, + "grad_norm": 3.217660764172856, + "learning_rate": 6.606281768400032e-06, + "loss": 0.4124, + "step": 2745 + }, + { + "epoch": 0.41, + "grad_norm": 5.916730129757846, + "learning_rate": 6.603968512026305e-06, + "loss": 0.5176, + "step": 2746 + }, + { + "epoch": 0.41, + "grad_norm": 7.302544562951499, + "learning_rate": 6.601654872881993e-06, + "loss": 0.4461, + "step": 2747 + }, + { + "epoch": 0.41, + "grad_norm": 6.583520799382047, + "learning_rate": 6.5993408515192224e-06, + "loss": 0.4112, + "step": 2748 + }, + { + "epoch": 0.41, + "grad_norm": 7.622447817236105, + "learning_rate": 6.59702644849021e-06, + "loss": 0.4569, + "step": 2749 + }, + { + "epoch": 0.41, + "grad_norm": 14.327070902254965, + "learning_rate": 6.5947116643472646e-06, + "loss": 0.4354, + "step": 2750 + }, + { + "epoch": 0.41, + "grad_norm": 3.266120893302219, + "learning_rate": 6.592396499642786e-06, + "loss": 0.4705, + "step": 2751 + }, + { + "epoch": 0.42, + "grad_norm": 8.911482668587954, + "learning_rate": 6.5900809549292646e-06, + "loss": 0.4877, + "step": 2752 + }, + { + "epoch": 0.42, + "grad_norm": 6.1065688896443655, + "learning_rate": 6.587765030759279e-06, + "loss": 0.4363, + "step": 2753 + }, + { + "epoch": 0.42, + "grad_norm": 6.30583973820975, + "learning_rate": 6.585448727685505e-06, + "loss": 0.4262, + "step": 2754 + }, + { + "epoch": 0.42, + "grad_norm": 1.1605210794029632, + "learning_rate": 6.583132046260701e-06, + "loss": 0.524, + "step": 2755 + }, + { + "epoch": 0.42, + "grad_norm": 7.583254289165055, + "learning_rate": 6.580814987037719e-06, + "loss": 0.5538, + "step": 2756 + }, + { + "epoch": 0.42, + "grad_norm": 5.897575167215788, + "learning_rate": 6.578497550569504e-06, + "loss": 0.452, + "step": 2757 + }, + { + "epoch": 0.42, + "grad_norm": 5.633504636764331, + "learning_rate": 6.576179737409087e-06, + "loss": 0.4644, + "step": 2758 + }, + { + "epoch": 0.42, + "grad_norm": 9.925804311796853, + "learning_rate": 6.573861548109589e-06, + "loss": 0.4791, + "step": 2759 + }, + { + "epoch": 0.42, + "grad_norm": 5.528698096542152, + "learning_rate": 6.571542983224224e-06, + "loss": 0.4503, + "step": 2760 + }, + { + "epoch": 0.42, + "grad_norm": 8.019478417165658, + "learning_rate": 6.569224043306293e-06, + "loss": 0.4652, + "step": 2761 + }, + { + "epoch": 0.42, + "grad_norm": 5.897658191236235, + "learning_rate": 6.566904728909188e-06, + "loss": 0.4387, + "step": 2762 + }, + { + "epoch": 0.42, + "grad_norm": 5.914205376551238, + "learning_rate": 6.564585040586388e-06, + "loss": 0.4214, + "step": 2763 + }, + { + "epoch": 0.42, + "grad_norm": 26.90624661216952, + "learning_rate": 6.562264978891462e-06, + "loss": 0.4742, + "step": 2764 + }, + { + "epoch": 0.42, + "grad_norm": 6.4542601911009205, + "learning_rate": 6.559944544378072e-06, + "loss": 0.5071, + "step": 2765 + }, + { + "epoch": 0.42, + "grad_norm": 20.323793108299714, + "learning_rate": 6.557623737599963e-06, + "loss": 0.4653, + "step": 2766 + }, + { + "epoch": 0.42, + "grad_norm": 10.130791431956046, + "learning_rate": 6.555302559110973e-06, + "loss": 0.4572, + "step": 2767 + }, + { + "epoch": 0.42, + "grad_norm": 1.554891748820894, + "learning_rate": 6.552981009465027e-06, + "loss": 0.6201, + "step": 2768 + }, + { + "epoch": 0.42, + "grad_norm": 10.87098358450737, + "learning_rate": 6.550659089216138e-06, + "loss": 0.4579, + "step": 2769 + }, + { + "epoch": 0.42, + "grad_norm": 8.068972908832782, + "learning_rate": 6.548336798918411e-06, + "loss": 0.5396, + "step": 2770 + }, + { + "epoch": 0.42, + "grad_norm": 6.827314715195966, + "learning_rate": 6.5460141391260345e-06, + "loss": 0.45, + "step": 2771 + }, + { + "epoch": 0.42, + "grad_norm": 11.243824523464562, + "learning_rate": 6.543691110393289e-06, + "loss": 0.4345, + "step": 2772 + }, + { + "epoch": 0.42, + "grad_norm": 8.203276108272592, + "learning_rate": 6.541367713274539e-06, + "loss": 0.5186, + "step": 2773 + }, + { + "epoch": 0.42, + "grad_norm": 9.386941785964687, + "learning_rate": 6.539043948324239e-06, + "loss": 0.5822, + "step": 2774 + }, + { + "epoch": 0.42, + "grad_norm": 8.838100108215123, + "learning_rate": 6.536719816096936e-06, + "loss": 0.4712, + "step": 2775 + }, + { + "epoch": 0.42, + "grad_norm": 5.719951804297019, + "learning_rate": 6.534395317147257e-06, + "loss": 0.4603, + "step": 2776 + }, + { + "epoch": 0.42, + "grad_norm": 8.897280460439331, + "learning_rate": 6.532070452029922e-06, + "loss": 0.4738, + "step": 2777 + }, + { + "epoch": 0.42, + "grad_norm": 1.3966527219859393, + "learning_rate": 6.529745221299731e-06, + "loss": 0.6114, + "step": 2778 + }, + { + "epoch": 0.42, + "grad_norm": 7.436220128884038, + "learning_rate": 6.527419625511581e-06, + "loss": 0.4946, + "step": 2779 + }, + { + "epoch": 0.42, + "grad_norm": 25.73745791775027, + "learning_rate": 6.52509366522045e-06, + "loss": 0.3347, + "step": 2780 + }, + { + "epoch": 0.42, + "grad_norm": 15.190280469048854, + "learning_rate": 6.522767340981406e-06, + "loss": 0.4625, + "step": 2781 + }, + { + "epoch": 0.42, + "grad_norm": 1.2932950851439577, + "learning_rate": 6.5204406533496e-06, + "loss": 0.5365, + "step": 2782 + }, + { + "epoch": 0.42, + "grad_norm": 12.829908481643743, + "learning_rate": 6.518113602880275e-06, + "loss": 0.497, + "step": 2783 + }, + { + "epoch": 0.42, + "grad_norm": 7.175146064295874, + "learning_rate": 6.5157861901287545e-06, + "loss": 0.4694, + "step": 2784 + }, + { + "epoch": 0.42, + "grad_norm": 16.411835261163326, + "learning_rate": 6.513458415650452e-06, + "loss": 0.5201, + "step": 2785 + }, + { + "epoch": 0.42, + "grad_norm": 5.1997373048831514, + "learning_rate": 6.51113028000087e-06, + "loss": 0.525, + "step": 2786 + }, + { + "epoch": 0.42, + "grad_norm": 13.510608987435019, + "learning_rate": 6.508801783735591e-06, + "loss": 0.5129, + "step": 2787 + }, + { + "epoch": 0.42, + "grad_norm": 46.12006136789878, + "learning_rate": 6.5064729274102875e-06, + "loss": 0.5015, + "step": 2788 + }, + { + "epoch": 0.42, + "grad_norm": 5.514114891752724, + "learning_rate": 6.504143711580718e-06, + "loss": 0.4637, + "step": 2789 + }, + { + "epoch": 0.42, + "grad_norm": 19.662999277017793, + "learning_rate": 6.501814136802725e-06, + "loss": 0.5133, + "step": 2790 + }, + { + "epoch": 0.42, + "grad_norm": 6.990169314082399, + "learning_rate": 6.499484203632238e-06, + "loss": 0.4741, + "step": 2791 + }, + { + "epoch": 0.42, + "grad_norm": 10.690199786685389, + "learning_rate": 6.497153912625271e-06, + "loss": 0.5041, + "step": 2792 + }, + { + "epoch": 0.42, + "grad_norm": 8.318113545468492, + "learning_rate": 6.494823264337925e-06, + "loss": 0.4537, + "step": 2793 + }, + { + "epoch": 0.42, + "grad_norm": 7.4449557626080916, + "learning_rate": 6.492492259326384e-06, + "loss": 0.3757, + "step": 2794 + }, + { + "epoch": 0.42, + "grad_norm": 15.016525407445265, + "learning_rate": 6.490160898146919e-06, + "loss": 0.4966, + "step": 2795 + }, + { + "epoch": 0.42, + "grad_norm": 13.048025426642337, + "learning_rate": 6.487829181355887e-06, + "loss": 0.4937, + "step": 2796 + }, + { + "epoch": 0.42, + "grad_norm": 7.3644151964490465, + "learning_rate": 6.4854971095097245e-06, + "loss": 0.4087, + "step": 2797 + }, + { + "epoch": 0.42, + "grad_norm": 10.052544094906661, + "learning_rate": 6.483164683164958e-06, + "loss": 0.4557, + "step": 2798 + }, + { + "epoch": 0.42, + "grad_norm": 9.395910897467711, + "learning_rate": 6.480831902878198e-06, + "loss": 0.5157, + "step": 2799 + }, + { + "epoch": 0.42, + "grad_norm": 11.946570993195593, + "learning_rate": 6.47849876920614e-06, + "loss": 0.4455, + "step": 2800 + }, + { + "epoch": 0.42, + "grad_norm": 7.9682337963206935, + "learning_rate": 6.476165282705559e-06, + "loss": 0.4763, + "step": 2801 + }, + { + "epoch": 0.42, + "grad_norm": 12.55711060984555, + "learning_rate": 6.473831443933318e-06, + "loss": 0.385, + "step": 2802 + }, + { + "epoch": 0.42, + "grad_norm": 7.912547954730923, + "learning_rate": 6.471497253446366e-06, + "loss": 0.4995, + "step": 2803 + }, + { + "epoch": 0.42, + "grad_norm": 4.345932591804477, + "learning_rate": 6.469162711801731e-06, + "loss": 0.4921, + "step": 2804 + }, + { + "epoch": 0.42, + "grad_norm": 4.083865790119315, + "learning_rate": 6.466827819556529e-06, + "loss": 0.4579, + "step": 2805 + }, + { + "epoch": 0.42, + "grad_norm": 7.677919737187017, + "learning_rate": 6.464492577267958e-06, + "loss": 0.4584, + "step": 2806 + }, + { + "epoch": 0.42, + "grad_norm": 13.30620290685418, + "learning_rate": 6.4621569854933e-06, + "loss": 0.4596, + "step": 2807 + }, + { + "epoch": 0.42, + "grad_norm": 10.296907954068159, + "learning_rate": 6.459821044789918e-06, + "loss": 0.4988, + "step": 2808 + }, + { + "epoch": 0.42, + "grad_norm": 33.04980072630393, + "learning_rate": 6.4574847557152595e-06, + "loss": 0.4587, + "step": 2809 + }, + { + "epoch": 0.42, + "grad_norm": 6.7201287175221935, + "learning_rate": 6.45514811882686e-06, + "loss": 0.4357, + "step": 2810 + }, + { + "epoch": 0.42, + "grad_norm": 5.302298262811125, + "learning_rate": 6.452811134682329e-06, + "loss": 0.5603, + "step": 2811 + }, + { + "epoch": 0.42, + "grad_norm": 8.509496070364374, + "learning_rate": 6.45047380383937e-06, + "loss": 0.4915, + "step": 2812 + }, + { + "epoch": 0.42, + "grad_norm": 15.623023107917717, + "learning_rate": 6.448136126855755e-06, + "loss": 0.5302, + "step": 2813 + }, + { + "epoch": 0.42, + "grad_norm": 16.87577088574106, + "learning_rate": 6.4457981042893525e-06, + "loss": 0.5389, + "step": 2814 + }, + { + "epoch": 0.42, + "grad_norm": 7.378613929374242, + "learning_rate": 6.443459736698106e-06, + "loss": 0.4677, + "step": 2815 + }, + { + "epoch": 0.42, + "grad_norm": 8.793061140847403, + "learning_rate": 6.441121024640043e-06, + "loss": 0.4414, + "step": 2816 + }, + { + "epoch": 0.42, + "grad_norm": 16.861273084357705, + "learning_rate": 6.438781968673271e-06, + "loss": 0.4633, + "step": 2817 + }, + { + "epoch": 0.43, + "grad_norm": 27.011331304719207, + "learning_rate": 6.436442569355984e-06, + "loss": 0.4701, + "step": 2818 + }, + { + "epoch": 0.43, + "grad_norm": 7.70671248446562, + "learning_rate": 6.4341028272464545e-06, + "loss": 0.4488, + "step": 2819 + }, + { + "epoch": 0.43, + "grad_norm": 8.547948149137019, + "learning_rate": 6.4317627429030385e-06, + "loss": 0.4944, + "step": 2820 + }, + { + "epoch": 0.43, + "grad_norm": 8.668611592824467, + "learning_rate": 6.429422316884174e-06, + "loss": 0.386, + "step": 2821 + }, + { + "epoch": 0.43, + "grad_norm": 10.367443276145769, + "learning_rate": 6.427081549748377e-06, + "loss": 0.5022, + "step": 2822 + }, + { + "epoch": 0.43, + "grad_norm": 9.768215903741776, + "learning_rate": 6.4247404420542495e-06, + "loss": 0.3762, + "step": 2823 + }, + { + "epoch": 0.43, + "grad_norm": 10.087593458796759, + "learning_rate": 6.422398994360473e-06, + "loss": 0.4221, + "step": 2824 + }, + { + "epoch": 0.43, + "grad_norm": 5.9940429562708175, + "learning_rate": 6.420057207225807e-06, + "loss": 0.5322, + "step": 2825 + }, + { + "epoch": 0.43, + "grad_norm": 9.998240436246018, + "learning_rate": 6.417715081209098e-06, + "loss": 0.4671, + "step": 2826 + }, + { + "epoch": 0.43, + "grad_norm": 10.367729028584408, + "learning_rate": 6.415372616869269e-06, + "loss": 0.4909, + "step": 2827 + }, + { + "epoch": 0.43, + "grad_norm": 5.7039831743808, + "learning_rate": 6.413029814765326e-06, + "loss": 0.4222, + "step": 2828 + }, + { + "epoch": 0.43, + "grad_norm": 4.812814161463242, + "learning_rate": 6.410686675456354e-06, + "loss": 0.397, + "step": 2829 + }, + { + "epoch": 0.43, + "grad_norm": 9.167465800082248, + "learning_rate": 6.408343199501519e-06, + "loss": 0.5039, + "step": 2830 + }, + { + "epoch": 0.43, + "grad_norm": 7.224493902731253, + "learning_rate": 6.4059993874600665e-06, + "loss": 0.4704, + "step": 2831 + }, + { + "epoch": 0.43, + "grad_norm": 8.032891669575696, + "learning_rate": 6.4036552398913244e-06, + "loss": 0.4686, + "step": 2832 + }, + { + "epoch": 0.43, + "grad_norm": 11.38400033097673, + "learning_rate": 6.401310757354698e-06, + "loss": 0.4795, + "step": 2833 + }, + { + "epoch": 0.43, + "grad_norm": 5.645819897136621, + "learning_rate": 6.398965940409676e-06, + "loss": 0.4452, + "step": 2834 + }, + { + "epoch": 0.43, + "grad_norm": 6.188581229438318, + "learning_rate": 6.396620789615825e-06, + "loss": 0.5087, + "step": 2835 + }, + { + "epoch": 0.43, + "grad_norm": 8.871289230179782, + "learning_rate": 6.39427530553279e-06, + "loss": 0.4458, + "step": 2836 + }, + { + "epoch": 0.43, + "grad_norm": 7.4049003430802145, + "learning_rate": 6.391929488720294e-06, + "loss": 0.5149, + "step": 2837 + }, + { + "epoch": 0.43, + "grad_norm": 10.797600214784543, + "learning_rate": 6.3895833397381464e-06, + "loss": 0.5331, + "step": 2838 + }, + { + "epoch": 0.43, + "grad_norm": 10.833469750964092, + "learning_rate": 6.38723685914623e-06, + "loss": 0.4709, + "step": 2839 + }, + { + "epoch": 0.43, + "grad_norm": 20.736665990322223, + "learning_rate": 6.3848900475045086e-06, + "loss": 0.4775, + "step": 2840 + }, + { + "epoch": 0.43, + "grad_norm": 14.638327172300627, + "learning_rate": 6.382542905373022e-06, + "loss": 0.5171, + "step": 2841 + }, + { + "epoch": 0.43, + "grad_norm": 5.702794886538009, + "learning_rate": 6.3801954333118955e-06, + "loss": 0.5315, + "step": 2842 + }, + { + "epoch": 0.43, + "grad_norm": 8.925505891023349, + "learning_rate": 6.377847631881326e-06, + "loss": 0.4176, + "step": 2843 + }, + { + "epoch": 0.43, + "grad_norm": 7.628373974834523, + "learning_rate": 6.3754995016415936e-06, + "loss": 0.448, + "step": 2844 + }, + { + "epoch": 0.43, + "grad_norm": 4.734274034482985, + "learning_rate": 6.373151043153056e-06, + "loss": 0.4636, + "step": 2845 + }, + { + "epoch": 0.43, + "grad_norm": 5.02214068032078, + "learning_rate": 6.370802256976148e-06, + "loss": 0.46, + "step": 2846 + }, + { + "epoch": 0.43, + "grad_norm": 7.225247930226508, + "learning_rate": 6.368453143671383e-06, + "loss": 0.5111, + "step": 2847 + }, + { + "epoch": 0.43, + "grad_norm": 4.876982908781532, + "learning_rate": 6.366103703799351e-06, + "loss": 0.4639, + "step": 2848 + }, + { + "epoch": 0.43, + "grad_norm": 5.788792257363526, + "learning_rate": 6.363753937920725e-06, + "loss": 0.5428, + "step": 2849 + }, + { + "epoch": 0.43, + "grad_norm": 7.585629500633913, + "learning_rate": 6.3614038465962514e-06, + "loss": 0.4506, + "step": 2850 + }, + { + "epoch": 0.43, + "grad_norm": 19.438511110730275, + "learning_rate": 6.359053430386755e-06, + "loss": 0.426, + "step": 2851 + }, + { + "epoch": 0.43, + "grad_norm": 6.928408966575904, + "learning_rate": 6.356702689853137e-06, + "loss": 0.4141, + "step": 2852 + }, + { + "epoch": 0.43, + "grad_norm": 4.621456090744973, + "learning_rate": 6.354351625556381e-06, + "loss": 0.4445, + "step": 2853 + }, + { + "epoch": 0.43, + "grad_norm": 3.5093117084897054, + "learning_rate": 6.3520002380575395e-06, + "loss": 0.5141, + "step": 2854 + }, + { + "epoch": 0.43, + "grad_norm": 4.82002362526606, + "learning_rate": 6.349648527917752e-06, + "loss": 0.4355, + "step": 2855 + }, + { + "epoch": 0.43, + "grad_norm": 6.172747252242528, + "learning_rate": 6.347296495698225e-06, + "loss": 0.5529, + "step": 2856 + }, + { + "epoch": 0.43, + "grad_norm": 3.5994608789766995, + "learning_rate": 6.344944141960251e-06, + "loss": 0.4565, + "step": 2857 + }, + { + "epoch": 0.43, + "grad_norm": 31.020871409318158, + "learning_rate": 6.342591467265193e-06, + "loss": 0.4096, + "step": 2858 + }, + { + "epoch": 0.43, + "grad_norm": 1.2352114791426783, + "learning_rate": 6.3402384721744935e-06, + "loss": 0.5142, + "step": 2859 + }, + { + "epoch": 0.43, + "grad_norm": 3.1856316205273822, + "learning_rate": 6.33788515724967e-06, + "loss": 0.5133, + "step": 2860 + }, + { + "epoch": 0.43, + "grad_norm": 9.749681326450956, + "learning_rate": 6.335531523052316e-06, + "loss": 0.4016, + "step": 2861 + }, + { + "epoch": 0.43, + "grad_norm": 120.5668977647027, + "learning_rate": 6.333177570144103e-06, + "loss": 0.4847, + "step": 2862 + }, + { + "epoch": 0.43, + "grad_norm": 6.707753304881009, + "learning_rate": 6.330823299086777e-06, + "loss": 0.4409, + "step": 2863 + }, + { + "epoch": 0.43, + "grad_norm": 10.447715442702926, + "learning_rate": 6.328468710442163e-06, + "loss": 0.4939, + "step": 2864 + }, + { + "epoch": 0.43, + "grad_norm": 4.226041478807944, + "learning_rate": 6.326113804772157e-06, + "loss": 0.4415, + "step": 2865 + }, + { + "epoch": 0.43, + "grad_norm": 4.833078021697118, + "learning_rate": 6.3237585826387325e-06, + "loss": 0.4589, + "step": 2866 + }, + { + "epoch": 0.43, + "grad_norm": 4.486664846964552, + "learning_rate": 6.321403044603941e-06, + "loss": 0.4958, + "step": 2867 + }, + { + "epoch": 0.43, + "grad_norm": 4.413022349163082, + "learning_rate": 6.319047191229906e-06, + "loss": 0.4783, + "step": 2868 + }, + { + "epoch": 0.43, + "grad_norm": 1.099621573316986, + "learning_rate": 6.3166910230788284e-06, + "loss": 0.5262, + "step": 2869 + }, + { + "epoch": 0.43, + "grad_norm": 4.572350286280132, + "learning_rate": 6.314334540712983e-06, + "loss": 0.444, + "step": 2870 + }, + { + "epoch": 0.43, + "grad_norm": 3.686426869676033, + "learning_rate": 6.311977744694722e-06, + "loss": 0.4338, + "step": 2871 + }, + { + "epoch": 0.43, + "grad_norm": 3.834913841187811, + "learning_rate": 6.309620635586466e-06, + "loss": 0.4183, + "step": 2872 + }, + { + "epoch": 0.43, + "grad_norm": 4.081157674525627, + "learning_rate": 6.307263213950721e-06, + "loss": 0.4527, + "step": 2873 + }, + { + "epoch": 0.43, + "grad_norm": 5.439799046523862, + "learning_rate": 6.304905480350055e-06, + "loss": 0.4303, + "step": 2874 + }, + { + "epoch": 0.43, + "grad_norm": 3.352210517561995, + "learning_rate": 6.302547435347121e-06, + "loss": 0.4769, + "step": 2875 + }, + { + "epoch": 0.43, + "grad_norm": 6.048311095692171, + "learning_rate": 6.300189079504644e-06, + "loss": 0.4039, + "step": 2876 + }, + { + "epoch": 0.43, + "grad_norm": 15.943869898815723, + "learning_rate": 6.297830413385415e-06, + "loss": 0.4409, + "step": 2877 + }, + { + "epoch": 0.43, + "grad_norm": 5.017159949879932, + "learning_rate": 6.295471437552309e-06, + "loss": 0.5156, + "step": 2878 + }, + { + "epoch": 0.43, + "grad_norm": 6.167930754050777, + "learning_rate": 6.293112152568274e-06, + "loss": 0.4864, + "step": 2879 + }, + { + "epoch": 0.43, + "grad_norm": 4.434436766618643, + "learning_rate": 6.290752558996325e-06, + "loss": 0.4384, + "step": 2880 + }, + { + "epoch": 0.43, + "grad_norm": 3.8966942400788023, + "learning_rate": 6.288392657399555e-06, + "loss": 0.4764, + "step": 2881 + }, + { + "epoch": 0.43, + "grad_norm": 7.304484480744687, + "learning_rate": 6.286032448341133e-06, + "loss": 0.4454, + "step": 2882 + }, + { + "epoch": 0.43, + "grad_norm": 7.025010823083057, + "learning_rate": 6.283671932384296e-06, + "loss": 0.4403, + "step": 2883 + }, + { + "epoch": 0.43, + "grad_norm": 5.814595715705873, + "learning_rate": 6.281311110092358e-06, + "loss": 0.4136, + "step": 2884 + }, + { + "epoch": 0.44, + "grad_norm": 19.38526408510347, + "learning_rate": 6.278949982028705e-06, + "loss": 0.4122, + "step": 2885 + }, + { + "epoch": 0.44, + "grad_norm": 9.674326131668105, + "learning_rate": 6.276588548756795e-06, + "loss": 0.434, + "step": 2886 + }, + { + "epoch": 0.44, + "grad_norm": 3.7460754247759183, + "learning_rate": 6.274226810840161e-06, + "loss": 0.4332, + "step": 2887 + }, + { + "epoch": 0.44, + "grad_norm": 6.511335587793365, + "learning_rate": 6.271864768842406e-06, + "loss": 0.493, + "step": 2888 + }, + { + "epoch": 0.44, + "grad_norm": 1.2983960605419433, + "learning_rate": 6.2695024233272095e-06, + "loss": 0.5474, + "step": 2889 + }, + { + "epoch": 0.44, + "grad_norm": 8.195212234202778, + "learning_rate": 6.267139774858318e-06, + "loss": 0.4485, + "step": 2890 + }, + { + "epoch": 0.44, + "grad_norm": 4.9996118883143, + "learning_rate": 6.264776823999556e-06, + "loss": 0.4593, + "step": 2891 + }, + { + "epoch": 0.44, + "grad_norm": 14.903877953451142, + "learning_rate": 6.262413571314814e-06, + "loss": 0.5282, + "step": 2892 + }, + { + "epoch": 0.44, + "grad_norm": 129.60616215718593, + "learning_rate": 6.260050017368062e-06, + "loss": 0.448, + "step": 2893 + }, + { + "epoch": 0.44, + "grad_norm": 4.992329053609898, + "learning_rate": 6.2576861627233355e-06, + "loss": 0.3721, + "step": 2894 + }, + { + "epoch": 0.44, + "grad_norm": 5.775458184546797, + "learning_rate": 6.255322007944744e-06, + "loss": 0.521, + "step": 2895 + }, + { + "epoch": 0.44, + "grad_norm": 7.276729067301927, + "learning_rate": 6.2529575535964685e-06, + "loss": 0.5539, + "step": 2896 + }, + { + "epoch": 0.44, + "grad_norm": 9.387276350721105, + "learning_rate": 6.250592800242762e-06, + "loss": 0.4548, + "step": 2897 + }, + { + "epoch": 0.44, + "grad_norm": 7.599231443871725, + "learning_rate": 6.248227748447949e-06, + "loss": 0.4904, + "step": 2898 + }, + { + "epoch": 0.44, + "grad_norm": 7.804613201083422, + "learning_rate": 6.245862398776427e-06, + "loss": 0.506, + "step": 2899 + }, + { + "epoch": 0.44, + "grad_norm": 9.42167919671319, + "learning_rate": 6.243496751792658e-06, + "loss": 0.4713, + "step": 2900 + }, + { + "epoch": 0.44, + "grad_norm": 3.849394671621753, + "learning_rate": 6.241130808061182e-06, + "loss": 0.5078, + "step": 2901 + }, + { + "epoch": 0.44, + "grad_norm": 7.8874500944831825, + "learning_rate": 6.238764568146606e-06, + "loss": 0.4847, + "step": 2902 + }, + { + "epoch": 0.44, + "grad_norm": 16.206077161020335, + "learning_rate": 6.236398032613609e-06, + "loss": 0.4706, + "step": 2903 + }, + { + "epoch": 0.44, + "grad_norm": 3.811333191992275, + "learning_rate": 6.234031202026942e-06, + "loss": 0.5102, + "step": 2904 + }, + { + "epoch": 0.44, + "grad_norm": 7.271011927450286, + "learning_rate": 6.231664076951421e-06, + "loss": 0.4578, + "step": 2905 + }, + { + "epoch": 0.44, + "grad_norm": 3.9928845148593854, + "learning_rate": 6.229296657951942e-06, + "loss": 0.4449, + "step": 2906 + }, + { + "epoch": 0.44, + "grad_norm": 19.619239275853378, + "learning_rate": 6.226928945593459e-06, + "loss": 0.4396, + "step": 2907 + }, + { + "epoch": 0.44, + "grad_norm": 4.51387948735736, + "learning_rate": 6.224560940441006e-06, + "loss": 0.492, + "step": 2908 + }, + { + "epoch": 0.44, + "grad_norm": 5.58547597251968, + "learning_rate": 6.222192643059681e-06, + "loss": 0.4752, + "step": 2909 + }, + { + "epoch": 0.44, + "grad_norm": 5.127400337743388, + "learning_rate": 6.219824054014656e-06, + "loss": 0.4914, + "step": 2910 + }, + { + "epoch": 0.44, + "grad_norm": 5.298296148806085, + "learning_rate": 6.217455173871168e-06, + "loss": 0.4532, + "step": 2911 + }, + { + "epoch": 0.44, + "grad_norm": 6.266291317185697, + "learning_rate": 6.215086003194528e-06, + "loss": 0.5083, + "step": 2912 + }, + { + "epoch": 0.44, + "grad_norm": 6.063603729960131, + "learning_rate": 6.212716542550112e-06, + "loss": 0.4731, + "step": 2913 + }, + { + "epoch": 0.44, + "grad_norm": 5.798648517513329, + "learning_rate": 6.21034679250337e-06, + "loss": 0.4797, + "step": 2914 + }, + { + "epoch": 0.44, + "grad_norm": 6.163752400320914, + "learning_rate": 6.207976753619817e-06, + "loss": 0.442, + "step": 2915 + }, + { + "epoch": 0.44, + "grad_norm": 3.554722972050721, + "learning_rate": 6.205606426465036e-06, + "loss": 0.4609, + "step": 2916 + }, + { + "epoch": 0.44, + "grad_norm": 4.858743791737207, + "learning_rate": 6.203235811604687e-06, + "loss": 0.4178, + "step": 2917 + }, + { + "epoch": 0.44, + "grad_norm": 4.430806749111668, + "learning_rate": 6.200864909604488e-06, + "loss": 0.463, + "step": 2918 + }, + { + "epoch": 0.44, + "grad_norm": 2.683070536151628, + "learning_rate": 6.198493721030234e-06, + "loss": 0.4217, + "step": 2919 + }, + { + "epoch": 0.44, + "grad_norm": 5.751872888152386, + "learning_rate": 6.196122246447779e-06, + "loss": 0.5221, + "step": 2920 + }, + { + "epoch": 0.44, + "grad_norm": 8.220702100998107, + "learning_rate": 6.193750486423057e-06, + "loss": 0.4912, + "step": 2921 + }, + { + "epoch": 0.44, + "grad_norm": 5.584599976744937, + "learning_rate": 6.191378441522061e-06, + "loss": 0.4837, + "step": 2922 + }, + { + "epoch": 0.44, + "grad_norm": 4.298815818935616, + "learning_rate": 6.1890061123108565e-06, + "loss": 0.521, + "step": 2923 + }, + { + "epoch": 0.44, + "grad_norm": 5.805676013351562, + "learning_rate": 6.186633499355576e-06, + "loss": 0.4255, + "step": 2924 + }, + { + "epoch": 0.44, + "grad_norm": 4.917810138462583, + "learning_rate": 6.184260603222416e-06, + "loss": 0.4925, + "step": 2925 + }, + { + "epoch": 0.44, + "grad_norm": 6.903642368695931, + "learning_rate": 6.181887424477646e-06, + "loss": 0.4293, + "step": 2926 + }, + { + "epoch": 0.44, + "grad_norm": 12.539622236503787, + "learning_rate": 6.179513963687601e-06, + "loss": 0.4939, + "step": 2927 + }, + { + "epoch": 0.44, + "grad_norm": 19.066186474789436, + "learning_rate": 6.177140221418684e-06, + "loss": 0.4281, + "step": 2928 + }, + { + "epoch": 0.44, + "grad_norm": 6.1219456875319365, + "learning_rate": 6.174766198237362e-06, + "loss": 0.5037, + "step": 2929 + }, + { + "epoch": 0.44, + "grad_norm": 29.955314239193036, + "learning_rate": 6.17239189471017e-06, + "loss": 0.526, + "step": 2930 + }, + { + "epoch": 0.44, + "grad_norm": 16.007468067120975, + "learning_rate": 6.1700173114037146e-06, + "loss": 0.4886, + "step": 2931 + }, + { + "epoch": 0.44, + "grad_norm": 5.564053407660757, + "learning_rate": 6.167642448884663e-06, + "loss": 0.5325, + "step": 2932 + }, + { + "epoch": 0.44, + "grad_norm": 1.240686494865597, + "learning_rate": 6.165267307719754e-06, + "loss": 0.5792, + "step": 2933 + }, + { + "epoch": 0.44, + "grad_norm": 1.253415449361312, + "learning_rate": 6.1628918884757895e-06, + "loss": 0.5987, + "step": 2934 + }, + { + "epoch": 0.44, + "grad_norm": 24.433133552177132, + "learning_rate": 6.160516191719639e-06, + "loss": 0.4968, + "step": 2935 + }, + { + "epoch": 0.44, + "grad_norm": 5.9931852423106236, + "learning_rate": 6.158140218018236e-06, + "loss": 0.462, + "step": 2936 + }, + { + "epoch": 0.44, + "grad_norm": 19.236499732538242, + "learning_rate": 6.155763967938585e-06, + "loss": 0.4212, + "step": 2937 + }, + { + "epoch": 0.44, + "grad_norm": 8.274228560984213, + "learning_rate": 6.153387442047753e-06, + "loss": 0.5129, + "step": 2938 + }, + { + "epoch": 0.44, + "grad_norm": 1.294532477232685, + "learning_rate": 6.151010640912873e-06, + "loss": 0.569, + "step": 2939 + }, + { + "epoch": 0.44, + "grad_norm": 8.491773618404347, + "learning_rate": 6.1486335651011455e-06, + "loss": 0.4585, + "step": 2940 + }, + { + "epoch": 0.44, + "grad_norm": 7.56092831572986, + "learning_rate": 6.146256215179834e-06, + "loss": 0.435, + "step": 2941 + }, + { + "epoch": 0.44, + "grad_norm": 11.236197360791412, + "learning_rate": 6.143878591716266e-06, + "loss": 0.4664, + "step": 2942 + }, + { + "epoch": 0.44, + "grad_norm": 10.667814557033001, + "learning_rate": 6.141500695277841e-06, + "loss": 0.4689, + "step": 2943 + }, + { + "epoch": 0.44, + "grad_norm": 6.74286595299248, + "learning_rate": 6.139122526432017e-06, + "loss": 0.4937, + "step": 2944 + }, + { + "epoch": 0.44, + "grad_norm": 7.1687008557766445, + "learning_rate": 6.136744085746322e-06, + "loss": 0.49, + "step": 2945 + }, + { + "epoch": 0.44, + "grad_norm": 6.098445146363322, + "learning_rate": 6.134365373788344e-06, + "loss": 0.4851, + "step": 2946 + }, + { + "epoch": 0.44, + "grad_norm": 1.2617298511860449, + "learning_rate": 6.13198639112574e-06, + "loss": 0.5173, + "step": 2947 + }, + { + "epoch": 0.44, + "grad_norm": 8.057634896107505, + "learning_rate": 6.129607138326229e-06, + "loss": 0.5721, + "step": 2948 + }, + { + "epoch": 0.44, + "grad_norm": 5.514425756558621, + "learning_rate": 6.127227615957594e-06, + "loss": 0.4605, + "step": 2949 + }, + { + "epoch": 0.44, + "grad_norm": 1.322004129411983, + "learning_rate": 6.124847824587684e-06, + "loss": 0.5908, + "step": 2950 + }, + { + "epoch": 0.45, + "grad_norm": 5.698888176923222, + "learning_rate": 6.1224677647844124e-06, + "loss": 0.4521, + "step": 2951 + }, + { + "epoch": 0.45, + "grad_norm": 5.897884546136255, + "learning_rate": 6.120087437115757e-06, + "loss": 0.4381, + "step": 2952 + }, + { + "epoch": 0.45, + "grad_norm": 5.502939995314409, + "learning_rate": 6.117706842149758e-06, + "loss": 0.5154, + "step": 2953 + }, + { + "epoch": 0.45, + "grad_norm": 5.389267912671945, + "learning_rate": 6.115325980454517e-06, + "loss": 0.4302, + "step": 2954 + }, + { + "epoch": 0.45, + "grad_norm": 3.7781321062543447, + "learning_rate": 6.112944852598205e-06, + "loss": 0.4931, + "step": 2955 + }, + { + "epoch": 0.45, + "grad_norm": 5.893630618320507, + "learning_rate": 6.1105634591490525e-06, + "loss": 0.4351, + "step": 2956 + }, + { + "epoch": 0.45, + "grad_norm": 7.101882233375057, + "learning_rate": 6.108181800675358e-06, + "loss": 0.3645, + "step": 2957 + }, + { + "epoch": 0.45, + "grad_norm": 5.667989651168263, + "learning_rate": 6.105799877745475e-06, + "loss": 0.4367, + "step": 2958 + }, + { + "epoch": 0.45, + "grad_norm": 5.842490529575291, + "learning_rate": 6.103417690927827e-06, + "loss": 0.4476, + "step": 2959 + }, + { + "epoch": 0.45, + "grad_norm": 8.933347081976434, + "learning_rate": 6.101035240790897e-06, + "loss": 0.3655, + "step": 2960 + }, + { + "epoch": 0.45, + "grad_norm": 5.3566911180389605, + "learning_rate": 6.098652527903235e-06, + "loss": 0.4941, + "step": 2961 + }, + { + "epoch": 0.45, + "grad_norm": 8.994200167373279, + "learning_rate": 6.096269552833448e-06, + "loss": 0.4327, + "step": 2962 + }, + { + "epoch": 0.45, + "grad_norm": 5.863694621041805, + "learning_rate": 6.09388631615021e-06, + "loss": 0.4955, + "step": 2963 + }, + { + "epoch": 0.45, + "grad_norm": 5.045503656825029, + "learning_rate": 6.091502818422258e-06, + "loss": 0.484, + "step": 2964 + }, + { + "epoch": 0.45, + "grad_norm": 6.017810884325633, + "learning_rate": 6.0891190602183845e-06, + "loss": 0.5009, + "step": 2965 + }, + { + "epoch": 0.45, + "grad_norm": 9.515713266760821, + "learning_rate": 6.0867350421074524e-06, + "loss": 0.4287, + "step": 2966 + }, + { + "epoch": 0.45, + "grad_norm": 8.32627328553966, + "learning_rate": 6.084350764658381e-06, + "loss": 0.4568, + "step": 2967 + }, + { + "epoch": 0.45, + "grad_norm": 10.132788183300953, + "learning_rate": 6.081966228440156e-06, + "loss": 0.4727, + "step": 2968 + }, + { + "epoch": 0.45, + "grad_norm": 12.956433016385434, + "learning_rate": 6.079581434021819e-06, + "loss": 0.4863, + "step": 2969 + }, + { + "epoch": 0.45, + "grad_norm": 4.482706597331589, + "learning_rate": 6.077196381972482e-06, + "loss": 0.5453, + "step": 2970 + }, + { + "epoch": 0.45, + "grad_norm": 8.695678259087538, + "learning_rate": 6.074811072861307e-06, + "loss": 0.3629, + "step": 2971 + }, + { + "epoch": 0.45, + "grad_norm": 4.313988620049725, + "learning_rate": 6.072425507257528e-06, + "loss": 0.3593, + "step": 2972 + }, + { + "epoch": 0.45, + "grad_norm": 4.535246856373871, + "learning_rate": 6.070039685730433e-06, + "loss": 0.4588, + "step": 2973 + }, + { + "epoch": 0.45, + "grad_norm": 8.974839937793737, + "learning_rate": 6.067653608849376e-06, + "loss": 0.4814, + "step": 2974 + }, + { + "epoch": 0.45, + "grad_norm": 69.96755068662458, + "learning_rate": 6.065267277183767e-06, + "loss": 0.4312, + "step": 2975 + }, + { + "epoch": 0.45, + "grad_norm": 8.180487673274635, + "learning_rate": 6.0628806913030815e-06, + "loss": 0.4752, + "step": 2976 + }, + { + "epoch": 0.45, + "grad_norm": 4.563480804634143, + "learning_rate": 6.060493851776852e-06, + "loss": 0.4193, + "step": 2977 + }, + { + "epoch": 0.45, + "grad_norm": 7.128795744027954, + "learning_rate": 6.058106759174677e-06, + "loss": 0.3979, + "step": 2978 + }, + { + "epoch": 0.45, + "grad_norm": 7.203556351127735, + "learning_rate": 6.0557194140662064e-06, + "loss": 0.443, + "step": 2979 + }, + { + "epoch": 0.45, + "grad_norm": 6.689772728688631, + "learning_rate": 6.053331817021159e-06, + "loss": 0.4923, + "step": 2980 + }, + { + "epoch": 0.45, + "grad_norm": 9.22409546078687, + "learning_rate": 6.05094396860931e-06, + "loss": 0.4475, + "step": 2981 + }, + { + "epoch": 0.45, + "grad_norm": 6.347506414721554, + "learning_rate": 6.048555869400493e-06, + "loss": 0.4423, + "step": 2982 + }, + { + "epoch": 0.45, + "grad_norm": 5.315507120057155, + "learning_rate": 6.0461675199646065e-06, + "loss": 0.4338, + "step": 2983 + }, + { + "epoch": 0.45, + "grad_norm": 10.523601489962932, + "learning_rate": 6.043778920871601e-06, + "loss": 0.4318, + "step": 2984 + }, + { + "epoch": 0.45, + "grad_norm": 4.406587377530553, + "learning_rate": 6.0413900726914955e-06, + "loss": 0.4481, + "step": 2985 + }, + { + "epoch": 0.45, + "grad_norm": 4.437395983043228, + "learning_rate": 6.039000975994362e-06, + "loss": 0.4691, + "step": 2986 + }, + { + "epoch": 0.45, + "grad_norm": 6.753539845800579, + "learning_rate": 6.036611631350333e-06, + "loss": 0.4003, + "step": 2987 + }, + { + "epoch": 0.45, + "grad_norm": 7.453974796678845, + "learning_rate": 6.034222039329604e-06, + "loss": 0.413, + "step": 2988 + }, + { + "epoch": 0.45, + "grad_norm": 4.52419402041998, + "learning_rate": 6.031832200502422e-06, + "loss": 0.4512, + "step": 2989 + }, + { + "epoch": 0.45, + "grad_norm": 4.097790472489987, + "learning_rate": 6.029442115439102e-06, + "loss": 0.5015, + "step": 2990 + }, + { + "epoch": 0.45, + "grad_norm": 10.216568186887628, + "learning_rate": 6.02705178471001e-06, + "loss": 0.4749, + "step": 2991 + }, + { + "epoch": 0.45, + "grad_norm": 13.386485668775096, + "learning_rate": 6.024661208885576e-06, + "loss": 0.4326, + "step": 2992 + }, + { + "epoch": 0.45, + "grad_norm": 16.246062311818296, + "learning_rate": 6.022270388536287e-06, + "loss": 0.447, + "step": 2993 + }, + { + "epoch": 0.45, + "grad_norm": 5.8761523724036016, + "learning_rate": 6.019879324232684e-06, + "loss": 0.4688, + "step": 2994 + }, + { + "epoch": 0.45, + "grad_norm": 5.590549779223677, + "learning_rate": 6.017488016545372e-06, + "loss": 0.4403, + "step": 2995 + }, + { + "epoch": 0.45, + "grad_norm": 7.081755614938339, + "learning_rate": 6.015096466045012e-06, + "loss": 0.4681, + "step": 2996 + }, + { + "epoch": 0.45, + "grad_norm": 5.453224836260443, + "learning_rate": 6.012704673302323e-06, + "loss": 0.4298, + "step": 2997 + }, + { + "epoch": 0.45, + "grad_norm": 5.630456567721023, + "learning_rate": 6.010312638888082e-06, + "loss": 0.5031, + "step": 2998 + }, + { + "epoch": 0.45, + "grad_norm": 4.1757028398868545, + "learning_rate": 6.007920363373124e-06, + "loss": 0.4027, + "step": 2999 + }, + { + "epoch": 0.45, + "grad_norm": 8.383145004097821, + "learning_rate": 6.005527847328338e-06, + "loss": 0.4884, + "step": 3000 + }, + { + "epoch": 0.45, + "grad_norm": 6.420703968434824, + "learning_rate": 6.003135091324677e-06, + "loss": 0.426, + "step": 3001 + }, + { + "epoch": 0.45, + "grad_norm": 5.395177362597513, + "learning_rate": 6.0007420959331465e-06, + "loss": 0.4361, + "step": 3002 + }, + { + "epoch": 0.45, + "grad_norm": 12.976079561168797, + "learning_rate": 5.998348861724808e-06, + "loss": 0.5099, + "step": 3003 + }, + { + "epoch": 0.45, + "grad_norm": 3.645064769081378, + "learning_rate": 5.995955389270784e-06, + "loss": 0.3893, + "step": 3004 + }, + { + "epoch": 0.45, + "grad_norm": 4.445898048616861, + "learning_rate": 5.993561679142253e-06, + "loss": 0.3685, + "step": 3005 + }, + { + "epoch": 0.45, + "grad_norm": 17.122033836583807, + "learning_rate": 5.991167731910448e-06, + "loss": 0.379, + "step": 3006 + }, + { + "epoch": 0.45, + "grad_norm": 6.388934948867373, + "learning_rate": 5.98877354814666e-06, + "loss": 0.4928, + "step": 3007 + }, + { + "epoch": 0.45, + "grad_norm": 5.602511908593512, + "learning_rate": 5.986379128422236e-06, + "loss": 0.4612, + "step": 3008 + }, + { + "epoch": 0.45, + "grad_norm": 9.319162821256345, + "learning_rate": 5.9839844733085806e-06, + "loss": 0.4506, + "step": 3009 + }, + { + "epoch": 0.45, + "grad_norm": 10.287549852979044, + "learning_rate": 5.981589583377155e-06, + "loss": 0.3952, + "step": 3010 + }, + { + "epoch": 0.45, + "grad_norm": 5.361216336811319, + "learning_rate": 5.97919445919947e-06, + "loss": 0.4621, + "step": 3011 + }, + { + "epoch": 0.45, + "grad_norm": 8.1208449239265, + "learning_rate": 5.976799101347102e-06, + "loss": 0.4737, + "step": 3012 + }, + { + "epoch": 0.45, + "grad_norm": 3.6287622580700205, + "learning_rate": 5.9744035103916775e-06, + "loss": 0.4348, + "step": 3013 + }, + { + "epoch": 0.45, + "grad_norm": 4.240008806756864, + "learning_rate": 5.972007686904878e-06, + "loss": 0.468, + "step": 3014 + }, + { + "epoch": 0.45, + "grad_norm": 3.601990239797793, + "learning_rate": 5.969611631458444e-06, + "loss": 0.4018, + "step": 3015 + }, + { + "epoch": 0.45, + "grad_norm": 9.564357271598327, + "learning_rate": 5.967215344624168e-06, + "loss": 0.4803, + "step": 3016 + }, + { + "epoch": 0.46, + "grad_norm": 14.972478710890037, + "learning_rate": 5.964818826973903e-06, + "loss": 0.474, + "step": 3017 + }, + { + "epoch": 0.46, + "grad_norm": 4.423299847576006, + "learning_rate": 5.9624220790795474e-06, + "loss": 0.4762, + "step": 3018 + }, + { + "epoch": 0.46, + "grad_norm": 4.5295518993204995, + "learning_rate": 5.960025101513063e-06, + "loss": 0.4957, + "step": 3019 + }, + { + "epoch": 0.46, + "grad_norm": 5.077293027793658, + "learning_rate": 5.957627894846465e-06, + "loss": 0.4197, + "step": 3020 + }, + { + "epoch": 0.46, + "grad_norm": 3.4222217448199177, + "learning_rate": 5.955230459651822e-06, + "loss": 0.4742, + "step": 3021 + }, + { + "epoch": 0.46, + "grad_norm": 4.972283015825895, + "learning_rate": 5.952832796501256e-06, + "loss": 0.491, + "step": 3022 + }, + { + "epoch": 0.46, + "grad_norm": 14.03138382708002, + "learning_rate": 5.950434905966945e-06, + "loss": 0.4811, + "step": 3023 + }, + { + "epoch": 0.46, + "grad_norm": 1.1859448600123303, + "learning_rate": 5.948036788621121e-06, + "loss": 0.5771, + "step": 3024 + }, + { + "epoch": 0.46, + "grad_norm": 23.586624891973372, + "learning_rate": 5.945638445036069e-06, + "loss": 0.452, + "step": 3025 + }, + { + "epoch": 0.46, + "grad_norm": 4.575839725218991, + "learning_rate": 5.9432398757841315e-06, + "loss": 0.4491, + "step": 3026 + }, + { + "epoch": 0.46, + "grad_norm": 1.2751361252434745, + "learning_rate": 5.940841081437702e-06, + "loss": 0.5689, + "step": 3027 + }, + { + "epoch": 0.46, + "grad_norm": 4.626488534885077, + "learning_rate": 5.9384420625692266e-06, + "loss": 0.4316, + "step": 3028 + }, + { + "epoch": 0.46, + "grad_norm": 4.1366265256247265, + "learning_rate": 5.9360428197512056e-06, + "loss": 0.4276, + "step": 3029 + }, + { + "epoch": 0.46, + "grad_norm": 4.883829318000987, + "learning_rate": 5.933643353556195e-06, + "loss": 0.4926, + "step": 3030 + }, + { + "epoch": 0.46, + "grad_norm": 3.4098243410612716, + "learning_rate": 5.931243664556803e-06, + "loss": 0.4011, + "step": 3031 + }, + { + "epoch": 0.46, + "grad_norm": 5.896959075466838, + "learning_rate": 5.9288437533256915e-06, + "loss": 0.4683, + "step": 3032 + }, + { + "epoch": 0.46, + "grad_norm": 5.886693300823188, + "learning_rate": 5.926443620435572e-06, + "loss": 0.627, + "step": 3033 + }, + { + "epoch": 0.46, + "grad_norm": 4.108506864501156, + "learning_rate": 5.924043266459214e-06, + "loss": 0.4845, + "step": 3034 + }, + { + "epoch": 0.46, + "grad_norm": 4.8393924733125395, + "learning_rate": 5.921642691969436e-06, + "loss": 0.5092, + "step": 3035 + }, + { + "epoch": 0.46, + "grad_norm": 29.411254033657546, + "learning_rate": 5.919241897539111e-06, + "loss": 0.4501, + "step": 3036 + }, + { + "epoch": 0.46, + "grad_norm": 8.560448452029831, + "learning_rate": 5.916840883741164e-06, + "loss": 0.4848, + "step": 3037 + }, + { + "epoch": 0.46, + "grad_norm": 3.642124747659159, + "learning_rate": 5.91443965114857e-06, + "loss": 0.4386, + "step": 3038 + }, + { + "epoch": 0.46, + "grad_norm": 1.2141450607545425, + "learning_rate": 5.912038200334362e-06, + "loss": 0.5436, + "step": 3039 + }, + { + "epoch": 0.46, + "grad_norm": 15.123352826719152, + "learning_rate": 5.9096365318716195e-06, + "loss": 0.428, + "step": 3040 + }, + { + "epoch": 0.46, + "grad_norm": 5.770042914816767, + "learning_rate": 5.9072346463334765e-06, + "loss": 0.4803, + "step": 3041 + }, + { + "epoch": 0.46, + "grad_norm": 4.913405825811076, + "learning_rate": 5.904832544293118e-06, + "loss": 0.4608, + "step": 3042 + }, + { + "epoch": 0.46, + "grad_norm": 6.972297140527095, + "learning_rate": 5.902430226323781e-06, + "loss": 0.4749, + "step": 3043 + }, + { + "epoch": 0.46, + "grad_norm": 5.366406699894982, + "learning_rate": 5.900027692998755e-06, + "loss": 0.4251, + "step": 3044 + }, + { + "epoch": 0.46, + "grad_norm": 9.614672933492882, + "learning_rate": 5.8976249448913784e-06, + "loss": 0.4149, + "step": 3045 + }, + { + "epoch": 0.46, + "grad_norm": 4.955260843285969, + "learning_rate": 5.895221982575042e-06, + "loss": 0.494, + "step": 3046 + }, + { + "epoch": 0.46, + "grad_norm": 3.730007061925838, + "learning_rate": 5.892818806623189e-06, + "loss": 0.3844, + "step": 3047 + }, + { + "epoch": 0.46, + "grad_norm": 7.816388027808729, + "learning_rate": 5.890415417609312e-06, + "loss": 0.4758, + "step": 3048 + }, + { + "epoch": 0.46, + "grad_norm": 6.510906777338831, + "learning_rate": 5.8880118161069556e-06, + "loss": 0.4753, + "step": 3049 + }, + { + "epoch": 0.46, + "grad_norm": 4.334214220022214, + "learning_rate": 5.885608002689714e-06, + "loss": 0.4528, + "step": 3050 + }, + { + "epoch": 0.46, + "grad_norm": 9.108381841817872, + "learning_rate": 5.883203977931236e-06, + "loss": 0.3675, + "step": 3051 + }, + { + "epoch": 0.46, + "grad_norm": 3.808269688565124, + "learning_rate": 5.880799742405212e-06, + "loss": 0.4496, + "step": 3052 + }, + { + "epoch": 0.46, + "grad_norm": 4.014758805013585, + "learning_rate": 5.8783952966853905e-06, + "loss": 0.4421, + "step": 3053 + }, + { + "epoch": 0.46, + "grad_norm": 13.325225442197393, + "learning_rate": 5.875990641345568e-06, + "loss": 0.4498, + "step": 3054 + }, + { + "epoch": 0.46, + "grad_norm": 11.027122161639543, + "learning_rate": 5.87358577695959e-06, + "loss": 0.4113, + "step": 3055 + }, + { + "epoch": 0.46, + "grad_norm": 6.003579691132278, + "learning_rate": 5.871180704101354e-06, + "loss": 0.4459, + "step": 3056 + }, + { + "epoch": 0.46, + "grad_norm": 6.305201862857531, + "learning_rate": 5.868775423344806e-06, + "loss": 0.5017, + "step": 3057 + }, + { + "epoch": 0.46, + "grad_norm": 4.622884414503929, + "learning_rate": 5.866369935263939e-06, + "loss": 0.4444, + "step": 3058 + }, + { + "epoch": 0.46, + "grad_norm": 6.524586138913282, + "learning_rate": 5.8639642404328e-06, + "loss": 0.506, + "step": 3059 + }, + { + "epoch": 0.46, + "grad_norm": 6.826045078682481, + "learning_rate": 5.861558339425481e-06, + "loss": 0.4714, + "step": 3060 + }, + { + "epoch": 0.46, + "grad_norm": 4.703327001043192, + "learning_rate": 5.859152232816131e-06, + "loss": 0.4246, + "step": 3061 + }, + { + "epoch": 0.46, + "grad_norm": 4.895368051133767, + "learning_rate": 5.856745921178937e-06, + "loss": 0.457, + "step": 3062 + }, + { + "epoch": 0.46, + "grad_norm": 6.933247989064422, + "learning_rate": 5.854339405088143e-06, + "loss": 0.4776, + "step": 3063 + }, + { + "epoch": 0.46, + "grad_norm": 5.457887112796842, + "learning_rate": 5.851932685118037e-06, + "loss": 0.4871, + "step": 3064 + }, + { + "epoch": 0.46, + "grad_norm": 8.895334093952295, + "learning_rate": 5.8495257618429615e-06, + "loss": 0.4061, + "step": 3065 + }, + { + "epoch": 0.46, + "grad_norm": 5.639967382847509, + "learning_rate": 5.8471186358373025e-06, + "loss": 0.4711, + "step": 3066 + }, + { + "epoch": 0.46, + "grad_norm": 6.67272662542438, + "learning_rate": 5.844711307675493e-06, + "loss": 0.524, + "step": 3067 + }, + { + "epoch": 0.46, + "grad_norm": 6.549650819979633, + "learning_rate": 5.842303777932022e-06, + "loss": 0.4642, + "step": 3068 + }, + { + "epoch": 0.46, + "grad_norm": 11.534854335683184, + "learning_rate": 5.8398960471814196e-06, + "loss": 0.4714, + "step": 3069 + }, + { + "epoch": 0.46, + "grad_norm": 11.34067912607852, + "learning_rate": 5.8374881159982645e-06, + "loss": 0.4238, + "step": 3070 + }, + { + "epoch": 0.46, + "grad_norm": 5.3661023885875405, + "learning_rate": 5.835079984957187e-06, + "loss": 0.4671, + "step": 3071 + }, + { + "epoch": 0.46, + "grad_norm": 5.909384356509735, + "learning_rate": 5.832671654632861e-06, + "loss": 0.4224, + "step": 3072 + }, + { + "epoch": 0.46, + "grad_norm": 4.963602018061618, + "learning_rate": 5.830263125600011e-06, + "loss": 0.4146, + "step": 3073 + }, + { + "epoch": 0.46, + "grad_norm": 6.645261360097426, + "learning_rate": 5.827854398433407e-06, + "loss": 0.4449, + "step": 3074 + }, + { + "epoch": 0.46, + "grad_norm": 6.172568981876063, + "learning_rate": 5.8254454737078665e-06, + "loss": 0.4587, + "step": 3075 + }, + { + "epoch": 0.46, + "grad_norm": 16.8571769690598, + "learning_rate": 5.8230363519982565e-06, + "loss": 0.4594, + "step": 3076 + }, + { + "epoch": 0.46, + "grad_norm": 4.482528579650665, + "learning_rate": 5.820627033879488e-06, + "loss": 0.4195, + "step": 3077 + }, + { + "epoch": 0.46, + "grad_norm": 11.199140684334441, + "learning_rate": 5.818217519926518e-06, + "loss": 0.4516, + "step": 3078 + }, + { + "epoch": 0.46, + "grad_norm": 6.378680061356225, + "learning_rate": 5.815807810714356e-06, + "loss": 0.5173, + "step": 3079 + }, + { + "epoch": 0.46, + "grad_norm": 6.083222178669987, + "learning_rate": 5.813397906818052e-06, + "loss": 0.4285, + "step": 3080 + }, + { + "epoch": 0.46, + "grad_norm": 7.450696444483291, + "learning_rate": 5.810987808812705e-06, + "loss": 0.4665, + "step": 3081 + }, + { + "epoch": 0.46, + "grad_norm": 5.041495777521601, + "learning_rate": 5.808577517273459e-06, + "loss": 0.4449, + "step": 3082 + }, + { + "epoch": 0.47, + "grad_norm": 7.750091877057997, + "learning_rate": 5.806167032775508e-06, + "loss": 0.4641, + "step": 3083 + }, + { + "epoch": 0.47, + "grad_norm": 7.551963841233672, + "learning_rate": 5.803756355894087e-06, + "loss": 0.4508, + "step": 3084 + }, + { + "epoch": 0.47, + "grad_norm": 15.36025155679563, + "learning_rate": 5.8013454872044816e-06, + "loss": 0.4417, + "step": 3085 + }, + { + "epoch": 0.47, + "grad_norm": 5.392521546269121, + "learning_rate": 5.798934427282019e-06, + "loss": 0.5054, + "step": 3086 + }, + { + "epoch": 0.47, + "grad_norm": 5.372660424430705, + "learning_rate": 5.796523176702074e-06, + "loss": 0.4856, + "step": 3087 + }, + { + "epoch": 0.47, + "grad_norm": 8.768462398963454, + "learning_rate": 5.794111736040064e-06, + "loss": 0.5417, + "step": 3088 + }, + { + "epoch": 0.47, + "grad_norm": 6.717548903726133, + "learning_rate": 5.79170010587146e-06, + "loss": 0.4305, + "step": 3089 + }, + { + "epoch": 0.47, + "grad_norm": 6.035969281525046, + "learning_rate": 5.7892882867717705e-06, + "loss": 0.4787, + "step": 3090 + }, + { + "epoch": 0.47, + "grad_norm": 7.81313869060724, + "learning_rate": 5.786876279316551e-06, + "loss": 0.526, + "step": 3091 + }, + { + "epoch": 0.47, + "grad_norm": 9.19821449774543, + "learning_rate": 5.784464084081402e-06, + "loss": 0.5263, + "step": 3092 + }, + { + "epoch": 0.47, + "grad_norm": 7.06840073370822, + "learning_rate": 5.782051701641969e-06, + "loss": 0.3849, + "step": 3093 + }, + { + "epoch": 0.47, + "grad_norm": 6.374673519137275, + "learning_rate": 5.779639132573942e-06, + "loss": 0.4842, + "step": 3094 + }, + { + "epoch": 0.47, + "grad_norm": 6.43732725047366, + "learning_rate": 5.7772263774530565e-06, + "loss": 0.4515, + "step": 3095 + }, + { + "epoch": 0.47, + "grad_norm": 10.467800868152375, + "learning_rate": 5.7748134368550935e-06, + "loss": 0.4214, + "step": 3096 + }, + { + "epoch": 0.47, + "grad_norm": 6.01038279518574, + "learning_rate": 5.772400311355872e-06, + "loss": 0.4347, + "step": 3097 + }, + { + "epoch": 0.47, + "grad_norm": 10.074339124672884, + "learning_rate": 5.7699870015312646e-06, + "loss": 0.4683, + "step": 3098 + }, + { + "epoch": 0.47, + "grad_norm": 9.039062793877982, + "learning_rate": 5.767573507957178e-06, + "loss": 0.4439, + "step": 3099 + }, + { + "epoch": 0.47, + "grad_norm": 4.917505861340575, + "learning_rate": 5.76515983120957e-06, + "loss": 0.4443, + "step": 3100 + }, + { + "epoch": 0.47, + "grad_norm": 4.843045981194791, + "learning_rate": 5.762745971864441e-06, + "loss": 0.4158, + "step": 3101 + }, + { + "epoch": 0.47, + "grad_norm": 6.043465873075565, + "learning_rate": 5.760331930497831e-06, + "loss": 0.4972, + "step": 3102 + }, + { + "epoch": 0.47, + "grad_norm": 4.517738193249536, + "learning_rate": 5.757917707685829e-06, + "loss": 0.5144, + "step": 3103 + }, + { + "epoch": 0.47, + "grad_norm": 13.37729348296037, + "learning_rate": 5.7555033040045625e-06, + "loss": 0.4687, + "step": 3104 + }, + { + "epoch": 0.47, + "grad_norm": 5.414470789802739, + "learning_rate": 5.753088720030206e-06, + "loss": 0.4717, + "step": 3105 + }, + { + "epoch": 0.47, + "grad_norm": 5.791109326288943, + "learning_rate": 5.7506739563389705e-06, + "loss": 0.4175, + "step": 3106 + }, + { + "epoch": 0.47, + "grad_norm": 6.865592504916592, + "learning_rate": 5.7482590135071194e-06, + "loss": 0.4792, + "step": 3107 + }, + { + "epoch": 0.47, + "grad_norm": 8.334393345982736, + "learning_rate": 5.745843892110952e-06, + "loss": 0.4627, + "step": 3108 + }, + { + "epoch": 0.47, + "grad_norm": 10.000214262766905, + "learning_rate": 5.743428592726813e-06, + "loss": 0.4532, + "step": 3109 + }, + { + "epoch": 0.47, + "grad_norm": 7.043919600037595, + "learning_rate": 5.741013115931087e-06, + "loss": 0.4832, + "step": 3110 + }, + { + "epoch": 0.47, + "grad_norm": 1.399708475505839, + "learning_rate": 5.738597462300205e-06, + "loss": 0.577, + "step": 3111 + }, + { + "epoch": 0.47, + "grad_norm": 5.3845451784220995, + "learning_rate": 5.736181632410635e-06, + "loss": 0.4082, + "step": 3112 + }, + { + "epoch": 0.47, + "grad_norm": 7.765795252000268, + "learning_rate": 5.733765626838894e-06, + "loss": 0.4195, + "step": 3113 + }, + { + "epoch": 0.47, + "grad_norm": 4.236385503238355, + "learning_rate": 5.731349446161533e-06, + "loss": 0.47, + "step": 3114 + }, + { + "epoch": 0.47, + "grad_norm": 8.706118423609485, + "learning_rate": 5.728933090955151e-06, + "loss": 0.4786, + "step": 3115 + }, + { + "epoch": 0.47, + "grad_norm": 7.119660145827664, + "learning_rate": 5.726516561796387e-06, + "loss": 0.4838, + "step": 3116 + }, + { + "epoch": 0.47, + "grad_norm": 7.537871335432183, + "learning_rate": 5.7240998592619165e-06, + "loss": 0.5145, + "step": 3117 + }, + { + "epoch": 0.47, + "grad_norm": 21.038663859852814, + "learning_rate": 5.7216829839284646e-06, + "loss": 0.4717, + "step": 3118 + }, + { + "epoch": 0.47, + "grad_norm": 12.840097205113555, + "learning_rate": 5.719265936372793e-06, + "loss": 0.4899, + "step": 3119 + }, + { + "epoch": 0.47, + "grad_norm": 4.2076129392836465, + "learning_rate": 5.716848717171706e-06, + "loss": 0.5292, + "step": 3120 + }, + { + "epoch": 0.47, + "grad_norm": 5.39907907781943, + "learning_rate": 5.714431326902047e-06, + "loss": 0.4762, + "step": 3121 + }, + { + "epoch": 0.47, + "grad_norm": 12.618395041778575, + "learning_rate": 5.7120137661407005e-06, + "loss": 0.4632, + "step": 3122 + }, + { + "epoch": 0.47, + "grad_norm": 5.451566632969732, + "learning_rate": 5.709596035464592e-06, + "loss": 0.4813, + "step": 3123 + }, + { + "epoch": 0.47, + "grad_norm": 5.416503003580778, + "learning_rate": 5.70717813545069e-06, + "loss": 0.5047, + "step": 3124 + }, + { + "epoch": 0.47, + "grad_norm": 35.65028924243356, + "learning_rate": 5.704760066676003e-06, + "loss": 0.4634, + "step": 3125 + }, + { + "epoch": 0.47, + "grad_norm": 5.474458387051392, + "learning_rate": 5.702341829717575e-06, + "loss": 0.4493, + "step": 3126 + }, + { + "epoch": 0.47, + "grad_norm": 13.452484435768707, + "learning_rate": 5.699923425152495e-06, + "loss": 0.5138, + "step": 3127 + }, + { + "epoch": 0.47, + "grad_norm": 4.983730366911048, + "learning_rate": 5.697504853557888e-06, + "loss": 0.4153, + "step": 3128 + }, + { + "epoch": 0.47, + "grad_norm": 3.9315589684090515, + "learning_rate": 5.695086115510924e-06, + "loss": 0.4539, + "step": 3129 + }, + { + "epoch": 0.47, + "grad_norm": 23.148437575675043, + "learning_rate": 5.69266721158881e-06, + "loss": 0.5023, + "step": 3130 + }, + { + "epoch": 0.47, + "grad_norm": 3.789955312540621, + "learning_rate": 5.69024814236879e-06, + "loss": 0.4387, + "step": 3131 + }, + { + "epoch": 0.47, + "grad_norm": 3.658923412725847, + "learning_rate": 5.687828908428153e-06, + "loss": 0.4704, + "step": 3132 + }, + { + "epoch": 0.47, + "grad_norm": 9.219025318694479, + "learning_rate": 5.685409510344223e-06, + "loss": 0.4534, + "step": 3133 + }, + { + "epoch": 0.47, + "grad_norm": 4.564538956915076, + "learning_rate": 5.682989948694363e-06, + "loss": 0.4089, + "step": 3134 + }, + { + "epoch": 0.47, + "grad_norm": 6.167202023758512, + "learning_rate": 5.680570224055979e-06, + "loss": 0.5171, + "step": 3135 + }, + { + "epoch": 0.47, + "grad_norm": 3.651144185075979, + "learning_rate": 5.678150337006512e-06, + "loss": 0.4182, + "step": 3136 + }, + { + "epoch": 0.47, + "grad_norm": 5.456699787212029, + "learning_rate": 5.675730288123442e-06, + "loss": 0.4599, + "step": 3137 + }, + { + "epoch": 0.47, + "grad_norm": 4.057024384405909, + "learning_rate": 5.673310077984293e-06, + "loss": 0.3857, + "step": 3138 + }, + { + "epoch": 0.47, + "grad_norm": 1.2932104838601441, + "learning_rate": 5.670889707166618e-06, + "loss": 0.5601, + "step": 3139 + }, + { + "epoch": 0.47, + "grad_norm": 4.059070323209951, + "learning_rate": 5.668469176248017e-06, + "loss": 0.4636, + "step": 3140 + }, + { + "epoch": 0.47, + "grad_norm": 3.9412784999811357, + "learning_rate": 5.666048485806124e-06, + "loss": 0.4786, + "step": 3141 + }, + { + "epoch": 0.47, + "grad_norm": 3.1548373543652737, + "learning_rate": 5.663627636418611e-06, + "loss": 0.4784, + "step": 3142 + }, + { + "epoch": 0.47, + "grad_norm": 6.181993005099054, + "learning_rate": 5.661206628663189e-06, + "loss": 0.5575, + "step": 3143 + }, + { + "epoch": 0.47, + "grad_norm": 3.9533606093609426, + "learning_rate": 5.658785463117607e-06, + "loss": 0.4793, + "step": 3144 + }, + { + "epoch": 0.47, + "grad_norm": 3.8632699036884715, + "learning_rate": 5.656364140359653e-06, + "loss": 0.5806, + "step": 3145 + }, + { + "epoch": 0.47, + "grad_norm": 2.6685592634425714, + "learning_rate": 5.6539426609671485e-06, + "loss": 0.3926, + "step": 3146 + }, + { + "epoch": 0.47, + "grad_norm": 4.063058798954238, + "learning_rate": 5.651521025517954e-06, + "loss": 0.5079, + "step": 3147 + }, + { + "epoch": 0.47, + "grad_norm": 4.164566714346429, + "learning_rate": 5.64909923458997e-06, + "loss": 0.4597, + "step": 3148 + }, + { + "epoch": 0.47, + "grad_norm": 3.9349549252412657, + "learning_rate": 5.646677288761132e-06, + "loss": 0.5664, + "step": 3149 + }, + { + "epoch": 0.48, + "grad_norm": 3.889440916712922, + "learning_rate": 5.644255188609411e-06, + "loss": 0.4477, + "step": 3150 + }, + { + "epoch": 0.48, + "grad_norm": 3.5492922835504976, + "learning_rate": 5.6418329347128165e-06, + "loss": 0.4897, + "step": 3151 + }, + { + "epoch": 0.48, + "grad_norm": 1.5571806458707342, + "learning_rate": 5.639410527649395e-06, + "loss": 0.5251, + "step": 3152 + }, + { + "epoch": 0.48, + "grad_norm": 3.635909692338665, + "learning_rate": 5.6369879679972285e-06, + "loss": 0.4029, + "step": 3153 + }, + { + "epoch": 0.48, + "grad_norm": 4.345980058289814, + "learning_rate": 5.634565256334435e-06, + "loss": 0.5002, + "step": 3154 + }, + { + "epoch": 0.48, + "grad_norm": 3.6397571867578837, + "learning_rate": 5.632142393239174e-06, + "loss": 0.471, + "step": 3155 + }, + { + "epoch": 0.48, + "grad_norm": 3.3704828441445147, + "learning_rate": 5.629719379289633e-06, + "loss": 0.3915, + "step": 3156 + }, + { + "epoch": 0.48, + "grad_norm": 23.507070834633065, + "learning_rate": 5.627296215064039e-06, + "loss": 0.4643, + "step": 3157 + }, + { + "epoch": 0.48, + "grad_norm": 2.957406963770855, + "learning_rate": 5.624872901140657e-06, + "loss": 0.3978, + "step": 3158 + }, + { + "epoch": 0.48, + "grad_norm": 11.218637599985948, + "learning_rate": 5.622449438097785e-06, + "loss": 0.4338, + "step": 3159 + }, + { + "epoch": 0.48, + "grad_norm": 4.282680276785911, + "learning_rate": 5.620025826513758e-06, + "loss": 0.5381, + "step": 3160 + }, + { + "epoch": 0.48, + "grad_norm": 6.2077390147945515, + "learning_rate": 5.6176020669669464e-06, + "loss": 0.4575, + "step": 3161 + }, + { + "epoch": 0.48, + "grad_norm": 4.664140470623212, + "learning_rate": 5.615178160035755e-06, + "loss": 0.4852, + "step": 3162 + }, + { + "epoch": 0.48, + "grad_norm": 5.866206035415342, + "learning_rate": 5.612754106298625e-06, + "loss": 0.4993, + "step": 3163 + }, + { + "epoch": 0.48, + "grad_norm": 3.7056785463742274, + "learning_rate": 5.61032990633403e-06, + "loss": 0.4652, + "step": 3164 + }, + { + "epoch": 0.48, + "grad_norm": 5.247469840658331, + "learning_rate": 5.607905560720481e-06, + "loss": 0.5389, + "step": 3165 + }, + { + "epoch": 0.48, + "grad_norm": 3.8592977303152116, + "learning_rate": 5.605481070036523e-06, + "loss": 0.4525, + "step": 3166 + }, + { + "epoch": 0.48, + "grad_norm": 3.0220763494375933, + "learning_rate": 5.603056434860739e-06, + "loss": 0.3894, + "step": 3167 + }, + { + "epoch": 0.48, + "grad_norm": 5.249027478167296, + "learning_rate": 5.600631655771739e-06, + "loss": 0.5713, + "step": 3168 + }, + { + "epoch": 0.48, + "grad_norm": 3.4263642422843046, + "learning_rate": 5.5982067333481735e-06, + "loss": 0.4534, + "step": 3169 + }, + { + "epoch": 0.48, + "grad_norm": 6.606751129313969, + "learning_rate": 5.5957816681687246e-06, + "loss": 0.4616, + "step": 3170 + }, + { + "epoch": 0.48, + "grad_norm": 4.884304660456333, + "learning_rate": 5.5933564608121095e-06, + "loss": 0.5166, + "step": 3171 + }, + { + "epoch": 0.48, + "grad_norm": 1.6970126005004829, + "learning_rate": 5.590931111857079e-06, + "loss": 0.5386, + "step": 3172 + }, + { + "epoch": 0.48, + "grad_norm": 9.154039772658649, + "learning_rate": 5.5885056218824185e-06, + "loss": 0.4867, + "step": 3173 + }, + { + "epoch": 0.48, + "grad_norm": 6.0363520755483, + "learning_rate": 5.586079991466945e-06, + "loss": 0.4555, + "step": 3174 + }, + { + "epoch": 0.48, + "grad_norm": 4.127212776221932, + "learning_rate": 5.58365422118951e-06, + "loss": 0.4436, + "step": 3175 + }, + { + "epoch": 0.48, + "grad_norm": 4.647537751906127, + "learning_rate": 5.581228311629e-06, + "loss": 0.4517, + "step": 3176 + }, + { + "epoch": 0.48, + "grad_norm": 5.317199225642473, + "learning_rate": 5.578802263364333e-06, + "loss": 0.4455, + "step": 3177 + }, + { + "epoch": 0.48, + "grad_norm": 3.6251371340360707, + "learning_rate": 5.5763760769744594e-06, + "loss": 0.54, + "step": 3178 + }, + { + "epoch": 0.48, + "grad_norm": 3.3214395582701854, + "learning_rate": 5.573949753038364e-06, + "loss": 0.4648, + "step": 3179 + }, + { + "epoch": 0.48, + "grad_norm": 4.297957349116306, + "learning_rate": 5.571523292135067e-06, + "loss": 0.4989, + "step": 3180 + }, + { + "epoch": 0.48, + "grad_norm": 2.976242648602837, + "learning_rate": 5.569096694843613e-06, + "loss": 0.4671, + "step": 3181 + }, + { + "epoch": 0.48, + "grad_norm": 2.802481147927998, + "learning_rate": 5.566669961743088e-06, + "loss": 0.4788, + "step": 3182 + }, + { + "epoch": 0.48, + "grad_norm": 4.06989713775913, + "learning_rate": 5.564243093412605e-06, + "loss": 0.4598, + "step": 3183 + }, + { + "epoch": 0.48, + "grad_norm": 4.489522273614001, + "learning_rate": 5.561816090431315e-06, + "loss": 0.4766, + "step": 3184 + }, + { + "epoch": 0.48, + "grad_norm": 4.1403164438120665, + "learning_rate": 5.559388953378393e-06, + "loss": 0.4475, + "step": 3185 + }, + { + "epoch": 0.48, + "grad_norm": 5.89871877495417, + "learning_rate": 5.556961682833052e-06, + "loss": 0.4424, + "step": 3186 + }, + { + "epoch": 0.48, + "grad_norm": 4.2063029776138885, + "learning_rate": 5.5545342793745345e-06, + "loss": 0.4882, + "step": 3187 + }, + { + "epoch": 0.48, + "grad_norm": 5.376312124732114, + "learning_rate": 5.552106743582115e-06, + "loss": 0.53, + "step": 3188 + }, + { + "epoch": 0.48, + "grad_norm": 3.554156742740164, + "learning_rate": 5.549679076035104e-06, + "loss": 0.4648, + "step": 3189 + }, + { + "epoch": 0.48, + "grad_norm": 5.550521146322817, + "learning_rate": 5.5472512773128326e-06, + "loss": 0.4514, + "step": 3190 + }, + { + "epoch": 0.48, + "grad_norm": 3.6665951731813227, + "learning_rate": 5.544823347994677e-06, + "loss": 0.4079, + "step": 3191 + }, + { + "epoch": 0.48, + "grad_norm": 6.409750586423111, + "learning_rate": 5.542395288660032e-06, + "loss": 0.4182, + "step": 3192 + }, + { + "epoch": 0.48, + "grad_norm": 4.780824638495456, + "learning_rate": 5.5399670998883305e-06, + "loss": 0.426, + "step": 3193 + }, + { + "epoch": 0.48, + "grad_norm": 1.3605253066651224, + "learning_rate": 5.537538782259037e-06, + "loss": 0.5659, + "step": 3194 + }, + { + "epoch": 0.48, + "grad_norm": 4.005783623385917, + "learning_rate": 5.535110336351642e-06, + "loss": 0.4581, + "step": 3195 + }, + { + "epoch": 0.48, + "grad_norm": 5.0520683998268225, + "learning_rate": 5.5326817627456695e-06, + "loss": 0.4219, + "step": 3196 + }, + { + "epoch": 0.48, + "grad_norm": 6.130545303079728, + "learning_rate": 5.5302530620206765e-06, + "loss": 0.4916, + "step": 3197 + }, + { + "epoch": 0.48, + "grad_norm": 1.2227916493186013, + "learning_rate": 5.527824234756243e-06, + "loss": 0.551, + "step": 3198 + }, + { + "epoch": 0.48, + "grad_norm": 6.863211075529677, + "learning_rate": 5.5253952815319865e-06, + "loss": 0.4407, + "step": 3199 + }, + { + "epoch": 0.48, + "grad_norm": 9.756551073754293, + "learning_rate": 5.522966202927551e-06, + "loss": 0.4191, + "step": 3200 + }, + { + "epoch": 0.48, + "grad_norm": 4.56040429887214, + "learning_rate": 5.52053699952261e-06, + "loss": 0.445, + "step": 3201 + }, + { + "epoch": 0.48, + "grad_norm": 7.0316646358828825, + "learning_rate": 5.518107671896871e-06, + "loss": 0.5828, + "step": 3202 + }, + { + "epoch": 0.48, + "grad_norm": 4.409697557185427, + "learning_rate": 5.515678220630064e-06, + "loss": 0.4597, + "step": 3203 + }, + { + "epoch": 0.48, + "grad_norm": 7.987591006317606, + "learning_rate": 5.513248646301957e-06, + "loss": 0.4235, + "step": 3204 + }, + { + "epoch": 0.48, + "grad_norm": 4.000521828200423, + "learning_rate": 5.510818949492337e-06, + "loss": 0.4584, + "step": 3205 + }, + { + "epoch": 0.48, + "grad_norm": 7.053772872980277, + "learning_rate": 5.508389130781031e-06, + "loss": 0.4009, + "step": 3206 + }, + { + "epoch": 0.48, + "grad_norm": 3.221718470839906, + "learning_rate": 5.505959190747889e-06, + "loss": 0.4596, + "step": 3207 + }, + { + "epoch": 0.48, + "grad_norm": 6.0661858479859765, + "learning_rate": 5.503529129972792e-06, + "loss": 0.4706, + "step": 3208 + }, + { + "epoch": 0.48, + "grad_norm": 7.998377295945082, + "learning_rate": 5.501098949035648e-06, + "loss": 0.4746, + "step": 3209 + }, + { + "epoch": 0.48, + "grad_norm": 7.9352148526871265, + "learning_rate": 5.498668648516395e-06, + "loss": 0.4232, + "step": 3210 + }, + { + "epoch": 0.48, + "grad_norm": 4.2745708213396005, + "learning_rate": 5.496238228994997e-06, + "loss": 0.4596, + "step": 3211 + }, + { + "epoch": 0.48, + "grad_norm": 7.5177934557602395, + "learning_rate": 5.493807691051451e-06, + "loss": 0.5331, + "step": 3212 + }, + { + "epoch": 0.48, + "grad_norm": 6.363429555229731, + "learning_rate": 5.4913770352657814e-06, + "loss": 0.4343, + "step": 3213 + }, + { + "epoch": 0.48, + "grad_norm": 4.585552795595535, + "learning_rate": 5.488946262218037e-06, + "loss": 0.4672, + "step": 3214 + }, + { + "epoch": 0.48, + "grad_norm": 4.127516266899229, + "learning_rate": 5.486515372488294e-06, + "loss": 0.3972, + "step": 3215 + }, + { + "epoch": 0.49, + "grad_norm": 3.732876608185631, + "learning_rate": 5.484084366656666e-06, + "loss": 0.5211, + "step": 3216 + }, + { + "epoch": 0.49, + "grad_norm": 4.764826690842418, + "learning_rate": 5.481653245303281e-06, + "loss": 0.411, + "step": 3217 + }, + { + "epoch": 0.49, + "grad_norm": 4.601291358917296, + "learning_rate": 5.479222009008306e-06, + "loss": 0.4699, + "step": 3218 + }, + { + "epoch": 0.49, + "grad_norm": 5.095162361491254, + "learning_rate": 5.476790658351927e-06, + "loss": 0.4565, + "step": 3219 + }, + { + "epoch": 0.49, + "grad_norm": 4.618087612634257, + "learning_rate": 5.474359193914363e-06, + "loss": 0.4525, + "step": 3220 + }, + { + "epoch": 0.49, + "grad_norm": 5.389409260587045, + "learning_rate": 5.471927616275856e-06, + "loss": 0.433, + "step": 3221 + }, + { + "epoch": 0.49, + "grad_norm": 4.865514228573659, + "learning_rate": 5.4694959260166795e-06, + "loss": 0.4686, + "step": 3222 + }, + { + "epoch": 0.49, + "grad_norm": 5.805259582049252, + "learning_rate": 5.46706412371713e-06, + "loss": 0.4209, + "step": 3223 + }, + { + "epoch": 0.49, + "grad_norm": 4.579052577552933, + "learning_rate": 5.464632209957532e-06, + "loss": 0.4378, + "step": 3224 + }, + { + "epoch": 0.49, + "grad_norm": 5.466297555880539, + "learning_rate": 5.462200185318236e-06, + "loss": 0.4411, + "step": 3225 + }, + { + "epoch": 0.49, + "grad_norm": 10.273337063617493, + "learning_rate": 5.459768050379622e-06, + "loss": 0.4215, + "step": 3226 + }, + { + "epoch": 0.49, + "grad_norm": 4.0711764515032725, + "learning_rate": 5.457335805722092e-06, + "loss": 0.5188, + "step": 3227 + }, + { + "epoch": 0.49, + "grad_norm": 6.757428239371602, + "learning_rate": 5.454903451926077e-06, + "loss": 0.4578, + "step": 3228 + }, + { + "epoch": 0.49, + "grad_norm": 4.411607230427831, + "learning_rate": 5.452470989572031e-06, + "loss": 0.4217, + "step": 3229 + }, + { + "epoch": 0.49, + "grad_norm": 3.433383694292181, + "learning_rate": 5.45003841924044e-06, + "loss": 0.4538, + "step": 3230 + }, + { + "epoch": 0.49, + "grad_norm": 6.925206867271547, + "learning_rate": 5.44760574151181e-06, + "loss": 0.4763, + "step": 3231 + }, + { + "epoch": 0.49, + "grad_norm": 4.112728685718616, + "learning_rate": 5.4451729569666735e-06, + "loss": 0.4749, + "step": 3232 + }, + { + "epoch": 0.49, + "grad_norm": 4.744233646869358, + "learning_rate": 5.4427400661855916e-06, + "loss": 0.4184, + "step": 3233 + }, + { + "epoch": 0.49, + "grad_norm": 8.840975923901068, + "learning_rate": 5.440307069749147e-06, + "loss": 0.416, + "step": 3234 + }, + { + "epoch": 0.49, + "grad_norm": 4.694575746940418, + "learning_rate": 5.437873968237948e-06, + "loss": 0.461, + "step": 3235 + }, + { + "epoch": 0.49, + "grad_norm": 6.246738936081393, + "learning_rate": 5.4354407622326324e-06, + "loss": 0.4307, + "step": 3236 + }, + { + "epoch": 0.49, + "grad_norm": 9.398998426939157, + "learning_rate": 5.4330074523138586e-06, + "loss": 0.3538, + "step": 3237 + }, + { + "epoch": 0.49, + "grad_norm": 9.15310211778712, + "learning_rate": 5.430574039062312e-06, + "loss": 0.5053, + "step": 3238 + }, + { + "epoch": 0.49, + "grad_norm": 4.0256818057927894, + "learning_rate": 5.428140523058698e-06, + "loss": 0.4252, + "step": 3239 + }, + { + "epoch": 0.49, + "grad_norm": 9.379003183699032, + "learning_rate": 5.425706904883753e-06, + "loss": 0.4589, + "step": 3240 + }, + { + "epoch": 0.49, + "grad_norm": 9.1879335713915, + "learning_rate": 5.4232731851182355e-06, + "loss": 0.4863, + "step": 3241 + }, + { + "epoch": 0.49, + "grad_norm": 5.705671182181169, + "learning_rate": 5.420839364342925e-06, + "loss": 0.4858, + "step": 3242 + }, + { + "epoch": 0.49, + "grad_norm": 5.488484698737066, + "learning_rate": 5.418405443138631e-06, + "loss": 0.3787, + "step": 3243 + }, + { + "epoch": 0.49, + "grad_norm": 6.579148638539216, + "learning_rate": 5.415971422086182e-06, + "loss": 0.4243, + "step": 3244 + }, + { + "epoch": 0.49, + "grad_norm": 4.3155983631965125, + "learning_rate": 5.413537301766433e-06, + "loss": 0.4531, + "step": 3245 + }, + { + "epoch": 0.49, + "grad_norm": 19.742292378614664, + "learning_rate": 5.411103082760259e-06, + "loss": 0.4864, + "step": 3246 + }, + { + "epoch": 0.49, + "grad_norm": 4.792634384931771, + "learning_rate": 5.408668765648565e-06, + "loss": 0.5039, + "step": 3247 + }, + { + "epoch": 0.49, + "grad_norm": 5.374334102062656, + "learning_rate": 5.406234351012275e-06, + "loss": 0.4756, + "step": 3248 + }, + { + "epoch": 0.49, + "grad_norm": 6.155682520256152, + "learning_rate": 5.4037998394323356e-06, + "loss": 0.4069, + "step": 3249 + }, + { + "epoch": 0.49, + "grad_norm": 6.325556722529026, + "learning_rate": 5.401365231489718e-06, + "loss": 0.4651, + "step": 3250 + }, + { + "epoch": 0.49, + "grad_norm": 12.561341732055103, + "learning_rate": 5.398930527765416e-06, + "loss": 0.4859, + "step": 3251 + }, + { + "epoch": 0.49, + "grad_norm": 28.47595791948495, + "learning_rate": 5.396495728840448e-06, + "loss": 0.4391, + "step": 3252 + }, + { + "epoch": 0.49, + "grad_norm": 3.1696913615848277, + "learning_rate": 5.3940608352958534e-06, + "loss": 0.4805, + "step": 3253 + }, + { + "epoch": 0.49, + "grad_norm": 3.6818854932250598, + "learning_rate": 5.391625847712695e-06, + "loss": 0.4323, + "step": 3254 + }, + { + "epoch": 0.49, + "grad_norm": 3.8075845104537915, + "learning_rate": 5.389190766672057e-06, + "loss": 0.4603, + "step": 3255 + }, + { + "epoch": 0.49, + "grad_norm": 5.665670113898421, + "learning_rate": 5.386755592755044e-06, + "loss": 0.4566, + "step": 3256 + }, + { + "epoch": 0.49, + "grad_norm": 3.9516943925407477, + "learning_rate": 5.384320326542789e-06, + "loss": 0.4552, + "step": 3257 + }, + { + "epoch": 0.49, + "grad_norm": 5.091935876677221, + "learning_rate": 5.381884968616442e-06, + "loss": 0.4509, + "step": 3258 + }, + { + "epoch": 0.49, + "grad_norm": 5.63573354157472, + "learning_rate": 5.379449519557175e-06, + "loss": 0.4148, + "step": 3259 + }, + { + "epoch": 0.49, + "grad_norm": 6.5816478696515395, + "learning_rate": 5.377013979946183e-06, + "loss": 0.5229, + "step": 3260 + }, + { + "epoch": 0.49, + "grad_norm": 3.262931556483842, + "learning_rate": 5.374578350364686e-06, + "loss": 0.4633, + "step": 3261 + }, + { + "epoch": 0.49, + "grad_norm": 9.859426386570577, + "learning_rate": 5.372142631393916e-06, + "loss": 0.4557, + "step": 3262 + }, + { + "epoch": 0.49, + "grad_norm": 1.3176677048114953, + "learning_rate": 5.369706823615138e-06, + "loss": 0.5563, + "step": 3263 + }, + { + "epoch": 0.49, + "grad_norm": 3.791295119135724, + "learning_rate": 5.367270927609627e-06, + "loss": 0.441, + "step": 3264 + }, + { + "epoch": 0.49, + "grad_norm": 1.4060839149248263, + "learning_rate": 5.364834943958688e-06, + "loss": 0.5696, + "step": 3265 + }, + { + "epoch": 0.49, + "grad_norm": 3.6067184379781727, + "learning_rate": 5.362398873243644e-06, + "loss": 0.4033, + "step": 3266 + }, + { + "epoch": 0.49, + "grad_norm": 6.160678843813218, + "learning_rate": 5.359962716045836e-06, + "loss": 0.4208, + "step": 3267 + }, + { + "epoch": 0.49, + "grad_norm": 4.26679962276524, + "learning_rate": 5.35752647294663e-06, + "loss": 0.4455, + "step": 3268 + }, + { + "epoch": 0.49, + "grad_norm": 7.798644079797333, + "learning_rate": 5.355090144527408e-06, + "loss": 0.4663, + "step": 3269 + }, + { + "epoch": 0.49, + "grad_norm": 3.861094812045507, + "learning_rate": 5.352653731369576e-06, + "loss": 0.4877, + "step": 3270 + }, + { + "epoch": 0.49, + "grad_norm": 3.9589362656198666, + "learning_rate": 5.350217234054559e-06, + "loss": 0.5565, + "step": 3271 + }, + { + "epoch": 0.49, + "grad_norm": 3.0391228827973937, + "learning_rate": 5.3477806531638025e-06, + "loss": 0.5136, + "step": 3272 + }, + { + "epoch": 0.49, + "grad_norm": 3.149651375953562, + "learning_rate": 5.345343989278771e-06, + "loss": 0.4026, + "step": 3273 + }, + { + "epoch": 0.49, + "grad_norm": 6.921150674950357, + "learning_rate": 5.342907242980948e-06, + "loss": 0.4502, + "step": 3274 + }, + { + "epoch": 0.49, + "grad_norm": 4.765230244320097, + "learning_rate": 5.340470414851839e-06, + "loss": 0.4853, + "step": 3275 + }, + { + "epoch": 0.49, + "grad_norm": 3.481198134182762, + "learning_rate": 5.338033505472968e-06, + "loss": 0.3824, + "step": 3276 + }, + { + "epoch": 0.49, + "grad_norm": 3.769270456440631, + "learning_rate": 5.335596515425879e-06, + "loss": 0.4569, + "step": 3277 + }, + { + "epoch": 0.49, + "grad_norm": 6.019814958256495, + "learning_rate": 5.333159445292135e-06, + "loss": 0.5026, + "step": 3278 + }, + { + "epoch": 0.49, + "grad_norm": 3.4954519176525025, + "learning_rate": 5.330722295653315e-06, + "loss": 0.443, + "step": 3279 + }, + { + "epoch": 0.49, + "grad_norm": 3.9237392661841275, + "learning_rate": 5.328285067091022e-06, + "loss": 0.4735, + "step": 3280 + }, + { + "epoch": 0.49, + "grad_norm": 3.9428035995371125, + "learning_rate": 5.3258477601868745e-06, + "loss": 0.3763, + "step": 3281 + }, + { + "epoch": 0.5, + "grad_norm": 3.3629913060942727, + "learning_rate": 5.3234103755225115e-06, + "loss": 0.5532, + "step": 3282 + }, + { + "epoch": 0.5, + "grad_norm": 17.176192576684052, + "learning_rate": 5.32097291367959e-06, + "loss": 0.5333, + "step": 3283 + }, + { + "epoch": 0.5, + "grad_norm": 6.081856887425179, + "learning_rate": 5.318535375239784e-06, + "loss": 0.5695, + "step": 3284 + }, + { + "epoch": 0.5, + "grad_norm": 3.0722452641289366, + "learning_rate": 5.316097760784786e-06, + "loss": 0.4614, + "step": 3285 + }, + { + "epoch": 0.5, + "grad_norm": 4.365557311997831, + "learning_rate": 5.313660070896311e-06, + "loss": 0.4329, + "step": 3286 + }, + { + "epoch": 0.5, + "grad_norm": 4.263080487688134, + "learning_rate": 5.3112223061560865e-06, + "loss": 0.4472, + "step": 3287 + }, + { + "epoch": 0.5, + "grad_norm": 3.374427471569894, + "learning_rate": 5.30878446714586e-06, + "loss": 0.453, + "step": 3288 + }, + { + "epoch": 0.5, + "grad_norm": 5.508626132987856, + "learning_rate": 5.3063465544473965e-06, + "loss": 0.3698, + "step": 3289 + }, + { + "epoch": 0.5, + "grad_norm": 2.4049375358944123, + "learning_rate": 5.30390856864248e-06, + "loss": 0.4769, + "step": 3290 + }, + { + "epoch": 0.5, + "grad_norm": 3.712612801149028, + "learning_rate": 5.301470510312909e-06, + "loss": 0.4495, + "step": 3291 + }, + { + "epoch": 0.5, + "grad_norm": 4.097102779736724, + "learning_rate": 5.299032380040504e-06, + "loss": 0.4283, + "step": 3292 + }, + { + "epoch": 0.5, + "grad_norm": 5.75776519062649, + "learning_rate": 5.296594178407096e-06, + "loss": 0.4164, + "step": 3293 + }, + { + "epoch": 0.5, + "grad_norm": 9.08760899435055, + "learning_rate": 5.294155905994539e-06, + "loss": 0.4637, + "step": 3294 + }, + { + "epoch": 0.5, + "grad_norm": 2.766519620726101, + "learning_rate": 5.291717563384702e-06, + "loss": 0.4387, + "step": 3295 + }, + { + "epoch": 0.5, + "grad_norm": 4.0430251977363705, + "learning_rate": 5.289279151159467e-06, + "loss": 0.4561, + "step": 3296 + }, + { + "epoch": 0.5, + "grad_norm": 4.380963646955826, + "learning_rate": 5.286840669900742e-06, + "loss": 0.464, + "step": 3297 + }, + { + "epoch": 0.5, + "grad_norm": 5.4721056471701806, + "learning_rate": 5.284402120190439e-06, + "loss": 0.4235, + "step": 3298 + }, + { + "epoch": 0.5, + "grad_norm": 3.7011138622413067, + "learning_rate": 5.281963502610497e-06, + "loss": 0.4309, + "step": 3299 + }, + { + "epoch": 0.5, + "grad_norm": 3.9534246719248185, + "learning_rate": 5.2795248177428646e-06, + "loss": 0.4663, + "step": 3300 + }, + { + "epoch": 0.5, + "grad_norm": 4.759452463026602, + "learning_rate": 5.2770860661695115e-06, + "loss": 0.4636, + "step": 3301 + }, + { + "epoch": 0.5, + "grad_norm": 3.5500426992803087, + "learning_rate": 5.274647248472419e-06, + "loss": 0.5299, + "step": 3302 + }, + { + "epoch": 0.5, + "grad_norm": 2.5934580441323676, + "learning_rate": 5.272208365233585e-06, + "loss": 0.4794, + "step": 3303 + }, + { + "epoch": 0.5, + "grad_norm": 4.033740734255481, + "learning_rate": 5.269769417035026e-06, + "loss": 0.5054, + "step": 3304 + }, + { + "epoch": 0.5, + "grad_norm": 3.6379714799954668, + "learning_rate": 5.267330404458769e-06, + "loss": 0.3783, + "step": 3305 + }, + { + "epoch": 0.5, + "grad_norm": 3.5875397094240573, + "learning_rate": 5.264891328086862e-06, + "loss": 0.4506, + "step": 3306 + }, + { + "epoch": 0.5, + "grad_norm": 4.385035323002471, + "learning_rate": 5.262452188501365e-06, + "loss": 0.4453, + "step": 3307 + }, + { + "epoch": 0.5, + "grad_norm": 4.584980069027644, + "learning_rate": 5.260012986284354e-06, + "loss": 0.4003, + "step": 3308 + }, + { + "epoch": 0.5, + "grad_norm": 5.772230307358214, + "learning_rate": 5.257573722017917e-06, + "loss": 0.4062, + "step": 3309 + }, + { + "epoch": 0.5, + "grad_norm": 4.049980850076305, + "learning_rate": 5.2551343962841615e-06, + "loss": 0.4964, + "step": 3310 + }, + { + "epoch": 0.5, + "grad_norm": 4.5467979341422895, + "learning_rate": 5.252695009665206e-06, + "loss": 0.4641, + "step": 3311 + }, + { + "epoch": 0.5, + "grad_norm": 5.859719462182213, + "learning_rate": 5.250255562743188e-06, + "loss": 0.3876, + "step": 3312 + }, + { + "epoch": 0.5, + "grad_norm": 6.047033734465715, + "learning_rate": 5.247816056100255e-06, + "loss": 0.4758, + "step": 3313 + }, + { + "epoch": 0.5, + "grad_norm": 8.240647672560057, + "learning_rate": 5.245376490318566e-06, + "loss": 0.4309, + "step": 3314 + }, + { + "epoch": 0.5, + "grad_norm": 2.3777127749030087, + "learning_rate": 5.242936865980304e-06, + "loss": 0.4516, + "step": 3315 + }, + { + "epoch": 0.5, + "grad_norm": 3.6658855171079057, + "learning_rate": 5.240497183667657e-06, + "loss": 0.4586, + "step": 3316 + }, + { + "epoch": 0.5, + "grad_norm": 3.9630513085929864, + "learning_rate": 5.238057443962834e-06, + "loss": 0.4859, + "step": 3317 + }, + { + "epoch": 0.5, + "grad_norm": 3.5330773653344494, + "learning_rate": 5.235617647448048e-06, + "loss": 0.4297, + "step": 3318 + }, + { + "epoch": 0.5, + "grad_norm": 21.25727874186291, + "learning_rate": 5.233177794705536e-06, + "loss": 0.4126, + "step": 3319 + }, + { + "epoch": 0.5, + "grad_norm": 4.769918163210967, + "learning_rate": 5.230737886317541e-06, + "loss": 0.4863, + "step": 3320 + }, + { + "epoch": 0.5, + "grad_norm": 3.4990919755753653, + "learning_rate": 5.2282979228663236e-06, + "loss": 0.4412, + "step": 3321 + }, + { + "epoch": 0.5, + "grad_norm": 6.892564054651963, + "learning_rate": 5.225857904934155e-06, + "loss": 0.4076, + "step": 3322 + }, + { + "epoch": 0.5, + "grad_norm": 4.186331258285111, + "learning_rate": 5.2234178331033215e-06, + "loss": 0.4439, + "step": 3323 + }, + { + "epoch": 0.5, + "grad_norm": 4.139353996842082, + "learning_rate": 5.22097770795612e-06, + "loss": 0.5027, + "step": 3324 + }, + { + "epoch": 0.5, + "grad_norm": 4.530008368315678, + "learning_rate": 5.218537530074862e-06, + "loss": 0.4458, + "step": 3325 + }, + { + "epoch": 0.5, + "grad_norm": 2.962023616624714, + "learning_rate": 5.21609730004187e-06, + "loss": 0.4584, + "step": 3326 + }, + { + "epoch": 0.5, + "grad_norm": 3.3513286606755646, + "learning_rate": 5.213657018439482e-06, + "loss": 0.436, + "step": 3327 + }, + { + "epoch": 0.5, + "grad_norm": 3.4170556046407374, + "learning_rate": 5.211216685850042e-06, + "loss": 0.4536, + "step": 3328 + }, + { + "epoch": 0.5, + "grad_norm": 22.31515906934759, + "learning_rate": 5.208776302855915e-06, + "loss": 0.4659, + "step": 3329 + }, + { + "epoch": 0.5, + "grad_norm": 20.357757128154105, + "learning_rate": 5.20633587003947e-06, + "loss": 0.4537, + "step": 3330 + }, + { + "epoch": 0.5, + "grad_norm": 3.8639261122811344, + "learning_rate": 5.203895387983093e-06, + "loss": 0.4983, + "step": 3331 + }, + { + "epoch": 0.5, + "grad_norm": 3.0484410183988553, + "learning_rate": 5.2014548572691796e-06, + "loss": 0.4693, + "step": 3332 + }, + { + "epoch": 0.5, + "grad_norm": 3.036665199129306, + "learning_rate": 5.199014278480135e-06, + "loss": 0.3851, + "step": 3333 + }, + { + "epoch": 0.5, + "grad_norm": 3.1032911895418964, + "learning_rate": 5.19657365219838e-06, + "loss": 0.4369, + "step": 3334 + }, + { + "epoch": 0.5, + "grad_norm": 4.020590575955939, + "learning_rate": 5.194132979006346e-06, + "loss": 0.4796, + "step": 3335 + }, + { + "epoch": 0.5, + "grad_norm": 5.8025778690321035, + "learning_rate": 5.191692259486475e-06, + "loss": 0.5066, + "step": 3336 + }, + { + "epoch": 0.5, + "grad_norm": 4.1991070049848505, + "learning_rate": 5.189251494221218e-06, + "loss": 0.4471, + "step": 3337 + }, + { + "epoch": 0.5, + "grad_norm": 7.210517795523312, + "learning_rate": 5.186810683793038e-06, + "loss": 0.4184, + "step": 3338 + }, + { + "epoch": 0.5, + "grad_norm": 3.433575860469508, + "learning_rate": 5.184369828784409e-06, + "loss": 0.4968, + "step": 3339 + }, + { + "epoch": 0.5, + "grad_norm": 7.297485175785062, + "learning_rate": 5.181928929777817e-06, + "loss": 0.3983, + "step": 3340 + }, + { + "epoch": 0.5, + "grad_norm": 12.269897665984946, + "learning_rate": 5.1794879873557605e-06, + "loss": 0.4311, + "step": 3341 + }, + { + "epoch": 0.5, + "grad_norm": 3.592431741408581, + "learning_rate": 5.177047002100739e-06, + "loss": 0.5051, + "step": 3342 + }, + { + "epoch": 0.5, + "grad_norm": 5.899059396345337, + "learning_rate": 5.174605974595274e-06, + "loss": 0.4218, + "step": 3343 + }, + { + "epoch": 0.5, + "grad_norm": 3.497862822273135, + "learning_rate": 5.172164905421888e-06, + "loss": 0.4616, + "step": 3344 + }, + { + "epoch": 0.5, + "grad_norm": 5.662840702861832, + "learning_rate": 5.169723795163119e-06, + "loss": 0.4173, + "step": 3345 + }, + { + "epoch": 0.5, + "grad_norm": 3.9742356443026807, + "learning_rate": 5.167282644401512e-06, + "loss": 0.3888, + "step": 3346 + }, + { + "epoch": 0.5, + "grad_norm": 2.6869484046200975, + "learning_rate": 5.164841453719623e-06, + "loss": 0.4747, + "step": 3347 + }, + { + "epoch": 0.5, + "grad_norm": 14.391452105445113, + "learning_rate": 5.162400223700017e-06, + "loss": 0.473, + "step": 3348 + }, + { + "epoch": 0.51, + "grad_norm": 4.7068099547723525, + "learning_rate": 5.159958954925267e-06, + "loss": 0.4785, + "step": 3349 + }, + { + "epoch": 0.51, + "grad_norm": 3.1050363365132787, + "learning_rate": 5.157517647977958e-06, + "loss": 0.5281, + "step": 3350 + }, + { + "epoch": 0.51, + "grad_norm": 3.330353150023166, + "learning_rate": 5.155076303440686e-06, + "loss": 0.4592, + "step": 3351 + }, + { + "epoch": 0.51, + "grad_norm": 5.705451851741254, + "learning_rate": 5.152634921896046e-06, + "loss": 0.4859, + "step": 3352 + }, + { + "epoch": 0.51, + "grad_norm": 3.0708551705973224, + "learning_rate": 5.1501935039266524e-06, + "loss": 0.4347, + "step": 3353 + }, + { + "epoch": 0.51, + "grad_norm": 3.6008086476195897, + "learning_rate": 5.147752050115127e-06, + "loss": 0.4852, + "step": 3354 + }, + { + "epoch": 0.51, + "grad_norm": 3.35196073413082, + "learning_rate": 5.145310561044091e-06, + "loss": 0.4481, + "step": 3355 + }, + { + "epoch": 0.51, + "grad_norm": 2.6751035264760574, + "learning_rate": 5.142869037296188e-06, + "loss": 0.4832, + "step": 3356 + }, + { + "epoch": 0.51, + "grad_norm": 4.90243119613261, + "learning_rate": 5.1404274794540565e-06, + "loss": 0.398, + "step": 3357 + }, + { + "epoch": 0.51, + "grad_norm": 4.796291533480356, + "learning_rate": 5.137985888100352e-06, + "loss": 0.4277, + "step": 3358 + }, + { + "epoch": 0.51, + "grad_norm": 3.791838010266537, + "learning_rate": 5.135544263817735e-06, + "loss": 0.4246, + "step": 3359 + }, + { + "epoch": 0.51, + "grad_norm": 4.547302138630425, + "learning_rate": 5.133102607188875e-06, + "loss": 0.4722, + "step": 3360 + }, + { + "epoch": 0.51, + "grad_norm": 6.452572250905354, + "learning_rate": 5.130660918796446e-06, + "loss": 0.4756, + "step": 3361 + }, + { + "epoch": 0.51, + "grad_norm": 3.576033157228893, + "learning_rate": 5.128219199223132e-06, + "loss": 0.473, + "step": 3362 + }, + { + "epoch": 0.51, + "grad_norm": 3.1144084907418708, + "learning_rate": 5.125777449051627e-06, + "loss": 0.4815, + "step": 3363 + }, + { + "epoch": 0.51, + "grad_norm": 1.240967014170178, + "learning_rate": 5.123335668864627e-06, + "loss": 0.5644, + "step": 3364 + }, + { + "epoch": 0.51, + "grad_norm": 2.890279233273213, + "learning_rate": 5.120893859244838e-06, + "loss": 0.4148, + "step": 3365 + }, + { + "epoch": 0.51, + "grad_norm": 6.346384067330006, + "learning_rate": 5.1184520207749735e-06, + "loss": 0.445, + "step": 3366 + }, + { + "epoch": 0.51, + "grad_norm": 2.867170831572803, + "learning_rate": 5.116010154037753e-06, + "loss": 0.3849, + "step": 3367 + }, + { + "epoch": 0.51, + "grad_norm": 3.337960371320644, + "learning_rate": 5.113568259615901e-06, + "loss": 0.4872, + "step": 3368 + }, + { + "epoch": 0.51, + "grad_norm": 4.906170119967276, + "learning_rate": 5.111126338092153e-06, + "loss": 0.4266, + "step": 3369 + }, + { + "epoch": 0.51, + "grad_norm": 6.5116594669574335, + "learning_rate": 5.1086843900492476e-06, + "loss": 0.4774, + "step": 3370 + }, + { + "epoch": 0.51, + "grad_norm": 3.5027918612460565, + "learning_rate": 5.106242416069932e-06, + "loss": 0.4677, + "step": 3371 + }, + { + "epoch": 0.51, + "grad_norm": 7.938188817263368, + "learning_rate": 5.103800416736957e-06, + "loss": 0.437, + "step": 3372 + }, + { + "epoch": 0.51, + "grad_norm": 4.0124811809810375, + "learning_rate": 5.101358392633078e-06, + "loss": 0.4875, + "step": 3373 + }, + { + "epoch": 0.51, + "grad_norm": 7.506166327783403, + "learning_rate": 5.098916344341062e-06, + "loss": 0.4918, + "step": 3374 + }, + { + "epoch": 0.51, + "grad_norm": 1.2820857996999695, + "learning_rate": 5.096474272443678e-06, + "loss": 0.59, + "step": 3375 + }, + { + "epoch": 0.51, + "grad_norm": 5.173620473422902, + "learning_rate": 5.094032177523703e-06, + "loss": 0.451, + "step": 3376 + }, + { + "epoch": 0.51, + "grad_norm": 4.6933139236618375, + "learning_rate": 5.091590060163916e-06, + "loss": 0.4567, + "step": 3377 + }, + { + "epoch": 0.51, + "grad_norm": 5.172780238157196, + "learning_rate": 5.089147920947101e-06, + "loss": 0.4621, + "step": 3378 + }, + { + "epoch": 0.51, + "grad_norm": 4.907892624629719, + "learning_rate": 5.086705760456053e-06, + "loss": 0.502, + "step": 3379 + }, + { + "epoch": 0.51, + "grad_norm": 5.487782358215405, + "learning_rate": 5.084263579273566e-06, + "loss": 0.532, + "step": 3380 + }, + { + "epoch": 0.51, + "grad_norm": 10.051137053257438, + "learning_rate": 5.081821377982446e-06, + "loss": 0.399, + "step": 3381 + }, + { + "epoch": 0.51, + "grad_norm": 4.859558571389233, + "learning_rate": 5.079379157165493e-06, + "loss": 0.4624, + "step": 3382 + }, + { + "epoch": 0.51, + "grad_norm": 3.5097424993680546, + "learning_rate": 5.076936917405522e-06, + "loss": 0.4135, + "step": 3383 + }, + { + "epoch": 0.51, + "grad_norm": 3.059271148389168, + "learning_rate": 5.074494659285346e-06, + "loss": 0.4256, + "step": 3384 + }, + { + "epoch": 0.51, + "grad_norm": 6.08453940537398, + "learning_rate": 5.072052383387787e-06, + "loss": 0.4586, + "step": 3385 + }, + { + "epoch": 0.51, + "grad_norm": 2.7070257113029568, + "learning_rate": 5.069610090295667e-06, + "loss": 0.3907, + "step": 3386 + }, + { + "epoch": 0.51, + "grad_norm": 3.5096620019079543, + "learning_rate": 5.067167780591815e-06, + "loss": 0.4839, + "step": 3387 + }, + { + "epoch": 0.51, + "grad_norm": 5.309144822229189, + "learning_rate": 5.064725454859064e-06, + "loss": 0.5119, + "step": 3388 + }, + { + "epoch": 0.51, + "grad_norm": 4.240076456281339, + "learning_rate": 5.062283113680249e-06, + "loss": 0.3928, + "step": 3389 + }, + { + "epoch": 0.51, + "grad_norm": 3.413247312515577, + "learning_rate": 5.05984075763821e-06, + "loss": 0.486, + "step": 3390 + }, + { + "epoch": 0.51, + "grad_norm": 3.784550943300344, + "learning_rate": 5.0573983873157885e-06, + "loss": 0.4729, + "step": 3391 + }, + { + "epoch": 0.51, + "grad_norm": 5.358901926133858, + "learning_rate": 5.0549560032958335e-06, + "loss": 0.4562, + "step": 3392 + }, + { + "epoch": 0.51, + "grad_norm": 2.7249252171028173, + "learning_rate": 5.052513606161192e-06, + "loss": 0.4634, + "step": 3393 + }, + { + "epoch": 0.51, + "grad_norm": 3.6424462382218405, + "learning_rate": 5.05007119649472e-06, + "loss": 0.4088, + "step": 3394 + }, + { + "epoch": 0.51, + "grad_norm": 2.8784175764397717, + "learning_rate": 5.047628774879273e-06, + "loss": 0.4778, + "step": 3395 + }, + { + "epoch": 0.51, + "grad_norm": 4.221259237159651, + "learning_rate": 5.0451863418977074e-06, + "loss": 0.4523, + "step": 3396 + }, + { + "epoch": 0.51, + "grad_norm": 3.4938120375126993, + "learning_rate": 5.042743898132886e-06, + "loss": 0.4433, + "step": 3397 + }, + { + "epoch": 0.51, + "grad_norm": 3.752674577893305, + "learning_rate": 5.040301444167672e-06, + "loss": 0.5248, + "step": 3398 + }, + { + "epoch": 0.51, + "grad_norm": 2.7552701164392324, + "learning_rate": 5.037858980584934e-06, + "loss": 0.5008, + "step": 3399 + }, + { + "epoch": 0.51, + "grad_norm": 3.733078586853428, + "learning_rate": 5.035416507967541e-06, + "loss": 0.5384, + "step": 3400 + }, + { + "epoch": 0.51, + "grad_norm": 1.167652804430579, + "learning_rate": 5.032974026898363e-06, + "loss": 0.6161, + "step": 3401 + }, + { + "epoch": 0.51, + "grad_norm": 3.3638938074112876, + "learning_rate": 5.030531537960272e-06, + "loss": 0.4794, + "step": 3402 + }, + { + "epoch": 0.51, + "grad_norm": 3.5496377118684292, + "learning_rate": 5.028089041736142e-06, + "loss": 0.5091, + "step": 3403 + }, + { + "epoch": 0.51, + "grad_norm": 3.008969015618689, + "learning_rate": 5.025646538808852e-06, + "loss": 0.4277, + "step": 3404 + }, + { + "epoch": 0.51, + "grad_norm": 3.5026658681671448, + "learning_rate": 5.023204029761279e-06, + "loss": 0.4501, + "step": 3405 + }, + { + "epoch": 0.51, + "grad_norm": 3.1698791553949257, + "learning_rate": 5.020761515176304e-06, + "loss": 0.4124, + "step": 3406 + }, + { + "epoch": 0.51, + "grad_norm": 4.139961506890189, + "learning_rate": 5.018318995636807e-06, + "loss": 0.422, + "step": 3407 + }, + { + "epoch": 0.51, + "grad_norm": 3.5418882889432277, + "learning_rate": 5.015876471725669e-06, + "loss": 0.4943, + "step": 3408 + }, + { + "epoch": 0.51, + "grad_norm": 3.97062695454358, + "learning_rate": 5.013433944025775e-06, + "loss": 0.4516, + "step": 3409 + }, + { + "epoch": 0.51, + "grad_norm": 4.58842963321321, + "learning_rate": 5.0109914131200085e-06, + "loss": 0.4406, + "step": 3410 + }, + { + "epoch": 0.51, + "grad_norm": 5.241393409859594, + "learning_rate": 5.008548879591255e-06, + "loss": 0.527, + "step": 3411 + }, + { + "epoch": 0.51, + "grad_norm": 4.624821053524552, + "learning_rate": 5.006106344022399e-06, + "loss": 0.4587, + "step": 3412 + }, + { + "epoch": 0.51, + "grad_norm": 2.8728721755382445, + "learning_rate": 5.003663806996326e-06, + "loss": 0.4271, + "step": 3413 + }, + { + "epoch": 0.51, + "grad_norm": 3.3522566835872807, + "learning_rate": 5.001221269095923e-06, + "loss": 0.4548, + "step": 3414 + }, + { + "epoch": 0.52, + "grad_norm": 3.482307662634816, + "learning_rate": 4.998778730904077e-06, + "loss": 0.477, + "step": 3415 + }, + { + "epoch": 0.52, + "grad_norm": 3.170252465197291, + "learning_rate": 4.9963361930036755e-06, + "loss": 0.4839, + "step": 3416 + }, + { + "epoch": 0.52, + "grad_norm": 2.7121381836810534, + "learning_rate": 4.993893655977603e-06, + "loss": 0.4861, + "step": 3417 + }, + { + "epoch": 0.52, + "grad_norm": 1.2954829777667467, + "learning_rate": 4.991451120408747e-06, + "loss": 0.5517, + "step": 3418 + }, + { + "epoch": 0.52, + "grad_norm": 3.957209305995384, + "learning_rate": 4.989008586879993e-06, + "loss": 0.4631, + "step": 3419 + }, + { + "epoch": 0.52, + "grad_norm": 2.286742550451276, + "learning_rate": 4.986566055974225e-06, + "loss": 0.4693, + "step": 3420 + }, + { + "epoch": 0.52, + "grad_norm": 3.191885250935636, + "learning_rate": 4.984123528274332e-06, + "loss": 0.4561, + "step": 3421 + }, + { + "epoch": 0.52, + "grad_norm": 2.6663191754587747, + "learning_rate": 4.981681004363195e-06, + "loss": 0.4831, + "step": 3422 + }, + { + "epoch": 0.52, + "grad_norm": 2.3930109168328557, + "learning_rate": 4.979238484823697e-06, + "loss": 0.4218, + "step": 3423 + }, + { + "epoch": 0.52, + "grad_norm": 3.283414588613568, + "learning_rate": 4.976795970238723e-06, + "loss": 0.4411, + "step": 3424 + }, + { + "epoch": 0.52, + "grad_norm": 2.1801747095590076, + "learning_rate": 4.974353461191149e-06, + "loss": 0.3749, + "step": 3425 + }, + { + "epoch": 0.52, + "grad_norm": 3.62708561780288, + "learning_rate": 4.97191095826386e-06, + "loss": 0.3877, + "step": 3426 + }, + { + "epoch": 0.52, + "grad_norm": 2.3634553758177157, + "learning_rate": 4.9694684620397315e-06, + "loss": 0.3894, + "step": 3427 + }, + { + "epoch": 0.52, + "grad_norm": 4.557484764878095, + "learning_rate": 4.967025973101638e-06, + "loss": 0.4785, + "step": 3428 + }, + { + "epoch": 0.52, + "grad_norm": 3.365239611851695, + "learning_rate": 4.96458349203246e-06, + "loss": 0.485, + "step": 3429 + }, + { + "epoch": 0.52, + "grad_norm": 3.050010411987575, + "learning_rate": 4.962141019415066e-06, + "loss": 0.4137, + "step": 3430 + }, + { + "epoch": 0.52, + "grad_norm": 7.3825976652597225, + "learning_rate": 4.9596985558323285e-06, + "loss": 0.4809, + "step": 3431 + }, + { + "epoch": 0.52, + "grad_norm": 2.3565590900118, + "learning_rate": 4.957256101867117e-06, + "loss": 0.4765, + "step": 3432 + }, + { + "epoch": 0.52, + "grad_norm": 1.2770063943017262, + "learning_rate": 4.954813658102294e-06, + "loss": 0.5567, + "step": 3433 + }, + { + "epoch": 0.52, + "grad_norm": 7.935390010925133, + "learning_rate": 4.95237122512073e-06, + "loss": 0.4746, + "step": 3434 + }, + { + "epoch": 0.52, + "grad_norm": 3.3716757944283295, + "learning_rate": 4.949928803505281e-06, + "loss": 0.4408, + "step": 3435 + }, + { + "epoch": 0.52, + "grad_norm": 2.8882367873943666, + "learning_rate": 4.94748639383881e-06, + "loss": 0.4884, + "step": 3436 + }, + { + "epoch": 0.52, + "grad_norm": 3.3926998299187745, + "learning_rate": 4.94504399670417e-06, + "loss": 0.4838, + "step": 3437 + }, + { + "epoch": 0.52, + "grad_norm": 4.577280721413775, + "learning_rate": 4.942601612684212e-06, + "loss": 0.4813, + "step": 3438 + }, + { + "epoch": 0.52, + "grad_norm": 3.6345878639724227, + "learning_rate": 4.940159242361792e-06, + "loss": 0.4442, + "step": 3439 + }, + { + "epoch": 0.52, + "grad_norm": 3.8389289827307134, + "learning_rate": 4.937716886319752e-06, + "loss": 0.4319, + "step": 3440 + }, + { + "epoch": 0.52, + "grad_norm": 7.153677529000984, + "learning_rate": 4.935274545140938e-06, + "loss": 0.5238, + "step": 3441 + }, + { + "epoch": 0.52, + "grad_norm": 3.4125762357007385, + "learning_rate": 4.932832219408187e-06, + "loss": 0.448, + "step": 3442 + }, + { + "epoch": 0.52, + "grad_norm": 3.0153865510135063, + "learning_rate": 4.930389909704334e-06, + "loss": 0.4648, + "step": 3443 + }, + { + "epoch": 0.52, + "grad_norm": 6.4607241572899525, + "learning_rate": 4.927947616612216e-06, + "loss": 0.4211, + "step": 3444 + }, + { + "epoch": 0.52, + "grad_norm": 5.608041559920161, + "learning_rate": 4.925505340714654e-06, + "loss": 0.4555, + "step": 3445 + }, + { + "epoch": 0.52, + "grad_norm": 2.629418764208967, + "learning_rate": 4.92306308259448e-06, + "loss": 0.395, + "step": 3446 + }, + { + "epoch": 0.52, + "grad_norm": 2.71131896726482, + "learning_rate": 4.920620842834509e-06, + "loss": 0.4477, + "step": 3447 + }, + { + "epoch": 0.52, + "grad_norm": 1.381204380253442, + "learning_rate": 4.918178622017557e-06, + "loss": 0.5077, + "step": 3448 + }, + { + "epoch": 0.52, + "grad_norm": 3.0009703084447716, + "learning_rate": 4.915736420726435e-06, + "loss": 0.4198, + "step": 3449 + }, + { + "epoch": 0.52, + "grad_norm": 3.1031666761428283, + "learning_rate": 4.913294239543948e-06, + "loss": 0.484, + "step": 3450 + }, + { + "epoch": 0.52, + "grad_norm": 4.641347489466055, + "learning_rate": 4.9108520790529e-06, + "loss": 0.501, + "step": 3451 + }, + { + "epoch": 0.52, + "grad_norm": 3.4244278328985156, + "learning_rate": 4.908409939836088e-06, + "loss": 0.4523, + "step": 3452 + }, + { + "epoch": 0.52, + "grad_norm": 7.546748770485189, + "learning_rate": 4.905967822476298e-06, + "loss": 0.4223, + "step": 3453 + }, + { + "epoch": 0.52, + "grad_norm": 3.35592321881806, + "learning_rate": 4.903525727556323e-06, + "loss": 0.4751, + "step": 3454 + }, + { + "epoch": 0.52, + "grad_norm": 3.3165390379995285, + "learning_rate": 4.901083655658938e-06, + "loss": 0.4951, + "step": 3455 + }, + { + "epoch": 0.52, + "grad_norm": 3.0046155185822014, + "learning_rate": 4.8986416073669235e-06, + "loss": 0.4488, + "step": 3456 + }, + { + "epoch": 0.52, + "grad_norm": 3.5446113442596725, + "learning_rate": 4.896199583263046e-06, + "loss": 0.4214, + "step": 3457 + }, + { + "epoch": 0.52, + "grad_norm": 14.525826216912545, + "learning_rate": 4.89375758393007e-06, + "loss": 0.5256, + "step": 3458 + }, + { + "epoch": 0.52, + "grad_norm": 1.2675446046835244, + "learning_rate": 4.891315609950753e-06, + "loss": 0.6005, + "step": 3459 + }, + { + "epoch": 0.52, + "grad_norm": 4.834310777039866, + "learning_rate": 4.888873661907847e-06, + "loss": 0.3408, + "step": 3460 + }, + { + "epoch": 0.52, + "grad_norm": 3.8742551774174125, + "learning_rate": 4.8864317403841e-06, + "loss": 0.4435, + "step": 3461 + }, + { + "epoch": 0.52, + "grad_norm": 3.219946019862945, + "learning_rate": 4.88398984596225e-06, + "loss": 0.4313, + "step": 3462 + }, + { + "epoch": 0.52, + "grad_norm": 3.6458496770574573, + "learning_rate": 4.881547979225027e-06, + "loss": 0.434, + "step": 3463 + }, + { + "epoch": 0.52, + "grad_norm": 3.833995588738102, + "learning_rate": 4.879106140755164e-06, + "loss": 0.542, + "step": 3464 + }, + { + "epoch": 0.52, + "grad_norm": 5.028555726158057, + "learning_rate": 4.876664331135374e-06, + "loss": 0.4186, + "step": 3465 + }, + { + "epoch": 0.52, + "grad_norm": 3.3753813061826565, + "learning_rate": 4.874222550948375e-06, + "loss": 0.3891, + "step": 3466 + }, + { + "epoch": 0.52, + "grad_norm": 3.3051854712314426, + "learning_rate": 4.871780800776869e-06, + "loss": 0.4434, + "step": 3467 + }, + { + "epoch": 0.52, + "grad_norm": 4.725181173749494, + "learning_rate": 4.8693390812035555e-06, + "loss": 0.4406, + "step": 3468 + }, + { + "epoch": 0.52, + "grad_norm": 8.357822365901962, + "learning_rate": 4.866897392811127e-06, + "loss": 0.4371, + "step": 3469 + }, + { + "epoch": 0.52, + "grad_norm": 2.6877054952492574, + "learning_rate": 4.8644557361822655e-06, + "loss": 0.4448, + "step": 3470 + }, + { + "epoch": 0.52, + "grad_norm": 4.7100074004978705, + "learning_rate": 4.862014111899649e-06, + "loss": 0.4414, + "step": 3471 + }, + { + "epoch": 0.52, + "grad_norm": 3.1444855497419875, + "learning_rate": 4.859572520545946e-06, + "loss": 0.4122, + "step": 3472 + }, + { + "epoch": 0.52, + "grad_norm": 2.699209978669724, + "learning_rate": 4.857130962703814e-06, + "loss": 0.4115, + "step": 3473 + }, + { + "epoch": 0.52, + "grad_norm": 4.167948988219005, + "learning_rate": 4.85468943895591e-06, + "loss": 0.5134, + "step": 3474 + }, + { + "epoch": 0.52, + "grad_norm": 5.054866792012111, + "learning_rate": 4.852247949884875e-06, + "loss": 0.4394, + "step": 3475 + }, + { + "epoch": 0.52, + "grad_norm": 2.760939822803174, + "learning_rate": 4.849806496073348e-06, + "loss": 0.3993, + "step": 3476 + }, + { + "epoch": 0.52, + "grad_norm": 3.7559279808200188, + "learning_rate": 4.8473650781039565e-06, + "loss": 0.5151, + "step": 3477 + }, + { + "epoch": 0.52, + "grad_norm": 6.8757257256668805, + "learning_rate": 4.844923696559316e-06, + "loss": 0.5334, + "step": 3478 + }, + { + "epoch": 0.52, + "grad_norm": 3.0021347551935254, + "learning_rate": 4.8424823520220425e-06, + "loss": 0.4195, + "step": 3479 + }, + { + "epoch": 0.52, + "grad_norm": 3.4106709025296054, + "learning_rate": 4.840041045074733e-06, + "loss": 0.5534, + "step": 3480 + }, + { + "epoch": 0.53, + "grad_norm": 3.992383492305769, + "learning_rate": 4.8375997762999844e-06, + "loss": 0.4749, + "step": 3481 + }, + { + "epoch": 0.53, + "grad_norm": 1.5171999896043895, + "learning_rate": 4.835158546280379e-06, + "loss": 0.5981, + "step": 3482 + }, + { + "epoch": 0.53, + "grad_norm": 3.875873049237427, + "learning_rate": 4.83271735559849e-06, + "loss": 0.5574, + "step": 3483 + }, + { + "epoch": 0.53, + "grad_norm": 5.236480791126966, + "learning_rate": 4.830276204836884e-06, + "loss": 0.4705, + "step": 3484 + }, + { + "epoch": 0.53, + "grad_norm": 3.2456375566861184, + "learning_rate": 4.827835094578113e-06, + "loss": 0.4734, + "step": 3485 + }, + { + "epoch": 0.53, + "grad_norm": 4.903724623353245, + "learning_rate": 4.825394025404728e-06, + "loss": 0.5015, + "step": 3486 + }, + { + "epoch": 0.53, + "grad_norm": 4.153910391826314, + "learning_rate": 4.822952997899263e-06, + "loss": 0.4516, + "step": 3487 + }, + { + "epoch": 0.53, + "grad_norm": 2.7390976593294263, + "learning_rate": 4.820512012644242e-06, + "loss": 0.4871, + "step": 3488 + }, + { + "epoch": 0.53, + "grad_norm": 4.668034220044751, + "learning_rate": 4.8180710702221835e-06, + "loss": 0.4366, + "step": 3489 + }, + { + "epoch": 0.53, + "grad_norm": 3.389067934098476, + "learning_rate": 4.8156301712155915e-06, + "loss": 0.4785, + "step": 3490 + }, + { + "epoch": 0.53, + "grad_norm": 3.793780583605442, + "learning_rate": 4.813189316206965e-06, + "loss": 0.4415, + "step": 3491 + }, + { + "epoch": 0.53, + "grad_norm": 2.904876710735061, + "learning_rate": 4.8107485057787846e-06, + "loss": 0.4349, + "step": 3492 + }, + { + "epoch": 0.53, + "grad_norm": 4.150573629415088, + "learning_rate": 4.808307740513526e-06, + "loss": 0.4682, + "step": 3493 + }, + { + "epoch": 0.53, + "grad_norm": 5.073998108765457, + "learning_rate": 4.8058670209936554e-06, + "loss": 0.4255, + "step": 3494 + }, + { + "epoch": 0.53, + "grad_norm": 2.980940730043345, + "learning_rate": 4.80342634780162e-06, + "loss": 0.4555, + "step": 3495 + }, + { + "epoch": 0.53, + "grad_norm": 4.513436261901973, + "learning_rate": 4.800985721519866e-06, + "loss": 0.4799, + "step": 3496 + }, + { + "epoch": 0.53, + "grad_norm": 3.044406585000728, + "learning_rate": 4.798545142730824e-06, + "loss": 0.4575, + "step": 3497 + }, + { + "epoch": 0.53, + "grad_norm": 23.838611026002955, + "learning_rate": 4.7961046120169075e-06, + "loss": 0.5083, + "step": 3498 + }, + { + "epoch": 0.53, + "grad_norm": 4.464785884601695, + "learning_rate": 4.7936641299605304e-06, + "loss": 0.474, + "step": 3499 + }, + { + "epoch": 0.53, + "grad_norm": 18.869656414745627, + "learning_rate": 4.791223697144085e-06, + "loss": 0.4329, + "step": 3500 + }, + { + "epoch": 0.53, + "grad_norm": 4.012713917989065, + "learning_rate": 4.788783314149959e-06, + "loss": 0.4908, + "step": 3501 + }, + { + "epoch": 0.53, + "grad_norm": 8.153512107442264, + "learning_rate": 4.786342981560521e-06, + "loss": 0.4977, + "step": 3502 + }, + { + "epoch": 0.53, + "grad_norm": 3.136883813680489, + "learning_rate": 4.78390269995813e-06, + "loss": 0.4626, + "step": 3503 + }, + { + "epoch": 0.53, + "grad_norm": 3.828426853825977, + "learning_rate": 4.7814624699251395e-06, + "loss": 0.5404, + "step": 3504 + }, + { + "epoch": 0.53, + "grad_norm": 4.287874335497635, + "learning_rate": 4.779022292043881e-06, + "loss": 0.471, + "step": 3505 + }, + { + "epoch": 0.53, + "grad_norm": 7.841635754919218, + "learning_rate": 4.77658216689668e-06, + "loss": 0.4742, + "step": 3506 + }, + { + "epoch": 0.53, + "grad_norm": 3.390665406607505, + "learning_rate": 4.774142095065847e-06, + "loss": 0.5069, + "step": 3507 + }, + { + "epoch": 0.53, + "grad_norm": 3.2450440516247485, + "learning_rate": 4.771702077133677e-06, + "loss": 0.4492, + "step": 3508 + }, + { + "epoch": 0.53, + "grad_norm": 4.482684831989166, + "learning_rate": 4.76926211368246e-06, + "loss": 0.4567, + "step": 3509 + }, + { + "epoch": 0.53, + "grad_norm": 3.9931534306242527, + "learning_rate": 4.766822205294465e-06, + "loss": 0.5023, + "step": 3510 + }, + { + "epoch": 0.53, + "grad_norm": 3.0534445668670958, + "learning_rate": 4.764382352551954e-06, + "loss": 0.453, + "step": 3511 + }, + { + "epoch": 0.53, + "grad_norm": 5.19558116585608, + "learning_rate": 4.761942556037169e-06, + "loss": 0.4593, + "step": 3512 + }, + { + "epoch": 0.53, + "grad_norm": 3.5384557717650473, + "learning_rate": 4.759502816332343e-06, + "loss": 0.4345, + "step": 3513 + }, + { + "epoch": 0.53, + "grad_norm": 1.215280363167575, + "learning_rate": 4.757063134019697e-06, + "loss": 0.5998, + "step": 3514 + }, + { + "epoch": 0.53, + "grad_norm": 2.44681201668645, + "learning_rate": 4.754623509681434e-06, + "loss": 0.4065, + "step": 3515 + }, + { + "epoch": 0.53, + "grad_norm": 3.620920828336688, + "learning_rate": 4.752183943899747e-06, + "loss": 0.4442, + "step": 3516 + }, + { + "epoch": 0.53, + "grad_norm": 3.985622264758341, + "learning_rate": 4.749744437256813e-06, + "loss": 0.4817, + "step": 3517 + }, + { + "epoch": 0.53, + "grad_norm": 3.0293660522348484, + "learning_rate": 4.747304990334794e-06, + "loss": 0.4616, + "step": 3518 + }, + { + "epoch": 0.53, + "grad_norm": 3.6476129275878977, + "learning_rate": 4.74486560371584e-06, + "loss": 0.4505, + "step": 3519 + }, + { + "epoch": 0.53, + "grad_norm": 2.9244168520030143, + "learning_rate": 4.742426277982083e-06, + "loss": 0.4416, + "step": 3520 + }, + { + "epoch": 0.53, + "grad_norm": 1.0611069597390574, + "learning_rate": 4.739987013715648e-06, + "loss": 0.5044, + "step": 3521 + }, + { + "epoch": 0.53, + "grad_norm": 5.316638499942588, + "learning_rate": 4.737547811498636e-06, + "loss": 0.4032, + "step": 3522 + }, + { + "epoch": 0.53, + "grad_norm": 3.8644422214268297, + "learning_rate": 4.735108671913138e-06, + "loss": 0.4901, + "step": 3523 + }, + { + "epoch": 0.53, + "grad_norm": 5.390755803162155, + "learning_rate": 4.732669595541232e-06, + "loss": 0.4043, + "step": 3524 + }, + { + "epoch": 0.53, + "grad_norm": 3.588097898209697, + "learning_rate": 4.730230582964975e-06, + "loss": 0.44, + "step": 3525 + }, + { + "epoch": 0.53, + "grad_norm": 5.852318814887911, + "learning_rate": 4.727791634766416e-06, + "loss": 0.4509, + "step": 3526 + }, + { + "epoch": 0.53, + "grad_norm": 2.402263480213148, + "learning_rate": 4.725352751527583e-06, + "loss": 0.4865, + "step": 3527 + }, + { + "epoch": 0.53, + "grad_norm": 3.466337588194331, + "learning_rate": 4.72291393383049e-06, + "loss": 0.4737, + "step": 3528 + }, + { + "epoch": 0.53, + "grad_norm": 3.140726653614173, + "learning_rate": 4.720475182257137e-06, + "loss": 0.4926, + "step": 3529 + }, + { + "epoch": 0.53, + "grad_norm": 5.453446035964061, + "learning_rate": 4.718036497389506e-06, + "loss": 0.4336, + "step": 3530 + }, + { + "epoch": 0.53, + "grad_norm": 3.208028076463462, + "learning_rate": 4.715597879809562e-06, + "loss": 0.4315, + "step": 3531 + }, + { + "epoch": 0.53, + "grad_norm": 1.2282949751536858, + "learning_rate": 4.713159330099261e-06, + "loss": 0.5179, + "step": 3532 + }, + { + "epoch": 0.53, + "grad_norm": 3.6535437494079477, + "learning_rate": 4.710720848840532e-06, + "loss": 0.4213, + "step": 3533 + }, + { + "epoch": 0.53, + "grad_norm": 3.4255539553471235, + "learning_rate": 4.7082824366153e-06, + "loss": 0.4841, + "step": 3534 + }, + { + "epoch": 0.53, + "grad_norm": 3.104363131189795, + "learning_rate": 4.7058440940054625e-06, + "loss": 0.4456, + "step": 3535 + }, + { + "epoch": 0.53, + "grad_norm": 3.3264529913533143, + "learning_rate": 4.703405821592905e-06, + "loss": 0.4624, + "step": 3536 + }, + { + "epoch": 0.53, + "grad_norm": 3.263774946068872, + "learning_rate": 4.7009676199594985e-06, + "loss": 0.4387, + "step": 3537 + }, + { + "epoch": 0.53, + "grad_norm": 3.2520655007153954, + "learning_rate": 4.69852948968709e-06, + "loss": 0.4522, + "step": 3538 + }, + { + "epoch": 0.53, + "grad_norm": 3.720797670736403, + "learning_rate": 4.6960914313575205e-06, + "loss": 0.4249, + "step": 3539 + }, + { + "epoch": 0.53, + "grad_norm": 3.2293799596105424, + "learning_rate": 4.693653445552605e-06, + "loss": 0.4194, + "step": 3540 + }, + { + "epoch": 0.53, + "grad_norm": 3.284295098013576, + "learning_rate": 4.691215532854141e-06, + "loss": 0.3881, + "step": 3541 + }, + { + "epoch": 0.53, + "grad_norm": 3.8329973408280282, + "learning_rate": 4.688777693843915e-06, + "loss": 0.4622, + "step": 3542 + }, + { + "epoch": 0.53, + "grad_norm": 2.5504158785385314, + "learning_rate": 4.68633992910369e-06, + "loss": 0.4835, + "step": 3543 + }, + { + "epoch": 0.53, + "grad_norm": 2.991906721170582, + "learning_rate": 4.683902239215215e-06, + "loss": 0.5291, + "step": 3544 + }, + { + "epoch": 0.53, + "grad_norm": 5.125463922956898, + "learning_rate": 4.681464624760219e-06, + "loss": 0.4237, + "step": 3545 + }, + { + "epoch": 0.53, + "grad_norm": 13.947853773046074, + "learning_rate": 4.679027086320412e-06, + "loss": 0.4548, + "step": 3546 + }, + { + "epoch": 0.53, + "grad_norm": 6.153086345969848, + "learning_rate": 4.676589624477491e-06, + "loss": 0.4703, + "step": 3547 + }, + { + "epoch": 0.54, + "grad_norm": 6.305823470890065, + "learning_rate": 4.674152239813126e-06, + "loss": 0.4695, + "step": 3548 + }, + { + "epoch": 0.54, + "grad_norm": 2.899897516440773, + "learning_rate": 4.671714932908979e-06, + "loss": 0.3633, + "step": 3549 + }, + { + "epoch": 0.54, + "grad_norm": 4.75380547454389, + "learning_rate": 4.669277704346687e-06, + "loss": 0.4496, + "step": 3550 + }, + { + "epoch": 0.54, + "grad_norm": 3.588039909086197, + "learning_rate": 4.666840554707866e-06, + "loss": 0.4735, + "step": 3551 + }, + { + "epoch": 0.54, + "grad_norm": 7.048772352324333, + "learning_rate": 4.664403484574122e-06, + "loss": 0.5019, + "step": 3552 + }, + { + "epoch": 0.54, + "grad_norm": 4.212233422467621, + "learning_rate": 4.661966494527032e-06, + "loss": 0.4051, + "step": 3553 + }, + { + "epoch": 0.54, + "grad_norm": 3.3540716855398864, + "learning_rate": 4.659529585148162e-06, + "loss": 0.4256, + "step": 3554 + }, + { + "epoch": 0.54, + "grad_norm": 2.7140547875838186, + "learning_rate": 4.657092757019055e-06, + "loss": 0.4162, + "step": 3555 + }, + { + "epoch": 0.54, + "grad_norm": 2.665371427384759, + "learning_rate": 4.654656010721231e-06, + "loss": 0.4506, + "step": 3556 + }, + { + "epoch": 0.54, + "grad_norm": 3.6544569022344486, + "learning_rate": 4.652219346836199e-06, + "loss": 0.4794, + "step": 3557 + }, + { + "epoch": 0.54, + "grad_norm": 2.9876235667536752, + "learning_rate": 4.649782765945442e-06, + "loss": 0.5115, + "step": 3558 + }, + { + "epoch": 0.54, + "grad_norm": 3.095931963052549, + "learning_rate": 4.6473462686304255e-06, + "loss": 0.4694, + "step": 3559 + }, + { + "epoch": 0.54, + "grad_norm": 2.9133563055814085, + "learning_rate": 4.6449098554725945e-06, + "loss": 0.4443, + "step": 3560 + }, + { + "epoch": 0.54, + "grad_norm": 3.130199727341025, + "learning_rate": 4.642473527053372e-06, + "loss": 0.492, + "step": 3561 + }, + { + "epoch": 0.54, + "grad_norm": 4.7056252491219555, + "learning_rate": 4.640037283954165e-06, + "loss": 0.4036, + "step": 3562 + }, + { + "epoch": 0.54, + "grad_norm": 3.8756960968885674, + "learning_rate": 4.6376011267563576e-06, + "loss": 0.4642, + "step": 3563 + }, + { + "epoch": 0.54, + "grad_norm": 3.374028069167655, + "learning_rate": 4.635165056041314e-06, + "loss": 0.4738, + "step": 3564 + }, + { + "epoch": 0.54, + "grad_norm": 4.456154783867747, + "learning_rate": 4.632729072390376e-06, + "loss": 0.4613, + "step": 3565 + }, + { + "epoch": 0.54, + "grad_norm": 4.646974499538116, + "learning_rate": 4.630293176384865e-06, + "loss": 0.4082, + "step": 3566 + }, + { + "epoch": 0.54, + "grad_norm": 3.4369401726325983, + "learning_rate": 4.627857368606086e-06, + "loss": 0.4994, + "step": 3567 + }, + { + "epoch": 0.54, + "grad_norm": 4.427581206120353, + "learning_rate": 4.625421649635317e-06, + "loss": 0.4739, + "step": 3568 + }, + { + "epoch": 0.54, + "grad_norm": 6.110166471095125, + "learning_rate": 4.622986020053818e-06, + "loss": 0.4779, + "step": 3569 + }, + { + "epoch": 0.54, + "grad_norm": 3.7541408821356703, + "learning_rate": 4.620550480442828e-06, + "loss": 0.3812, + "step": 3570 + }, + { + "epoch": 0.54, + "grad_norm": 4.822937875550511, + "learning_rate": 4.618115031383559e-06, + "loss": 0.4627, + "step": 3571 + }, + { + "epoch": 0.54, + "grad_norm": 3.0805632944686683, + "learning_rate": 4.615679673457213e-06, + "loss": 0.4992, + "step": 3572 + }, + { + "epoch": 0.54, + "grad_norm": 3.4835279501143415, + "learning_rate": 4.613244407244956e-06, + "loss": 0.4542, + "step": 3573 + }, + { + "epoch": 0.54, + "grad_norm": 4.832094378239397, + "learning_rate": 4.610809233327946e-06, + "loss": 0.4584, + "step": 3574 + }, + { + "epoch": 0.54, + "grad_norm": 1.2386887127766082, + "learning_rate": 4.608374152287307e-06, + "loss": 0.547, + "step": 3575 + }, + { + "epoch": 0.54, + "grad_norm": 4.806360868548605, + "learning_rate": 4.605939164704147e-06, + "loss": 0.4643, + "step": 3576 + }, + { + "epoch": 0.54, + "grad_norm": 2.997720747786057, + "learning_rate": 4.603504271159553e-06, + "loss": 0.4532, + "step": 3577 + }, + { + "epoch": 0.54, + "grad_norm": 3.2957150311531964, + "learning_rate": 4.601069472234584e-06, + "loss": 0.4619, + "step": 3578 + }, + { + "epoch": 0.54, + "grad_norm": 3.649172866262382, + "learning_rate": 4.598634768510284e-06, + "loss": 0.4372, + "step": 3579 + }, + { + "epoch": 0.54, + "grad_norm": 1.3373489891189552, + "learning_rate": 4.596200160567667e-06, + "loss": 0.5865, + "step": 3580 + }, + { + "epoch": 0.54, + "grad_norm": 4.024451249224027, + "learning_rate": 4.593765648987727e-06, + "loss": 0.391, + "step": 3581 + }, + { + "epoch": 0.54, + "grad_norm": 3.3412811684671477, + "learning_rate": 4.591331234351436e-06, + "loss": 0.456, + "step": 3582 + }, + { + "epoch": 0.54, + "grad_norm": 3.7626667516222256, + "learning_rate": 4.588896917239741e-06, + "loss": 0.3939, + "step": 3583 + }, + { + "epoch": 0.54, + "grad_norm": 3.4364873392986612, + "learning_rate": 4.5864626982335685e-06, + "loss": 0.4549, + "step": 3584 + }, + { + "epoch": 0.54, + "grad_norm": 3.1055795871425915, + "learning_rate": 4.584028577913819e-06, + "loss": 0.4614, + "step": 3585 + }, + { + "epoch": 0.54, + "grad_norm": 1.1312700284967037, + "learning_rate": 4.58159455686137e-06, + "loss": 0.4969, + "step": 3586 + }, + { + "epoch": 0.54, + "grad_norm": 4.522486460425019, + "learning_rate": 4.579160635657076e-06, + "loss": 0.505, + "step": 3587 + }, + { + "epoch": 0.54, + "grad_norm": 17.064758804235545, + "learning_rate": 4.576726814881765e-06, + "loss": 0.4407, + "step": 3588 + }, + { + "epoch": 0.54, + "grad_norm": 1.3221708935160172, + "learning_rate": 4.574293095116248e-06, + "loss": 0.5853, + "step": 3589 + }, + { + "epoch": 0.54, + "grad_norm": 4.492522266300199, + "learning_rate": 4.5718594769413045e-06, + "loss": 0.4729, + "step": 3590 + }, + { + "epoch": 0.54, + "grad_norm": 4.20958806292807, + "learning_rate": 4.56942596093769e-06, + "loss": 0.4889, + "step": 3591 + }, + { + "epoch": 0.54, + "grad_norm": 2.4163001256298524, + "learning_rate": 4.566992547686142e-06, + "loss": 0.4267, + "step": 3592 + }, + { + "epoch": 0.54, + "grad_norm": 3.388778521911487, + "learning_rate": 4.564559237767368e-06, + "loss": 0.4623, + "step": 3593 + }, + { + "epoch": 0.54, + "grad_norm": 3.944930482158423, + "learning_rate": 4.562126031762053e-06, + "loss": 0.4673, + "step": 3594 + }, + { + "epoch": 0.54, + "grad_norm": 5.410210767571355, + "learning_rate": 4.5596929302508566e-06, + "loss": 0.5191, + "step": 3595 + }, + { + "epoch": 0.54, + "grad_norm": 4.328477899141395, + "learning_rate": 4.557259933814409e-06, + "loss": 0.3894, + "step": 3596 + }, + { + "epoch": 0.54, + "grad_norm": 3.1610464189136445, + "learning_rate": 4.554827043033328e-06, + "loss": 0.4368, + "step": 3597 + }, + { + "epoch": 0.54, + "grad_norm": 1.2219190631737484, + "learning_rate": 4.552394258488192e-06, + "loss": 0.5273, + "step": 3598 + }, + { + "epoch": 0.54, + "grad_norm": 5.53375942750944, + "learning_rate": 4.549961580759561e-06, + "loss": 0.4417, + "step": 3599 + }, + { + "epoch": 0.54, + "grad_norm": 1.2157975588817882, + "learning_rate": 4.547529010427971e-06, + "loss": 0.5519, + "step": 3600 + }, + { + "epoch": 0.54, + "grad_norm": 5.087248834398162, + "learning_rate": 4.5450965480739245e-06, + "loss": 0.4797, + "step": 3601 + }, + { + "epoch": 0.54, + "grad_norm": 1.3323876344802354, + "learning_rate": 4.54266419427791e-06, + "loss": 0.5814, + "step": 3602 + }, + { + "epoch": 0.54, + "grad_norm": 3.4293596943243267, + "learning_rate": 4.5402319496203796e-06, + "loss": 0.4677, + "step": 3603 + }, + { + "epoch": 0.54, + "grad_norm": 3.3907581821304524, + "learning_rate": 4.537799814681765e-06, + "loss": 0.4952, + "step": 3604 + }, + { + "epoch": 0.54, + "grad_norm": 27.195689806612886, + "learning_rate": 4.535367790042471e-06, + "loss": 0.4623, + "step": 3605 + }, + { + "epoch": 0.54, + "grad_norm": 3.5527307043628285, + "learning_rate": 4.532935876282871e-06, + "loss": 0.4476, + "step": 3606 + }, + { + "epoch": 0.54, + "grad_norm": 1.3124562313135855, + "learning_rate": 4.530504073983322e-06, + "loss": 0.6166, + "step": 3607 + }, + { + "epoch": 0.54, + "grad_norm": 2.6438443670303857, + "learning_rate": 4.528072383724144e-06, + "loss": 0.4315, + "step": 3608 + }, + { + "epoch": 0.54, + "grad_norm": 1.0726134382175725, + "learning_rate": 4.525640806085638e-06, + "loss": 0.5123, + "step": 3609 + }, + { + "epoch": 0.54, + "grad_norm": 2.6558606207230753, + "learning_rate": 4.523209341648075e-06, + "loss": 0.3796, + "step": 3610 + }, + { + "epoch": 0.54, + "grad_norm": 2.890507619464277, + "learning_rate": 4.520777990991696e-06, + "loss": 0.5054, + "step": 3611 + }, + { + "epoch": 0.54, + "grad_norm": 3.4124917559136767, + "learning_rate": 4.518346754696721e-06, + "loss": 0.4584, + "step": 3612 + }, + { + "epoch": 0.54, + "grad_norm": 10.879553414196396, + "learning_rate": 4.515915633343335e-06, + "loss": 0.393, + "step": 3613 + }, + { + "epoch": 0.55, + "grad_norm": 1.2004468339785968, + "learning_rate": 4.5134846275117065e-06, + "loss": 0.5976, + "step": 3614 + }, + { + "epoch": 0.55, + "grad_norm": 2.8007658352487934, + "learning_rate": 4.5110537377819665e-06, + "loss": 0.4512, + "step": 3615 + }, + { + "epoch": 0.55, + "grad_norm": 3.9969787394457663, + "learning_rate": 4.50862296473422e-06, + "loss": 0.4689, + "step": 3616 + }, + { + "epoch": 0.55, + "grad_norm": 3.9921251545900076, + "learning_rate": 4.5061923089485505e-06, + "loss": 0.4661, + "step": 3617 + }, + { + "epoch": 0.55, + "grad_norm": 3.041834346511043, + "learning_rate": 4.503761771005004e-06, + "loss": 0.418, + "step": 3618 + }, + { + "epoch": 0.55, + "grad_norm": 3.693491790877917, + "learning_rate": 4.501331351483607e-06, + "loss": 0.3758, + "step": 3619 + }, + { + "epoch": 0.55, + "grad_norm": 5.070403620321998, + "learning_rate": 4.498901050964353e-06, + "loss": 0.4571, + "step": 3620 + }, + { + "epoch": 0.55, + "grad_norm": 1.3269682285479902, + "learning_rate": 4.496470870027209e-06, + "loss": 0.5596, + "step": 3621 + }, + { + "epoch": 0.55, + "grad_norm": 3.919025404219239, + "learning_rate": 4.494040809252112e-06, + "loss": 0.4293, + "step": 3622 + }, + { + "epoch": 0.55, + "grad_norm": 6.059555164955259, + "learning_rate": 4.491610869218969e-06, + "loss": 0.4819, + "step": 3623 + }, + { + "epoch": 0.55, + "grad_norm": 3.3111506942735636, + "learning_rate": 4.489181050507664e-06, + "loss": 0.4553, + "step": 3624 + }, + { + "epoch": 0.55, + "grad_norm": 7.048236455299816, + "learning_rate": 4.486751353698047e-06, + "loss": 0.3941, + "step": 3625 + }, + { + "epoch": 0.55, + "grad_norm": 2.829447067519785, + "learning_rate": 4.4843217793699365e-06, + "loss": 0.5054, + "step": 3626 + }, + { + "epoch": 0.55, + "grad_norm": 5.883516326739647, + "learning_rate": 4.481892328103131e-06, + "loss": 0.3779, + "step": 3627 + }, + { + "epoch": 0.55, + "grad_norm": 2.844861984948812, + "learning_rate": 4.47946300047739e-06, + "loss": 0.4639, + "step": 3628 + }, + { + "epoch": 0.55, + "grad_norm": 3.212595259719701, + "learning_rate": 4.477033797072451e-06, + "loss": 0.3977, + "step": 3629 + }, + { + "epoch": 0.55, + "grad_norm": 3.839727917115151, + "learning_rate": 4.474604718468016e-06, + "loss": 0.4317, + "step": 3630 + }, + { + "epoch": 0.55, + "grad_norm": 4.450431118581267, + "learning_rate": 4.472175765243758e-06, + "loss": 0.464, + "step": 3631 + }, + { + "epoch": 0.55, + "grad_norm": 4.077424784882286, + "learning_rate": 4.469746937979325e-06, + "loss": 0.4681, + "step": 3632 + }, + { + "epoch": 0.55, + "grad_norm": 4.1701420017884745, + "learning_rate": 4.4673182372543305e-06, + "loss": 0.4798, + "step": 3633 + }, + { + "epoch": 0.55, + "grad_norm": 3.6906097518137866, + "learning_rate": 4.464889663648359e-06, + "loss": 0.4627, + "step": 3634 + }, + { + "epoch": 0.55, + "grad_norm": 7.046488246071865, + "learning_rate": 4.462461217740965e-06, + "loss": 0.4384, + "step": 3635 + }, + { + "epoch": 0.55, + "grad_norm": 4.058887673260045, + "learning_rate": 4.46003290011167e-06, + "loss": 0.4799, + "step": 3636 + }, + { + "epoch": 0.55, + "grad_norm": 3.614535493773436, + "learning_rate": 4.45760471133997e-06, + "loss": 0.4443, + "step": 3637 + }, + { + "epoch": 0.55, + "grad_norm": 3.582446944665605, + "learning_rate": 4.455176652005325e-06, + "loss": 0.4284, + "step": 3638 + }, + { + "epoch": 0.55, + "grad_norm": 4.483392356907345, + "learning_rate": 4.452748722687168e-06, + "loss": 0.3582, + "step": 3639 + }, + { + "epoch": 0.55, + "grad_norm": 6.290613832679188, + "learning_rate": 4.4503209239648995e-06, + "loss": 0.4562, + "step": 3640 + }, + { + "epoch": 0.55, + "grad_norm": 4.754892929241506, + "learning_rate": 4.447893256417885e-06, + "loss": 0.473, + "step": 3641 + }, + { + "epoch": 0.55, + "grad_norm": 7.935003859721646, + "learning_rate": 4.445465720625467e-06, + "loss": 0.4454, + "step": 3642 + }, + { + "epoch": 0.55, + "grad_norm": 2.7707915972103616, + "learning_rate": 4.443038317166951e-06, + "loss": 0.4685, + "step": 3643 + }, + { + "epoch": 0.55, + "grad_norm": 6.505217139008351, + "learning_rate": 4.440611046621608e-06, + "loss": 0.5013, + "step": 3644 + }, + { + "epoch": 0.55, + "grad_norm": 3.1161057654199564, + "learning_rate": 4.438183909568687e-06, + "loss": 0.4878, + "step": 3645 + }, + { + "epoch": 0.55, + "grad_norm": 2.870952892683315, + "learning_rate": 4.435756906587395e-06, + "loss": 0.4676, + "step": 3646 + }, + { + "epoch": 0.55, + "grad_norm": 6.4913246651433125, + "learning_rate": 4.433330038256914e-06, + "loss": 0.4599, + "step": 3647 + }, + { + "epoch": 0.55, + "grad_norm": 5.72607318514973, + "learning_rate": 4.430903305156389e-06, + "loss": 0.4127, + "step": 3648 + }, + { + "epoch": 0.55, + "grad_norm": 4.005872535105403, + "learning_rate": 4.4284767078649345e-06, + "loss": 0.4233, + "step": 3649 + }, + { + "epoch": 0.55, + "grad_norm": 3.3677120183549696, + "learning_rate": 4.4260502469616365e-06, + "loss": 0.3943, + "step": 3650 + }, + { + "epoch": 0.55, + "grad_norm": 3.5162997676667143, + "learning_rate": 4.423623923025541e-06, + "loss": 0.4082, + "step": 3651 + }, + { + "epoch": 0.55, + "grad_norm": 3.178985082861699, + "learning_rate": 4.421197736635669e-06, + "loss": 0.412, + "step": 3652 + }, + { + "epoch": 0.55, + "grad_norm": 14.477750216243061, + "learning_rate": 4.418771688371003e-06, + "loss": 0.5137, + "step": 3653 + }, + { + "epoch": 0.55, + "grad_norm": 1.233838056149836, + "learning_rate": 4.4163457788104905e-06, + "loss": 0.5534, + "step": 3654 + }, + { + "epoch": 0.55, + "grad_norm": 3.922280358920338, + "learning_rate": 4.413920008533057e-06, + "loss": 0.4505, + "step": 3655 + }, + { + "epoch": 0.55, + "grad_norm": 3.8067760976042635, + "learning_rate": 4.411494378117583e-06, + "loss": 0.489, + "step": 3656 + }, + { + "epoch": 0.55, + "grad_norm": 2.8003119663702747, + "learning_rate": 4.409068888142923e-06, + "loss": 0.4217, + "step": 3657 + }, + { + "epoch": 0.55, + "grad_norm": 5.866391730923889, + "learning_rate": 4.406643539187893e-06, + "loss": 0.3683, + "step": 3658 + }, + { + "epoch": 0.55, + "grad_norm": 5.1739151640845185, + "learning_rate": 4.404218331831277e-06, + "loss": 0.49, + "step": 3659 + }, + { + "epoch": 0.55, + "grad_norm": 1.1844224348561958, + "learning_rate": 4.401793266651829e-06, + "loss": 0.5543, + "step": 3660 + }, + { + "epoch": 0.55, + "grad_norm": 2.8016563765552527, + "learning_rate": 4.399368344228262e-06, + "loss": 0.4667, + "step": 3661 + }, + { + "epoch": 0.55, + "grad_norm": 4.425879411029741, + "learning_rate": 4.396943565139262e-06, + "loss": 0.4275, + "step": 3662 + }, + { + "epoch": 0.55, + "grad_norm": 4.083956558982024, + "learning_rate": 4.3945189299634775e-06, + "loss": 0.473, + "step": 3663 + }, + { + "epoch": 0.55, + "grad_norm": 5.586302234383876, + "learning_rate": 4.3920944392795206e-06, + "loss": 0.4508, + "step": 3664 + }, + { + "epoch": 0.55, + "grad_norm": 3.0047529969942084, + "learning_rate": 4.389670093665972e-06, + "loss": 0.432, + "step": 3665 + }, + { + "epoch": 0.55, + "grad_norm": 3.3782134489602105, + "learning_rate": 4.387245893701376e-06, + "loss": 0.503, + "step": 3666 + }, + { + "epoch": 0.55, + "grad_norm": 7.184324921621955, + "learning_rate": 4.3848218399642454e-06, + "loss": 0.4479, + "step": 3667 + }, + { + "epoch": 0.55, + "grad_norm": 3.079358196536918, + "learning_rate": 4.382397933033054e-06, + "loss": 0.4686, + "step": 3668 + }, + { + "epoch": 0.55, + "grad_norm": 5.192506832573775, + "learning_rate": 4.3799741734862424e-06, + "loss": 0.4596, + "step": 3669 + }, + { + "epoch": 0.55, + "grad_norm": 3.1529257194915834, + "learning_rate": 4.377550561902217e-06, + "loss": 0.5427, + "step": 3670 + }, + { + "epoch": 0.55, + "grad_norm": 3.2221924724097963, + "learning_rate": 4.375127098859343e-06, + "loss": 0.4757, + "step": 3671 + }, + { + "epoch": 0.55, + "grad_norm": 3.345586262988627, + "learning_rate": 4.372703784935962e-06, + "loss": 0.3744, + "step": 3672 + }, + { + "epoch": 0.55, + "grad_norm": 3.80529402242681, + "learning_rate": 4.37028062071037e-06, + "loss": 0.4115, + "step": 3673 + }, + { + "epoch": 0.55, + "grad_norm": 3.1655249544247033, + "learning_rate": 4.367857606760828e-06, + "loss": 0.4583, + "step": 3674 + }, + { + "epoch": 0.55, + "grad_norm": 2.344405582861784, + "learning_rate": 4.3654347436655655e-06, + "loss": 0.4396, + "step": 3675 + }, + { + "epoch": 0.55, + "grad_norm": 4.201644393759148, + "learning_rate": 4.363012032002772e-06, + "loss": 0.4743, + "step": 3676 + }, + { + "epoch": 0.55, + "grad_norm": 4.286190166757507, + "learning_rate": 4.360589472350607e-06, + "loss": 0.4432, + "step": 3677 + }, + { + "epoch": 0.55, + "grad_norm": 3.328887891816517, + "learning_rate": 4.358167065287186e-06, + "loss": 0.4665, + "step": 3678 + }, + { + "epoch": 0.55, + "grad_norm": 3.09657255677466, + "learning_rate": 4.3557448113905894e-06, + "loss": 0.4167, + "step": 3679 + }, + { + "epoch": 0.56, + "grad_norm": 1.356866312441596, + "learning_rate": 4.3533227112388694e-06, + "loss": 0.5428, + "step": 3680 + }, + { + "epoch": 0.56, + "grad_norm": 4.559404359490227, + "learning_rate": 4.35090076541003e-06, + "loss": 0.4833, + "step": 3681 + }, + { + "epoch": 0.56, + "grad_norm": 2.9347378630454957, + "learning_rate": 4.348478974482047e-06, + "loss": 0.4567, + "step": 3682 + }, + { + "epoch": 0.56, + "grad_norm": 3.0611278272803855, + "learning_rate": 4.346057339032854e-06, + "loss": 0.4312, + "step": 3683 + }, + { + "epoch": 0.56, + "grad_norm": 3.6304892091145695, + "learning_rate": 4.3436358596403476e-06, + "loss": 0.5078, + "step": 3684 + }, + { + "epoch": 0.56, + "grad_norm": 4.780309014631191, + "learning_rate": 4.3412145368823934e-06, + "loss": 0.4932, + "step": 3685 + }, + { + "epoch": 0.56, + "grad_norm": 3.486716513964562, + "learning_rate": 4.338793371336812e-06, + "loss": 0.4246, + "step": 3686 + }, + { + "epoch": 0.56, + "grad_norm": 3.3010646883317905, + "learning_rate": 4.336372363581391e-06, + "loss": 0.4222, + "step": 3687 + }, + { + "epoch": 0.56, + "grad_norm": 1.4046670620123414, + "learning_rate": 4.3339515141938795e-06, + "loss": 0.5594, + "step": 3688 + }, + { + "epoch": 0.56, + "grad_norm": 8.510420916282598, + "learning_rate": 4.331530823751984e-06, + "loss": 0.425, + "step": 3689 + }, + { + "epoch": 0.56, + "grad_norm": 6.270868980735197, + "learning_rate": 4.329110292833383e-06, + "loss": 0.5166, + "step": 3690 + }, + { + "epoch": 0.56, + "grad_norm": 3.2253763390533923, + "learning_rate": 4.326689922015709e-06, + "loss": 0.4495, + "step": 3691 + }, + { + "epoch": 0.56, + "grad_norm": 2.5563899126097223, + "learning_rate": 4.324269711876559e-06, + "loss": 0.5175, + "step": 3692 + }, + { + "epoch": 0.56, + "grad_norm": 3.2519443604917377, + "learning_rate": 4.321849662993491e-06, + "loss": 0.4851, + "step": 3693 + }, + { + "epoch": 0.56, + "grad_norm": 4.962968745083644, + "learning_rate": 4.319429775944021e-06, + "loss": 0.3481, + "step": 3694 + }, + { + "epoch": 0.56, + "grad_norm": 2.8017135791269934, + "learning_rate": 4.317010051305639e-06, + "loss": 0.4358, + "step": 3695 + }, + { + "epoch": 0.56, + "grad_norm": 5.813601825032154, + "learning_rate": 4.314590489655778e-06, + "loss": 0.4951, + "step": 3696 + }, + { + "epoch": 0.56, + "grad_norm": 3.906634301008147, + "learning_rate": 4.3121710915718474e-06, + "loss": 0.3819, + "step": 3697 + }, + { + "epoch": 0.56, + "grad_norm": 11.113220627126022, + "learning_rate": 4.309751857631211e-06, + "loss": 0.4649, + "step": 3698 + }, + { + "epoch": 0.56, + "grad_norm": 2.9539136184721326, + "learning_rate": 4.3073327884111915e-06, + "loss": 0.448, + "step": 3699 + }, + { + "epoch": 0.56, + "grad_norm": 3.8079398349205285, + "learning_rate": 4.304913884489078e-06, + "loss": 0.5251, + "step": 3700 + }, + { + "epoch": 0.56, + "grad_norm": 3.809916124103518, + "learning_rate": 4.302495146442112e-06, + "loss": 0.4297, + "step": 3701 + }, + { + "epoch": 0.56, + "grad_norm": 3.7283355053578866, + "learning_rate": 4.300076574847507e-06, + "loss": 0.4976, + "step": 3702 + }, + { + "epoch": 0.56, + "grad_norm": 3.8822465858160426, + "learning_rate": 4.297658170282427e-06, + "loss": 0.4829, + "step": 3703 + }, + { + "epoch": 0.56, + "grad_norm": 3.3182148679641714, + "learning_rate": 4.2952399333239985e-06, + "loss": 0.438, + "step": 3704 + }, + { + "epoch": 0.56, + "grad_norm": 3.9196694711338833, + "learning_rate": 4.2928218645493105e-06, + "loss": 0.4983, + "step": 3705 + }, + { + "epoch": 0.56, + "grad_norm": 4.448278511388973, + "learning_rate": 4.290403964535408e-06, + "loss": 0.5016, + "step": 3706 + }, + { + "epoch": 0.56, + "grad_norm": 4.080095031982521, + "learning_rate": 4.287986233859301e-06, + "loss": 0.4241, + "step": 3707 + }, + { + "epoch": 0.56, + "grad_norm": 4.054066037293088, + "learning_rate": 4.285568673097955e-06, + "loss": 0.4011, + "step": 3708 + }, + { + "epoch": 0.56, + "grad_norm": 6.095032667614165, + "learning_rate": 4.283151282828295e-06, + "loss": 0.5172, + "step": 3709 + }, + { + "epoch": 0.56, + "grad_norm": 3.4706088869054237, + "learning_rate": 4.280734063627208e-06, + "loss": 0.3854, + "step": 3710 + }, + { + "epoch": 0.56, + "grad_norm": 4.460625969113091, + "learning_rate": 4.2783170160715346e-06, + "loss": 0.4206, + "step": 3711 + }, + { + "epoch": 0.56, + "grad_norm": 3.928018662756119, + "learning_rate": 4.275900140738084e-06, + "loss": 0.4158, + "step": 3712 + }, + { + "epoch": 0.56, + "grad_norm": 3.8811917846404276, + "learning_rate": 4.273483438203616e-06, + "loss": 0.4266, + "step": 3713 + }, + { + "epoch": 0.56, + "grad_norm": 3.100769638870864, + "learning_rate": 4.27106690904485e-06, + "loss": 0.4353, + "step": 3714 + }, + { + "epoch": 0.56, + "grad_norm": 4.457538457042008, + "learning_rate": 4.268650553838468e-06, + "loss": 0.5121, + "step": 3715 + }, + { + "epoch": 0.56, + "grad_norm": 4.917510206711214, + "learning_rate": 4.2662343731611065e-06, + "loss": 0.4845, + "step": 3716 + }, + { + "epoch": 0.56, + "grad_norm": 4.224251413796545, + "learning_rate": 4.2638183675893655e-06, + "loss": 0.4281, + "step": 3717 + }, + { + "epoch": 0.56, + "grad_norm": 2.75078392617113, + "learning_rate": 4.261402537699797e-06, + "loss": 0.4972, + "step": 3718 + }, + { + "epoch": 0.56, + "grad_norm": 5.276124561260907, + "learning_rate": 4.258986884068913e-06, + "loss": 0.4608, + "step": 3719 + }, + { + "epoch": 0.56, + "grad_norm": 5.023113067498944, + "learning_rate": 4.256571407273189e-06, + "loss": 0.5003, + "step": 3720 + }, + { + "epoch": 0.56, + "grad_norm": 3.7639214701540804, + "learning_rate": 4.254156107889049e-06, + "loss": 0.4742, + "step": 3721 + }, + { + "epoch": 0.56, + "grad_norm": 3.8077414587185223, + "learning_rate": 4.251740986492882e-06, + "loss": 0.4822, + "step": 3722 + }, + { + "epoch": 0.56, + "grad_norm": 8.41816501884602, + "learning_rate": 4.249326043661032e-06, + "loss": 0.499, + "step": 3723 + }, + { + "epoch": 0.56, + "grad_norm": 6.467159527683841, + "learning_rate": 4.246911279969797e-06, + "loss": 0.4309, + "step": 3724 + }, + { + "epoch": 0.56, + "grad_norm": 3.644577999164011, + "learning_rate": 4.244496695995438e-06, + "loss": 0.5314, + "step": 3725 + }, + { + "epoch": 0.56, + "grad_norm": 6.479901681264197, + "learning_rate": 4.242082292314172e-06, + "loss": 0.4539, + "step": 3726 + }, + { + "epoch": 0.56, + "grad_norm": 4.084483768623457, + "learning_rate": 4.23966806950217e-06, + "loss": 0.446, + "step": 3727 + }, + { + "epoch": 0.56, + "grad_norm": 3.3706273889282485, + "learning_rate": 4.2372540281355614e-06, + "loss": 0.4237, + "step": 3728 + }, + { + "epoch": 0.56, + "grad_norm": 5.074422438592611, + "learning_rate": 4.2348401687904305e-06, + "loss": 0.4096, + "step": 3729 + }, + { + "epoch": 0.56, + "grad_norm": 3.561739359933813, + "learning_rate": 4.232426492042824e-06, + "loss": 0.4529, + "step": 3730 + }, + { + "epoch": 0.56, + "grad_norm": 3.9143420312390447, + "learning_rate": 4.230012998468738e-06, + "loss": 0.4447, + "step": 3731 + }, + { + "epoch": 0.56, + "grad_norm": 5.7307871671537125, + "learning_rate": 4.227599688644129e-06, + "loss": 0.4299, + "step": 3732 + }, + { + "epoch": 0.56, + "grad_norm": 3.3124886170339365, + "learning_rate": 4.22518656314491e-06, + "loss": 0.3715, + "step": 3733 + }, + { + "epoch": 0.56, + "grad_norm": 4.042866287318259, + "learning_rate": 4.2227736225469434e-06, + "loss": 0.4493, + "step": 3734 + }, + { + "epoch": 0.56, + "grad_norm": 3.625237889074587, + "learning_rate": 4.220360867426059e-06, + "loss": 0.531, + "step": 3735 + }, + { + "epoch": 0.56, + "grad_norm": 3.035402015602318, + "learning_rate": 4.217948298358031e-06, + "loss": 0.4937, + "step": 3736 + }, + { + "epoch": 0.56, + "grad_norm": 6.100650195189811, + "learning_rate": 4.215535915918599e-06, + "loss": 0.4963, + "step": 3737 + }, + { + "epoch": 0.56, + "grad_norm": 5.002068530550297, + "learning_rate": 4.2131237206834506e-06, + "loss": 0.4514, + "step": 3738 + }, + { + "epoch": 0.56, + "grad_norm": 6.8512357998123425, + "learning_rate": 4.21071171322823e-06, + "loss": 0.3758, + "step": 3739 + }, + { + "epoch": 0.56, + "grad_norm": 3.9706212876280618, + "learning_rate": 4.2082998941285405e-06, + "loss": 0.455, + "step": 3740 + }, + { + "epoch": 0.56, + "grad_norm": 2.925160611815083, + "learning_rate": 4.205888263959935e-06, + "loss": 0.4452, + "step": 3741 + }, + { + "epoch": 0.56, + "grad_norm": 6.460936886482379, + "learning_rate": 4.2034768232979286e-06, + "loss": 0.423, + "step": 3742 + }, + { + "epoch": 0.56, + "grad_norm": 2.693464740018308, + "learning_rate": 4.201065572717984e-06, + "loss": 0.3508, + "step": 3743 + }, + { + "epoch": 0.56, + "grad_norm": 3.2977808098530543, + "learning_rate": 4.19865451279552e-06, + "loss": 0.416, + "step": 3744 + }, + { + "epoch": 0.56, + "grad_norm": 6.949640480079124, + "learning_rate": 4.196243644105915e-06, + "loss": 0.4342, + "step": 3745 + }, + { + "epoch": 0.57, + "grad_norm": 4.115900430881847, + "learning_rate": 4.193832967224493e-06, + "loss": 0.5142, + "step": 3746 + }, + { + "epoch": 0.57, + "grad_norm": 2.6471569624513993, + "learning_rate": 4.191422482726542e-06, + "loss": 0.4338, + "step": 3747 + }, + { + "epoch": 0.57, + "grad_norm": 4.4609638393392315, + "learning_rate": 4.1890121911872975e-06, + "loss": 0.4472, + "step": 3748 + }, + { + "epoch": 0.57, + "grad_norm": 2.512341882136168, + "learning_rate": 4.18660209318195e-06, + "loss": 0.4591, + "step": 3749 + }, + { + "epoch": 0.57, + "grad_norm": 5.30351029899018, + "learning_rate": 4.184192189285647e-06, + "loss": 0.413, + "step": 3750 + }, + { + "epoch": 0.57, + "grad_norm": 3.422912584568043, + "learning_rate": 4.181782480073482e-06, + "loss": 0.4381, + "step": 3751 + }, + { + "epoch": 0.57, + "grad_norm": 4.255355689820745, + "learning_rate": 4.179372966120514e-06, + "loss": 0.4618, + "step": 3752 + }, + { + "epoch": 0.57, + "grad_norm": 4.7357101275621725, + "learning_rate": 4.176963648001746e-06, + "loss": 0.4832, + "step": 3753 + }, + { + "epoch": 0.57, + "grad_norm": 5.910943432851932, + "learning_rate": 4.1745545262921334e-06, + "loss": 0.4549, + "step": 3754 + }, + { + "epoch": 0.57, + "grad_norm": 4.466433753037585, + "learning_rate": 4.172145601566594e-06, + "loss": 0.4296, + "step": 3755 + }, + { + "epoch": 0.57, + "grad_norm": 3.1377886032315687, + "learning_rate": 4.169736874399991e-06, + "loss": 0.4878, + "step": 3756 + }, + { + "epoch": 0.57, + "grad_norm": 10.435219589425918, + "learning_rate": 4.16732834536714e-06, + "loss": 0.4481, + "step": 3757 + }, + { + "epoch": 0.57, + "grad_norm": 4.205351584041125, + "learning_rate": 4.164920015042816e-06, + "loss": 0.419, + "step": 3758 + }, + { + "epoch": 0.57, + "grad_norm": 5.7306223133844325, + "learning_rate": 4.162511884001736e-06, + "loss": 0.4446, + "step": 3759 + }, + { + "epoch": 0.57, + "grad_norm": 2.3161335503631997, + "learning_rate": 4.160103952818582e-06, + "loss": 0.4518, + "step": 3760 + }, + { + "epoch": 0.57, + "grad_norm": 3.998832938898976, + "learning_rate": 4.1576962220679796e-06, + "loss": 0.5471, + "step": 3761 + }, + { + "epoch": 0.57, + "grad_norm": 7.144881721483335, + "learning_rate": 4.1552886923245075e-06, + "loss": 0.4373, + "step": 3762 + }, + { + "epoch": 0.57, + "grad_norm": 11.807120142824246, + "learning_rate": 4.152881364162701e-06, + "loss": 0.4269, + "step": 3763 + }, + { + "epoch": 0.57, + "grad_norm": 8.196372374604001, + "learning_rate": 4.150474238157039e-06, + "loss": 0.4393, + "step": 3764 + }, + { + "epoch": 0.57, + "grad_norm": 4.075919598805602, + "learning_rate": 4.148067314881964e-06, + "loss": 0.4154, + "step": 3765 + }, + { + "epoch": 0.57, + "grad_norm": 3.802354117209687, + "learning_rate": 4.14566059491186e-06, + "loss": 0.4626, + "step": 3766 + }, + { + "epoch": 0.57, + "grad_norm": 3.5181329680198337, + "learning_rate": 4.143254078821065e-06, + "loss": 0.392, + "step": 3767 + }, + { + "epoch": 0.57, + "grad_norm": 4.618790600780879, + "learning_rate": 4.140847767183872e-06, + "loss": 0.4955, + "step": 3768 + }, + { + "epoch": 0.57, + "grad_norm": 5.253947578786492, + "learning_rate": 4.138441660574518e-06, + "loss": 0.5044, + "step": 3769 + }, + { + "epoch": 0.57, + "grad_norm": 6.004823198878446, + "learning_rate": 4.136035759567202e-06, + "loss": 0.4908, + "step": 3770 + }, + { + "epoch": 0.57, + "grad_norm": 5.920748463492911, + "learning_rate": 4.1336300647360635e-06, + "loss": 0.4411, + "step": 3771 + }, + { + "epoch": 0.57, + "grad_norm": 4.042136379001271, + "learning_rate": 4.131224576655195e-06, + "loss": 0.4198, + "step": 3772 + }, + { + "epoch": 0.57, + "grad_norm": 2.691848950251611, + "learning_rate": 4.1288192958986464e-06, + "loss": 0.419, + "step": 3773 + }, + { + "epoch": 0.57, + "grad_norm": 3.5028573514819126, + "learning_rate": 4.12641422304041e-06, + "loss": 0.3994, + "step": 3774 + }, + { + "epoch": 0.57, + "grad_norm": 3.4277369782924043, + "learning_rate": 4.124009358654433e-06, + "loss": 0.3919, + "step": 3775 + }, + { + "epoch": 0.57, + "grad_norm": 2.8204239078453863, + "learning_rate": 4.121604703314612e-06, + "loss": 0.5225, + "step": 3776 + }, + { + "epoch": 0.57, + "grad_norm": 5.892782365351085, + "learning_rate": 4.119200257594789e-06, + "loss": 0.4549, + "step": 3777 + }, + { + "epoch": 0.57, + "grad_norm": 3.5938108099082258, + "learning_rate": 4.116796022068767e-06, + "loss": 0.5055, + "step": 3778 + }, + { + "epoch": 0.57, + "grad_norm": 3.307273099932867, + "learning_rate": 4.1143919973102856e-06, + "loss": 0.4042, + "step": 3779 + }, + { + "epoch": 0.57, + "grad_norm": 5.118192768755623, + "learning_rate": 4.111988183893046e-06, + "loss": 0.4717, + "step": 3780 + }, + { + "epoch": 0.57, + "grad_norm": 2.6645901761303006, + "learning_rate": 4.10958458239069e-06, + "loss": 0.4744, + "step": 3781 + }, + { + "epoch": 0.57, + "grad_norm": 3.6274248632772688, + "learning_rate": 4.107181193376812e-06, + "loss": 0.4243, + "step": 3782 + }, + { + "epoch": 0.57, + "grad_norm": 4.944124399338622, + "learning_rate": 4.104778017424961e-06, + "loss": 0.3698, + "step": 3783 + }, + { + "epoch": 0.57, + "grad_norm": 3.716516990297118, + "learning_rate": 4.102375055108624e-06, + "loss": 0.4686, + "step": 3784 + }, + { + "epoch": 0.57, + "grad_norm": 2.919077800281757, + "learning_rate": 4.099972307001248e-06, + "loss": 0.4612, + "step": 3785 + }, + { + "epoch": 0.57, + "grad_norm": 3.56710923583181, + "learning_rate": 4.097569773676221e-06, + "loss": 0.4254, + "step": 3786 + }, + { + "epoch": 0.57, + "grad_norm": 3.2556409000249342, + "learning_rate": 4.0951674557068824e-06, + "loss": 0.4384, + "step": 3787 + }, + { + "epoch": 0.57, + "grad_norm": 3.0533035373997905, + "learning_rate": 4.092765353666524e-06, + "loss": 0.4396, + "step": 3788 + }, + { + "epoch": 0.57, + "grad_norm": 3.400837710023985, + "learning_rate": 4.09036346812838e-06, + "loss": 0.3972, + "step": 3789 + }, + { + "epoch": 0.57, + "grad_norm": 3.530494071447724, + "learning_rate": 4.087961799665639e-06, + "loss": 0.4477, + "step": 3790 + }, + { + "epoch": 0.57, + "grad_norm": 4.191102419148896, + "learning_rate": 4.085560348851431e-06, + "loss": 0.5014, + "step": 3791 + }, + { + "epoch": 0.57, + "grad_norm": 7.522334971938472, + "learning_rate": 4.083159116258838e-06, + "loss": 0.4815, + "step": 3792 + }, + { + "epoch": 0.57, + "grad_norm": 2.6877758985199396, + "learning_rate": 4.080758102460891e-06, + "loss": 0.4611, + "step": 3793 + }, + { + "epoch": 0.57, + "grad_norm": 5.37841878880807, + "learning_rate": 4.078357308030565e-06, + "loss": 0.4809, + "step": 3794 + }, + { + "epoch": 0.57, + "grad_norm": 3.064971415371922, + "learning_rate": 4.075956733540787e-06, + "loss": 0.4598, + "step": 3795 + }, + { + "epoch": 0.57, + "grad_norm": 9.699938349323046, + "learning_rate": 4.073556379564429e-06, + "loss": 0.4672, + "step": 3796 + }, + { + "epoch": 0.57, + "grad_norm": 4.044675633798052, + "learning_rate": 4.07115624667431e-06, + "loss": 0.4538, + "step": 3797 + }, + { + "epoch": 0.57, + "grad_norm": 3.8069917767795776, + "learning_rate": 4.0687563354431986e-06, + "loss": 0.4338, + "step": 3798 + }, + { + "epoch": 0.57, + "grad_norm": 6.894886480147642, + "learning_rate": 4.066356646443806e-06, + "loss": 0.4628, + "step": 3799 + }, + { + "epoch": 0.57, + "grad_norm": 3.9780328737289157, + "learning_rate": 4.063957180248796e-06, + "loss": 0.4856, + "step": 3800 + }, + { + "epoch": 0.57, + "grad_norm": 4.070128793278779, + "learning_rate": 4.061557937430777e-06, + "loss": 0.5372, + "step": 3801 + }, + { + "epoch": 0.57, + "grad_norm": 4.388383629547504, + "learning_rate": 4.0591589185623e-06, + "loss": 0.5216, + "step": 3802 + }, + { + "epoch": 0.57, + "grad_norm": 3.093885847147076, + "learning_rate": 4.05676012421587e-06, + "loss": 0.4383, + "step": 3803 + }, + { + "epoch": 0.57, + "grad_norm": 4.833820887662677, + "learning_rate": 4.05436155496393e-06, + "loss": 0.4603, + "step": 3804 + }, + { + "epoch": 0.57, + "grad_norm": 3.2681093337286358, + "learning_rate": 4.05196321137888e-06, + "loss": 0.4555, + "step": 3805 + }, + { + "epoch": 0.57, + "grad_norm": 3.210998529514009, + "learning_rate": 4.049565094033057e-06, + "loss": 0.4544, + "step": 3806 + }, + { + "epoch": 0.57, + "grad_norm": 4.290992340049877, + "learning_rate": 4.0471672034987446e-06, + "loss": 0.4026, + "step": 3807 + }, + { + "epoch": 0.57, + "grad_norm": 3.415609489644776, + "learning_rate": 4.044769540348179e-06, + "loss": 0.4378, + "step": 3808 + }, + { + "epoch": 0.57, + "grad_norm": 5.760407663262526, + "learning_rate": 4.042372105153535e-06, + "loss": 0.4597, + "step": 3809 + }, + { + "epoch": 0.57, + "grad_norm": 5.844833424338622, + "learning_rate": 4.039974898486938e-06, + "loss": 0.4378, + "step": 3810 + }, + { + "epoch": 0.57, + "grad_norm": 3.609878177192518, + "learning_rate": 4.037577920920455e-06, + "loss": 0.4371, + "step": 3811 + }, + { + "epoch": 0.57, + "grad_norm": 4.063551689916742, + "learning_rate": 4.035181173026099e-06, + "loss": 0.4554, + "step": 3812 + }, + { + "epoch": 0.58, + "grad_norm": 4.095990008225165, + "learning_rate": 4.032784655375832e-06, + "loss": 0.4336, + "step": 3813 + }, + { + "epoch": 0.58, + "grad_norm": 4.589989040144424, + "learning_rate": 4.030388368541557e-06, + "loss": 0.3961, + "step": 3814 + }, + { + "epoch": 0.58, + "grad_norm": 4.968738490214082, + "learning_rate": 4.027992313095124e-06, + "loss": 0.4436, + "step": 3815 + }, + { + "epoch": 0.58, + "grad_norm": 1.1874661783453915, + "learning_rate": 4.025596489608326e-06, + "loss": 0.5373, + "step": 3816 + }, + { + "epoch": 0.58, + "grad_norm": 5.586353906262547, + "learning_rate": 4.0232008986528985e-06, + "loss": 0.4975, + "step": 3817 + }, + { + "epoch": 0.58, + "grad_norm": 4.4645308538466, + "learning_rate": 4.0208055408005316e-06, + "loss": 0.4693, + "step": 3818 + }, + { + "epoch": 0.58, + "grad_norm": 3.6754331202646413, + "learning_rate": 4.018410416622848e-06, + "loss": 0.4635, + "step": 3819 + }, + { + "epoch": 0.58, + "grad_norm": 10.29712552204372, + "learning_rate": 4.016015526691421e-06, + "loss": 0.4155, + "step": 3820 + }, + { + "epoch": 0.58, + "grad_norm": 3.2708343902202897, + "learning_rate": 4.013620871577765e-06, + "loss": 0.4009, + "step": 3821 + }, + { + "epoch": 0.58, + "grad_norm": 3.6925404903174863, + "learning_rate": 4.011226451853341e-06, + "loss": 0.4177, + "step": 3822 + }, + { + "epoch": 0.58, + "grad_norm": 8.33518671856447, + "learning_rate": 4.0088322680895545e-06, + "loss": 0.443, + "step": 3823 + }, + { + "epoch": 0.58, + "grad_norm": 4.42611930784702, + "learning_rate": 4.0064383208577475e-06, + "loss": 0.4592, + "step": 3824 + }, + { + "epoch": 0.58, + "grad_norm": 5.064120276902632, + "learning_rate": 4.0040446107292165e-06, + "loss": 0.4299, + "step": 3825 + }, + { + "epoch": 0.58, + "grad_norm": 3.52094255201696, + "learning_rate": 4.001651138275194e-06, + "loss": 0.3608, + "step": 3826 + }, + { + "epoch": 0.58, + "grad_norm": 3.7214776779876364, + "learning_rate": 3.999257904066855e-06, + "loss": 0.3773, + "step": 3827 + }, + { + "epoch": 0.58, + "grad_norm": 7.891657857256591, + "learning_rate": 3.996864908675325e-06, + "loss": 0.5148, + "step": 3828 + }, + { + "epoch": 0.58, + "grad_norm": 4.0822886419417035, + "learning_rate": 3.994472152671662e-06, + "loss": 0.4498, + "step": 3829 + }, + { + "epoch": 0.58, + "grad_norm": 3.5296903332642313, + "learning_rate": 3.992079636626878e-06, + "loss": 0.4994, + "step": 3830 + }, + { + "epoch": 0.58, + "grad_norm": 5.315017312131069, + "learning_rate": 3.989687361111919e-06, + "loss": 0.5009, + "step": 3831 + }, + { + "epoch": 0.58, + "grad_norm": 4.009823787285301, + "learning_rate": 3.9872953266976774e-06, + "loss": 0.5108, + "step": 3832 + }, + { + "epoch": 0.58, + "grad_norm": 2.4213315323951665, + "learning_rate": 3.9849035339549904e-06, + "loss": 0.4418, + "step": 3833 + }, + { + "epoch": 0.58, + "grad_norm": 7.9316200526211755, + "learning_rate": 3.982511983454629e-06, + "loss": 0.4361, + "step": 3834 + }, + { + "epoch": 0.58, + "grad_norm": 4.226637870072047, + "learning_rate": 3.980120675767319e-06, + "loss": 0.3685, + "step": 3835 + }, + { + "epoch": 0.58, + "grad_norm": 3.098428565683648, + "learning_rate": 3.977729611463717e-06, + "loss": 0.446, + "step": 3836 + }, + { + "epoch": 0.58, + "grad_norm": 3.206832987115077, + "learning_rate": 3.9753387911144245e-06, + "loss": 0.4422, + "step": 3837 + }, + { + "epoch": 0.58, + "grad_norm": 3.682644375850624, + "learning_rate": 3.972948215289992e-06, + "loss": 0.5019, + "step": 3838 + }, + { + "epoch": 0.58, + "grad_norm": 4.819916263655875, + "learning_rate": 3.970557884560899e-06, + "loss": 0.5358, + "step": 3839 + }, + { + "epoch": 0.58, + "grad_norm": 3.8543266558686375, + "learning_rate": 3.968167799497579e-06, + "loss": 0.4445, + "step": 3840 + }, + { + "epoch": 0.58, + "grad_norm": 5.8166683220953255, + "learning_rate": 3.9657779606703985e-06, + "loss": 0.4785, + "step": 3841 + }, + { + "epoch": 0.58, + "grad_norm": 6.7611725251145485, + "learning_rate": 3.963388368649667e-06, + "loss": 0.4448, + "step": 3842 + }, + { + "epoch": 0.58, + "grad_norm": 3.332272397440074, + "learning_rate": 3.960999024005639e-06, + "loss": 0.4881, + "step": 3843 + }, + { + "epoch": 0.58, + "grad_norm": 2.99467430813858, + "learning_rate": 3.958609927308505e-06, + "loss": 0.4048, + "step": 3844 + }, + { + "epoch": 0.58, + "grad_norm": 3.977211615799983, + "learning_rate": 3.9562210791283994e-06, + "loss": 0.4232, + "step": 3845 + }, + { + "epoch": 0.58, + "grad_norm": 3.7038739925427553, + "learning_rate": 3.953832480035396e-06, + "loss": 0.4704, + "step": 3846 + }, + { + "epoch": 0.58, + "grad_norm": 3.607936277232311, + "learning_rate": 3.951444130599507e-06, + "loss": 0.388, + "step": 3847 + }, + { + "epoch": 0.58, + "grad_norm": 3.8754332494342587, + "learning_rate": 3.949056031390691e-06, + "loss": 0.4067, + "step": 3848 + }, + { + "epoch": 0.58, + "grad_norm": 3.44140849446421, + "learning_rate": 3.946668182978841e-06, + "loss": 0.4482, + "step": 3849 + }, + { + "epoch": 0.58, + "grad_norm": 5.732917779973226, + "learning_rate": 3.944280585933794e-06, + "loss": 0.4214, + "step": 3850 + }, + { + "epoch": 0.58, + "grad_norm": 4.41208164454842, + "learning_rate": 3.9418932408253256e-06, + "loss": 0.4502, + "step": 3851 + }, + { + "epoch": 0.58, + "grad_norm": 3.653226027970726, + "learning_rate": 3.9395061482231474e-06, + "loss": 0.4408, + "step": 3852 + }, + { + "epoch": 0.58, + "grad_norm": 3.959581666976122, + "learning_rate": 3.93711930869692e-06, + "loss": 0.4843, + "step": 3853 + }, + { + "epoch": 0.58, + "grad_norm": 4.42181232619998, + "learning_rate": 3.934732722816234e-06, + "loss": 0.4754, + "step": 3854 + }, + { + "epoch": 0.58, + "grad_norm": 10.213007837532748, + "learning_rate": 3.932346391150627e-06, + "loss": 0.4678, + "step": 3855 + }, + { + "epoch": 0.58, + "grad_norm": 4.588453871587705, + "learning_rate": 3.929960314269569e-06, + "loss": 0.4584, + "step": 3856 + }, + { + "epoch": 0.58, + "grad_norm": 5.250239785513397, + "learning_rate": 3.927574492742473e-06, + "loss": 0.4564, + "step": 3857 + }, + { + "epoch": 0.58, + "grad_norm": 4.812969321742089, + "learning_rate": 3.925188927138694e-06, + "loss": 0.4453, + "step": 3858 + }, + { + "epoch": 0.58, + "grad_norm": 10.40932519580338, + "learning_rate": 3.922803618027519e-06, + "loss": 0.4488, + "step": 3859 + }, + { + "epoch": 0.58, + "grad_norm": 2.7899936831543393, + "learning_rate": 3.920418565978182e-06, + "loss": 0.4117, + "step": 3860 + }, + { + "epoch": 0.58, + "grad_norm": 4.8866379970939215, + "learning_rate": 3.918033771559846e-06, + "loss": 0.4553, + "step": 3861 + }, + { + "epoch": 0.58, + "grad_norm": 12.864507745898237, + "learning_rate": 3.91564923534162e-06, + "loss": 0.4683, + "step": 3862 + }, + { + "epoch": 0.58, + "grad_norm": 2.8245837484009333, + "learning_rate": 3.913264957892549e-06, + "loss": 0.4757, + "step": 3863 + }, + { + "epoch": 0.58, + "grad_norm": 13.744252201125041, + "learning_rate": 3.9108809397816154e-06, + "loss": 0.4266, + "step": 3864 + }, + { + "epoch": 0.58, + "grad_norm": 6.630564614036765, + "learning_rate": 3.908497181577743e-06, + "loss": 0.5326, + "step": 3865 + }, + { + "epoch": 0.58, + "grad_norm": 8.082092079702226, + "learning_rate": 3.90611368384979e-06, + "loss": 0.3823, + "step": 3866 + }, + { + "epoch": 0.58, + "grad_norm": 3.536295524373419, + "learning_rate": 3.9037304471665525e-06, + "loss": 0.4667, + "step": 3867 + }, + { + "epoch": 0.58, + "grad_norm": 2.903252018663176, + "learning_rate": 3.901347472096767e-06, + "loss": 0.37, + "step": 3868 + }, + { + "epoch": 0.58, + "grad_norm": 4.900086867129049, + "learning_rate": 3.898964759209105e-06, + "loss": 0.427, + "step": 3869 + }, + { + "epoch": 0.58, + "grad_norm": 3.8482026680460026, + "learning_rate": 3.896582309072175e-06, + "loss": 0.4362, + "step": 3870 + }, + { + "epoch": 0.58, + "grad_norm": 3.593407478794892, + "learning_rate": 3.894200122254528e-06, + "loss": 0.4486, + "step": 3871 + }, + { + "epoch": 0.58, + "grad_norm": 4.428488499865246, + "learning_rate": 3.891818199324644e-06, + "loss": 0.4899, + "step": 3872 + }, + { + "epoch": 0.58, + "grad_norm": 3.395171462252501, + "learning_rate": 3.889436540850948e-06, + "loss": 0.4333, + "step": 3873 + }, + { + "epoch": 0.58, + "grad_norm": 3.1575532850275114, + "learning_rate": 3.887055147401797e-06, + "loss": 0.4321, + "step": 3874 + }, + { + "epoch": 0.58, + "grad_norm": 10.605102161149329, + "learning_rate": 3.884674019545484e-06, + "loss": 0.4422, + "step": 3875 + }, + { + "epoch": 0.58, + "grad_norm": 3.7286572460990937, + "learning_rate": 3.882293157850245e-06, + "loss": 0.4364, + "step": 3876 + }, + { + "epoch": 0.58, + "grad_norm": 4.659070305451452, + "learning_rate": 3.879912562884244e-06, + "loss": 0.446, + "step": 3877 + }, + { + "epoch": 0.58, + "grad_norm": 3.679684806880758, + "learning_rate": 3.877532235215589e-06, + "loss": 0.3922, + "step": 3878 + }, + { + "epoch": 0.59, + "grad_norm": 2.8155908540711883, + "learning_rate": 3.875152175412319e-06, + "loss": 0.4801, + "step": 3879 + }, + { + "epoch": 0.59, + "grad_norm": 6.120111832659367, + "learning_rate": 3.872772384042408e-06, + "loss": 0.4474, + "step": 3880 + }, + { + "epoch": 0.59, + "grad_norm": 1.1073271249620888, + "learning_rate": 3.870392861673774e-06, + "loss": 0.4901, + "step": 3881 + }, + { + "epoch": 0.59, + "grad_norm": 2.7946935367066894, + "learning_rate": 3.86801360887426e-06, + "loss": 0.416, + "step": 3882 + }, + { + "epoch": 0.59, + "grad_norm": 6.472519954368805, + "learning_rate": 3.8656346262116565e-06, + "loss": 0.5222, + "step": 3883 + }, + { + "epoch": 0.59, + "grad_norm": 11.888011337517117, + "learning_rate": 3.8632559142536795e-06, + "loss": 0.4729, + "step": 3884 + }, + { + "epoch": 0.59, + "grad_norm": 7.823315182217647, + "learning_rate": 3.8608774735679835e-06, + "loss": 0.4506, + "step": 3885 + }, + { + "epoch": 0.59, + "grad_norm": 7.8900950332405815, + "learning_rate": 3.858499304722161e-06, + "loss": 0.4581, + "step": 3886 + }, + { + "epoch": 0.59, + "grad_norm": 6.040592637631079, + "learning_rate": 3.856121408283735e-06, + "loss": 0.4304, + "step": 3887 + }, + { + "epoch": 0.59, + "grad_norm": 4.432579107233802, + "learning_rate": 3.853743784820169e-06, + "loss": 0.4949, + "step": 3888 + }, + { + "epoch": 0.59, + "grad_norm": 3.3878343634866326, + "learning_rate": 3.851366434898857e-06, + "loss": 0.3906, + "step": 3889 + }, + { + "epoch": 0.59, + "grad_norm": 5.532765857145093, + "learning_rate": 3.8489893590871275e-06, + "loss": 0.439, + "step": 3890 + }, + { + "epoch": 0.59, + "grad_norm": 1.2204393937906575, + "learning_rate": 3.846612557952248e-06, + "loss": 0.5284, + "step": 3891 + }, + { + "epoch": 0.59, + "grad_norm": 8.42920845323881, + "learning_rate": 3.844236032061415e-06, + "loss": 0.461, + "step": 3892 + }, + { + "epoch": 0.59, + "grad_norm": 1.2324256957638833, + "learning_rate": 3.841859781981765e-06, + "loss": 0.5673, + "step": 3893 + }, + { + "epoch": 0.59, + "grad_norm": 5.554493836809629, + "learning_rate": 3.839483808280364e-06, + "loss": 0.4703, + "step": 3894 + }, + { + "epoch": 0.59, + "grad_norm": 5.325686909821763, + "learning_rate": 3.837108111524213e-06, + "loss": 0.3804, + "step": 3895 + }, + { + "epoch": 0.59, + "grad_norm": 4.64493375771464, + "learning_rate": 3.834732692280248e-06, + "loss": 0.4347, + "step": 3896 + }, + { + "epoch": 0.59, + "grad_norm": 4.31644062137482, + "learning_rate": 3.832357551115337e-06, + "loss": 0.4471, + "step": 3897 + }, + { + "epoch": 0.59, + "grad_norm": 3.903948417974065, + "learning_rate": 3.829982688596287e-06, + "loss": 0.447, + "step": 3898 + }, + { + "epoch": 0.59, + "grad_norm": 9.570794961224365, + "learning_rate": 3.827608105289832e-06, + "loss": 0.401, + "step": 3899 + }, + { + "epoch": 0.59, + "grad_norm": 5.224796085871885, + "learning_rate": 3.82523380176264e-06, + "loss": 0.4128, + "step": 3900 + }, + { + "epoch": 0.59, + "grad_norm": 9.996710572629503, + "learning_rate": 3.8228597785813186e-06, + "loss": 0.4227, + "step": 3901 + }, + { + "epoch": 0.59, + "grad_norm": 4.017857467008076, + "learning_rate": 3.8204860363124e-06, + "loss": 0.4734, + "step": 3902 + }, + { + "epoch": 0.59, + "grad_norm": 6.0340383974600345, + "learning_rate": 3.818112575522355e-06, + "loss": 0.3988, + "step": 3903 + }, + { + "epoch": 0.59, + "grad_norm": 6.063504383674613, + "learning_rate": 3.815739396777587e-06, + "loss": 0.4533, + "step": 3904 + }, + { + "epoch": 0.59, + "grad_norm": 11.546315356747908, + "learning_rate": 3.813366500644426e-06, + "loss": 0.3868, + "step": 3905 + }, + { + "epoch": 0.59, + "grad_norm": 3.4027131804055455, + "learning_rate": 3.810993887689145e-06, + "loss": 0.4346, + "step": 3906 + }, + { + "epoch": 0.59, + "grad_norm": 3.6435181230051685, + "learning_rate": 3.8086215584779395e-06, + "loss": 0.4405, + "step": 3907 + }, + { + "epoch": 0.59, + "grad_norm": 5.2694779293584, + "learning_rate": 3.806249513576945e-06, + "loss": 0.4424, + "step": 3908 + }, + { + "epoch": 0.59, + "grad_norm": 3.5783562057844054, + "learning_rate": 3.803877753552223e-06, + "loss": 0.4632, + "step": 3909 + }, + { + "epoch": 0.59, + "grad_norm": 1.3843775139673493, + "learning_rate": 3.801506278969769e-06, + "loss": 0.5327, + "step": 3910 + }, + { + "epoch": 0.59, + "grad_norm": 3.8359043089334834, + "learning_rate": 3.7991350903955138e-06, + "loss": 0.5134, + "step": 3911 + }, + { + "epoch": 0.59, + "grad_norm": 3.2393513672950855, + "learning_rate": 3.7967641883953144e-06, + "loss": 0.5469, + "step": 3912 + }, + { + "epoch": 0.59, + "grad_norm": 4.765021958793624, + "learning_rate": 3.7943935735349647e-06, + "loss": 0.3794, + "step": 3913 + }, + { + "epoch": 0.59, + "grad_norm": 4.164424688518929, + "learning_rate": 3.7920232463801866e-06, + "loss": 0.5157, + "step": 3914 + }, + { + "epoch": 0.59, + "grad_norm": 3.8468160581979056, + "learning_rate": 3.789653207496631e-06, + "loss": 0.4694, + "step": 3915 + }, + { + "epoch": 0.59, + "grad_norm": 12.44937779827826, + "learning_rate": 3.7872834574498894e-06, + "loss": 0.4222, + "step": 3916 + }, + { + "epoch": 0.59, + "grad_norm": 5.165782554198708, + "learning_rate": 3.7849139968054728e-06, + "loss": 0.407, + "step": 3917 + }, + { + "epoch": 0.59, + "grad_norm": 5.681458829866862, + "learning_rate": 3.782544826128833e-06, + "loss": 0.5007, + "step": 3918 + }, + { + "epoch": 0.59, + "grad_norm": 3.7520348590162493, + "learning_rate": 3.780175945985346e-06, + "loss": 0.5048, + "step": 3919 + }, + { + "epoch": 0.59, + "grad_norm": 3.6402701549190333, + "learning_rate": 3.77780735694032e-06, + "loss": 0.3853, + "step": 3920 + }, + { + "epoch": 0.59, + "grad_norm": 3.9457507661649633, + "learning_rate": 3.7754390595589962e-06, + "loss": 0.4238, + "step": 3921 + }, + { + "epoch": 0.59, + "grad_norm": 3.7949855926788367, + "learning_rate": 3.7730710544065413e-06, + "loss": 0.5078, + "step": 3922 + }, + { + "epoch": 0.59, + "grad_norm": 1.2221028319353835, + "learning_rate": 3.77070334204806e-06, + "loss": 0.5746, + "step": 3923 + }, + { + "epoch": 0.59, + "grad_norm": 4.586762219397688, + "learning_rate": 3.7683359230485794e-06, + "loss": 0.378, + "step": 3924 + }, + { + "epoch": 0.59, + "grad_norm": 3.3795651934166724, + "learning_rate": 3.76596879797306e-06, + "loss": 0.366, + "step": 3925 + }, + { + "epoch": 0.59, + "grad_norm": 3.355652889916826, + "learning_rate": 3.7636019673863926e-06, + "loss": 0.4032, + "step": 3926 + }, + { + "epoch": 0.59, + "grad_norm": 4.007924862278221, + "learning_rate": 3.761235431853395e-06, + "loss": 0.4933, + "step": 3927 + }, + { + "epoch": 0.59, + "grad_norm": 4.336640886607247, + "learning_rate": 3.75886919193882e-06, + "loss": 0.4459, + "step": 3928 + }, + { + "epoch": 0.59, + "grad_norm": 3.8036769742169887, + "learning_rate": 3.756503248207344e-06, + "loss": 0.4854, + "step": 3929 + }, + { + "epoch": 0.59, + "grad_norm": 1.179796315765809, + "learning_rate": 3.7541376012235753e-06, + "loss": 0.5288, + "step": 3930 + }, + { + "epoch": 0.59, + "grad_norm": 1.235615642290056, + "learning_rate": 3.751772251552052e-06, + "loss": 0.5509, + "step": 3931 + }, + { + "epoch": 0.59, + "grad_norm": 7.93599352931071, + "learning_rate": 3.7494071997572386e-06, + "loss": 0.4108, + "step": 3932 + }, + { + "epoch": 0.59, + "grad_norm": 3.151237142602862, + "learning_rate": 3.747042446403533e-06, + "loss": 0.4672, + "step": 3933 + }, + { + "epoch": 0.59, + "grad_norm": 3.906394689752917, + "learning_rate": 3.744677992055259e-06, + "loss": 0.4637, + "step": 3934 + }, + { + "epoch": 0.59, + "grad_norm": 8.416352854911965, + "learning_rate": 3.7423138372766654e-06, + "loss": 0.5008, + "step": 3935 + }, + { + "epoch": 0.59, + "grad_norm": 4.75958120818296, + "learning_rate": 3.7399499826319397e-06, + "loss": 0.4597, + "step": 3936 + }, + { + "epoch": 0.59, + "grad_norm": 3.8896375972382327, + "learning_rate": 3.737586428685186e-06, + "loss": 0.4212, + "step": 3937 + }, + { + "epoch": 0.59, + "grad_norm": 3.8731968571840034, + "learning_rate": 3.735223176000446e-06, + "loss": 0.4093, + "step": 3938 + }, + { + "epoch": 0.59, + "grad_norm": 3.862717593071882, + "learning_rate": 3.7328602251416835e-06, + "loss": 0.441, + "step": 3939 + }, + { + "epoch": 0.59, + "grad_norm": 3.9742213521317544, + "learning_rate": 3.7304975766727913e-06, + "loss": 0.4165, + "step": 3940 + }, + { + "epoch": 0.59, + "grad_norm": 12.01955298356408, + "learning_rate": 3.7281352311575947e-06, + "loss": 0.4028, + "step": 3941 + }, + { + "epoch": 0.59, + "grad_norm": 2.9754500304432985, + "learning_rate": 3.7257731891598402e-06, + "loss": 0.4317, + "step": 3942 + }, + { + "epoch": 0.59, + "grad_norm": 1.1195693288507145, + "learning_rate": 3.7234114512432064e-06, + "loss": 0.5294, + "step": 3943 + }, + { + "epoch": 0.59, + "grad_norm": 11.569465769621749, + "learning_rate": 3.721050017971298e-06, + "loss": 0.4284, + "step": 3944 + }, + { + "epoch": 0.6, + "grad_norm": 4.0491076696329795, + "learning_rate": 3.7186888899076427e-06, + "loss": 0.4152, + "step": 3945 + }, + { + "epoch": 0.6, + "grad_norm": 5.705513185628955, + "learning_rate": 3.7163280676157055e-06, + "loss": 0.3982, + "step": 3946 + }, + { + "epoch": 0.6, + "grad_norm": 3.52724256847801, + "learning_rate": 3.713967551658868e-06, + "loss": 0.4308, + "step": 3947 + }, + { + "epoch": 0.6, + "grad_norm": 5.2774077231419465, + "learning_rate": 3.7116073426004463e-06, + "loss": 0.475, + "step": 3948 + }, + { + "epoch": 0.6, + "grad_norm": 7.038592634621072, + "learning_rate": 3.709247441003678e-06, + "loss": 0.4843, + "step": 3949 + }, + { + "epoch": 0.6, + "grad_norm": 6.223512489428968, + "learning_rate": 3.7068878474317273e-06, + "loss": 0.4629, + "step": 3950 + }, + { + "epoch": 0.6, + "grad_norm": 4.919465917965498, + "learning_rate": 3.704528562447691e-06, + "loss": 0.4421, + "step": 3951 + }, + { + "epoch": 0.6, + "grad_norm": 3.4969707531047955, + "learning_rate": 3.7021695866145846e-06, + "loss": 0.48, + "step": 3952 + }, + { + "epoch": 0.6, + "grad_norm": 1.1035028946611682, + "learning_rate": 3.699810920495358e-06, + "loss": 0.4849, + "step": 3953 + }, + { + "epoch": 0.6, + "grad_norm": 4.0430254104808, + "learning_rate": 3.697452564652879e-06, + "loss": 0.4463, + "step": 3954 + }, + { + "epoch": 0.6, + "grad_norm": 6.691028679130676, + "learning_rate": 3.6950945196499454e-06, + "loss": 0.4156, + "step": 3955 + }, + { + "epoch": 0.6, + "grad_norm": 4.017633651799711, + "learning_rate": 3.6927367860492813e-06, + "loss": 0.4553, + "step": 3956 + }, + { + "epoch": 0.6, + "grad_norm": 3.5971774035056305, + "learning_rate": 3.6903793644135333e-06, + "loss": 0.4096, + "step": 3957 + }, + { + "epoch": 0.6, + "grad_norm": 4.376116494017634, + "learning_rate": 3.688022255305279e-06, + "loss": 0.463, + "step": 3958 + }, + { + "epoch": 0.6, + "grad_norm": 3.851463620455795, + "learning_rate": 3.685665459287018e-06, + "loss": 0.411, + "step": 3959 + }, + { + "epoch": 0.6, + "grad_norm": 4.385108898146386, + "learning_rate": 3.683308976921173e-06, + "loss": 0.4514, + "step": 3960 + }, + { + "epoch": 0.6, + "grad_norm": 2.3557357766478106, + "learning_rate": 3.6809528087700956e-06, + "loss": 0.4226, + "step": 3961 + }, + { + "epoch": 0.6, + "grad_norm": 4.370694491986353, + "learning_rate": 3.6785969553960598e-06, + "loss": 0.4686, + "step": 3962 + }, + { + "epoch": 0.6, + "grad_norm": 3.441701649246758, + "learning_rate": 3.6762414173612683e-06, + "loss": 0.3916, + "step": 3963 + }, + { + "epoch": 0.6, + "grad_norm": 7.068758494708756, + "learning_rate": 3.6738861952278456e-06, + "loss": 0.493, + "step": 3964 + }, + { + "epoch": 0.6, + "grad_norm": 3.9212312352478937, + "learning_rate": 3.6715312895578388e-06, + "loss": 0.4666, + "step": 3965 + }, + { + "epoch": 0.6, + "grad_norm": 5.644703293804203, + "learning_rate": 3.669176700913224e-06, + "loss": 0.4602, + "step": 3966 + }, + { + "epoch": 0.6, + "grad_norm": 4.067670912612484, + "learning_rate": 3.6668224298558973e-06, + "loss": 0.3769, + "step": 3967 + }, + { + "epoch": 0.6, + "grad_norm": 4.074620071399767, + "learning_rate": 3.6644684769476857e-06, + "loss": 0.4589, + "step": 3968 + }, + { + "epoch": 0.6, + "grad_norm": 2.785863552947082, + "learning_rate": 3.6621148427503326e-06, + "loss": 0.4504, + "step": 3969 + }, + { + "epoch": 0.6, + "grad_norm": 3.594619990847038, + "learning_rate": 3.6597615278255073e-06, + "loss": 0.4988, + "step": 3970 + }, + { + "epoch": 0.6, + "grad_norm": 3.993625777801499, + "learning_rate": 3.6574085327348074e-06, + "loss": 0.4303, + "step": 3971 + }, + { + "epoch": 0.6, + "grad_norm": 5.0432593125304725, + "learning_rate": 3.655055858039749e-06, + "loss": 0.5025, + "step": 3972 + }, + { + "epoch": 0.6, + "grad_norm": 5.284346288589723, + "learning_rate": 3.6527035043017756e-06, + "loss": 0.4497, + "step": 3973 + }, + { + "epoch": 0.6, + "grad_norm": 3.731856515998285, + "learning_rate": 3.6503514720822507e-06, + "loss": 0.4395, + "step": 3974 + }, + { + "epoch": 0.6, + "grad_norm": 3.2937291167291463, + "learning_rate": 3.6479997619424605e-06, + "loss": 0.5068, + "step": 3975 + }, + { + "epoch": 0.6, + "grad_norm": 1.1884893327885775, + "learning_rate": 3.645648374443621e-06, + "loss": 0.5428, + "step": 3976 + }, + { + "epoch": 0.6, + "grad_norm": 4.199422002868311, + "learning_rate": 3.6432973101468638e-06, + "loss": 0.4483, + "step": 3977 + }, + { + "epoch": 0.6, + "grad_norm": 2.2396888660069516, + "learning_rate": 3.640946569613247e-06, + "loss": 0.4094, + "step": 3978 + }, + { + "epoch": 0.6, + "grad_norm": 4.6751073395562734, + "learning_rate": 3.6385961534037506e-06, + "loss": 0.4748, + "step": 3979 + }, + { + "epoch": 0.6, + "grad_norm": 3.458976524699675, + "learning_rate": 3.6362460620792755e-06, + "loss": 0.443, + "step": 3980 + }, + { + "epoch": 0.6, + "grad_norm": 5.34339235804711, + "learning_rate": 3.6338962962006504e-06, + "loss": 0.4501, + "step": 3981 + }, + { + "epoch": 0.6, + "grad_norm": 4.791445218336222, + "learning_rate": 3.6315468563286204e-06, + "loss": 0.5081, + "step": 3982 + }, + { + "epoch": 0.6, + "grad_norm": 3.170620791205664, + "learning_rate": 3.6291977430238544e-06, + "loss": 0.4685, + "step": 3983 + }, + { + "epoch": 0.6, + "grad_norm": 4.685833756703963, + "learning_rate": 3.626848956846947e-06, + "loss": 0.4652, + "step": 3984 + }, + { + "epoch": 0.6, + "grad_norm": 2.7807688779558157, + "learning_rate": 3.624500498358407e-06, + "loss": 0.4613, + "step": 3985 + }, + { + "epoch": 0.6, + "grad_norm": 5.444401965086056, + "learning_rate": 3.6221523681186755e-06, + "loss": 0.4849, + "step": 3986 + }, + { + "epoch": 0.6, + "grad_norm": 5.108764509632119, + "learning_rate": 3.619804566688108e-06, + "loss": 0.4622, + "step": 3987 + }, + { + "epoch": 0.6, + "grad_norm": 2.707945294768867, + "learning_rate": 3.6174570946269794e-06, + "loss": 0.4689, + "step": 3988 + }, + { + "epoch": 0.6, + "grad_norm": 3.907174698943255, + "learning_rate": 3.615109952495495e-06, + "loss": 0.4331, + "step": 3989 + }, + { + "epoch": 0.6, + "grad_norm": 4.605965764594416, + "learning_rate": 3.612763140853771e-06, + "loss": 0.4421, + "step": 3990 + }, + { + "epoch": 0.6, + "grad_norm": 3.9770040486925007, + "learning_rate": 3.6104166602618552e-06, + "loss": 0.4087, + "step": 3991 + }, + { + "epoch": 0.6, + "grad_norm": 8.300240674285796, + "learning_rate": 3.6080705112797077e-06, + "loss": 0.4427, + "step": 3992 + }, + { + "epoch": 0.6, + "grad_norm": 2.9492693362347153, + "learning_rate": 3.605724694467212e-06, + "loss": 0.4301, + "step": 3993 + }, + { + "epoch": 0.6, + "grad_norm": 5.906306608948889, + "learning_rate": 3.6033792103841763e-06, + "loss": 0.3949, + "step": 3994 + }, + { + "epoch": 0.6, + "grad_norm": 4.811142046787412, + "learning_rate": 3.601034059590324e-06, + "loss": 0.4986, + "step": 3995 + }, + { + "epoch": 0.6, + "grad_norm": 3.8544245758084106, + "learning_rate": 3.5986892426453024e-06, + "loss": 0.4653, + "step": 3996 + }, + { + "epoch": 0.6, + "grad_norm": 2.925799766472331, + "learning_rate": 3.596344760108678e-06, + "loss": 0.4193, + "step": 3997 + }, + { + "epoch": 0.6, + "grad_norm": 2.502867745035287, + "learning_rate": 3.594000612539935e-06, + "loss": 0.4533, + "step": 3998 + }, + { + "epoch": 0.6, + "grad_norm": 4.428192361402524, + "learning_rate": 3.591656800498483e-06, + "loss": 0.4336, + "step": 3999 + }, + { + "epoch": 0.6, + "grad_norm": 7.569962374830117, + "learning_rate": 3.5893133245436475e-06, + "loss": 0.464, + "step": 4000 + }, + { + "epoch": 0.6, + "grad_norm": 2.7408903621180176, + "learning_rate": 3.5869701852346763e-06, + "loss": 0.3715, + "step": 4001 + }, + { + "epoch": 0.6, + "grad_norm": 20.385413649027072, + "learning_rate": 3.584627383130733e-06, + "loss": 0.4159, + "step": 4002 + }, + { + "epoch": 0.6, + "grad_norm": 9.788405223074752, + "learning_rate": 3.5822849187909027e-06, + "loss": 0.4541, + "step": 4003 + }, + { + "epoch": 0.6, + "grad_norm": 3.3787999623014033, + "learning_rate": 3.579942792774195e-06, + "loss": 0.5147, + "step": 4004 + }, + { + "epoch": 0.6, + "grad_norm": 2.851494378698466, + "learning_rate": 3.5776010056395294e-06, + "loss": 0.5066, + "step": 4005 + }, + { + "epoch": 0.6, + "grad_norm": 4.035042938643484, + "learning_rate": 3.5752595579457526e-06, + "loss": 0.3936, + "step": 4006 + }, + { + "epoch": 0.6, + "grad_norm": 3.8187556215828, + "learning_rate": 3.5729184502516255e-06, + "loss": 0.4363, + "step": 4007 + }, + { + "epoch": 0.6, + "grad_norm": 3.3644227028526674, + "learning_rate": 3.5705776831158277e-06, + "loss": 0.428, + "step": 4008 + }, + { + "epoch": 0.6, + "grad_norm": 4.347480314351162, + "learning_rate": 3.5682372570969628e-06, + "loss": 0.4233, + "step": 4009 + }, + { + "epoch": 0.6, + "grad_norm": 3.113569062661106, + "learning_rate": 3.565897172753545e-06, + "loss": 0.3856, + "step": 4010 + }, + { + "epoch": 0.6, + "grad_norm": 4.265739931296165, + "learning_rate": 3.563557430644017e-06, + "loss": 0.4245, + "step": 4011 + }, + { + "epoch": 0.61, + "grad_norm": 5.1991291856913096, + "learning_rate": 3.561218031326731e-06, + "loss": 0.4459, + "step": 4012 + }, + { + "epoch": 0.61, + "grad_norm": 5.077278530620917, + "learning_rate": 3.5588789753599594e-06, + "loss": 0.4508, + "step": 4013 + }, + { + "epoch": 0.61, + "grad_norm": 4.144920789452597, + "learning_rate": 3.5565402633018963e-06, + "loss": 0.4683, + "step": 4014 + }, + { + "epoch": 0.61, + "grad_norm": 4.784236318720534, + "learning_rate": 3.554201895710648e-06, + "loss": 0.4209, + "step": 4015 + }, + { + "epoch": 0.61, + "grad_norm": 8.603804583771765, + "learning_rate": 3.5518638731442454e-06, + "loss": 0.47, + "step": 4016 + }, + { + "epoch": 0.61, + "grad_norm": 3.918355727188468, + "learning_rate": 3.5495261961606332e-06, + "loss": 0.457, + "step": 4017 + }, + { + "epoch": 0.61, + "grad_norm": 1.1344735466323284, + "learning_rate": 3.547188865317671e-06, + "loss": 0.5404, + "step": 4018 + }, + { + "epoch": 0.61, + "grad_norm": 3.2138200586442682, + "learning_rate": 3.5448518811731424e-06, + "loss": 0.4468, + "step": 4019 + }, + { + "epoch": 0.61, + "grad_norm": 4.4180797151960425, + "learning_rate": 3.5425152442847405e-06, + "loss": 0.4581, + "step": 4020 + }, + { + "epoch": 0.61, + "grad_norm": 5.62687617142585, + "learning_rate": 3.540178955210084e-06, + "loss": 0.5304, + "step": 4021 + }, + { + "epoch": 0.61, + "grad_norm": 3.556292400368309, + "learning_rate": 3.537843014506702e-06, + "loss": 0.4269, + "step": 4022 + }, + { + "epoch": 0.61, + "grad_norm": 3.4310153502267564, + "learning_rate": 3.535507422732043e-06, + "loss": 0.4044, + "step": 4023 + }, + { + "epoch": 0.61, + "grad_norm": 3.8490371471988003, + "learning_rate": 3.5331721804434728e-06, + "loss": 0.4572, + "step": 4024 + }, + { + "epoch": 0.61, + "grad_norm": 3.392016365796824, + "learning_rate": 3.5308372881982687e-06, + "loss": 0.4306, + "step": 4025 + }, + { + "epoch": 0.61, + "grad_norm": 4.3600933707934075, + "learning_rate": 3.5285027465536353e-06, + "loss": 0.5201, + "step": 4026 + }, + { + "epoch": 0.61, + "grad_norm": 3.7862054681419726, + "learning_rate": 3.5261685560666836e-06, + "loss": 0.5053, + "step": 4027 + }, + { + "epoch": 0.61, + "grad_norm": 1.1972869741681706, + "learning_rate": 3.523834717294442e-06, + "loss": 0.5238, + "step": 4028 + }, + { + "epoch": 0.61, + "grad_norm": 2.6851024667049597, + "learning_rate": 3.5215012307938622e-06, + "loss": 0.4111, + "step": 4029 + }, + { + "epoch": 0.61, + "grad_norm": 4.687887414271254, + "learning_rate": 3.5191680971218024e-06, + "loss": 0.4187, + "step": 4030 + }, + { + "epoch": 0.61, + "grad_norm": 4.522284973057458, + "learning_rate": 3.5168353168350433e-06, + "loss": 0.4383, + "step": 4031 + }, + { + "epoch": 0.61, + "grad_norm": 4.648043688463352, + "learning_rate": 3.514502890490279e-06, + "loss": 0.4824, + "step": 4032 + }, + { + "epoch": 0.61, + "grad_norm": 3.66733954413022, + "learning_rate": 3.512170818644115e-06, + "loss": 0.4491, + "step": 4033 + }, + { + "epoch": 0.61, + "grad_norm": 4.356066978837546, + "learning_rate": 3.509839101853082e-06, + "loss": 0.4103, + "step": 4034 + }, + { + "epoch": 0.61, + "grad_norm": 4.116187965764531, + "learning_rate": 3.507507740673617e-06, + "loss": 0.4113, + "step": 4035 + }, + { + "epoch": 0.61, + "grad_norm": 3.5508064043655843, + "learning_rate": 3.505176735662077e-06, + "loss": 0.4832, + "step": 4036 + }, + { + "epoch": 0.61, + "grad_norm": 4.14323474333763, + "learning_rate": 3.502846087374731e-06, + "loss": 0.4411, + "step": 4037 + }, + { + "epoch": 0.61, + "grad_norm": 3.4688436849602327, + "learning_rate": 3.5005157963677625e-06, + "loss": 0.3867, + "step": 4038 + }, + { + "epoch": 0.61, + "grad_norm": 4.965346364036287, + "learning_rate": 3.4981858631972764e-06, + "loss": 0.4535, + "step": 4039 + }, + { + "epoch": 0.61, + "grad_norm": 19.753306391292035, + "learning_rate": 3.4958562884192827e-06, + "loss": 0.4672, + "step": 4040 + }, + { + "epoch": 0.61, + "grad_norm": 3.212642947465992, + "learning_rate": 3.493527072589714e-06, + "loss": 0.4556, + "step": 4041 + }, + { + "epoch": 0.61, + "grad_norm": 7.325872056679902, + "learning_rate": 3.4911982162644115e-06, + "loss": 0.4697, + "step": 4042 + }, + { + "epoch": 0.61, + "grad_norm": 2.4481126102536233, + "learning_rate": 3.488869719999131e-06, + "loss": 0.4224, + "step": 4043 + }, + { + "epoch": 0.61, + "grad_norm": 3.136604087663385, + "learning_rate": 3.4865415843495485e-06, + "loss": 0.4373, + "step": 4044 + }, + { + "epoch": 0.61, + "grad_norm": 7.354065009877868, + "learning_rate": 3.4842138098712463e-06, + "loss": 0.3881, + "step": 4045 + }, + { + "epoch": 0.61, + "grad_norm": 2.6080057548778397, + "learning_rate": 3.481886397119727e-06, + "loss": 0.4476, + "step": 4046 + }, + { + "epoch": 0.61, + "grad_norm": 4.865163645571122, + "learning_rate": 3.479559346650401e-06, + "loss": 0.4976, + "step": 4047 + }, + { + "epoch": 0.61, + "grad_norm": 3.2944238927361624, + "learning_rate": 3.4772326590185957e-06, + "loss": 0.4015, + "step": 4048 + }, + { + "epoch": 0.61, + "grad_norm": 3.400152120338766, + "learning_rate": 3.4749063347795514e-06, + "loss": 0.4205, + "step": 4049 + }, + { + "epoch": 0.61, + "grad_norm": 3.8344291810765423, + "learning_rate": 3.4725803744884197e-06, + "loss": 0.4682, + "step": 4050 + }, + { + "epoch": 0.61, + "grad_norm": 3.2424943774541988, + "learning_rate": 3.4702547787002706e-06, + "loss": 0.4721, + "step": 4051 + }, + { + "epoch": 0.61, + "grad_norm": 2.334924833238669, + "learning_rate": 3.4679295479700814e-06, + "loss": 0.4279, + "step": 4052 + }, + { + "epoch": 0.61, + "grad_norm": 5.141393375792844, + "learning_rate": 3.465604682852744e-06, + "loss": 0.4881, + "step": 4053 + }, + { + "epoch": 0.61, + "grad_norm": 2.643999399469282, + "learning_rate": 3.4632801839030656e-06, + "loss": 0.4852, + "step": 4054 + }, + { + "epoch": 0.61, + "grad_norm": 3.0040675409439754, + "learning_rate": 3.4609560516757602e-06, + "loss": 0.4923, + "step": 4055 + }, + { + "epoch": 0.61, + "grad_norm": 2.8225482656878365, + "learning_rate": 3.4586322867254628e-06, + "loss": 0.4855, + "step": 4056 + }, + { + "epoch": 0.61, + "grad_norm": 3.21572189714957, + "learning_rate": 3.456308889606714e-06, + "loss": 0.381, + "step": 4057 + }, + { + "epoch": 0.61, + "grad_norm": 5.625519854635449, + "learning_rate": 3.4539858608739667e-06, + "loss": 0.4962, + "step": 4058 + }, + { + "epoch": 0.61, + "grad_norm": 2.399521888680049, + "learning_rate": 3.451663201081591e-06, + "loss": 0.4338, + "step": 4059 + }, + { + "epoch": 0.61, + "grad_norm": 3.495567250438975, + "learning_rate": 3.4493409107838618e-06, + "loss": 0.3367, + "step": 4060 + }, + { + "epoch": 0.61, + "grad_norm": 2.8580752812641745, + "learning_rate": 3.447018990534975e-06, + "loss": 0.4818, + "step": 4061 + }, + { + "epoch": 0.61, + "grad_norm": 3.5408231594346247, + "learning_rate": 3.4446974408890295e-06, + "loss": 0.3608, + "step": 4062 + }, + { + "epoch": 0.61, + "grad_norm": 3.239499267516946, + "learning_rate": 3.442376262400038e-06, + "loss": 0.4519, + "step": 4063 + }, + { + "epoch": 0.61, + "grad_norm": 4.55063718568584, + "learning_rate": 3.44005545562193e-06, + "loss": 0.5071, + "step": 4064 + }, + { + "epoch": 0.61, + "grad_norm": 5.133845257128605, + "learning_rate": 3.437735021108539e-06, + "loss": 0.4319, + "step": 4065 + }, + { + "epoch": 0.61, + "grad_norm": 2.707557216301683, + "learning_rate": 3.435414959413614e-06, + "loss": 0.4029, + "step": 4066 + }, + { + "epoch": 0.61, + "grad_norm": 3.023728042445161, + "learning_rate": 3.433095271090815e-06, + "loss": 0.3984, + "step": 4067 + }, + { + "epoch": 0.61, + "grad_norm": 3.4128677451251845, + "learning_rate": 3.430775956693707e-06, + "loss": 0.4201, + "step": 4068 + }, + { + "epoch": 0.61, + "grad_norm": 2.9119567478435466, + "learning_rate": 3.428457016775777e-06, + "loss": 0.4666, + "step": 4069 + }, + { + "epoch": 0.61, + "grad_norm": 4.0184967300669365, + "learning_rate": 3.4261384518904116e-06, + "loss": 0.4419, + "step": 4070 + }, + { + "epoch": 0.61, + "grad_norm": 2.7946789774822434, + "learning_rate": 3.423820262590915e-06, + "loss": 0.3978, + "step": 4071 + }, + { + "epoch": 0.61, + "grad_norm": 4.314063419130091, + "learning_rate": 3.4215024494304982e-06, + "loss": 0.4918, + "step": 4072 + }, + { + "epoch": 0.61, + "grad_norm": 3.0664962220843424, + "learning_rate": 3.4191850129622816e-06, + "loss": 0.5281, + "step": 4073 + }, + { + "epoch": 0.61, + "grad_norm": 3.063403964501148, + "learning_rate": 3.4168679537393013e-06, + "loss": 0.4897, + "step": 4074 + }, + { + "epoch": 0.61, + "grad_norm": 2.9832900125881534, + "learning_rate": 3.4145512723144967e-06, + "loss": 0.3958, + "step": 4075 + }, + { + "epoch": 0.61, + "grad_norm": 3.6376765553586967, + "learning_rate": 3.412234969240722e-06, + "loss": 0.3613, + "step": 4076 + }, + { + "epoch": 0.61, + "grad_norm": 2.614262544507862, + "learning_rate": 3.409919045070739e-06, + "loss": 0.4669, + "step": 4077 + }, + { + "epoch": 0.62, + "grad_norm": 5.471682755427336, + "learning_rate": 3.407603500357215e-06, + "loss": 0.4645, + "step": 4078 + }, + { + "epoch": 0.62, + "grad_norm": 3.856667770090094, + "learning_rate": 3.4052883356527367e-06, + "loss": 0.4205, + "step": 4079 + }, + { + "epoch": 0.62, + "grad_norm": 2.9295399918661236, + "learning_rate": 3.40297355150979e-06, + "loss": 0.4011, + "step": 4080 + }, + { + "epoch": 0.62, + "grad_norm": 3.8220652952320067, + "learning_rate": 3.4006591484807784e-06, + "loss": 0.4016, + "step": 4081 + }, + { + "epoch": 0.62, + "grad_norm": 5.096546809974063, + "learning_rate": 3.398345127118008e-06, + "loss": 0.5163, + "step": 4082 + }, + { + "epoch": 0.62, + "grad_norm": 6.295085389338741, + "learning_rate": 3.3960314879736954e-06, + "loss": 0.3845, + "step": 4083 + }, + { + "epoch": 0.62, + "grad_norm": 4.078405828436352, + "learning_rate": 3.3937182315999682e-06, + "loss": 0.4138, + "step": 4084 + }, + { + "epoch": 0.62, + "grad_norm": 4.819176125989123, + "learning_rate": 3.391405358548859e-06, + "loss": 0.3911, + "step": 4085 + }, + { + "epoch": 0.62, + "grad_norm": 1.088530077992319, + "learning_rate": 3.3890928693723156e-06, + "loss": 0.5482, + "step": 4086 + }, + { + "epoch": 0.62, + "grad_norm": 3.9535732122171208, + "learning_rate": 3.3867807646221872e-06, + "loss": 0.4568, + "step": 4087 + }, + { + "epoch": 0.62, + "grad_norm": 11.693660561605986, + "learning_rate": 3.3844690448502325e-06, + "loss": 0.4372, + "step": 4088 + }, + { + "epoch": 0.62, + "grad_norm": 2.7542625662105626, + "learning_rate": 3.3821577106081215e-06, + "loss": 0.42, + "step": 4089 + }, + { + "epoch": 0.62, + "grad_norm": 3.133255499297002, + "learning_rate": 3.379846762447428e-06, + "loss": 0.3861, + "step": 4090 + }, + { + "epoch": 0.62, + "grad_norm": 3.8231785986383025, + "learning_rate": 3.37753620091964e-06, + "loss": 0.4052, + "step": 4091 + }, + { + "epoch": 0.62, + "grad_norm": 3.357325787470231, + "learning_rate": 3.375226026576146e-06, + "loss": 0.4904, + "step": 4092 + }, + { + "epoch": 0.62, + "grad_norm": 3.593308397152471, + "learning_rate": 3.372916239968246e-06, + "loss": 0.4301, + "step": 4093 + }, + { + "epoch": 0.62, + "grad_norm": 3.7230942151229764, + "learning_rate": 3.370606841647148e-06, + "loss": 0.4815, + "step": 4094 + }, + { + "epoch": 0.62, + "grad_norm": 2.907366375839802, + "learning_rate": 3.3682978321639654e-06, + "loss": 0.4591, + "step": 4095 + }, + { + "epoch": 0.62, + "grad_norm": 3.359502140478121, + "learning_rate": 3.365989212069717e-06, + "loss": 0.5102, + "step": 4096 + }, + { + "epoch": 0.62, + "grad_norm": 5.161584237022204, + "learning_rate": 3.363680981915336e-06, + "loss": 0.4307, + "step": 4097 + }, + { + "epoch": 0.62, + "grad_norm": 3.8795737103518575, + "learning_rate": 3.3613731422516534e-06, + "loss": 0.4453, + "step": 4098 + }, + { + "epoch": 0.62, + "grad_norm": 2.9846780106631647, + "learning_rate": 3.359065693629415e-06, + "loss": 0.4085, + "step": 4099 + }, + { + "epoch": 0.62, + "grad_norm": 2.7881602507502006, + "learning_rate": 3.3567586365992687e-06, + "loss": 0.4043, + "step": 4100 + }, + { + "epoch": 0.62, + "grad_norm": 2.670572860799817, + "learning_rate": 3.3544519717117683e-06, + "loss": 0.4712, + "step": 4101 + }, + { + "epoch": 0.62, + "grad_norm": 3.3892969376016806, + "learning_rate": 3.3521456995173774e-06, + "loss": 0.4163, + "step": 4102 + }, + { + "epoch": 0.62, + "grad_norm": 3.5974569106702914, + "learning_rate": 3.349839820566462e-06, + "loss": 0.4476, + "step": 4103 + }, + { + "epoch": 0.62, + "grad_norm": 15.369537354780407, + "learning_rate": 3.347534335409299e-06, + "loss": 0.5016, + "step": 4104 + }, + { + "epoch": 0.62, + "grad_norm": 3.441457117957543, + "learning_rate": 3.3452292445960676e-06, + "loss": 0.4207, + "step": 4105 + }, + { + "epoch": 0.62, + "grad_norm": 4.83868381301182, + "learning_rate": 3.3429245486768538e-06, + "loss": 0.402, + "step": 4106 + }, + { + "epoch": 0.62, + "grad_norm": 2.938646553091695, + "learning_rate": 3.3406202482016493e-06, + "loss": 0.4418, + "step": 4107 + }, + { + "epoch": 0.62, + "grad_norm": 2.979804355797596, + "learning_rate": 3.3383163437203507e-06, + "loss": 0.4139, + "step": 4108 + }, + { + "epoch": 0.62, + "grad_norm": 3.3365716750619128, + "learning_rate": 3.3360128357827647e-06, + "loss": 0.4409, + "step": 4109 + }, + { + "epoch": 0.62, + "grad_norm": 2.44924014142774, + "learning_rate": 3.3337097249385973e-06, + "loss": 0.4408, + "step": 4110 + }, + { + "epoch": 0.62, + "grad_norm": 3.9480677648974996, + "learning_rate": 3.331407011737462e-06, + "loss": 0.4514, + "step": 4111 + }, + { + "epoch": 0.62, + "grad_norm": 3.1013596536723944, + "learning_rate": 3.329104696728879e-06, + "loss": 0.4645, + "step": 4112 + }, + { + "epoch": 0.62, + "grad_norm": 3.4205428794364803, + "learning_rate": 3.3268027804622695e-06, + "loss": 0.5026, + "step": 4113 + }, + { + "epoch": 0.62, + "grad_norm": 4.2625849021849, + "learning_rate": 3.324501263486965e-06, + "loss": 0.4282, + "step": 4114 + }, + { + "epoch": 0.62, + "grad_norm": 4.483131019450267, + "learning_rate": 3.322200146352198e-06, + "loss": 0.4547, + "step": 4115 + }, + { + "epoch": 0.62, + "grad_norm": 5.511018207964915, + "learning_rate": 3.319899429607104e-06, + "loss": 0.4586, + "step": 4116 + }, + { + "epoch": 0.62, + "grad_norm": 6.329424894334045, + "learning_rate": 3.31759911380073e-06, + "loss": 0.4567, + "step": 4117 + }, + { + "epoch": 0.62, + "grad_norm": 3.8689168702447008, + "learning_rate": 3.315299199482019e-06, + "loss": 0.4458, + "step": 4118 + }, + { + "epoch": 0.62, + "grad_norm": 3.374575116351191, + "learning_rate": 3.3129996871998237e-06, + "loss": 0.446, + "step": 4119 + }, + { + "epoch": 0.62, + "grad_norm": 5.8174257575364745, + "learning_rate": 3.310700577502899e-06, + "loss": 0.4529, + "step": 4120 + }, + { + "epoch": 0.62, + "grad_norm": 3.1905396767409777, + "learning_rate": 3.3084018709399005e-06, + "loss": 0.4846, + "step": 4121 + }, + { + "epoch": 0.62, + "grad_norm": 3.2684225263685573, + "learning_rate": 3.3061035680593967e-06, + "loss": 0.4467, + "step": 4122 + }, + { + "epoch": 0.62, + "grad_norm": 2.819086736697005, + "learning_rate": 3.3038056694098485e-06, + "loss": 0.3351, + "step": 4123 + }, + { + "epoch": 0.62, + "grad_norm": 4.169389399500903, + "learning_rate": 3.3015081755396305e-06, + "loss": 0.4266, + "step": 4124 + }, + { + "epoch": 0.62, + "grad_norm": 3.6418206259424655, + "learning_rate": 3.2992110869970127e-06, + "loss": 0.4635, + "step": 4125 + }, + { + "epoch": 0.62, + "grad_norm": 1.2632004760848792, + "learning_rate": 3.2969144043301704e-06, + "loss": 0.5505, + "step": 4126 + }, + { + "epoch": 0.62, + "grad_norm": 4.021828557340513, + "learning_rate": 3.294618128087188e-06, + "loss": 0.4407, + "step": 4127 + }, + { + "epoch": 0.62, + "grad_norm": 3.5059832414780914, + "learning_rate": 3.292322258816044e-06, + "loss": 0.504, + "step": 4128 + }, + { + "epoch": 0.62, + "grad_norm": 1.3972618637035412, + "learning_rate": 3.2900267970646273e-06, + "loss": 0.5657, + "step": 4129 + }, + { + "epoch": 0.62, + "grad_norm": 4.783531084916077, + "learning_rate": 3.2877317433807243e-06, + "loss": 0.492, + "step": 4130 + }, + { + "epoch": 0.62, + "grad_norm": 4.557317078228341, + "learning_rate": 3.285437098312024e-06, + "loss": 0.4335, + "step": 4131 + }, + { + "epoch": 0.62, + "grad_norm": 2.9531228885242755, + "learning_rate": 3.2831428624061246e-06, + "loss": 0.4381, + "step": 4132 + }, + { + "epoch": 0.62, + "grad_norm": 2.967109994450408, + "learning_rate": 3.2808490362105178e-06, + "loss": 0.4152, + "step": 4133 + }, + { + "epoch": 0.62, + "grad_norm": 2.8485199355576016, + "learning_rate": 3.278555620272604e-06, + "loss": 0.4971, + "step": 4134 + }, + { + "epoch": 0.62, + "grad_norm": 2.6443138578268917, + "learning_rate": 3.2762626151396832e-06, + "loss": 0.4597, + "step": 4135 + }, + { + "epoch": 0.62, + "grad_norm": 3.667453266928034, + "learning_rate": 3.2739700213589543e-06, + "loss": 0.4726, + "step": 4136 + }, + { + "epoch": 0.62, + "grad_norm": 3.8378470931115167, + "learning_rate": 3.2716778394775255e-06, + "loss": 0.3917, + "step": 4137 + }, + { + "epoch": 0.62, + "grad_norm": 2.5455667890253646, + "learning_rate": 3.269386070042399e-06, + "loss": 0.4655, + "step": 4138 + }, + { + "epoch": 0.62, + "grad_norm": 4.970105256220969, + "learning_rate": 3.2670947136004856e-06, + "loss": 0.4013, + "step": 4139 + }, + { + "epoch": 0.62, + "grad_norm": 14.130624816058997, + "learning_rate": 3.264803770698592e-06, + "loss": 0.4824, + "step": 4140 + }, + { + "epoch": 0.62, + "grad_norm": 2.767995597014194, + "learning_rate": 3.262513241883427e-06, + "loss": 0.4123, + "step": 4141 + }, + { + "epoch": 0.62, + "grad_norm": 4.067819550933766, + "learning_rate": 3.260223127701604e-06, + "loss": 0.4174, + "step": 4142 + }, + { + "epoch": 0.62, + "grad_norm": 8.810689746904155, + "learning_rate": 3.2579334286996313e-06, + "loss": 0.4471, + "step": 4143 + }, + { + "epoch": 0.63, + "grad_norm": 3.100034766633999, + "learning_rate": 3.2556441454239274e-06, + "loss": 0.4174, + "step": 4144 + }, + { + "epoch": 0.63, + "grad_norm": 3.5966439255199, + "learning_rate": 3.2533552784208032e-06, + "loss": 0.4261, + "step": 4145 + }, + { + "epoch": 0.63, + "grad_norm": 4.090498322363859, + "learning_rate": 3.2510668282364734e-06, + "loss": 0.4081, + "step": 4146 + }, + { + "epoch": 0.63, + "grad_norm": 3.1893001675459978, + "learning_rate": 3.2487787954170536e-06, + "loss": 0.3901, + "step": 4147 + }, + { + "epoch": 0.63, + "grad_norm": 7.830068279838418, + "learning_rate": 3.2464911805085577e-06, + "loss": 0.4452, + "step": 4148 + }, + { + "epoch": 0.63, + "grad_norm": 2.7752334817967057, + "learning_rate": 3.244203984056904e-06, + "loss": 0.4624, + "step": 4149 + }, + { + "epoch": 0.63, + "grad_norm": 11.920636569242394, + "learning_rate": 3.2419172066079073e-06, + "loss": 0.4285, + "step": 4150 + }, + { + "epoch": 0.63, + "grad_norm": 2.4336455029864017, + "learning_rate": 3.2396308487072826e-06, + "loss": 0.4268, + "step": 4151 + }, + { + "epoch": 0.63, + "grad_norm": 3.6445613135995427, + "learning_rate": 3.2373449109006476e-06, + "loss": 0.4519, + "step": 4152 + }, + { + "epoch": 0.63, + "grad_norm": 5.988567928380297, + "learning_rate": 3.235059393733515e-06, + "loss": 0.4561, + "step": 4153 + }, + { + "epoch": 0.63, + "grad_norm": 2.3286880896559583, + "learning_rate": 3.2327742977513034e-06, + "loss": 0.4349, + "step": 4154 + }, + { + "epoch": 0.63, + "grad_norm": 5.859370437972936, + "learning_rate": 3.2304896234993255e-06, + "loss": 0.427, + "step": 4155 + }, + { + "epoch": 0.63, + "grad_norm": 2.814227704538722, + "learning_rate": 3.2282053715227934e-06, + "loss": 0.4425, + "step": 4156 + }, + { + "epoch": 0.63, + "grad_norm": 4.049519594715914, + "learning_rate": 3.225921542366825e-06, + "loss": 0.3947, + "step": 4157 + }, + { + "epoch": 0.63, + "grad_norm": 3.270789754081606, + "learning_rate": 3.223638136576429e-06, + "loss": 0.471, + "step": 4158 + }, + { + "epoch": 0.63, + "grad_norm": 3.180175986416722, + "learning_rate": 3.221355154696518e-06, + "loss": 0.4119, + "step": 4159 + }, + { + "epoch": 0.63, + "grad_norm": 2.8695469719514453, + "learning_rate": 3.2190725972719033e-06, + "loss": 0.4502, + "step": 4160 + }, + { + "epoch": 0.63, + "grad_norm": 3.4934928216042325, + "learning_rate": 3.2167904648472896e-06, + "loss": 0.4214, + "step": 4161 + }, + { + "epoch": 0.63, + "grad_norm": 14.465713287539895, + "learning_rate": 3.2145087579672894e-06, + "loss": 0.4326, + "step": 4162 + }, + { + "epoch": 0.63, + "grad_norm": 4.804850285086972, + "learning_rate": 3.212227477176405e-06, + "loss": 0.3765, + "step": 4163 + }, + { + "epoch": 0.63, + "grad_norm": 3.8894928684584356, + "learning_rate": 3.2099466230190436e-06, + "loss": 0.4364, + "step": 4164 + }, + { + "epoch": 0.63, + "grad_norm": 5.586257041590815, + "learning_rate": 3.207666196039505e-06, + "loss": 0.5021, + "step": 4165 + }, + { + "epoch": 0.63, + "grad_norm": 5.784656500382422, + "learning_rate": 3.2053861967819894e-06, + "loss": 0.4536, + "step": 4166 + }, + { + "epoch": 0.63, + "grad_norm": 3.9891940845795726, + "learning_rate": 3.2031066257905973e-06, + "loss": 0.5142, + "step": 4167 + }, + { + "epoch": 0.63, + "grad_norm": 1.2457393398365733, + "learning_rate": 3.200827483609324e-06, + "loss": 0.5392, + "step": 4168 + }, + { + "epoch": 0.63, + "grad_norm": 7.277139420881489, + "learning_rate": 3.1985487707820635e-06, + "loss": 0.5737, + "step": 4169 + }, + { + "epoch": 0.63, + "grad_norm": 9.162792230031295, + "learning_rate": 3.196270487852607e-06, + "loss": 0.457, + "step": 4170 + }, + { + "epoch": 0.63, + "grad_norm": 2.779906764390809, + "learning_rate": 3.1939926353646412e-06, + "loss": 0.4938, + "step": 4171 + }, + { + "epoch": 0.63, + "grad_norm": 6.451719390696094, + "learning_rate": 3.191715213861756e-06, + "loss": 0.4721, + "step": 4172 + }, + { + "epoch": 0.63, + "grad_norm": 4.568459318857308, + "learning_rate": 3.1894382238874307e-06, + "loss": 0.3876, + "step": 4173 + }, + { + "epoch": 0.63, + "grad_norm": 4.727431008057013, + "learning_rate": 3.1871616659850493e-06, + "loss": 0.4209, + "step": 4174 + }, + { + "epoch": 0.63, + "grad_norm": 2.140329912955659, + "learning_rate": 3.1848855406978872e-06, + "loss": 0.4011, + "step": 4175 + }, + { + "epoch": 0.63, + "grad_norm": 5.1335665617801824, + "learning_rate": 3.182609848569118e-06, + "loss": 0.369, + "step": 4176 + }, + { + "epoch": 0.63, + "grad_norm": 2.292738467481602, + "learning_rate": 3.1803345901418126e-06, + "loss": 0.4024, + "step": 4177 + }, + { + "epoch": 0.63, + "grad_norm": 3.0084688940698974, + "learning_rate": 3.1780597659589363e-06, + "loss": 0.4461, + "step": 4178 + }, + { + "epoch": 0.63, + "grad_norm": 4.145930763914243, + "learning_rate": 3.175785376563356e-06, + "loss": 0.4897, + "step": 4179 + }, + { + "epoch": 0.63, + "grad_norm": 2.6364328485092097, + "learning_rate": 3.1735114224978295e-06, + "loss": 0.4418, + "step": 4180 + }, + { + "epoch": 0.63, + "grad_norm": 2.9085466026260534, + "learning_rate": 3.1712379043050116e-06, + "loss": 0.4193, + "step": 4181 + }, + { + "epoch": 0.63, + "grad_norm": 3.1657466834124266, + "learning_rate": 3.168964822527456e-06, + "loss": 0.5386, + "step": 4182 + }, + { + "epoch": 0.63, + "grad_norm": 3.8090696852334833, + "learning_rate": 3.166692177707607e-06, + "loss": 0.4494, + "step": 4183 + }, + { + "epoch": 0.63, + "grad_norm": 3.654037777930578, + "learning_rate": 3.164419970387812e-06, + "loss": 0.3667, + "step": 4184 + }, + { + "epoch": 0.63, + "grad_norm": 6.375401668600867, + "learning_rate": 3.162148201110308e-06, + "loss": 0.4186, + "step": 4185 + }, + { + "epoch": 0.63, + "grad_norm": 3.733822214842829, + "learning_rate": 3.1598768704172283e-06, + "loss": 0.4368, + "step": 4186 + }, + { + "epoch": 0.63, + "grad_norm": 5.203142104926271, + "learning_rate": 3.1576059788506052e-06, + "loss": 0.4143, + "step": 4187 + }, + { + "epoch": 0.63, + "grad_norm": 2.5106096053421543, + "learning_rate": 3.1553355269523594e-06, + "loss": 0.4308, + "step": 4188 + }, + { + "epoch": 0.63, + "grad_norm": 3.180485644778623, + "learning_rate": 3.1530655152643157e-06, + "loss": 0.4143, + "step": 4189 + }, + { + "epoch": 0.63, + "grad_norm": 4.312917387630994, + "learning_rate": 3.1507959443281872e-06, + "loss": 0.3748, + "step": 4190 + }, + { + "epoch": 0.63, + "grad_norm": 6.675023479696254, + "learning_rate": 3.148526814685581e-06, + "loss": 0.4486, + "step": 4191 + }, + { + "epoch": 0.63, + "grad_norm": 4.935785844122816, + "learning_rate": 3.1462581268780058e-06, + "loss": 0.462, + "step": 4192 + }, + { + "epoch": 0.63, + "grad_norm": 3.3755335751767, + "learning_rate": 3.1439898814468585e-06, + "loss": 0.4368, + "step": 4193 + }, + { + "epoch": 0.63, + "grad_norm": 2.319869731786069, + "learning_rate": 3.1417220789334326e-06, + "loss": 0.4003, + "step": 4194 + }, + { + "epoch": 0.63, + "grad_norm": 4.9281100697810745, + "learning_rate": 3.1394547198789173e-06, + "loss": 0.4332, + "step": 4195 + }, + { + "epoch": 0.63, + "grad_norm": 3.50118220624619, + "learning_rate": 3.1371878048243897e-06, + "loss": 0.4153, + "step": 4196 + }, + { + "epoch": 0.63, + "grad_norm": 3.597950823402391, + "learning_rate": 3.134921334310832e-06, + "loss": 0.4746, + "step": 4197 + }, + { + "epoch": 0.63, + "grad_norm": 4.823342534201827, + "learning_rate": 3.1326553088791107e-06, + "loss": 0.4348, + "step": 4198 + }, + { + "epoch": 0.63, + "grad_norm": 3.7286051322202436, + "learning_rate": 3.1303897290699904e-06, + "loss": 0.3972, + "step": 4199 + }, + { + "epoch": 0.63, + "grad_norm": 4.57304316688605, + "learning_rate": 3.1281245954241286e-06, + "loss": 0.3477, + "step": 4200 + }, + { + "epoch": 0.63, + "grad_norm": 4.843591605815458, + "learning_rate": 3.1258599084820734e-06, + "loss": 0.4924, + "step": 4201 + }, + { + "epoch": 0.63, + "grad_norm": 4.446061494156459, + "learning_rate": 3.1235956687842733e-06, + "loss": 0.4437, + "step": 4202 + }, + { + "epoch": 0.63, + "grad_norm": 3.2842103653263885, + "learning_rate": 3.1213318768710634e-06, + "loss": 0.4514, + "step": 4203 + }, + { + "epoch": 0.63, + "grad_norm": 4.078509101844589, + "learning_rate": 3.119068533282675e-06, + "loss": 0.4097, + "step": 4204 + }, + { + "epoch": 0.63, + "grad_norm": 3.8761872484029896, + "learning_rate": 3.1168056385592318e-06, + "loss": 0.4196, + "step": 4205 + }, + { + "epoch": 0.63, + "grad_norm": 2.9881416998584487, + "learning_rate": 3.114543193240748e-06, + "loss": 0.4511, + "step": 4206 + }, + { + "epoch": 0.63, + "grad_norm": 3.2859200436626455, + "learning_rate": 3.112281197867136e-06, + "loss": 0.3775, + "step": 4207 + }, + { + "epoch": 0.63, + "grad_norm": 3.3738781816485077, + "learning_rate": 3.1100196529781977e-06, + "loss": 0.397, + "step": 4208 + }, + { + "epoch": 0.63, + "grad_norm": 5.993913074115008, + "learning_rate": 3.107758559113625e-06, + "loss": 0.4757, + "step": 4209 + }, + { + "epoch": 0.63, + "grad_norm": 2.829888873644678, + "learning_rate": 3.1054979168130074e-06, + "loss": 0.405, + "step": 4210 + }, + { + "epoch": 0.64, + "grad_norm": 4.029030500269111, + "learning_rate": 3.103237726615822e-06, + "loss": 0.4492, + "step": 4211 + }, + { + "epoch": 0.64, + "grad_norm": 4.0634323904280745, + "learning_rate": 3.1009779890614418e-06, + "loss": 0.4344, + "step": 4212 + }, + { + "epoch": 0.64, + "grad_norm": 2.68774751470287, + "learning_rate": 3.098718704689129e-06, + "loss": 0.4587, + "step": 4213 + }, + { + "epoch": 0.64, + "grad_norm": 5.484813580150848, + "learning_rate": 3.0964598740380354e-06, + "loss": 0.4554, + "step": 4214 + }, + { + "epoch": 0.64, + "grad_norm": 4.148140489452931, + "learning_rate": 3.0942014976472134e-06, + "loss": 0.4763, + "step": 4215 + }, + { + "epoch": 0.64, + "grad_norm": 3.5920501665135247, + "learning_rate": 3.0919435760555972e-06, + "loss": 0.4823, + "step": 4216 + }, + { + "epoch": 0.64, + "grad_norm": 2.4204315391585736, + "learning_rate": 3.0896861098020192e-06, + "loss": 0.3781, + "step": 4217 + }, + { + "epoch": 0.64, + "grad_norm": 2.902008321270663, + "learning_rate": 3.0874290994251988e-06, + "loss": 0.3859, + "step": 4218 + }, + { + "epoch": 0.64, + "grad_norm": 3.3183353791392496, + "learning_rate": 3.085172545463747e-06, + "loss": 0.4651, + "step": 4219 + }, + { + "epoch": 0.64, + "grad_norm": 3.349838259522261, + "learning_rate": 3.082916448456171e-06, + "loss": 0.4811, + "step": 4220 + }, + { + "epoch": 0.64, + "grad_norm": 11.414992933107373, + "learning_rate": 3.080660808940862e-06, + "loss": 0.434, + "step": 4221 + }, + { + "epoch": 0.64, + "grad_norm": 3.2349218711913075, + "learning_rate": 3.078405627456107e-06, + "loss": 0.4148, + "step": 4222 + }, + { + "epoch": 0.64, + "grad_norm": 5.484004309073225, + "learning_rate": 3.076150904540081e-06, + "loss": 0.447, + "step": 4223 + }, + { + "epoch": 0.64, + "grad_norm": 13.8554465107056, + "learning_rate": 3.073896640730849e-06, + "loss": 0.4097, + "step": 4224 + }, + { + "epoch": 0.64, + "grad_norm": 5.4628825410170725, + "learning_rate": 3.071642836566371e-06, + "loss": 0.4373, + "step": 4225 + }, + { + "epoch": 0.64, + "grad_norm": 3.473157458929054, + "learning_rate": 3.0693894925844906e-06, + "loss": 0.4502, + "step": 4226 + }, + { + "epoch": 0.64, + "grad_norm": 2.9053544485036134, + "learning_rate": 3.0671366093229495e-06, + "loss": 0.4394, + "step": 4227 + }, + { + "epoch": 0.64, + "grad_norm": 2.850874630445069, + "learning_rate": 3.0648841873193725e-06, + "loss": 0.4164, + "step": 4228 + }, + { + "epoch": 0.64, + "grad_norm": 5.574953364336203, + "learning_rate": 3.062632227111276e-06, + "loss": 0.4255, + "step": 4229 + }, + { + "epoch": 0.64, + "grad_norm": 2.2038555013274514, + "learning_rate": 3.060380729236069e-06, + "loss": 0.4083, + "step": 4230 + }, + { + "epoch": 0.64, + "grad_norm": 3.87574092147822, + "learning_rate": 3.0581296942310464e-06, + "loss": 0.394, + "step": 4231 + }, + { + "epoch": 0.64, + "grad_norm": 4.504919368964063, + "learning_rate": 3.0558791226333974e-06, + "loss": 0.4633, + "step": 4232 + }, + { + "epoch": 0.64, + "grad_norm": 5.033485624101838, + "learning_rate": 3.053629014980196e-06, + "loss": 0.4561, + "step": 4233 + }, + { + "epoch": 0.64, + "grad_norm": 2.7090215797176054, + "learning_rate": 3.051379371808406e-06, + "loss": 0.4361, + "step": 4234 + }, + { + "epoch": 0.64, + "grad_norm": 4.131954816212699, + "learning_rate": 3.0491301936548847e-06, + "loss": 0.4176, + "step": 4235 + }, + { + "epoch": 0.64, + "grad_norm": 2.681442399990167, + "learning_rate": 3.0468814810563714e-06, + "loss": 0.4007, + "step": 4236 + }, + { + "epoch": 0.64, + "grad_norm": 2.83941352189141, + "learning_rate": 3.044633234549502e-06, + "loss": 0.468, + "step": 4237 + }, + { + "epoch": 0.64, + "grad_norm": 3.3232015998886277, + "learning_rate": 3.0423854546707966e-06, + "loss": 0.4862, + "step": 4238 + }, + { + "epoch": 0.64, + "grad_norm": 3.9000610673829677, + "learning_rate": 3.0401381419566624e-06, + "loss": 0.4197, + "step": 4239 + }, + { + "epoch": 0.64, + "grad_norm": 3.2137154805579864, + "learning_rate": 3.0378912969433994e-06, + "loss": 0.5219, + "step": 4240 + }, + { + "epoch": 0.64, + "grad_norm": 7.03641470545588, + "learning_rate": 3.035644920167193e-06, + "loss": 0.4653, + "step": 4241 + }, + { + "epoch": 0.64, + "grad_norm": 3.669601911310437, + "learning_rate": 3.03339901216412e-06, + "loss": 0.4383, + "step": 4242 + }, + { + "epoch": 0.64, + "grad_norm": 3.5535364004142087, + "learning_rate": 3.0311535734701413e-06, + "loss": 0.4829, + "step": 4243 + }, + { + "epoch": 0.64, + "grad_norm": 24.805328316506102, + "learning_rate": 3.028908604621107e-06, + "loss": 0.5084, + "step": 4244 + }, + { + "epoch": 0.64, + "grad_norm": 4.3106147611164465, + "learning_rate": 3.0266641061527584e-06, + "loss": 0.4117, + "step": 4245 + }, + { + "epoch": 0.64, + "grad_norm": 1.2639217482381109, + "learning_rate": 3.0244200786007204e-06, + "loss": 0.518, + "step": 4246 + }, + { + "epoch": 0.64, + "grad_norm": 2.9907223549953996, + "learning_rate": 3.0221765225005078e-06, + "loss": 0.429, + "step": 4247 + }, + { + "epoch": 0.64, + "grad_norm": 3.999994271494147, + "learning_rate": 3.0199334383875223e-06, + "loss": 0.4165, + "step": 4248 + }, + { + "epoch": 0.64, + "grad_norm": 1.07551204483128, + "learning_rate": 3.0176908267970494e-06, + "loss": 0.5264, + "step": 4249 + }, + { + "epoch": 0.64, + "grad_norm": 6.004382197632611, + "learning_rate": 3.015448688264271e-06, + "loss": 0.3438, + "step": 4250 + }, + { + "epoch": 0.64, + "grad_norm": 4.722289554512023, + "learning_rate": 3.013207023324246e-06, + "loss": 0.4296, + "step": 4251 + }, + { + "epoch": 0.64, + "grad_norm": 3.860736230168339, + "learning_rate": 3.010965832511928e-06, + "loss": 0.4479, + "step": 4252 + }, + { + "epoch": 0.64, + "grad_norm": 6.783670247345435, + "learning_rate": 3.0087251163621516e-06, + "loss": 0.4088, + "step": 4253 + }, + { + "epoch": 0.64, + "grad_norm": 3.2907179369922095, + "learning_rate": 3.0064848754096398e-06, + "loss": 0.4316, + "step": 4254 + }, + { + "epoch": 0.64, + "grad_norm": 3.5934671897463204, + "learning_rate": 3.004245110189007e-06, + "loss": 0.4279, + "step": 4255 + }, + { + "epoch": 0.64, + "grad_norm": 3.7973344784980134, + "learning_rate": 3.002005821234746e-06, + "loss": 0.4924, + "step": 4256 + }, + { + "epoch": 0.64, + "grad_norm": 3.26083950210593, + "learning_rate": 2.999767009081243e-06, + "loss": 0.4468, + "step": 4257 + }, + { + "epoch": 0.64, + "grad_norm": 4.255745570860005, + "learning_rate": 2.9975286742627667e-06, + "loss": 0.4909, + "step": 4258 + }, + { + "epoch": 0.64, + "grad_norm": 7.819666463823498, + "learning_rate": 2.9952908173134704e-06, + "loss": 0.5017, + "step": 4259 + }, + { + "epoch": 0.64, + "grad_norm": 2.763800155319947, + "learning_rate": 2.993053438767399e-06, + "loss": 0.4321, + "step": 4260 + }, + { + "epoch": 0.64, + "grad_norm": 3.5531038028014956, + "learning_rate": 2.9908165391584775e-06, + "loss": 0.4306, + "step": 4261 + }, + { + "epoch": 0.64, + "grad_norm": 3.8285004096833473, + "learning_rate": 2.9885801190205215e-06, + "loss": 0.4659, + "step": 4262 + }, + { + "epoch": 0.64, + "grad_norm": 12.687664967667553, + "learning_rate": 2.986344178887228e-06, + "loss": 0.4262, + "step": 4263 + }, + { + "epoch": 0.64, + "grad_norm": 3.6207045711248687, + "learning_rate": 2.98410871929218e-06, + "loss": 0.3788, + "step": 4264 + }, + { + "epoch": 0.64, + "grad_norm": 3.606990772419031, + "learning_rate": 2.98187374076885e-06, + "loss": 0.4473, + "step": 4265 + }, + { + "epoch": 0.64, + "grad_norm": 4.727685269673953, + "learning_rate": 2.979639243850588e-06, + "loss": 0.4185, + "step": 4266 + }, + { + "epoch": 0.64, + "grad_norm": 4.0124699786689035, + "learning_rate": 2.9774052290706386e-06, + "loss": 0.432, + "step": 4267 + }, + { + "epoch": 0.64, + "grad_norm": 5.221983650996144, + "learning_rate": 2.975171696962124e-06, + "loss": 0.341, + "step": 4268 + }, + { + "epoch": 0.64, + "grad_norm": 3.4573217062743313, + "learning_rate": 2.9729386480580535e-06, + "loss": 0.4124, + "step": 4269 + }, + { + "epoch": 0.64, + "grad_norm": 7.8209838307559965, + "learning_rate": 2.9707060828913226e-06, + "loss": 0.4969, + "step": 4270 + }, + { + "epoch": 0.64, + "grad_norm": 1.1869242059904859, + "learning_rate": 2.9684740019947073e-06, + "loss": 0.5589, + "step": 4271 + }, + { + "epoch": 0.64, + "grad_norm": 2.844529840321809, + "learning_rate": 2.9662424059008742e-06, + "loss": 0.4462, + "step": 4272 + }, + { + "epoch": 0.64, + "grad_norm": 1.1814585118069272, + "learning_rate": 2.96401129514237e-06, + "loss": 0.5118, + "step": 4273 + }, + { + "epoch": 0.64, + "grad_norm": 5.49959202083849, + "learning_rate": 2.961780670251624e-06, + "loss": 0.4241, + "step": 4274 + }, + { + "epoch": 0.64, + "grad_norm": 4.6489546747736, + "learning_rate": 2.959550531760954e-06, + "loss": 0.4209, + "step": 4275 + }, + { + "epoch": 0.64, + "grad_norm": 5.031413034018773, + "learning_rate": 2.9573208802025577e-06, + "loss": 0.4754, + "step": 4276 + }, + { + "epoch": 0.65, + "grad_norm": 6.6903598897626875, + "learning_rate": 2.9550917161085213e-06, + "loss": 0.435, + "step": 4277 + }, + { + "epoch": 0.65, + "grad_norm": 3.6026312676211463, + "learning_rate": 2.9528630400108097e-06, + "loss": 0.4949, + "step": 4278 + }, + { + "epoch": 0.65, + "grad_norm": 5.476945756331593, + "learning_rate": 2.950634852441274e-06, + "loss": 0.439, + "step": 4279 + }, + { + "epoch": 0.65, + "grad_norm": 5.8043460578808395, + "learning_rate": 2.9484071539316494e-06, + "loss": 0.451, + "step": 4280 + }, + { + "epoch": 0.65, + "grad_norm": 7.850384099935864, + "learning_rate": 2.9461799450135502e-06, + "loss": 0.4102, + "step": 4281 + }, + { + "epoch": 0.65, + "grad_norm": 2.700714102998691, + "learning_rate": 2.943953226218481e-06, + "loss": 0.3756, + "step": 4282 + }, + { + "epoch": 0.65, + "grad_norm": 4.31944729888374, + "learning_rate": 2.941726998077823e-06, + "loss": 0.4065, + "step": 4283 + }, + { + "epoch": 0.65, + "grad_norm": 3.7162586105225186, + "learning_rate": 2.939501261122841e-06, + "loss": 0.4639, + "step": 4284 + }, + { + "epoch": 0.65, + "grad_norm": 8.807495758188896, + "learning_rate": 2.9372760158846884e-06, + "loss": 0.5127, + "step": 4285 + }, + { + "epoch": 0.65, + "grad_norm": 4.71399331007614, + "learning_rate": 2.935051262894394e-06, + "loss": 0.4317, + "step": 4286 + }, + { + "epoch": 0.65, + "grad_norm": 4.190015487512295, + "learning_rate": 2.9328270026828737e-06, + "loss": 0.4286, + "step": 4287 + }, + { + "epoch": 0.65, + "grad_norm": 1.016634790335939, + "learning_rate": 2.9306032357809232e-06, + "loss": 0.5187, + "step": 4288 + }, + { + "epoch": 0.65, + "grad_norm": 5.579473369660197, + "learning_rate": 2.928379962719221e-06, + "loss": 0.4945, + "step": 4289 + }, + { + "epoch": 0.65, + "grad_norm": 5.78229745485092, + "learning_rate": 2.926157184028331e-06, + "loss": 0.4599, + "step": 4290 + }, + { + "epoch": 0.65, + "grad_norm": 5.323466247480093, + "learning_rate": 2.923934900238694e-06, + "loss": 0.4777, + "step": 4291 + }, + { + "epoch": 0.65, + "grad_norm": 5.814833020144376, + "learning_rate": 2.921713111880637e-06, + "loss": 0.4136, + "step": 4292 + }, + { + "epoch": 0.65, + "grad_norm": 7.457495168334471, + "learning_rate": 2.919491819484366e-06, + "loss": 0.5199, + "step": 4293 + }, + { + "epoch": 0.65, + "grad_norm": 7.14076624129154, + "learning_rate": 2.9172710235799686e-06, + "loss": 0.4343, + "step": 4294 + }, + { + "epoch": 0.65, + "grad_norm": 3.560090442871984, + "learning_rate": 2.915050724697417e-06, + "loss": 0.4318, + "step": 4295 + }, + { + "epoch": 0.65, + "grad_norm": 10.667521784069349, + "learning_rate": 2.9128309233665585e-06, + "loss": 0.4769, + "step": 4296 + }, + { + "epoch": 0.65, + "grad_norm": 5.871134562263709, + "learning_rate": 2.910611620117132e-06, + "loss": 0.4954, + "step": 4297 + }, + { + "epoch": 0.65, + "grad_norm": 3.761334207200338, + "learning_rate": 2.908392815478748e-06, + "loss": 0.4229, + "step": 4298 + }, + { + "epoch": 0.65, + "grad_norm": 3.60594305620509, + "learning_rate": 2.9061745099808997e-06, + "loss": 0.4777, + "step": 4299 + }, + { + "epoch": 0.65, + "grad_norm": 4.356389774295121, + "learning_rate": 2.903956704152967e-06, + "loss": 0.4686, + "step": 4300 + }, + { + "epoch": 0.65, + "grad_norm": 4.832368351611848, + "learning_rate": 2.901739398524202e-06, + "loss": 0.4261, + "step": 4301 + }, + { + "epoch": 0.65, + "grad_norm": 4.44051124362793, + "learning_rate": 2.8995225936237466e-06, + "loss": 0.4674, + "step": 4302 + }, + { + "epoch": 0.65, + "grad_norm": 3.327171468363169, + "learning_rate": 2.897306289980616e-06, + "loss": 0.4292, + "step": 4303 + }, + { + "epoch": 0.65, + "grad_norm": 4.848054489487304, + "learning_rate": 2.8950904881237065e-06, + "loss": 0.4074, + "step": 4304 + }, + { + "epoch": 0.65, + "grad_norm": 3.7426854888982626, + "learning_rate": 2.8928751885817995e-06, + "loss": 0.4426, + "step": 4305 + }, + { + "epoch": 0.65, + "grad_norm": 3.9630468565067294, + "learning_rate": 2.890660391883553e-06, + "loss": 0.4283, + "step": 4306 + }, + { + "epoch": 0.65, + "grad_norm": 1.1947282656835776, + "learning_rate": 2.8884460985575043e-06, + "loss": 0.5508, + "step": 4307 + }, + { + "epoch": 0.65, + "grad_norm": 3.419579978689222, + "learning_rate": 2.886232309132072e-06, + "loss": 0.4615, + "step": 4308 + }, + { + "epoch": 0.65, + "grad_norm": 5.029674828211915, + "learning_rate": 2.8840190241355526e-06, + "loss": 0.4432, + "step": 4309 + }, + { + "epoch": 0.65, + "grad_norm": 5.046092568138356, + "learning_rate": 2.881806244096127e-06, + "loss": 0.4131, + "step": 4310 + }, + { + "epoch": 0.65, + "grad_norm": 4.70295951788894, + "learning_rate": 2.879593969541849e-06, + "loss": 0.4169, + "step": 4311 + }, + { + "epoch": 0.65, + "grad_norm": 3.9876104565843082, + "learning_rate": 2.8773822010006593e-06, + "loss": 0.4699, + "step": 4312 + }, + { + "epoch": 0.65, + "grad_norm": 4.241154178734816, + "learning_rate": 2.875170939000371e-06, + "loss": 0.4659, + "step": 4313 + }, + { + "epoch": 0.65, + "grad_norm": 3.186116935460794, + "learning_rate": 2.8729601840686777e-06, + "loss": 0.3915, + "step": 4314 + }, + { + "epoch": 0.65, + "grad_norm": 5.440808919833905, + "learning_rate": 2.870749936733157e-06, + "loss": 0.4459, + "step": 4315 + }, + { + "epoch": 0.65, + "grad_norm": 9.255718688164043, + "learning_rate": 2.868540197521258e-06, + "loss": 0.4706, + "step": 4316 + }, + { + "epoch": 0.65, + "grad_norm": 3.9417146766522637, + "learning_rate": 2.866330966960316e-06, + "loss": 0.4347, + "step": 4317 + }, + { + "epoch": 0.65, + "grad_norm": 5.697306060566815, + "learning_rate": 2.8641222455775388e-06, + "loss": 0.4401, + "step": 4318 + }, + { + "epoch": 0.65, + "grad_norm": 4.365444123149797, + "learning_rate": 2.861914033900015e-06, + "loss": 0.4505, + "step": 4319 + }, + { + "epoch": 0.65, + "grad_norm": 6.3409113103619354, + "learning_rate": 2.859706332454713e-06, + "loss": 0.4882, + "step": 4320 + }, + { + "epoch": 0.65, + "grad_norm": 1.1306698484462991, + "learning_rate": 2.857499141768476e-06, + "loss": 0.5001, + "step": 4321 + }, + { + "epoch": 0.65, + "grad_norm": 4.410574765447539, + "learning_rate": 2.8552924623680273e-06, + "loss": 0.4975, + "step": 4322 + }, + { + "epoch": 0.65, + "grad_norm": 5.563479156474254, + "learning_rate": 2.8530862947799707e-06, + "loss": 0.3978, + "step": 4323 + }, + { + "epoch": 0.65, + "grad_norm": 4.292523661253694, + "learning_rate": 2.8508806395307825e-06, + "loss": 0.4633, + "step": 4324 + }, + { + "epoch": 0.65, + "grad_norm": 4.447121391229444, + "learning_rate": 2.848675497146823e-06, + "loss": 0.4792, + "step": 4325 + }, + { + "epoch": 0.65, + "grad_norm": 3.3254022648522366, + "learning_rate": 2.846470868154324e-06, + "loss": 0.4306, + "step": 4326 + }, + { + "epoch": 0.65, + "grad_norm": 7.314454355077301, + "learning_rate": 2.8442667530793975e-06, + "loss": 0.4715, + "step": 4327 + }, + { + "epoch": 0.65, + "grad_norm": 1.3589335789412338, + "learning_rate": 2.8420631524480356e-06, + "loss": 0.5793, + "step": 4328 + }, + { + "epoch": 0.65, + "grad_norm": 5.290905484796616, + "learning_rate": 2.8398600667861032e-06, + "loss": 0.4674, + "step": 4329 + }, + { + "epoch": 0.65, + "grad_norm": 5.110200939111929, + "learning_rate": 2.8376574966193432e-06, + "loss": 0.4259, + "step": 4330 + }, + { + "epoch": 0.65, + "grad_norm": 8.372939338726948, + "learning_rate": 2.8354554424733776e-06, + "loss": 0.4123, + "step": 4331 + }, + { + "epoch": 0.65, + "grad_norm": 25.30634865411913, + "learning_rate": 2.833253904873702e-06, + "loss": 0.4843, + "step": 4332 + }, + { + "epoch": 0.65, + "grad_norm": 5.905317262299432, + "learning_rate": 2.831052884345693e-06, + "loss": 0.4428, + "step": 4333 + }, + { + "epoch": 0.65, + "grad_norm": 3.246732088165109, + "learning_rate": 2.828852381414599e-06, + "loss": 0.4258, + "step": 4334 + }, + { + "epoch": 0.65, + "grad_norm": 4.341307430365894, + "learning_rate": 2.8266523966055516e-06, + "loss": 0.3687, + "step": 4335 + }, + { + "epoch": 0.65, + "grad_norm": 4.875475913022291, + "learning_rate": 2.824452930443551e-06, + "loss": 0.4731, + "step": 4336 + }, + { + "epoch": 0.65, + "grad_norm": 3.099843840421281, + "learning_rate": 2.822253983453477e-06, + "loss": 0.4175, + "step": 4337 + }, + { + "epoch": 0.65, + "grad_norm": 3.805262952802066, + "learning_rate": 2.8200555561600884e-06, + "loss": 0.4217, + "step": 4338 + }, + { + "epoch": 0.65, + "grad_norm": 3.8417496624621927, + "learning_rate": 2.8178576490880137e-06, + "loss": 0.4693, + "step": 4339 + }, + { + "epoch": 0.65, + "grad_norm": 4.315259502344015, + "learning_rate": 2.8156602627617647e-06, + "loss": 0.4518, + "step": 4340 + }, + { + "epoch": 0.65, + "grad_norm": 9.701438759857595, + "learning_rate": 2.8134633977057236e-06, + "loss": 0.4394, + "step": 4341 + }, + { + "epoch": 0.65, + "grad_norm": 5.963209728896554, + "learning_rate": 2.811267054444149e-06, + "loss": 0.4053, + "step": 4342 + }, + { + "epoch": 0.66, + "grad_norm": 4.50805954808813, + "learning_rate": 2.8090712335011755e-06, + "loss": 0.4224, + "step": 4343 + }, + { + "epoch": 0.66, + "grad_norm": 5.351187008453021, + "learning_rate": 2.806875935400811e-06, + "loss": 0.4946, + "step": 4344 + }, + { + "epoch": 0.66, + "grad_norm": 3.7530632582233223, + "learning_rate": 2.804681160666947e-06, + "loss": 0.4078, + "step": 4345 + }, + { + "epoch": 0.66, + "grad_norm": 3.631067963612568, + "learning_rate": 2.8024869098233386e-06, + "loss": 0.3993, + "step": 4346 + }, + { + "epoch": 0.66, + "grad_norm": 5.8057556953557485, + "learning_rate": 2.8002931833936213e-06, + "loss": 0.4569, + "step": 4347 + }, + { + "epoch": 0.66, + "grad_norm": 5.794464173769737, + "learning_rate": 2.798099981901309e-06, + "loss": 0.3834, + "step": 4348 + }, + { + "epoch": 0.66, + "grad_norm": 5.773465212202362, + "learning_rate": 2.795907305869783e-06, + "loss": 0.4113, + "step": 4349 + }, + { + "epoch": 0.66, + "grad_norm": 7.628533675439235, + "learning_rate": 2.793715155822305e-06, + "loss": 0.4923, + "step": 4350 + }, + { + "epoch": 0.66, + "grad_norm": 5.011423465785355, + "learning_rate": 2.7915235322820096e-06, + "loss": 0.4717, + "step": 4351 + }, + { + "epoch": 0.66, + "grad_norm": 8.12729451346462, + "learning_rate": 2.789332435771904e-06, + "loss": 0.4027, + "step": 4352 + }, + { + "epoch": 0.66, + "grad_norm": 1.1433392242730365, + "learning_rate": 2.7871418668148703e-06, + "loss": 0.494, + "step": 4353 + }, + { + "epoch": 0.66, + "grad_norm": 3.336123069759631, + "learning_rate": 2.7849518259336637e-06, + "loss": 0.4031, + "step": 4354 + }, + { + "epoch": 0.66, + "grad_norm": 3.9470886230195013, + "learning_rate": 2.7827623136509197e-06, + "loss": 0.4852, + "step": 4355 + }, + { + "epoch": 0.66, + "grad_norm": 4.250727188240011, + "learning_rate": 2.78057333048914e-06, + "loss": 0.3852, + "step": 4356 + }, + { + "epoch": 0.66, + "grad_norm": 12.560849701238379, + "learning_rate": 2.7783848769707016e-06, + "loss": 0.4303, + "step": 4357 + }, + { + "epoch": 0.66, + "grad_norm": 4.212088990541189, + "learning_rate": 2.7761969536178598e-06, + "loss": 0.42, + "step": 4358 + }, + { + "epoch": 0.66, + "grad_norm": 5.567913330313754, + "learning_rate": 2.7740095609527363e-06, + "loss": 0.4302, + "step": 4359 + }, + { + "epoch": 0.66, + "grad_norm": 4.809994176497501, + "learning_rate": 2.771822699497333e-06, + "loss": 0.4466, + "step": 4360 + }, + { + "epoch": 0.66, + "grad_norm": 6.0247448050094, + "learning_rate": 2.7696363697735213e-06, + "loss": 0.4645, + "step": 4361 + }, + { + "epoch": 0.66, + "grad_norm": 3.960530562498612, + "learning_rate": 2.767450572303043e-06, + "loss": 0.4404, + "step": 4362 + }, + { + "epoch": 0.66, + "grad_norm": 4.103447009062216, + "learning_rate": 2.765265307607522e-06, + "loss": 0.4861, + "step": 4363 + }, + { + "epoch": 0.66, + "grad_norm": 7.246512240991744, + "learning_rate": 2.763080576208445e-06, + "loss": 0.446, + "step": 4364 + }, + { + "epoch": 0.66, + "grad_norm": 6.448272992424974, + "learning_rate": 2.760896378627177e-06, + "loss": 0.4706, + "step": 4365 + }, + { + "epoch": 0.66, + "grad_norm": 6.829550773135516, + "learning_rate": 2.758712715384954e-06, + "loss": 0.4737, + "step": 4366 + }, + { + "epoch": 0.66, + "grad_norm": 6.2126655343411805, + "learning_rate": 2.756529587002883e-06, + "loss": 0.3792, + "step": 4367 + }, + { + "epoch": 0.66, + "grad_norm": 4.437739359027148, + "learning_rate": 2.7543469940019486e-06, + "loss": 0.4204, + "step": 4368 + }, + { + "epoch": 0.66, + "grad_norm": 6.212829142047479, + "learning_rate": 2.7521649369030006e-06, + "loss": 0.4356, + "step": 4369 + }, + { + "epoch": 0.66, + "grad_norm": 3.9906867974911813, + "learning_rate": 2.7499834162267684e-06, + "loss": 0.4764, + "step": 4370 + }, + { + "epoch": 0.66, + "grad_norm": 17.452710708997856, + "learning_rate": 2.7478024324938474e-06, + "loss": 0.4624, + "step": 4371 + }, + { + "epoch": 0.66, + "grad_norm": 4.7540841766833495, + "learning_rate": 2.7456219862247047e-06, + "loss": 0.3688, + "step": 4372 + }, + { + "epoch": 0.66, + "grad_norm": 9.268272033177547, + "learning_rate": 2.7434420779396864e-06, + "loss": 0.4398, + "step": 4373 + }, + { + "epoch": 0.66, + "grad_norm": 4.52362885239163, + "learning_rate": 2.7412627081589993e-06, + "loss": 0.3822, + "step": 4374 + }, + { + "epoch": 0.66, + "grad_norm": 3.8909266895296435, + "learning_rate": 2.739083877402734e-06, + "loss": 0.46, + "step": 4375 + }, + { + "epoch": 0.66, + "grad_norm": 4.953537456540554, + "learning_rate": 2.7369055861908418e-06, + "loss": 0.439, + "step": 4376 + }, + { + "epoch": 0.66, + "grad_norm": 5.12068434109332, + "learning_rate": 2.7347278350431505e-06, + "loss": 0.4173, + "step": 4377 + }, + { + "epoch": 0.66, + "grad_norm": 4.715467588011136, + "learning_rate": 2.7325506244793583e-06, + "loss": 0.459, + "step": 4378 + }, + { + "epoch": 0.66, + "grad_norm": 4.25848876181839, + "learning_rate": 2.730373955019031e-06, + "loss": 0.411, + "step": 4379 + }, + { + "epoch": 0.66, + "grad_norm": 8.45368503260071, + "learning_rate": 2.728197827181614e-06, + "loss": 0.4415, + "step": 4380 + }, + { + "epoch": 0.66, + "grad_norm": 1.2061540680364977, + "learning_rate": 2.7260222414864145e-06, + "loss": 0.5208, + "step": 4381 + }, + { + "epoch": 0.66, + "grad_norm": 3.794568301926648, + "learning_rate": 2.723847198452612e-06, + "loss": 0.4405, + "step": 4382 + }, + { + "epoch": 0.66, + "grad_norm": 2.870938639791854, + "learning_rate": 2.7216726985992627e-06, + "loss": 0.4574, + "step": 4383 + }, + { + "epoch": 0.66, + "grad_norm": 3.479505958164214, + "learning_rate": 2.7194987424452834e-06, + "loss": 0.4528, + "step": 4384 + }, + { + "epoch": 0.66, + "grad_norm": 11.514641412860714, + "learning_rate": 2.7173253305094717e-06, + "loss": 0.4463, + "step": 4385 + }, + { + "epoch": 0.66, + "grad_norm": 2.9313806145047487, + "learning_rate": 2.7151524633104874e-06, + "loss": 0.3647, + "step": 4386 + }, + { + "epoch": 0.66, + "grad_norm": 4.60966959573462, + "learning_rate": 2.7129801413668626e-06, + "loss": 0.4488, + "step": 4387 + }, + { + "epoch": 0.66, + "grad_norm": 7.496082134423018, + "learning_rate": 2.710808365197e-06, + "loss": 0.528, + "step": 4388 + }, + { + "epoch": 0.66, + "grad_norm": 3.980087493830957, + "learning_rate": 2.70863713531917e-06, + "loss": 0.4756, + "step": 4389 + }, + { + "epoch": 0.66, + "grad_norm": 4.446874548239667, + "learning_rate": 2.7064664522515172e-06, + "loss": 0.4268, + "step": 4390 + }, + { + "epoch": 0.66, + "grad_norm": 5.00240919918048, + "learning_rate": 2.7042963165120516e-06, + "loss": 0.4837, + "step": 4391 + }, + { + "epoch": 0.66, + "grad_norm": 3.7764154699034873, + "learning_rate": 2.702126728618652e-06, + "loss": 0.3895, + "step": 4392 + }, + { + "epoch": 0.66, + "grad_norm": 4.053744569789017, + "learning_rate": 2.699957689089071e-06, + "loss": 0.4383, + "step": 4393 + }, + { + "epoch": 0.66, + "grad_norm": 4.497921837348814, + "learning_rate": 2.6977891984409254e-06, + "loss": 0.4618, + "step": 4394 + }, + { + "epoch": 0.66, + "grad_norm": 1.1559608750588002, + "learning_rate": 2.695621257191705e-06, + "loss": 0.4985, + "step": 4395 + }, + { + "epoch": 0.66, + "grad_norm": 4.174164904502262, + "learning_rate": 2.693453865858767e-06, + "loss": 0.4354, + "step": 4396 + }, + { + "epoch": 0.66, + "grad_norm": 5.682379889659037, + "learning_rate": 2.6912870249593333e-06, + "loss": 0.4576, + "step": 4397 + }, + { + "epoch": 0.66, + "grad_norm": 35.94340271548921, + "learning_rate": 2.6891207350105025e-06, + "loss": 0.4143, + "step": 4398 + }, + { + "epoch": 0.66, + "grad_norm": 5.154744498229646, + "learning_rate": 2.6869549965292364e-06, + "loss": 0.4779, + "step": 4399 + }, + { + "epoch": 0.66, + "grad_norm": 3.1675618714759293, + "learning_rate": 2.6847898100323645e-06, + "loss": 0.4336, + "step": 4400 + }, + { + "epoch": 0.66, + "grad_norm": 5.161446751526091, + "learning_rate": 2.6826251760365887e-06, + "loss": 0.4038, + "step": 4401 + }, + { + "epoch": 0.66, + "grad_norm": 3.2678951590518412, + "learning_rate": 2.680461095058472e-06, + "loss": 0.4443, + "step": 4402 + }, + { + "epoch": 0.66, + "grad_norm": 5.550965756267644, + "learning_rate": 2.678297567614455e-06, + "loss": 0.4793, + "step": 4403 + }, + { + "epoch": 0.66, + "grad_norm": 4.952472736138958, + "learning_rate": 2.6761345942208384e-06, + "loss": 0.3951, + "step": 4404 + }, + { + "epoch": 0.66, + "grad_norm": 3.9276301742694, + "learning_rate": 2.6739721753937963e-06, + "loss": 0.4009, + "step": 4405 + }, + { + "epoch": 0.66, + "grad_norm": 1.1446247743952453, + "learning_rate": 2.6718103116493666e-06, + "loss": 0.5652, + "step": 4406 + }, + { + "epoch": 0.66, + "grad_norm": 4.9940060241387165, + "learning_rate": 2.669649003503453e-06, + "loss": 0.474, + "step": 4407 + }, + { + "epoch": 0.66, + "grad_norm": 3.6215339534667277, + "learning_rate": 2.6674882514718335e-06, + "loss": 0.426, + "step": 4408 + }, + { + "epoch": 0.67, + "grad_norm": 5.529765672138517, + "learning_rate": 2.665328056070147e-06, + "loss": 0.4699, + "step": 4409 + }, + { + "epoch": 0.67, + "grad_norm": 4.598942845530387, + "learning_rate": 2.663168417813903e-06, + "loss": 0.4835, + "step": 4410 + }, + { + "epoch": 0.67, + "grad_norm": 16.0907759285874, + "learning_rate": 2.6610093372184764e-06, + "loss": 0.4519, + "step": 4411 + }, + { + "epoch": 0.67, + "grad_norm": 8.56009402126643, + "learning_rate": 2.658850814799111e-06, + "loss": 0.4197, + "step": 4412 + }, + { + "epoch": 0.67, + "grad_norm": 5.886579430888448, + "learning_rate": 2.6566928510709135e-06, + "loss": 0.427, + "step": 4413 + }, + { + "epoch": 0.67, + "grad_norm": 7.965329351102854, + "learning_rate": 2.6545354465488593e-06, + "loss": 0.4249, + "step": 4414 + }, + { + "epoch": 0.67, + "grad_norm": 28.381606758235975, + "learning_rate": 2.652378601747795e-06, + "loss": 0.4896, + "step": 4415 + }, + { + "epoch": 0.67, + "grad_norm": 6.034779291478902, + "learning_rate": 2.650222317182426e-06, + "loss": 0.4481, + "step": 4416 + }, + { + "epoch": 0.67, + "grad_norm": 4.931396386376095, + "learning_rate": 2.648066593367327e-06, + "loss": 0.4271, + "step": 4417 + }, + { + "epoch": 0.67, + "grad_norm": 6.7103258570970015, + "learning_rate": 2.6459114308169414e-06, + "loss": 0.3913, + "step": 4418 + }, + { + "epoch": 0.67, + "grad_norm": 4.829445458272366, + "learning_rate": 2.643756830045574e-06, + "loss": 0.4875, + "step": 4419 + }, + { + "epoch": 0.67, + "grad_norm": 5.599875800177118, + "learning_rate": 2.6416027915674027e-06, + "loss": 0.4439, + "step": 4420 + }, + { + "epoch": 0.67, + "grad_norm": 2.9642682234151705, + "learning_rate": 2.6394493158964628e-06, + "loss": 0.414, + "step": 4421 + }, + { + "epoch": 0.67, + "grad_norm": 7.582960253276647, + "learning_rate": 2.637296403546661e-06, + "loss": 0.3911, + "step": 4422 + }, + { + "epoch": 0.67, + "grad_norm": 4.390873301554247, + "learning_rate": 2.6351440550317665e-06, + "loss": 0.4043, + "step": 4423 + }, + { + "epoch": 0.67, + "grad_norm": 4.533277350610243, + "learning_rate": 2.6329922708654124e-06, + "loss": 0.3888, + "step": 4424 + }, + { + "epoch": 0.67, + "grad_norm": 10.150295216726759, + "learning_rate": 2.630841051561106e-06, + "loss": 0.4754, + "step": 4425 + }, + { + "epoch": 0.67, + "grad_norm": 272.026482859545, + "learning_rate": 2.62869039763221e-06, + "loss": 0.53, + "step": 4426 + }, + { + "epoch": 0.67, + "grad_norm": 4.047501765964948, + "learning_rate": 2.6265403095919547e-06, + "loss": 0.4107, + "step": 4427 + }, + { + "epoch": 0.67, + "grad_norm": 4.213980613919312, + "learning_rate": 2.6243907879534397e-06, + "loss": 0.4918, + "step": 4428 + }, + { + "epoch": 0.67, + "grad_norm": 7.305880841344576, + "learning_rate": 2.622241833229623e-06, + "loss": 0.4531, + "step": 4429 + }, + { + "epoch": 0.67, + "grad_norm": 3.923891070606426, + "learning_rate": 2.6200934459333337e-06, + "loss": 0.4043, + "step": 4430 + }, + { + "epoch": 0.67, + "grad_norm": 4.487964336511981, + "learning_rate": 2.617945626577262e-06, + "loss": 0.4389, + "step": 4431 + }, + { + "epoch": 0.67, + "grad_norm": 3.980207956292584, + "learning_rate": 2.615798375673958e-06, + "loss": 0.4549, + "step": 4432 + }, + { + "epoch": 0.67, + "grad_norm": 6.962797961768096, + "learning_rate": 2.613651693735848e-06, + "loss": 0.4402, + "step": 4433 + }, + { + "epoch": 0.67, + "grad_norm": 4.628685374014429, + "learning_rate": 2.611505581275212e-06, + "loss": 0.3865, + "step": 4434 + }, + { + "epoch": 0.67, + "grad_norm": 4.997354702589493, + "learning_rate": 2.6093600388041977e-06, + "loss": 0.3422, + "step": 4435 + }, + { + "epoch": 0.67, + "grad_norm": 7.3507032031446435, + "learning_rate": 2.6072150668348175e-06, + "loss": 0.4204, + "step": 4436 + }, + { + "epoch": 0.67, + "grad_norm": 2.872216878208923, + "learning_rate": 2.6050706658789437e-06, + "loss": 0.3605, + "step": 4437 + }, + { + "epoch": 0.67, + "grad_norm": 4.144199736648732, + "learning_rate": 2.6029268364483206e-06, + "loss": 0.4193, + "step": 4438 + }, + { + "epoch": 0.67, + "grad_norm": 4.865539887399338, + "learning_rate": 2.6007835790545493e-06, + "loss": 0.43, + "step": 4439 + }, + { + "epoch": 0.67, + "grad_norm": 4.516676269724278, + "learning_rate": 2.598640894209093e-06, + "loss": 0.4787, + "step": 4440 + }, + { + "epoch": 0.67, + "grad_norm": 6.09768123760752, + "learning_rate": 2.5964987824232857e-06, + "loss": 0.413, + "step": 4441 + }, + { + "epoch": 0.67, + "grad_norm": 1.0510665768935583, + "learning_rate": 2.594357244208317e-06, + "loss": 0.5387, + "step": 4442 + }, + { + "epoch": 0.67, + "grad_norm": 6.866635429938676, + "learning_rate": 2.592216280075246e-06, + "loss": 0.4361, + "step": 4443 + }, + { + "epoch": 0.67, + "grad_norm": 4.679483229501885, + "learning_rate": 2.590075890534991e-06, + "loss": 0.4692, + "step": 4444 + }, + { + "epoch": 0.67, + "grad_norm": 11.926879515943446, + "learning_rate": 2.5879360760983306e-06, + "loss": 0.3772, + "step": 4445 + }, + { + "epoch": 0.67, + "grad_norm": 4.510184199589595, + "learning_rate": 2.585796837275914e-06, + "loss": 0.4201, + "step": 4446 + }, + { + "epoch": 0.67, + "grad_norm": 9.921457635212152, + "learning_rate": 2.5836581745782474e-06, + "loss": 0.5154, + "step": 4447 + }, + { + "epoch": 0.67, + "grad_norm": 4.476561580027541, + "learning_rate": 2.581520088515699e-06, + "loss": 0.3951, + "step": 4448 + }, + { + "epoch": 0.67, + "grad_norm": 1.1988205705169679, + "learning_rate": 2.5793825795985027e-06, + "loss": 0.5207, + "step": 4449 + }, + { + "epoch": 0.67, + "grad_norm": 4.519278380788434, + "learning_rate": 2.57724564833675e-06, + "loss": 0.4473, + "step": 4450 + }, + { + "epoch": 0.67, + "grad_norm": 1.0648684055111914, + "learning_rate": 2.575109295240401e-06, + "loss": 0.5368, + "step": 4451 + }, + { + "epoch": 0.67, + "grad_norm": 3.4232756253397914, + "learning_rate": 2.5729735208192726e-06, + "loss": 0.4183, + "step": 4452 + }, + { + "epoch": 0.67, + "grad_norm": 10.871397854742682, + "learning_rate": 2.570838325583047e-06, + "loss": 0.4544, + "step": 4453 + }, + { + "epoch": 0.67, + "grad_norm": 4.288503909523776, + "learning_rate": 2.568703710041266e-06, + "loss": 0.4885, + "step": 4454 + }, + { + "epoch": 0.67, + "grad_norm": 6.676419313709423, + "learning_rate": 2.566569674703332e-06, + "loss": 0.5154, + "step": 4455 + }, + { + "epoch": 0.67, + "grad_norm": 6.893021268469413, + "learning_rate": 2.564436220078513e-06, + "loss": 0.4583, + "step": 4456 + }, + { + "epoch": 0.67, + "grad_norm": 6.178885096044598, + "learning_rate": 2.5623033466759345e-06, + "loss": 0.3964, + "step": 4457 + }, + { + "epoch": 0.67, + "grad_norm": 5.302400830747199, + "learning_rate": 2.5601710550045856e-06, + "loss": 0.4631, + "step": 4458 + }, + { + "epoch": 0.67, + "grad_norm": 12.107473780230713, + "learning_rate": 2.5580393455733153e-06, + "loss": 0.4786, + "step": 4459 + }, + { + "epoch": 0.67, + "grad_norm": 1.1321055198611474, + "learning_rate": 2.5559082188908317e-06, + "loss": 0.5672, + "step": 4460 + }, + { + "epoch": 0.67, + "grad_norm": 7.2708641438076835, + "learning_rate": 2.553777675465711e-06, + "loss": 0.4397, + "step": 4461 + }, + { + "epoch": 0.67, + "grad_norm": 6.767372584261381, + "learning_rate": 2.551647715806381e-06, + "loss": 0.4031, + "step": 4462 + }, + { + "epoch": 0.67, + "grad_norm": 5.253141579549303, + "learning_rate": 2.549518340421139e-06, + "loss": 0.402, + "step": 4463 + }, + { + "epoch": 0.67, + "grad_norm": 3.9464314145639103, + "learning_rate": 2.547389549818137e-06, + "loss": 0.4202, + "step": 4464 + }, + { + "epoch": 0.67, + "grad_norm": 31.907346259845525, + "learning_rate": 2.5452613445053847e-06, + "loss": 0.411, + "step": 4465 + }, + { + "epoch": 0.67, + "grad_norm": 5.070057855700322, + "learning_rate": 2.5431337249907628e-06, + "loss": 0.4431, + "step": 4466 + }, + { + "epoch": 0.67, + "grad_norm": 6.733684069782123, + "learning_rate": 2.541006691782001e-06, + "loss": 0.3722, + "step": 4467 + }, + { + "epoch": 0.67, + "grad_norm": 25.448426624984773, + "learning_rate": 2.538880245386698e-06, + "loss": 0.4109, + "step": 4468 + }, + { + "epoch": 0.67, + "grad_norm": 4.188092790602591, + "learning_rate": 2.536754386312307e-06, + "loss": 0.432, + "step": 4469 + }, + { + "epoch": 0.67, + "grad_norm": 1.0657586111854107, + "learning_rate": 2.5346291150661407e-06, + "loss": 0.5393, + "step": 4470 + }, + { + "epoch": 0.67, + "grad_norm": 6.254217490570659, + "learning_rate": 2.5325044321553737e-06, + "loss": 0.4649, + "step": 4471 + }, + { + "epoch": 0.67, + "grad_norm": 8.626629848650046, + "learning_rate": 2.5303803380870394e-06, + "loss": 0.4522, + "step": 4472 + }, + { + "epoch": 0.67, + "grad_norm": 3.4184715142252413, + "learning_rate": 2.5282568333680337e-06, + "loss": 0.4436, + "step": 4473 + }, + { + "epoch": 0.67, + "grad_norm": 7.868419182342017, + "learning_rate": 2.5261339185051063e-06, + "loss": 0.4389, + "step": 4474 + }, + { + "epoch": 0.67, + "grad_norm": 5.502172559428933, + "learning_rate": 2.5240115940048686e-06, + "loss": 0.4089, + "step": 4475 + }, + { + "epoch": 0.68, + "grad_norm": 4.708852952818557, + "learning_rate": 2.5218898603737952e-06, + "loss": 0.3746, + "step": 4476 + }, + { + "epoch": 0.68, + "grad_norm": 5.452830154847227, + "learning_rate": 2.5197687181182105e-06, + "loss": 0.4767, + "step": 4477 + }, + { + "epoch": 0.68, + "grad_norm": 4.047366366421238, + "learning_rate": 2.51764816774431e-06, + "loss": 0.4036, + "step": 4478 + }, + { + "epoch": 0.68, + "grad_norm": 5.636987678485354, + "learning_rate": 2.515528209758137e-06, + "loss": 0.3874, + "step": 4479 + }, + { + "epoch": 0.68, + "grad_norm": 11.944147760602071, + "learning_rate": 2.5134088446655958e-06, + "loss": 0.4867, + "step": 4480 + }, + { + "epoch": 0.68, + "grad_norm": 4.44987372891256, + "learning_rate": 2.5112900729724565e-06, + "loss": 0.4286, + "step": 4481 + }, + { + "epoch": 0.68, + "grad_norm": 8.706559371110334, + "learning_rate": 2.5091718951843384e-06, + "loss": 0.4185, + "step": 4482 + }, + { + "epoch": 0.68, + "grad_norm": 6.152327862547834, + "learning_rate": 2.5070543118067244e-06, + "loss": 0.4392, + "step": 4483 + }, + { + "epoch": 0.68, + "grad_norm": 7.456112949079817, + "learning_rate": 2.5049373233449532e-06, + "loss": 0.3905, + "step": 4484 + }, + { + "epoch": 0.68, + "grad_norm": 9.19055494127556, + "learning_rate": 2.5028209303042207e-06, + "loss": 0.4376, + "step": 4485 + }, + { + "epoch": 0.68, + "grad_norm": 5.420412062691334, + "learning_rate": 2.500705133189585e-06, + "loss": 0.459, + "step": 4486 + }, + { + "epoch": 0.68, + "grad_norm": 3.4612743290333983, + "learning_rate": 2.4985899325059578e-06, + "loss": 0.5248, + "step": 4487 + }, + { + "epoch": 0.68, + "grad_norm": 5.686372145597631, + "learning_rate": 2.496475328758111e-06, + "loss": 0.3779, + "step": 4488 + }, + { + "epoch": 0.68, + "grad_norm": 4.155366349644389, + "learning_rate": 2.494361322450674e-06, + "loss": 0.414, + "step": 4489 + }, + { + "epoch": 0.68, + "grad_norm": 8.712826376929609, + "learning_rate": 2.4922479140881283e-06, + "loss": 0.5018, + "step": 4490 + }, + { + "epoch": 0.68, + "grad_norm": 4.523784571382723, + "learning_rate": 2.490135104174823e-06, + "loss": 0.4569, + "step": 4491 + }, + { + "epoch": 0.68, + "grad_norm": 1.1600711872342322, + "learning_rate": 2.4880228932149546e-06, + "loss": 0.517, + "step": 4492 + }, + { + "epoch": 0.68, + "grad_norm": 4.4214046008623615, + "learning_rate": 2.485911281712582e-06, + "loss": 0.4492, + "step": 4493 + }, + { + "epoch": 0.68, + "grad_norm": 7.115657271223559, + "learning_rate": 2.4838002701716184e-06, + "loss": 0.4548, + "step": 4494 + }, + { + "epoch": 0.68, + "grad_norm": 4.662059536090669, + "learning_rate": 2.4816898590958338e-06, + "loss": 0.4475, + "step": 4495 + }, + { + "epoch": 0.68, + "grad_norm": 4.481344999517694, + "learning_rate": 2.4795800489888593e-06, + "loss": 0.3711, + "step": 4496 + }, + { + "epoch": 0.68, + "grad_norm": 3.391467438549771, + "learning_rate": 2.4774708403541768e-06, + "loss": 0.4438, + "step": 4497 + }, + { + "epoch": 0.68, + "grad_norm": 5.528469680813858, + "learning_rate": 2.47536223369513e-06, + "loss": 0.4724, + "step": 4498 + }, + { + "epoch": 0.68, + "grad_norm": 3.4375114190833433, + "learning_rate": 2.4732542295149143e-06, + "loss": 0.3832, + "step": 4499 + }, + { + "epoch": 0.68, + "grad_norm": 6.308241352733989, + "learning_rate": 2.471146828316582e-06, + "loss": 0.4379, + "step": 4500 + }, + { + "epoch": 0.68, + "grad_norm": 4.4481875596949845, + "learning_rate": 2.4690400306030453e-06, + "loss": 0.4491, + "step": 4501 + }, + { + "epoch": 0.68, + "grad_norm": 4.158016159275677, + "learning_rate": 2.4669338368770674e-06, + "loss": 0.4347, + "step": 4502 + }, + { + "epoch": 0.68, + "grad_norm": 7.304517886631863, + "learning_rate": 2.4648282476412727e-06, + "loss": 0.4053, + "step": 4503 + }, + { + "epoch": 0.68, + "grad_norm": 4.412040272731639, + "learning_rate": 2.4627232633981368e-06, + "loss": 0.4673, + "step": 4504 + }, + { + "epoch": 0.68, + "grad_norm": 4.499352600200761, + "learning_rate": 2.460618884649993e-06, + "loss": 0.4697, + "step": 4505 + }, + { + "epoch": 0.68, + "grad_norm": 4.643633887993586, + "learning_rate": 2.4585151118990286e-06, + "loss": 0.4103, + "step": 4506 + }, + { + "epoch": 0.68, + "grad_norm": 4.798532353278695, + "learning_rate": 2.4564119456472856e-06, + "loss": 0.3388, + "step": 4507 + }, + { + "epoch": 0.68, + "grad_norm": 4.767016457125586, + "learning_rate": 2.4543093863966675e-06, + "loss": 0.4759, + "step": 4508 + }, + { + "epoch": 0.68, + "grad_norm": 5.2930411129763115, + "learning_rate": 2.452207434648926e-06, + "loss": 0.4265, + "step": 4509 + }, + { + "epoch": 0.68, + "grad_norm": 10.266145038678747, + "learning_rate": 2.450106090905669e-06, + "loss": 0.4678, + "step": 4510 + }, + { + "epoch": 0.68, + "grad_norm": 4.509971414357076, + "learning_rate": 2.4480053556683638e-06, + "loss": 0.4519, + "step": 4511 + }, + { + "epoch": 0.68, + "grad_norm": 3.55741596874126, + "learning_rate": 2.445905229438325e-06, + "loss": 0.4723, + "step": 4512 + }, + { + "epoch": 0.68, + "grad_norm": 6.199393472643776, + "learning_rate": 2.4438057127167307e-06, + "loss": 0.3809, + "step": 4513 + }, + { + "epoch": 0.68, + "grad_norm": 8.364261609779675, + "learning_rate": 2.441706806004607e-06, + "loss": 0.4307, + "step": 4514 + }, + { + "epoch": 0.68, + "grad_norm": 5.270565185147571, + "learning_rate": 2.439608509802836e-06, + "loss": 0.4114, + "step": 4515 + }, + { + "epoch": 0.68, + "grad_norm": 6.329635907436552, + "learning_rate": 2.437510824612155e-06, + "loss": 0.4027, + "step": 4516 + }, + { + "epoch": 0.68, + "grad_norm": 7.783217983060687, + "learning_rate": 2.4354137509331526e-06, + "loss": 0.4066, + "step": 4517 + }, + { + "epoch": 0.68, + "grad_norm": 5.266490816961862, + "learning_rate": 2.4333172892662784e-06, + "loss": 0.4658, + "step": 4518 + }, + { + "epoch": 0.68, + "grad_norm": 5.124930718978582, + "learning_rate": 2.4312214401118282e-06, + "loss": 0.4626, + "step": 4519 + }, + { + "epoch": 0.68, + "grad_norm": 5.9660690622702095, + "learning_rate": 2.4291262039699544e-06, + "loss": 0.4937, + "step": 4520 + }, + { + "epoch": 0.68, + "grad_norm": 6.305849853422665, + "learning_rate": 2.4270315813406664e-06, + "loss": 0.4454, + "step": 4521 + }, + { + "epoch": 0.68, + "grad_norm": 7.698637588666971, + "learning_rate": 2.424937572723821e-06, + "loss": 0.4753, + "step": 4522 + }, + { + "epoch": 0.68, + "grad_norm": 5.586873932601037, + "learning_rate": 2.422844178619135e-06, + "loss": 0.4649, + "step": 4523 + }, + { + "epoch": 0.68, + "grad_norm": 4.092495400873869, + "learning_rate": 2.4207513995261745e-06, + "loss": 0.4092, + "step": 4524 + }, + { + "epoch": 0.68, + "grad_norm": 5.3905903417965355, + "learning_rate": 2.418659235944357e-06, + "loss": 0.3599, + "step": 4525 + }, + { + "epoch": 0.68, + "grad_norm": 4.656486039008912, + "learning_rate": 2.41656768837296e-06, + "loss": 0.4161, + "step": 4526 + }, + { + "epoch": 0.68, + "grad_norm": 3.8032244113971356, + "learning_rate": 2.4144767573111074e-06, + "loss": 0.4164, + "step": 4527 + }, + { + "epoch": 0.68, + "grad_norm": 4.687471682575265, + "learning_rate": 2.412386443257779e-06, + "loss": 0.48, + "step": 4528 + }, + { + "epoch": 0.68, + "grad_norm": 6.466082416505405, + "learning_rate": 2.410296746711806e-06, + "loss": 0.4881, + "step": 4529 + }, + { + "epoch": 0.68, + "grad_norm": 5.772450169476161, + "learning_rate": 2.408207668171872e-06, + "loss": 0.3472, + "step": 4530 + }, + { + "epoch": 0.68, + "grad_norm": 5.307142647415721, + "learning_rate": 2.406119208136517e-06, + "loss": 0.4412, + "step": 4531 + }, + { + "epoch": 0.68, + "grad_norm": 16.53462810148362, + "learning_rate": 2.4040313671041276e-06, + "loss": 0.4462, + "step": 4532 + }, + { + "epoch": 0.68, + "grad_norm": 7.730368474049808, + "learning_rate": 2.4019441455729494e-06, + "loss": 0.492, + "step": 4533 + }, + { + "epoch": 0.68, + "grad_norm": 5.6306245641289125, + "learning_rate": 2.3998575440410733e-06, + "loss": 0.4854, + "step": 4534 + }, + { + "epoch": 0.68, + "grad_norm": 3.6512486764189913, + "learning_rate": 2.3977715630064442e-06, + "loss": 0.4513, + "step": 4535 + }, + { + "epoch": 0.68, + "grad_norm": 6.175449154591465, + "learning_rate": 2.395686202966864e-06, + "loss": 0.4322, + "step": 4536 + }, + { + "epoch": 0.68, + "grad_norm": 24.026666226977785, + "learning_rate": 2.393601464419977e-06, + "loss": 0.4016, + "step": 4537 + }, + { + "epoch": 0.68, + "grad_norm": 6.131896151842517, + "learning_rate": 2.3915173478632916e-06, + "loss": 0.3715, + "step": 4538 + }, + { + "epoch": 0.68, + "grad_norm": 5.129544868409922, + "learning_rate": 2.389433853794156e-06, + "loss": 0.4685, + "step": 4539 + }, + { + "epoch": 0.68, + "grad_norm": 6.333273276297152, + "learning_rate": 2.3873509827097756e-06, + "loss": 0.4946, + "step": 4540 + }, + { + "epoch": 0.68, + "grad_norm": 4.398083876655644, + "learning_rate": 2.385268735107207e-06, + "loss": 0.3865, + "step": 4541 + }, + { + "epoch": 0.69, + "grad_norm": 1.045336038096149, + "learning_rate": 2.3831871114833537e-06, + "loss": 0.5099, + "step": 4542 + }, + { + "epoch": 0.69, + "grad_norm": 1.184042356912598, + "learning_rate": 2.3811061123349783e-06, + "loss": 0.5092, + "step": 4543 + }, + { + "epoch": 0.69, + "grad_norm": 8.186751493810831, + "learning_rate": 2.3790257381586882e-06, + "loss": 0.4478, + "step": 4544 + }, + { + "epoch": 0.69, + "grad_norm": 8.306576889678205, + "learning_rate": 2.376945989450941e-06, + "loss": 0.4906, + "step": 4545 + }, + { + "epoch": 0.69, + "grad_norm": 5.917671598270758, + "learning_rate": 2.3748668667080516e-06, + "loss": 0.4311, + "step": 4546 + }, + { + "epoch": 0.69, + "grad_norm": 4.709369186023426, + "learning_rate": 2.372788370426179e-06, + "loss": 0.4034, + "step": 4547 + }, + { + "epoch": 0.69, + "grad_norm": 6.028036620918392, + "learning_rate": 2.3707105011013326e-06, + "loss": 0.4585, + "step": 4548 + }, + { + "epoch": 0.69, + "grad_norm": 5.153543576664654, + "learning_rate": 2.368633259229379e-06, + "loss": 0.4197, + "step": 4549 + }, + { + "epoch": 0.69, + "grad_norm": 5.799360530377895, + "learning_rate": 2.366556645306029e-06, + "loss": 0.3663, + "step": 4550 + }, + { + "epoch": 0.69, + "grad_norm": 5.829285559413873, + "learning_rate": 2.3644806598268446e-06, + "loss": 0.4434, + "step": 4551 + }, + { + "epoch": 0.69, + "grad_norm": 7.146776232131839, + "learning_rate": 2.3624053032872385e-06, + "loss": 0.3757, + "step": 4552 + }, + { + "epoch": 0.69, + "grad_norm": 6.729286985052793, + "learning_rate": 2.360330576182471e-06, + "loss": 0.3892, + "step": 4553 + }, + { + "epoch": 0.69, + "grad_norm": 4.133138191483974, + "learning_rate": 2.3582564790076586e-06, + "loss": 0.47, + "step": 4554 + }, + { + "epoch": 0.69, + "grad_norm": 15.039260666038281, + "learning_rate": 2.356183012257759e-06, + "loss": 0.5098, + "step": 4555 + }, + { + "epoch": 0.69, + "grad_norm": 4.564456473143087, + "learning_rate": 2.3541101764275883e-06, + "loss": 0.4014, + "step": 4556 + }, + { + "epoch": 0.69, + "grad_norm": 6.030094906203111, + "learning_rate": 2.352037972011805e-06, + "loss": 0.4528, + "step": 4557 + }, + { + "epoch": 0.69, + "grad_norm": 5.959836056600525, + "learning_rate": 2.3499663995049165e-06, + "loss": 0.4613, + "step": 4558 + }, + { + "epoch": 0.69, + "grad_norm": 4.7974239036737965, + "learning_rate": 2.3478954594012884e-06, + "loss": 0.4467, + "step": 4559 + }, + { + "epoch": 0.69, + "grad_norm": 6.6174852007861995, + "learning_rate": 2.3458251521951236e-06, + "loss": 0.4046, + "step": 4560 + }, + { + "epoch": 0.69, + "grad_norm": 6.9198416112721945, + "learning_rate": 2.3437554783804838e-06, + "loss": 0.3758, + "step": 4561 + }, + { + "epoch": 0.69, + "grad_norm": 6.414526658694877, + "learning_rate": 2.341686438451274e-06, + "loss": 0.4066, + "step": 4562 + }, + { + "epoch": 0.69, + "grad_norm": 4.366281314133032, + "learning_rate": 2.3396180329012488e-06, + "loss": 0.46, + "step": 4563 + }, + { + "epoch": 0.69, + "grad_norm": 4.94041075223834, + "learning_rate": 2.3375502622240115e-06, + "loss": 0.4072, + "step": 4564 + }, + { + "epoch": 0.69, + "grad_norm": 4.28504549223454, + "learning_rate": 2.3354831269130133e-06, + "loss": 0.4463, + "step": 4565 + }, + { + "epoch": 0.69, + "grad_norm": 4.68620075171947, + "learning_rate": 2.3334166274615582e-06, + "loss": 0.399, + "step": 4566 + }, + { + "epoch": 0.69, + "grad_norm": 4.307452964421236, + "learning_rate": 2.331350764362793e-06, + "loss": 0.4453, + "step": 4567 + }, + { + "epoch": 0.69, + "grad_norm": 8.037296210984081, + "learning_rate": 2.3292855381097134e-06, + "loss": 0.4667, + "step": 4568 + }, + { + "epoch": 0.69, + "grad_norm": 3.935676980907501, + "learning_rate": 2.3272209491951674e-06, + "loss": 0.4206, + "step": 4569 + }, + { + "epoch": 0.69, + "grad_norm": 3.223737573691994, + "learning_rate": 2.325156998111844e-06, + "loss": 0.4843, + "step": 4570 + }, + { + "epoch": 0.69, + "grad_norm": 3.426790935369577, + "learning_rate": 2.323093685352289e-06, + "loss": 0.4588, + "step": 4571 + }, + { + "epoch": 0.69, + "grad_norm": 8.597001989728083, + "learning_rate": 2.3210310114088876e-06, + "loss": 0.4185, + "step": 4572 + }, + { + "epoch": 0.69, + "grad_norm": 6.297678665704269, + "learning_rate": 2.3189689767738737e-06, + "loss": 0.4298, + "step": 4573 + }, + { + "epoch": 0.69, + "grad_norm": 4.8997794861676125, + "learning_rate": 2.3169075819393355e-06, + "loss": 0.4147, + "step": 4574 + }, + { + "epoch": 0.69, + "grad_norm": 4.999493847655554, + "learning_rate": 2.3148468273972013e-06, + "loss": 0.4574, + "step": 4575 + }, + { + "epoch": 0.69, + "grad_norm": 4.1911402605255725, + "learning_rate": 2.3127867136392487e-06, + "loss": 0.4347, + "step": 4576 + }, + { + "epoch": 0.69, + "grad_norm": 5.165114242751637, + "learning_rate": 2.3107272411571024e-06, + "loss": 0.4886, + "step": 4577 + }, + { + "epoch": 0.69, + "grad_norm": 6.972705143940993, + "learning_rate": 2.3086684104422333e-06, + "loss": 0.4216, + "step": 4578 + }, + { + "epoch": 0.69, + "grad_norm": 4.926095429899442, + "learning_rate": 2.306610221985964e-06, + "loss": 0.4322, + "step": 4579 + }, + { + "epoch": 0.69, + "grad_norm": 10.489777705254749, + "learning_rate": 2.3045526762794546e-06, + "loss": 0.4706, + "step": 4580 + }, + { + "epoch": 0.69, + "grad_norm": 4.204448492801335, + "learning_rate": 2.3024957738137227e-06, + "loss": 0.4188, + "step": 4581 + }, + { + "epoch": 0.69, + "grad_norm": 3.802353210661846, + "learning_rate": 2.3004395150796244e-06, + "loss": 0.3693, + "step": 4582 + }, + { + "epoch": 0.69, + "grad_norm": 4.5014865840948906, + "learning_rate": 2.298383900567863e-06, + "loss": 0.4722, + "step": 4583 + }, + { + "epoch": 0.69, + "grad_norm": 3.7536278404125865, + "learning_rate": 2.2963289307689933e-06, + "loss": 0.3982, + "step": 4584 + }, + { + "epoch": 0.69, + "grad_norm": 5.249145251195088, + "learning_rate": 2.294274606173411e-06, + "loss": 0.4896, + "step": 4585 + }, + { + "epoch": 0.69, + "grad_norm": 1.197223032904191, + "learning_rate": 2.2922209272713598e-06, + "loss": 0.5714, + "step": 4586 + }, + { + "epoch": 0.69, + "grad_norm": 1.1648321594125401, + "learning_rate": 2.290167894552928e-06, + "loss": 0.4932, + "step": 4587 + }, + { + "epoch": 0.69, + "grad_norm": 8.463588051263578, + "learning_rate": 2.2881155085080505e-06, + "loss": 0.4655, + "step": 4588 + }, + { + "epoch": 0.69, + "grad_norm": 10.667121994737903, + "learning_rate": 2.28606376962651e-06, + "loss": 0.4012, + "step": 4589 + }, + { + "epoch": 0.69, + "grad_norm": 3.090926567888159, + "learning_rate": 2.2840126783979307e-06, + "loss": 0.4878, + "step": 4590 + }, + { + "epoch": 0.69, + "grad_norm": 3.6082281652327035, + "learning_rate": 2.281962235311787e-06, + "loss": 0.4537, + "step": 4591 + }, + { + "epoch": 0.69, + "grad_norm": 6.7634535940592695, + "learning_rate": 2.279912440857395e-06, + "loss": 0.4316, + "step": 4592 + }, + { + "epoch": 0.69, + "grad_norm": 5.169632495348908, + "learning_rate": 2.277863295523914e-06, + "loss": 0.4203, + "step": 4593 + }, + { + "epoch": 0.69, + "grad_norm": 6.690919204361506, + "learning_rate": 2.2758147998003575e-06, + "loss": 0.4343, + "step": 4594 + }, + { + "epoch": 0.69, + "grad_norm": 29.483921318362853, + "learning_rate": 2.273766954175572e-06, + "loss": 0.4786, + "step": 4595 + }, + { + "epoch": 0.69, + "grad_norm": 7.751363819404805, + "learning_rate": 2.2717197591382595e-06, + "loss": 0.3788, + "step": 4596 + }, + { + "epoch": 0.69, + "grad_norm": 6.5950124046572824, + "learning_rate": 2.26967321517696e-06, + "loss": 0.4025, + "step": 4597 + }, + { + "epoch": 0.69, + "grad_norm": 3.9530483590891334, + "learning_rate": 2.26762732278006e-06, + "loss": 0.4013, + "step": 4598 + }, + { + "epoch": 0.69, + "grad_norm": 5.24545903136202, + "learning_rate": 2.2655820824357904e-06, + "loss": 0.4219, + "step": 4599 + }, + { + "epoch": 0.69, + "grad_norm": 6.563241387138226, + "learning_rate": 2.2635374946322257e-06, + "loss": 0.4323, + "step": 4600 + }, + { + "epoch": 0.69, + "grad_norm": 4.882516292669679, + "learning_rate": 2.2614935598572896e-06, + "loss": 0.453, + "step": 4601 + }, + { + "epoch": 0.69, + "grad_norm": 7.078845839984108, + "learning_rate": 2.2594502785987434e-06, + "loss": 0.3548, + "step": 4602 + }, + { + "epoch": 0.69, + "grad_norm": 5.250606849565479, + "learning_rate": 2.257407651344193e-06, + "loss": 0.4155, + "step": 4603 + }, + { + "epoch": 0.69, + "grad_norm": 7.420533821994416, + "learning_rate": 2.255365678581095e-06, + "loss": 0.3934, + "step": 4604 + }, + { + "epoch": 0.69, + "grad_norm": 4.886899516945829, + "learning_rate": 2.2533243607967416e-06, + "loss": 0.4555, + "step": 4605 + }, + { + "epoch": 0.69, + "grad_norm": 4.86707533288496, + "learning_rate": 2.2512836984782753e-06, + "loss": 0.4299, + "step": 4606 + }, + { + "epoch": 0.69, + "grad_norm": 26.974018865404727, + "learning_rate": 2.249243692112678e-06, + "loss": 0.4535, + "step": 4607 + }, + { + "epoch": 0.7, + "grad_norm": 5.997298008021307, + "learning_rate": 2.2472043421867734e-06, + "loss": 0.4571, + "step": 4608 + }, + { + "epoch": 0.7, + "grad_norm": 9.293712151444383, + "learning_rate": 2.2451656491872357e-06, + "loss": 0.4524, + "step": 4609 + }, + { + "epoch": 0.7, + "grad_norm": 3.9687203949358665, + "learning_rate": 2.243127613600577e-06, + "loss": 0.4269, + "step": 4610 + }, + { + "epoch": 0.7, + "grad_norm": 7.966021983752036, + "learning_rate": 2.241090235913152e-06, + "loss": 0.3689, + "step": 4611 + }, + { + "epoch": 0.7, + "grad_norm": 4.989376092084924, + "learning_rate": 2.2390535166111603e-06, + "loss": 0.3786, + "step": 4612 + }, + { + "epoch": 0.7, + "grad_norm": 3.968203036394358, + "learning_rate": 2.237017456180643e-06, + "loss": 0.3963, + "step": 4613 + }, + { + "epoch": 0.7, + "grad_norm": 9.036780402997223, + "learning_rate": 2.234982055107488e-06, + "loss": 0.4976, + "step": 4614 + }, + { + "epoch": 0.7, + "grad_norm": 10.410156394099785, + "learning_rate": 2.2329473138774192e-06, + "loss": 0.5123, + "step": 4615 + }, + { + "epoch": 0.7, + "grad_norm": 10.948106820397602, + "learning_rate": 2.23091323297601e-06, + "loss": 0.4463, + "step": 4616 + }, + { + "epoch": 0.7, + "grad_norm": 5.800167084295893, + "learning_rate": 2.2288798128886722e-06, + "loss": 0.3882, + "step": 4617 + }, + { + "epoch": 0.7, + "grad_norm": 12.135761369676365, + "learning_rate": 2.2268470541006574e-06, + "loss": 0.411, + "step": 4618 + }, + { + "epoch": 0.7, + "grad_norm": 7.420117036253311, + "learning_rate": 2.2248149570970672e-06, + "loss": 0.4848, + "step": 4619 + }, + { + "epoch": 0.7, + "grad_norm": 1.1551565530720522, + "learning_rate": 2.2227835223628387e-06, + "loss": 0.5533, + "step": 4620 + }, + { + "epoch": 0.7, + "grad_norm": 23.697108492652568, + "learning_rate": 2.220752750382752e-06, + "loss": 0.3574, + "step": 4621 + }, + { + "epoch": 0.7, + "grad_norm": 3.9671962894469255, + "learning_rate": 2.218722641641431e-06, + "loss": 0.4243, + "step": 4622 + }, + { + "epoch": 0.7, + "grad_norm": 4.353322658454584, + "learning_rate": 2.216693196623338e-06, + "loss": 0.3787, + "step": 4623 + }, + { + "epoch": 0.7, + "grad_norm": 13.83502814193874, + "learning_rate": 2.2146644158127827e-06, + "loss": 0.3777, + "step": 4624 + }, + { + "epoch": 0.7, + "grad_norm": 10.693264209919535, + "learning_rate": 2.212636299693909e-06, + "loss": 0.3688, + "step": 4625 + }, + { + "epoch": 0.7, + "grad_norm": 7.086954309172666, + "learning_rate": 2.2106088487507104e-06, + "loss": 0.4273, + "step": 4626 + }, + { + "epoch": 0.7, + "grad_norm": 6.085475982982225, + "learning_rate": 2.2085820634670148e-06, + "loss": 0.38, + "step": 4627 + }, + { + "epoch": 0.7, + "grad_norm": 7.61605925906109, + "learning_rate": 2.206555944326492e-06, + "loss": 0.4852, + "step": 4628 + }, + { + "epoch": 0.7, + "grad_norm": 8.11558863841789, + "learning_rate": 2.2045304918126574e-06, + "loss": 0.4336, + "step": 4629 + }, + { + "epoch": 0.7, + "grad_norm": 5.854839164108935, + "learning_rate": 2.202505706408862e-06, + "loss": 0.4531, + "step": 4630 + }, + { + "epoch": 0.7, + "grad_norm": 8.452764270677095, + "learning_rate": 2.2004815885983026e-06, + "loss": 0.4723, + "step": 4631 + }, + { + "epoch": 0.7, + "grad_norm": 6.205039843815364, + "learning_rate": 2.1984581388640136e-06, + "loss": 0.4369, + "step": 4632 + }, + { + "epoch": 0.7, + "grad_norm": 11.060438635453224, + "learning_rate": 2.1964353576888687e-06, + "loss": 0.4104, + "step": 4633 + }, + { + "epoch": 0.7, + "grad_norm": 4.634745901049915, + "learning_rate": 2.194413245555585e-06, + "loss": 0.4607, + "step": 4634 + }, + { + "epoch": 0.7, + "grad_norm": 6.584523491819987, + "learning_rate": 2.1923918029467165e-06, + "loss": 0.4561, + "step": 4635 + }, + { + "epoch": 0.7, + "grad_norm": 35.25405147532874, + "learning_rate": 2.190371030344664e-06, + "loss": 0.398, + "step": 4636 + }, + { + "epoch": 0.7, + "grad_norm": 4.715199306268774, + "learning_rate": 2.1883509282316613e-06, + "loss": 0.45, + "step": 4637 + }, + { + "epoch": 0.7, + "grad_norm": 4.231615177859496, + "learning_rate": 2.1863314970897837e-06, + "loss": 0.4158, + "step": 4638 + }, + { + "epoch": 0.7, + "grad_norm": 8.552721319654022, + "learning_rate": 2.184312737400951e-06, + "loss": 0.5289, + "step": 4639 + }, + { + "epoch": 0.7, + "grad_norm": 5.34897053490368, + "learning_rate": 2.182294649646916e-06, + "loss": 0.4341, + "step": 4640 + }, + { + "epoch": 0.7, + "grad_norm": 8.437392369300287, + "learning_rate": 2.1802772343092787e-06, + "loss": 0.4257, + "step": 4641 + }, + { + "epoch": 0.7, + "grad_norm": 20.056076856889685, + "learning_rate": 2.178260491869472e-06, + "loss": 0.4329, + "step": 4642 + }, + { + "epoch": 0.7, + "grad_norm": 5.844687318809765, + "learning_rate": 2.1762444228087708e-06, + "loss": 0.3964, + "step": 4643 + }, + { + "epoch": 0.7, + "grad_norm": 5.603557886156365, + "learning_rate": 2.1742290276082893e-06, + "loss": 0.4116, + "step": 4644 + }, + { + "epoch": 0.7, + "grad_norm": 16.2170742957076, + "learning_rate": 2.1722143067489797e-06, + "loss": 0.384, + "step": 4645 + }, + { + "epoch": 0.7, + "grad_norm": 7.220376346298095, + "learning_rate": 2.170200260711637e-06, + "loss": 0.4162, + "step": 4646 + }, + { + "epoch": 0.7, + "grad_norm": 13.467417263954843, + "learning_rate": 2.1681868899768916e-06, + "loss": 0.4421, + "step": 4647 + }, + { + "epoch": 0.7, + "grad_norm": 4.269462910136216, + "learning_rate": 2.166174195025211e-06, + "loss": 0.4284, + "step": 4648 + }, + { + "epoch": 0.7, + "grad_norm": 6.004013252437515, + "learning_rate": 2.1641621763369086e-06, + "loss": 0.4043, + "step": 4649 + }, + { + "epoch": 0.7, + "grad_norm": 6.560257411420245, + "learning_rate": 2.1621508343921275e-06, + "loss": 0.416, + "step": 4650 + }, + { + "epoch": 0.7, + "grad_norm": 6.020505422365288, + "learning_rate": 2.1601401696708583e-06, + "loss": 0.4375, + "step": 4651 + }, + { + "epoch": 0.7, + "grad_norm": 8.201869938330477, + "learning_rate": 2.1581301826529232e-06, + "loss": 0.4585, + "step": 4652 + }, + { + "epoch": 0.7, + "grad_norm": 7.889826556007674, + "learning_rate": 2.1561208738179833e-06, + "loss": 0.4516, + "step": 4653 + }, + { + "epoch": 0.7, + "grad_norm": 5.619533407874556, + "learning_rate": 2.154112243645543e-06, + "loss": 0.4502, + "step": 4654 + }, + { + "epoch": 0.7, + "grad_norm": 8.023614709868001, + "learning_rate": 2.152104292614939e-06, + "loss": 0.4321, + "step": 4655 + }, + { + "epoch": 0.7, + "grad_norm": 6.210272671405171, + "learning_rate": 2.1500970212053492e-06, + "loss": 0.3809, + "step": 4656 + }, + { + "epoch": 0.7, + "grad_norm": 7.441683276674121, + "learning_rate": 2.148090429895787e-06, + "loss": 0.4349, + "step": 4657 + }, + { + "epoch": 0.7, + "grad_norm": 9.767996467563625, + "learning_rate": 2.1460845191651036e-06, + "loss": 0.4178, + "step": 4658 + }, + { + "epoch": 0.7, + "grad_norm": 15.117822916745567, + "learning_rate": 2.1440792894919932e-06, + "loss": 0.4851, + "step": 4659 + }, + { + "epoch": 0.7, + "grad_norm": 7.285538550767005, + "learning_rate": 2.1420747413549805e-06, + "loss": 0.4687, + "step": 4660 + }, + { + "epoch": 0.7, + "grad_norm": 6.930910529415885, + "learning_rate": 2.140070875232428e-06, + "loss": 0.4365, + "step": 4661 + }, + { + "epoch": 0.7, + "grad_norm": 4.070733697446871, + "learning_rate": 2.138067691602543e-06, + "loss": 0.414, + "step": 4662 + }, + { + "epoch": 0.7, + "grad_norm": 6.177455473795536, + "learning_rate": 2.1360651909433596e-06, + "loss": 0.4108, + "step": 4663 + }, + { + "epoch": 0.7, + "grad_norm": 7.0297085019450165, + "learning_rate": 2.134063373732759e-06, + "loss": 0.4308, + "step": 4664 + }, + { + "epoch": 0.7, + "grad_norm": 3.3707977985592805, + "learning_rate": 2.1320622404484503e-06, + "loss": 0.3829, + "step": 4665 + }, + { + "epoch": 0.7, + "grad_norm": 7.208764243613324, + "learning_rate": 2.130061791567984e-06, + "loss": 0.3999, + "step": 4666 + }, + { + "epoch": 0.7, + "grad_norm": 5.44023721421205, + "learning_rate": 2.1280620275687487e-06, + "loss": 0.4692, + "step": 4667 + }, + { + "epoch": 0.7, + "grad_norm": 6.36590258757543, + "learning_rate": 2.1260629489279662e-06, + "loss": 0.4444, + "step": 4668 + }, + { + "epoch": 0.7, + "grad_norm": 12.946079389636454, + "learning_rate": 2.1240645561226964e-06, + "loss": 0.4797, + "step": 4669 + }, + { + "epoch": 0.7, + "grad_norm": 4.261322914908133, + "learning_rate": 2.122066849629834e-06, + "loss": 0.3824, + "step": 4670 + }, + { + "epoch": 0.7, + "grad_norm": 7.135524966736439, + "learning_rate": 2.1200698299261106e-06, + "loss": 0.4369, + "step": 4671 + }, + { + "epoch": 0.7, + "grad_norm": 7.396645568956617, + "learning_rate": 2.118073497488098e-06, + "loss": 0.4547, + "step": 4672 + }, + { + "epoch": 0.7, + "grad_norm": 8.597303668527404, + "learning_rate": 2.1160778527921953e-06, + "loss": 0.4345, + "step": 4673 + }, + { + "epoch": 0.7, + "grad_norm": 6.8010262912373305, + "learning_rate": 2.114082896314648e-06, + "loss": 0.4077, + "step": 4674 + }, + { + "epoch": 0.71, + "grad_norm": 5.933496550250718, + "learning_rate": 2.112088628531529e-06, + "loss": 0.4337, + "step": 4675 + }, + { + "epoch": 0.71, + "grad_norm": 4.899411637486555, + "learning_rate": 2.1100950499187483e-06, + "loss": 0.4384, + "step": 4676 + }, + { + "epoch": 0.71, + "grad_norm": 12.286007187349762, + "learning_rate": 2.108102160952057e-06, + "loss": 0.4037, + "step": 4677 + }, + { + "epoch": 0.71, + "grad_norm": 4.896029011362263, + "learning_rate": 2.106109962107035e-06, + "loss": 0.4439, + "step": 4678 + }, + { + "epoch": 0.71, + "grad_norm": 8.518024639038835, + "learning_rate": 2.1041184538590996e-06, + "loss": 0.4726, + "step": 4679 + }, + { + "epoch": 0.71, + "grad_norm": 8.87395562555042, + "learning_rate": 2.1021276366835052e-06, + "loss": 0.419, + "step": 4680 + }, + { + "epoch": 0.71, + "grad_norm": 12.02543737973597, + "learning_rate": 2.1001375110553373e-06, + "loss": 0.368, + "step": 4681 + }, + { + "epoch": 0.71, + "grad_norm": 3.734900180804966, + "learning_rate": 2.0981480774495226e-06, + "loss": 0.413, + "step": 4682 + }, + { + "epoch": 0.71, + "grad_norm": 4.542658841616869, + "learning_rate": 2.0961593363408154e-06, + "loss": 0.3926, + "step": 4683 + }, + { + "epoch": 0.71, + "grad_norm": 7.464696121585491, + "learning_rate": 2.0941712882038116e-06, + "loss": 0.4628, + "step": 4684 + }, + { + "epoch": 0.71, + "grad_norm": 4.402154659081515, + "learning_rate": 2.0921839335129374e-06, + "loss": 0.4044, + "step": 4685 + }, + { + "epoch": 0.71, + "grad_norm": 1.2150549324737538, + "learning_rate": 2.090197272742452e-06, + "loss": 0.56, + "step": 4686 + }, + { + "epoch": 0.71, + "grad_norm": 7.893079830497851, + "learning_rate": 2.088211306366455e-06, + "loss": 0.4764, + "step": 4687 + }, + { + "epoch": 0.71, + "grad_norm": 6.499264425402562, + "learning_rate": 2.086226034858874e-06, + "loss": 0.4563, + "step": 4688 + }, + { + "epoch": 0.71, + "grad_norm": 18.238104301765745, + "learning_rate": 2.084241458693476e-06, + "loss": 0.4623, + "step": 4689 + }, + { + "epoch": 0.71, + "grad_norm": 4.503142197456145, + "learning_rate": 2.08225757834386e-06, + "loss": 0.3976, + "step": 4690 + }, + { + "epoch": 0.71, + "grad_norm": 6.057851564852268, + "learning_rate": 2.080274394283457e-06, + "loss": 0.4715, + "step": 4691 + }, + { + "epoch": 0.71, + "grad_norm": 3.898048866615284, + "learning_rate": 2.0782919069855336e-06, + "loss": 0.4023, + "step": 4692 + }, + { + "epoch": 0.71, + "grad_norm": 5.596477407889757, + "learning_rate": 2.076310116923188e-06, + "loss": 0.4809, + "step": 4693 + }, + { + "epoch": 0.71, + "grad_norm": 5.451967831340048, + "learning_rate": 2.0743290245693586e-06, + "loss": 0.4034, + "step": 4694 + }, + { + "epoch": 0.71, + "grad_norm": 4.600355110987674, + "learning_rate": 2.0723486303968104e-06, + "loss": 0.4291, + "step": 4695 + }, + { + "epoch": 0.71, + "grad_norm": 8.61573714106362, + "learning_rate": 2.070368934878142e-06, + "loss": 0.4572, + "step": 4696 + }, + { + "epoch": 0.71, + "grad_norm": 5.264408809762894, + "learning_rate": 2.0683899384857913e-06, + "loss": 0.4349, + "step": 4697 + }, + { + "epoch": 0.71, + "grad_norm": 5.935668233463539, + "learning_rate": 2.066411641692022e-06, + "loss": 0.4133, + "step": 4698 + }, + { + "epoch": 0.71, + "grad_norm": 5.690459249479889, + "learning_rate": 2.0644340449689374e-06, + "loss": 0.4289, + "step": 4699 + }, + { + "epoch": 0.71, + "grad_norm": 5.421532254128499, + "learning_rate": 2.0624571487884692e-06, + "loss": 0.3993, + "step": 4700 + }, + { + "epoch": 0.71, + "grad_norm": 7.313477050650719, + "learning_rate": 2.0604809536223814e-06, + "loss": 0.4838, + "step": 4701 + }, + { + "epoch": 0.71, + "grad_norm": 7.683965706031938, + "learning_rate": 2.058505459942276e-06, + "loss": 0.4449, + "step": 4702 + }, + { + "epoch": 0.71, + "grad_norm": 6.534991503362, + "learning_rate": 2.056530668219583e-06, + "loss": 0.4444, + "step": 4703 + }, + { + "epoch": 0.71, + "grad_norm": 5.414881440044616, + "learning_rate": 2.0545565789255656e-06, + "loss": 0.4789, + "step": 4704 + }, + { + "epoch": 0.71, + "grad_norm": 5.302244049254322, + "learning_rate": 2.05258319253132e-06, + "loss": 0.3543, + "step": 4705 + }, + { + "epoch": 0.71, + "grad_norm": 12.63342888608853, + "learning_rate": 2.050610509507773e-06, + "loss": 0.481, + "step": 4706 + }, + { + "epoch": 0.71, + "grad_norm": 5.080221815325946, + "learning_rate": 2.048638530325689e-06, + "loss": 0.4668, + "step": 4707 + }, + { + "epoch": 0.71, + "grad_norm": 5.660873750762749, + "learning_rate": 2.046667255455656e-06, + "loss": 0.3461, + "step": 4708 + }, + { + "epoch": 0.71, + "grad_norm": 4.846244225820388, + "learning_rate": 2.0446966853681026e-06, + "loss": 0.4274, + "step": 4709 + }, + { + "epoch": 0.71, + "grad_norm": 5.238483083439394, + "learning_rate": 2.0427268205332833e-06, + "loss": 0.457, + "step": 4710 + }, + { + "epoch": 0.71, + "grad_norm": 7.306997425363786, + "learning_rate": 2.040757661421284e-06, + "loss": 0.4035, + "step": 4711 + }, + { + "epoch": 0.71, + "grad_norm": 6.168392162710345, + "learning_rate": 2.038789208502029e-06, + "loss": 0.3801, + "step": 4712 + }, + { + "epoch": 0.71, + "grad_norm": 6.511669164384748, + "learning_rate": 2.036821462245266e-06, + "loss": 0.4544, + "step": 4713 + }, + { + "epoch": 0.71, + "grad_norm": 3.6854478787518623, + "learning_rate": 2.034854423120578e-06, + "loss": 0.4399, + "step": 4714 + }, + { + "epoch": 0.71, + "grad_norm": 3.5988742302303782, + "learning_rate": 2.032888091597379e-06, + "loss": 0.4744, + "step": 4715 + }, + { + "epoch": 0.71, + "grad_norm": 3.8593936874751247, + "learning_rate": 2.0309224681449118e-06, + "loss": 0.4042, + "step": 4716 + }, + { + "epoch": 0.71, + "grad_norm": 21.22083884726939, + "learning_rate": 2.028957553232256e-06, + "loss": 0.4897, + "step": 4717 + }, + { + "epoch": 0.71, + "grad_norm": 3.901116152699715, + "learning_rate": 2.0269933473283137e-06, + "loss": 0.415, + "step": 4718 + }, + { + "epoch": 0.71, + "grad_norm": 5.2067091800392715, + "learning_rate": 2.025029850901827e-06, + "loss": 0.3993, + "step": 4719 + }, + { + "epoch": 0.71, + "grad_norm": 5.7798294752347745, + "learning_rate": 2.023067064421362e-06, + "loss": 0.467, + "step": 4720 + }, + { + "epoch": 0.71, + "grad_norm": 5.366255035940329, + "learning_rate": 2.0211049883553162e-06, + "loss": 0.4067, + "step": 4721 + }, + { + "epoch": 0.71, + "grad_norm": 3.9227577268451377, + "learning_rate": 2.019143623171922e-06, + "loss": 0.4161, + "step": 4722 + }, + { + "epoch": 0.71, + "grad_norm": 4.10171748518996, + "learning_rate": 2.0171829693392353e-06, + "loss": 0.4384, + "step": 4723 + }, + { + "epoch": 0.71, + "grad_norm": 12.501246466865606, + "learning_rate": 2.01522302732515e-06, + "loss": 0.4197, + "step": 4724 + }, + { + "epoch": 0.71, + "grad_norm": 3.912877247628547, + "learning_rate": 2.0132637975973832e-06, + "loss": 0.4512, + "step": 4725 + }, + { + "epoch": 0.71, + "grad_norm": 1.3154409210532136, + "learning_rate": 2.0113052806234868e-06, + "loss": 0.5244, + "step": 4726 + }, + { + "epoch": 0.71, + "grad_norm": 6.250235380106761, + "learning_rate": 2.0093474768708395e-06, + "loss": 0.4354, + "step": 4727 + }, + { + "epoch": 0.71, + "grad_norm": 6.106157840111753, + "learning_rate": 2.0073903868066495e-06, + "loss": 0.4321, + "step": 4728 + }, + { + "epoch": 0.71, + "grad_norm": 7.181806441718165, + "learning_rate": 2.0054340108979598e-06, + "loss": 0.4835, + "step": 4729 + }, + { + "epoch": 0.71, + "grad_norm": 4.168744364264677, + "learning_rate": 2.003478349611637e-06, + "loss": 0.3424, + "step": 4730 + }, + { + "epoch": 0.71, + "grad_norm": 7.076747303519995, + "learning_rate": 2.001523403414379e-06, + "loss": 0.4094, + "step": 4731 + }, + { + "epoch": 0.71, + "grad_norm": 0.9906013701034028, + "learning_rate": 1.999569172772716e-06, + "loss": 0.4802, + "step": 4732 + }, + { + "epoch": 0.71, + "grad_norm": 3.8285734828874447, + "learning_rate": 1.997615658153002e-06, + "loss": 0.449, + "step": 4733 + }, + { + "epoch": 0.71, + "grad_norm": 3.8344524000735345, + "learning_rate": 1.9956628600214267e-06, + "loss": 0.4346, + "step": 4734 + }, + { + "epoch": 0.71, + "grad_norm": 13.735080444926478, + "learning_rate": 1.9937107788440035e-06, + "loss": 0.3805, + "step": 4735 + }, + { + "epoch": 0.71, + "grad_norm": 4.264136044196781, + "learning_rate": 1.9917594150865744e-06, + "loss": 0.3887, + "step": 4736 + }, + { + "epoch": 0.71, + "grad_norm": 3.9343279246266087, + "learning_rate": 1.9898087692148164e-06, + "loss": 0.3166, + "step": 4737 + }, + { + "epoch": 0.71, + "grad_norm": 4.337342481763093, + "learning_rate": 1.987858841694229e-06, + "loss": 0.399, + "step": 4738 + }, + { + "epoch": 0.71, + "grad_norm": 3.4241735249813416, + "learning_rate": 1.9859096329901422e-06, + "loss": 0.3852, + "step": 4739 + }, + { + "epoch": 0.71, + "grad_norm": 5.7575853911919115, + "learning_rate": 1.983961143567715e-06, + "loss": 0.4134, + "step": 4740 + }, + { + "epoch": 0.72, + "grad_norm": 4.446791940639822, + "learning_rate": 1.9820133738919324e-06, + "loss": 0.409, + "step": 4741 + }, + { + "epoch": 0.72, + "grad_norm": 5.796601320489815, + "learning_rate": 1.980066324427613e-06, + "loss": 0.5017, + "step": 4742 + }, + { + "epoch": 0.72, + "grad_norm": 9.945047528469804, + "learning_rate": 1.9781199956393963e-06, + "loss": 0.4521, + "step": 4743 + }, + { + "epoch": 0.72, + "grad_norm": 3.874048737161528, + "learning_rate": 1.9761743879917585e-06, + "loss": 0.4128, + "step": 4744 + }, + { + "epoch": 0.72, + "grad_norm": 2.9873889417082498, + "learning_rate": 1.9742295019489952e-06, + "loss": 0.4534, + "step": 4745 + }, + { + "epoch": 0.72, + "grad_norm": 4.639668549414138, + "learning_rate": 1.9722853379752333e-06, + "loss": 0.4191, + "step": 4746 + }, + { + "epoch": 0.72, + "grad_norm": 3.6971203170595976, + "learning_rate": 1.9703418965344303e-06, + "loss": 0.417, + "step": 4747 + }, + { + "epoch": 0.72, + "grad_norm": 20.39474374729187, + "learning_rate": 1.968399178090366e-06, + "loss": 0.4511, + "step": 4748 + }, + { + "epoch": 0.72, + "grad_norm": 4.04790137541763, + "learning_rate": 1.966457183106652e-06, + "loss": 0.4986, + "step": 4749 + }, + { + "epoch": 0.72, + "grad_norm": 3.6056859627574367, + "learning_rate": 1.9645159120467237e-06, + "loss": 0.4548, + "step": 4750 + }, + { + "epoch": 0.72, + "grad_norm": 4.224561677316631, + "learning_rate": 1.962575365373845e-06, + "loss": 0.4647, + "step": 4751 + }, + { + "epoch": 0.72, + "grad_norm": 6.127696206630603, + "learning_rate": 1.96063554355111e-06, + "loss": 0.4852, + "step": 4752 + }, + { + "epoch": 0.72, + "grad_norm": 5.88517042410116, + "learning_rate": 1.9586964470414342e-06, + "loss": 0.4456, + "step": 4753 + }, + { + "epoch": 0.72, + "grad_norm": 10.66025798061986, + "learning_rate": 1.9567580763075666e-06, + "loss": 0.4512, + "step": 4754 + }, + { + "epoch": 0.72, + "grad_norm": 5.417383929690043, + "learning_rate": 1.954820431812077e-06, + "loss": 0.4331, + "step": 4755 + }, + { + "epoch": 0.72, + "grad_norm": 4.786966869059034, + "learning_rate": 1.9528835140173634e-06, + "loss": 0.369, + "step": 4756 + }, + { + "epoch": 0.72, + "grad_norm": 4.571297753914703, + "learning_rate": 1.9509473233856546e-06, + "loss": 0.4642, + "step": 4757 + }, + { + "epoch": 0.72, + "grad_norm": 5.8844154931955295, + "learning_rate": 1.9490118603789987e-06, + "loss": 0.4878, + "step": 4758 + }, + { + "epoch": 0.72, + "grad_norm": 5.044764298217436, + "learning_rate": 1.9470771254592772e-06, + "loss": 0.4965, + "step": 4759 + }, + { + "epoch": 0.72, + "grad_norm": 14.163962241445091, + "learning_rate": 1.945143119088195e-06, + "loss": 0.4583, + "step": 4760 + }, + { + "epoch": 0.72, + "grad_norm": 5.51364752690002, + "learning_rate": 1.9432098417272797e-06, + "loss": 0.465, + "step": 4761 + }, + { + "epoch": 0.72, + "grad_norm": 6.102156915083765, + "learning_rate": 1.941277293837891e-06, + "loss": 0.4587, + "step": 4762 + }, + { + "epoch": 0.72, + "grad_norm": 3.632436543277974, + "learning_rate": 1.9393454758812075e-06, + "loss": 0.3707, + "step": 4763 + }, + { + "epoch": 0.72, + "grad_norm": 1.1629162868207483, + "learning_rate": 1.937414388318243e-06, + "loss": 0.4965, + "step": 4764 + }, + { + "epoch": 0.72, + "grad_norm": 6.2781670246465335, + "learning_rate": 1.9354840316098283e-06, + "loss": 0.4051, + "step": 4765 + }, + { + "epoch": 0.72, + "grad_norm": 14.332777738099624, + "learning_rate": 1.9335544062166227e-06, + "loss": 0.4171, + "step": 4766 + }, + { + "epoch": 0.72, + "grad_norm": 21.23823239045441, + "learning_rate": 1.9316255125991145e-06, + "loss": 0.3867, + "step": 4767 + }, + { + "epoch": 0.72, + "grad_norm": 7.054718638541873, + "learning_rate": 1.929697351217611e-06, + "loss": 0.4448, + "step": 4768 + }, + { + "epoch": 0.72, + "grad_norm": 1.2764466745029368, + "learning_rate": 1.9277699225322517e-06, + "loss": 0.5246, + "step": 4769 + }, + { + "epoch": 0.72, + "grad_norm": 17.999192601220216, + "learning_rate": 1.9258432270029947e-06, + "loss": 0.4023, + "step": 4770 + }, + { + "epoch": 0.72, + "grad_norm": 6.10815055361747, + "learning_rate": 1.923917265089626e-06, + "loss": 0.433, + "step": 4771 + }, + { + "epoch": 0.72, + "grad_norm": 4.907788718316416, + "learning_rate": 1.9219920372517587e-06, + "loss": 0.367, + "step": 4772 + }, + { + "epoch": 0.72, + "grad_norm": 4.395287114142184, + "learning_rate": 1.920067543948829e-06, + "loss": 0.4652, + "step": 4773 + }, + { + "epoch": 0.72, + "grad_norm": 4.80292021307846, + "learning_rate": 1.918143785640093e-06, + "loss": 0.3906, + "step": 4774 + }, + { + "epoch": 0.72, + "grad_norm": 6.4021567765020295, + "learning_rate": 1.91622076278464e-06, + "loss": 0.3641, + "step": 4775 + }, + { + "epoch": 0.72, + "grad_norm": 10.016041904029297, + "learning_rate": 1.9142984758413774e-06, + "loss": 0.4161, + "step": 4776 + }, + { + "epoch": 0.72, + "grad_norm": 3.3499910086681997, + "learning_rate": 1.912376925269041e-06, + "loss": 0.3903, + "step": 4777 + }, + { + "epoch": 0.72, + "grad_norm": 9.137413852974838, + "learning_rate": 1.9104561115261886e-06, + "loss": 0.4768, + "step": 4778 + }, + { + "epoch": 0.72, + "grad_norm": 18.316567464908562, + "learning_rate": 1.9085360350712007e-06, + "loss": 0.3942, + "step": 4779 + }, + { + "epoch": 0.72, + "grad_norm": 6.117784915751582, + "learning_rate": 1.9066166963622867e-06, + "loss": 0.4866, + "step": 4780 + }, + { + "epoch": 0.72, + "grad_norm": 6.983007106790981, + "learning_rate": 1.904698095857474e-06, + "loss": 0.4503, + "step": 4781 + }, + { + "epoch": 0.72, + "grad_norm": 5.499873402984663, + "learning_rate": 1.9027802340146207e-06, + "loss": 0.4329, + "step": 4782 + }, + { + "epoch": 0.72, + "grad_norm": 4.676540606912331, + "learning_rate": 1.9008631112914022e-06, + "loss": 0.4497, + "step": 4783 + }, + { + "epoch": 0.72, + "grad_norm": 5.376980655451133, + "learning_rate": 1.8989467281453206e-06, + "loss": 0.4122, + "step": 4784 + }, + { + "epoch": 0.72, + "grad_norm": 6.388283138070941, + "learning_rate": 1.8970310850337015e-06, + "loss": 0.425, + "step": 4785 + }, + { + "epoch": 0.72, + "grad_norm": 4.168339183284806, + "learning_rate": 1.8951161824136904e-06, + "loss": 0.4904, + "step": 4786 + }, + { + "epoch": 0.72, + "grad_norm": 4.79316188216168, + "learning_rate": 1.8932020207422641e-06, + "loss": 0.4421, + "step": 4787 + }, + { + "epoch": 0.72, + "grad_norm": 7.7934992858567425, + "learning_rate": 1.8912886004762148e-06, + "loss": 0.3934, + "step": 4788 + }, + { + "epoch": 0.72, + "grad_norm": 4.110173722074461, + "learning_rate": 1.8893759220721591e-06, + "loss": 0.398, + "step": 4789 + }, + { + "epoch": 0.72, + "grad_norm": 4.777282508696908, + "learning_rate": 1.8874639859865419e-06, + "loss": 0.4125, + "step": 4790 + }, + { + "epoch": 0.72, + "grad_norm": 12.09581046049798, + "learning_rate": 1.8855527926756228e-06, + "loss": 0.3868, + "step": 4791 + }, + { + "epoch": 0.72, + "grad_norm": 10.297436907399447, + "learning_rate": 1.8836423425954925e-06, + "loss": 0.4236, + "step": 4792 + }, + { + "epoch": 0.72, + "grad_norm": 2.5108793158951612, + "learning_rate": 1.8817326362020588e-06, + "loss": 0.405, + "step": 4793 + }, + { + "epoch": 0.72, + "grad_norm": 5.250977099788029, + "learning_rate": 1.8798236739510505e-06, + "loss": 0.4438, + "step": 4794 + }, + { + "epoch": 0.72, + "grad_norm": 11.346898801866713, + "learning_rate": 1.8779154562980266e-06, + "loss": 0.4057, + "step": 4795 + }, + { + "epoch": 0.72, + "grad_norm": 4.871366371493626, + "learning_rate": 1.8760079836983613e-06, + "loss": 0.4721, + "step": 4796 + }, + { + "epoch": 0.72, + "grad_norm": 4.471619437773189, + "learning_rate": 1.8741012566072537e-06, + "loss": 0.4488, + "step": 4797 + }, + { + "epoch": 0.72, + "grad_norm": 3.7314066834232316, + "learning_rate": 1.872195275479724e-06, + "loss": 0.4699, + "step": 4798 + }, + { + "epoch": 0.72, + "grad_norm": 6.2041254733788405, + "learning_rate": 1.8702900407706136e-06, + "loss": 0.4332, + "step": 4799 + }, + { + "epoch": 0.72, + "grad_norm": 8.20921586563363, + "learning_rate": 1.8683855529345902e-06, + "loss": 0.4201, + "step": 4800 + }, + { + "epoch": 0.72, + "grad_norm": 7.4733403520552235, + "learning_rate": 1.8664818124261375e-06, + "loss": 0.4338, + "step": 4801 + }, + { + "epoch": 0.72, + "grad_norm": 8.848958990117113, + "learning_rate": 1.8645788196995669e-06, + "loss": 0.4331, + "step": 4802 + }, + { + "epoch": 0.72, + "grad_norm": 4.765926051250804, + "learning_rate": 1.8626765752090058e-06, + "loss": 0.4234, + "step": 4803 + }, + { + "epoch": 0.72, + "grad_norm": 3.8996676426468473, + "learning_rate": 1.8607750794084039e-06, + "loss": 0.441, + "step": 4804 + }, + { + "epoch": 0.72, + "grad_norm": 5.764143348480458, + "learning_rate": 1.858874332751537e-06, + "loss": 0.397, + "step": 4805 + }, + { + "epoch": 0.72, + "grad_norm": 10.842235171457089, + "learning_rate": 1.8569743356919973e-06, + "loss": 0.4874, + "step": 4806 + }, + { + "epoch": 0.73, + "grad_norm": 3.893625141478686, + "learning_rate": 1.8550750886831987e-06, + "loss": 0.3924, + "step": 4807 + }, + { + "epoch": 0.73, + "grad_norm": 6.680053428108346, + "learning_rate": 1.8531765921783778e-06, + "loss": 0.4088, + "step": 4808 + }, + { + "epoch": 0.73, + "grad_norm": 6.1158080825436985, + "learning_rate": 1.8512788466305899e-06, + "loss": 0.3828, + "step": 4809 + }, + { + "epoch": 0.73, + "grad_norm": 6.916174875721781, + "learning_rate": 1.8493818524927143e-06, + "loss": 0.4532, + "step": 4810 + }, + { + "epoch": 0.73, + "grad_norm": 18.113534331027083, + "learning_rate": 1.8474856102174471e-06, + "loss": 0.4893, + "step": 4811 + }, + { + "epoch": 0.73, + "grad_norm": 6.054129068271102, + "learning_rate": 1.84559012025731e-06, + "loss": 0.3585, + "step": 4812 + }, + { + "epoch": 0.73, + "grad_norm": 4.695444230783209, + "learning_rate": 1.843695383064641e-06, + "loss": 0.4027, + "step": 4813 + }, + { + "epoch": 0.73, + "grad_norm": 5.753104053450278, + "learning_rate": 1.8418013990915966e-06, + "loss": 0.3926, + "step": 4814 + }, + { + "epoch": 0.73, + "grad_norm": 5.585469979832173, + "learning_rate": 1.839908168790161e-06, + "loss": 0.4142, + "step": 4815 + }, + { + "epoch": 0.73, + "grad_norm": 6.043318930118474, + "learning_rate": 1.8380156926121312e-06, + "loss": 0.4595, + "step": 4816 + }, + { + "epoch": 0.73, + "grad_norm": 9.875248160862492, + "learning_rate": 1.8361239710091294e-06, + "loss": 0.4641, + "step": 4817 + }, + { + "epoch": 0.73, + "grad_norm": 0.9573754336354109, + "learning_rate": 1.834233004432594e-06, + "loss": 0.4947, + "step": 4818 + }, + { + "epoch": 0.73, + "grad_norm": 5.896341779314252, + "learning_rate": 1.8323427933337855e-06, + "loss": 0.404, + "step": 4819 + }, + { + "epoch": 0.73, + "grad_norm": 19.914681773464604, + "learning_rate": 1.830453338163783e-06, + "loss": 0.4253, + "step": 4820 + }, + { + "epoch": 0.73, + "grad_norm": 3.844182332281133, + "learning_rate": 1.828564639373483e-06, + "loss": 0.4165, + "step": 4821 + }, + { + "epoch": 0.73, + "grad_norm": 5.249741511633091, + "learning_rate": 1.8266766974136086e-06, + "loss": 0.3703, + "step": 4822 + }, + { + "epoch": 0.73, + "grad_norm": 7.7952526084534, + "learning_rate": 1.824789512734696e-06, + "loss": 0.4048, + "step": 4823 + }, + { + "epoch": 0.73, + "grad_norm": 7.329583271609569, + "learning_rate": 1.8229030857870995e-06, + "loss": 0.4291, + "step": 4824 + }, + { + "epoch": 0.73, + "grad_norm": 6.424959433592154, + "learning_rate": 1.8210174170210005e-06, + "loss": 0.4428, + "step": 4825 + }, + { + "epoch": 0.73, + "grad_norm": 3.8269199898893205, + "learning_rate": 1.8191325068863897e-06, + "loss": 0.4203, + "step": 4826 + }, + { + "epoch": 0.73, + "grad_norm": 3.989762975483705, + "learning_rate": 1.817248355833085e-06, + "loss": 0.4354, + "step": 4827 + }, + { + "epoch": 0.73, + "grad_norm": 8.670025855796599, + "learning_rate": 1.815364964310719e-06, + "loss": 0.4262, + "step": 4828 + }, + { + "epoch": 0.73, + "grad_norm": 6.372761417585162, + "learning_rate": 1.8134823327687412e-06, + "loss": 0.4699, + "step": 4829 + }, + { + "epoch": 0.73, + "grad_norm": 4.809787111876093, + "learning_rate": 1.8116004616564264e-06, + "loss": 0.4748, + "step": 4830 + }, + { + "epoch": 0.73, + "grad_norm": 4.421935693195781, + "learning_rate": 1.809719351422861e-06, + "loss": 0.4401, + "step": 4831 + }, + { + "epoch": 0.73, + "grad_norm": 6.311108126361975, + "learning_rate": 1.8078390025169534e-06, + "loss": 0.4088, + "step": 4832 + }, + { + "epoch": 0.73, + "grad_norm": 6.2955653274612215, + "learning_rate": 1.8059594153874288e-06, + "loss": 0.4529, + "step": 4833 + }, + { + "epoch": 0.73, + "grad_norm": 7.293111965725443, + "learning_rate": 1.8040805904828307e-06, + "loss": 0.3747, + "step": 4834 + }, + { + "epoch": 0.73, + "grad_norm": 6.522337220497181, + "learning_rate": 1.8022025282515238e-06, + "loss": 0.4361, + "step": 4835 + }, + { + "epoch": 0.73, + "grad_norm": 7.979629153985799, + "learning_rate": 1.8003252291416856e-06, + "loss": 0.3609, + "step": 4836 + }, + { + "epoch": 0.73, + "grad_norm": 5.896113590177915, + "learning_rate": 1.7984486936013169e-06, + "loss": 0.4897, + "step": 4837 + }, + { + "epoch": 0.73, + "grad_norm": 4.0867163934319795, + "learning_rate": 1.7965729220782325e-06, + "loss": 0.3431, + "step": 4838 + }, + { + "epoch": 0.73, + "grad_norm": 4.891413896978916, + "learning_rate": 1.7946979150200638e-06, + "loss": 0.3887, + "step": 4839 + }, + { + "epoch": 0.73, + "grad_norm": 23.492161178521677, + "learning_rate": 1.792823672874266e-06, + "loss": 0.444, + "step": 4840 + }, + { + "epoch": 0.73, + "grad_norm": 5.968469908086001, + "learning_rate": 1.7909501960881055e-06, + "loss": 0.4346, + "step": 4841 + }, + { + "epoch": 0.73, + "grad_norm": 6.057792926634205, + "learning_rate": 1.7890774851086685e-06, + "loss": 0.3707, + "step": 4842 + }, + { + "epoch": 0.73, + "grad_norm": 6.0387093934563225, + "learning_rate": 1.787205540382858e-06, + "loss": 0.4249, + "step": 4843 + }, + { + "epoch": 0.73, + "grad_norm": 1.2843482238104174, + "learning_rate": 1.7853343623573926e-06, + "loss": 0.531, + "step": 4844 + }, + { + "epoch": 0.73, + "grad_norm": 4.551262326249063, + "learning_rate": 1.7834639514788137e-06, + "loss": 0.4359, + "step": 4845 + }, + { + "epoch": 0.73, + "grad_norm": 1.0251030905347258, + "learning_rate": 1.7815943081934711e-06, + "loss": 0.4867, + "step": 4846 + }, + { + "epoch": 0.73, + "grad_norm": 1.2468523902854127, + "learning_rate": 1.7797254329475406e-06, + "loss": 0.5672, + "step": 4847 + }, + { + "epoch": 0.73, + "grad_norm": 8.934355350139679, + "learning_rate": 1.7778573261870075e-06, + "loss": 0.4741, + "step": 4848 + }, + { + "epoch": 0.73, + "grad_norm": 6.150816999915261, + "learning_rate": 1.7759899883576748e-06, + "loss": 0.4424, + "step": 4849 + }, + { + "epoch": 0.73, + "grad_norm": 5.839801670311012, + "learning_rate": 1.7741234199051665e-06, + "loss": 0.3926, + "step": 4850 + }, + { + "epoch": 0.73, + "grad_norm": 7.607223532359424, + "learning_rate": 1.7722576212749175e-06, + "loss": 0.4142, + "step": 4851 + }, + { + "epoch": 0.73, + "grad_norm": 9.059926435240088, + "learning_rate": 1.770392592912184e-06, + "loss": 0.372, + "step": 4852 + }, + { + "epoch": 0.73, + "grad_norm": 8.545666886737184, + "learning_rate": 1.7685283352620342e-06, + "loss": 0.4919, + "step": 4853 + }, + { + "epoch": 0.73, + "grad_norm": 7.0488808107560805, + "learning_rate": 1.7666648487693544e-06, + "loss": 0.4825, + "step": 4854 + }, + { + "epoch": 0.73, + "grad_norm": 5.1692074926481, + "learning_rate": 1.764802133878847e-06, + "loss": 0.4811, + "step": 4855 + }, + { + "epoch": 0.73, + "grad_norm": 5.920473605852468, + "learning_rate": 1.7629401910350264e-06, + "loss": 0.4232, + "step": 4856 + }, + { + "epoch": 0.73, + "grad_norm": 5.767986532710334, + "learning_rate": 1.7610790206822303e-06, + "loss": 0.3185, + "step": 4857 + }, + { + "epoch": 0.73, + "grad_norm": 8.979722557519594, + "learning_rate": 1.7592186232646069e-06, + "loss": 0.4315, + "step": 4858 + }, + { + "epoch": 0.73, + "grad_norm": 3.8077919084509935, + "learning_rate": 1.7573589992261185e-06, + "loss": 0.417, + "step": 4859 + }, + { + "epoch": 0.73, + "grad_norm": 11.594822457211778, + "learning_rate": 1.755500149010549e-06, + "loss": 0.3718, + "step": 4860 + }, + { + "epoch": 0.73, + "grad_norm": 1.5021175619209115, + "learning_rate": 1.75364207306149e-06, + "loss": 0.5242, + "step": 4861 + }, + { + "epoch": 0.73, + "grad_norm": 28.36490192142048, + "learning_rate": 1.7517847718223557e-06, + "loss": 0.4318, + "step": 4862 + }, + { + "epoch": 0.73, + "grad_norm": 16.472748306922306, + "learning_rate": 1.7499282457363702e-06, + "loss": 0.463, + "step": 4863 + }, + { + "epoch": 0.73, + "grad_norm": 7.323775283283541, + "learning_rate": 1.7480724952465733e-06, + "loss": 0.4751, + "step": 4864 + }, + { + "epoch": 0.73, + "grad_norm": 7.40553883348144, + "learning_rate": 1.7462175207958238e-06, + "loss": 0.4378, + "step": 4865 + }, + { + "epoch": 0.73, + "grad_norm": 8.00406488243665, + "learning_rate": 1.7443633228267898e-06, + "loss": 0.4043, + "step": 4866 + }, + { + "epoch": 0.73, + "grad_norm": 6.197980994180602, + "learning_rate": 1.742509901781958e-06, + "loss": 0.4632, + "step": 4867 + }, + { + "epoch": 0.73, + "grad_norm": 5.178142734831738, + "learning_rate": 1.740657258103627e-06, + "loss": 0.4512, + "step": 4868 + }, + { + "epoch": 0.73, + "grad_norm": 1.118999819289833, + "learning_rate": 1.7388053922339103e-06, + "loss": 0.5419, + "step": 4869 + }, + { + "epoch": 0.73, + "grad_norm": 7.840371729404817, + "learning_rate": 1.7369543046147398e-06, + "loss": 0.4184, + "step": 4870 + }, + { + "epoch": 0.73, + "grad_norm": 4.533382509410105, + "learning_rate": 1.7351039956878546e-06, + "loss": 0.3232, + "step": 4871 + }, + { + "epoch": 0.73, + "grad_norm": 7.302847347435059, + "learning_rate": 1.7332544658948159e-06, + "loss": 0.4842, + "step": 4872 + }, + { + "epoch": 0.73, + "grad_norm": 8.041976038676482, + "learning_rate": 1.7314057156769925e-06, + "loss": 0.4303, + "step": 4873 + }, + { + "epoch": 0.74, + "grad_norm": 5.571689330323869, + "learning_rate": 1.729557745475569e-06, + "loss": 0.4163, + "step": 4874 + }, + { + "epoch": 0.74, + "grad_norm": 7.224163560404418, + "learning_rate": 1.7277105557315477e-06, + "loss": 0.4514, + "step": 4875 + }, + { + "epoch": 0.74, + "grad_norm": 17.02968199332374, + "learning_rate": 1.725864146885739e-06, + "loss": 0.4865, + "step": 4876 + }, + { + "epoch": 0.74, + "grad_norm": 3.9187941838507596, + "learning_rate": 1.7240185193787701e-06, + "loss": 0.3722, + "step": 4877 + }, + { + "epoch": 0.74, + "grad_norm": 4.39109024770403, + "learning_rate": 1.7221736736510802e-06, + "loss": 0.4453, + "step": 4878 + }, + { + "epoch": 0.74, + "grad_norm": 16.67848950841273, + "learning_rate": 1.7203296101429217e-06, + "loss": 0.3983, + "step": 4879 + }, + { + "epoch": 0.74, + "grad_norm": 5.892557021435531, + "learning_rate": 1.718486329294365e-06, + "loss": 0.5005, + "step": 4880 + }, + { + "epoch": 0.74, + "grad_norm": 5.117090748996902, + "learning_rate": 1.716643831545286e-06, + "loss": 0.4444, + "step": 4881 + }, + { + "epoch": 0.74, + "grad_norm": 5.718355933694574, + "learning_rate": 1.7148021173353819e-06, + "loss": 0.3732, + "step": 4882 + }, + { + "epoch": 0.74, + "grad_norm": 8.1676876210267, + "learning_rate": 1.7129611871041563e-06, + "loss": 0.4147, + "step": 4883 + }, + { + "epoch": 0.74, + "grad_norm": 1.088968575755412, + "learning_rate": 1.7111210412909274e-06, + "loss": 0.5055, + "step": 4884 + }, + { + "epoch": 0.74, + "grad_norm": 8.187910030590789, + "learning_rate": 1.7092816803348306e-06, + "loss": 0.469, + "step": 4885 + }, + { + "epoch": 0.74, + "grad_norm": 5.215746213669689, + "learning_rate": 1.7074431046748075e-06, + "loss": 0.404, + "step": 4886 + }, + { + "epoch": 0.74, + "grad_norm": 6.966191084117976, + "learning_rate": 1.7056053147496149e-06, + "loss": 0.4984, + "step": 4887 + }, + { + "epoch": 0.74, + "grad_norm": 3.789996401071698, + "learning_rate": 1.7037683109978253e-06, + "loss": 0.4302, + "step": 4888 + }, + { + "epoch": 0.74, + "grad_norm": 6.661061526998839, + "learning_rate": 1.7019320938578189e-06, + "loss": 0.4155, + "step": 4889 + }, + { + "epoch": 0.74, + "grad_norm": 5.186022501292075, + "learning_rate": 1.7000966637677902e-06, + "loss": 0.3907, + "step": 4890 + }, + { + "epoch": 0.74, + "grad_norm": 10.010487626288365, + "learning_rate": 1.6982620211657464e-06, + "loss": 0.4232, + "step": 4891 + }, + { + "epoch": 0.74, + "grad_norm": 5.195608680303167, + "learning_rate": 1.6964281664895027e-06, + "loss": 0.4556, + "step": 4892 + }, + { + "epoch": 0.74, + "grad_norm": 4.663817486152631, + "learning_rate": 1.6945951001766947e-06, + "loss": 0.4809, + "step": 4893 + }, + { + "epoch": 0.74, + "grad_norm": 5.954266927858951, + "learning_rate": 1.6927628226647607e-06, + "loss": 0.5132, + "step": 4894 + }, + { + "epoch": 0.74, + "grad_norm": 4.641869878517609, + "learning_rate": 1.6909313343909584e-06, + "loss": 0.4287, + "step": 4895 + }, + { + "epoch": 0.74, + "grad_norm": 6.803206334854586, + "learning_rate": 1.6891006357923522e-06, + "loss": 0.3854, + "step": 4896 + }, + { + "epoch": 0.74, + "grad_norm": 18.952405855327473, + "learning_rate": 1.687270727305817e-06, + "loss": 0.4007, + "step": 4897 + }, + { + "epoch": 0.74, + "grad_norm": 12.207564145502667, + "learning_rate": 1.6854416093680458e-06, + "loss": 0.4533, + "step": 4898 + }, + { + "epoch": 0.74, + "grad_norm": 6.330269943490551, + "learning_rate": 1.6836132824155355e-06, + "loss": 0.512, + "step": 4899 + }, + { + "epoch": 0.74, + "grad_norm": 8.348833866728041, + "learning_rate": 1.6817857468846e-06, + "loss": 0.397, + "step": 4900 + }, + { + "epoch": 0.74, + "grad_norm": 9.423152856202949, + "learning_rate": 1.6799590032113606e-06, + "loss": 0.4674, + "step": 4901 + }, + { + "epoch": 0.74, + "grad_norm": 5.1333136289443475, + "learning_rate": 1.6781330518317512e-06, + "loss": 0.4347, + "step": 4902 + }, + { + "epoch": 0.74, + "grad_norm": 7.823223427455497, + "learning_rate": 1.6763078931815159e-06, + "loss": 0.3793, + "step": 4903 + }, + { + "epoch": 0.74, + "grad_norm": 7.455143246369038, + "learning_rate": 1.674483527696209e-06, + "loss": 0.4417, + "step": 4904 + }, + { + "epoch": 0.74, + "grad_norm": 9.247116094823998, + "learning_rate": 1.6726599558111988e-06, + "loss": 0.4591, + "step": 4905 + }, + { + "epoch": 0.74, + "grad_norm": 7.2975895748970405, + "learning_rate": 1.6708371779616617e-06, + "loss": 0.4598, + "step": 4906 + }, + { + "epoch": 0.74, + "grad_norm": 4.650598619127493, + "learning_rate": 1.6690151945825817e-06, + "loss": 0.4714, + "step": 4907 + }, + { + "epoch": 0.74, + "grad_norm": 7.184568054265106, + "learning_rate": 1.6671940061087611e-06, + "loss": 0.4474, + "step": 4908 + }, + { + "epoch": 0.74, + "grad_norm": 10.388266695649065, + "learning_rate": 1.6653736129748038e-06, + "loss": 0.4171, + "step": 4909 + }, + { + "epoch": 0.74, + "grad_norm": 18.78637382925074, + "learning_rate": 1.6635540156151308e-06, + "loss": 0.4044, + "step": 4910 + }, + { + "epoch": 0.74, + "grad_norm": 8.804404357101728, + "learning_rate": 1.6617352144639692e-06, + "loss": 0.4127, + "step": 4911 + }, + { + "epoch": 0.74, + "grad_norm": 5.298417331355293, + "learning_rate": 1.6599172099553573e-06, + "loss": 0.5037, + "step": 4912 + }, + { + "epoch": 0.74, + "grad_norm": 22.9245769578651, + "learning_rate": 1.658100002523143e-06, + "loss": 0.4455, + "step": 4913 + }, + { + "epoch": 0.74, + "grad_norm": 7.128143686680953, + "learning_rate": 1.6562835926009823e-06, + "loss": 0.5221, + "step": 4914 + }, + { + "epoch": 0.74, + "grad_norm": 8.87987630690349, + "learning_rate": 1.6544679806223468e-06, + "loss": 0.4389, + "step": 4915 + }, + { + "epoch": 0.74, + "grad_norm": 5.997562544096999, + "learning_rate": 1.652653167020511e-06, + "loss": 0.5207, + "step": 4916 + }, + { + "epoch": 0.74, + "grad_norm": 5.6756898675156, + "learning_rate": 1.6508391522285598e-06, + "loss": 0.3716, + "step": 4917 + }, + { + "epoch": 0.74, + "grad_norm": 11.629126002075862, + "learning_rate": 1.6490259366793931e-06, + "loss": 0.4882, + "step": 4918 + }, + { + "epoch": 0.74, + "grad_norm": 4.511961143783831, + "learning_rate": 1.6472135208057128e-06, + "loss": 0.4109, + "step": 4919 + }, + { + "epoch": 0.74, + "grad_norm": 10.787115792259728, + "learning_rate": 1.6454019050400367e-06, + "loss": 0.4362, + "step": 4920 + }, + { + "epoch": 0.74, + "grad_norm": 17.995833660736523, + "learning_rate": 1.6435910898146862e-06, + "loss": 0.3676, + "step": 4921 + }, + { + "epoch": 0.74, + "grad_norm": 9.40433157762816, + "learning_rate": 1.6417810755617918e-06, + "loss": 0.3838, + "step": 4922 + }, + { + "epoch": 0.74, + "grad_norm": 6.033522241669954, + "learning_rate": 1.6399718627132987e-06, + "loss": 0.4589, + "step": 4923 + }, + { + "epoch": 0.74, + "grad_norm": 5.0192212072313405, + "learning_rate": 1.6381634517009555e-06, + "loss": 0.4903, + "step": 4924 + }, + { + "epoch": 0.74, + "grad_norm": 5.480625025617363, + "learning_rate": 1.6363558429563197e-06, + "loss": 0.3969, + "step": 4925 + }, + { + "epoch": 0.74, + "grad_norm": 7.098339693090555, + "learning_rate": 1.6345490369107603e-06, + "loss": 0.4634, + "step": 4926 + }, + { + "epoch": 0.74, + "grad_norm": 11.193848370471887, + "learning_rate": 1.6327430339954498e-06, + "loss": 0.4342, + "step": 4927 + }, + { + "epoch": 0.74, + "grad_norm": 52.726233316108406, + "learning_rate": 1.6309378346413763e-06, + "loss": 0.3572, + "step": 4928 + }, + { + "epoch": 0.74, + "grad_norm": 5.340193376500938, + "learning_rate": 1.6291334392793278e-06, + "loss": 0.4048, + "step": 4929 + }, + { + "epoch": 0.74, + "grad_norm": 6.051044141667209, + "learning_rate": 1.6273298483399092e-06, + "loss": 0.3871, + "step": 4930 + }, + { + "epoch": 0.74, + "grad_norm": 26.84123581530738, + "learning_rate": 1.6255270622535268e-06, + "loss": 0.4584, + "step": 4931 + }, + { + "epoch": 0.74, + "grad_norm": 15.725892856004375, + "learning_rate": 1.6237250814503952e-06, + "loss": 0.3978, + "step": 4932 + }, + { + "epoch": 0.74, + "grad_norm": 4.428397990579211, + "learning_rate": 1.6219239063605425e-06, + "loss": 0.413, + "step": 4933 + }, + { + "epoch": 0.74, + "grad_norm": 5.529300400335759, + "learning_rate": 1.6201235374137975e-06, + "loss": 0.4498, + "step": 4934 + }, + { + "epoch": 0.74, + "grad_norm": 4.814173658846859, + "learning_rate": 1.618323975039801e-06, + "loss": 0.4679, + "step": 4935 + }, + { + "epoch": 0.74, + "grad_norm": 13.615970551633417, + "learning_rate": 1.6165252196679998e-06, + "loss": 0.4909, + "step": 4936 + }, + { + "epoch": 0.74, + "grad_norm": 8.1922503619048, + "learning_rate": 1.614727271727647e-06, + "loss": 0.426, + "step": 4937 + }, + { + "epoch": 0.74, + "grad_norm": 6.108320072326528, + "learning_rate": 1.612930131647807e-06, + "loss": 0.3736, + "step": 4938 + }, + { + "epoch": 0.74, + "grad_norm": 1.12896918113066, + "learning_rate": 1.6111337998573456e-06, + "loss": 0.5228, + "step": 4939 + }, + { + "epoch": 0.75, + "grad_norm": 1.2678537297782337, + "learning_rate": 1.6093382767849424e-06, + "loss": 0.5425, + "step": 4940 + }, + { + "epoch": 0.75, + "grad_norm": 12.649758765802616, + "learning_rate": 1.6075435628590785e-06, + "loss": 0.3955, + "step": 4941 + }, + { + "epoch": 0.75, + "grad_norm": 4.478793926993321, + "learning_rate": 1.6057496585080434e-06, + "loss": 0.4447, + "step": 4942 + }, + { + "epoch": 0.75, + "grad_norm": 8.320495025362742, + "learning_rate": 1.603956564159936e-06, + "loss": 0.4795, + "step": 4943 + }, + { + "epoch": 0.75, + "grad_norm": 5.116434211852291, + "learning_rate": 1.6021642802426568e-06, + "loss": 0.4213, + "step": 4944 + }, + { + "epoch": 0.75, + "grad_norm": 27.316673231305767, + "learning_rate": 1.6003728071839198e-06, + "loss": 0.3771, + "step": 4945 + }, + { + "epoch": 0.75, + "grad_norm": 34.85074357696168, + "learning_rate": 1.598582145411239e-06, + "loss": 0.4135, + "step": 4946 + }, + { + "epoch": 0.75, + "grad_norm": 5.0812517510222825, + "learning_rate": 1.5967922953519382e-06, + "loss": 0.4112, + "step": 4947 + }, + { + "epoch": 0.75, + "grad_norm": 3.6985521819593163, + "learning_rate": 1.5950032574331464e-06, + "loss": 0.3631, + "step": 4948 + }, + { + "epoch": 0.75, + "grad_norm": 11.654332732581707, + "learning_rate": 1.593215032081798e-06, + "loss": 0.4314, + "step": 4949 + }, + { + "epoch": 0.75, + "grad_norm": 5.00186330803566, + "learning_rate": 1.5914276197246365e-06, + "loss": 0.397, + "step": 4950 + }, + { + "epoch": 0.75, + "grad_norm": 8.64873041810846, + "learning_rate": 1.589641020788209e-06, + "loss": 0.3741, + "step": 4951 + }, + { + "epoch": 0.75, + "grad_norm": 6.434797175090717, + "learning_rate": 1.5878552356988674e-06, + "loss": 0.4011, + "step": 4952 + }, + { + "epoch": 0.75, + "grad_norm": 3.6503700571785704, + "learning_rate": 1.5860702648827736e-06, + "loss": 0.4605, + "step": 4953 + }, + { + "epoch": 0.75, + "grad_norm": 4.212988445695651, + "learning_rate": 1.5842861087658895e-06, + "loss": 0.4288, + "step": 4954 + }, + { + "epoch": 0.75, + "grad_norm": 3.8616210004920544, + "learning_rate": 1.5825027677739891e-06, + "loss": 0.4097, + "step": 4955 + }, + { + "epoch": 0.75, + "grad_norm": 5.879344358693332, + "learning_rate": 1.5807202423326463e-06, + "loss": 0.4339, + "step": 4956 + }, + { + "epoch": 0.75, + "grad_norm": 4.488216598804553, + "learning_rate": 1.5789385328672407e-06, + "loss": 0.4498, + "step": 4957 + }, + { + "epoch": 0.75, + "grad_norm": 4.873919797969425, + "learning_rate": 1.5771576398029625e-06, + "loss": 0.4168, + "step": 4958 + }, + { + "epoch": 0.75, + "grad_norm": 5.826562207919402, + "learning_rate": 1.5753775635648022e-06, + "loss": 0.4974, + "step": 4959 + }, + { + "epoch": 0.75, + "grad_norm": 17.696820792300283, + "learning_rate": 1.5735983045775567e-06, + "loss": 0.4892, + "step": 4960 + }, + { + "epoch": 0.75, + "grad_norm": 13.478672577520692, + "learning_rate": 1.5718198632658272e-06, + "loss": 0.4541, + "step": 4961 + }, + { + "epoch": 0.75, + "grad_norm": 5.68228939997883, + "learning_rate": 1.5700422400540194e-06, + "loss": 0.4658, + "step": 4962 + }, + { + "epoch": 0.75, + "grad_norm": 3.772958624788832, + "learning_rate": 1.5682654353663474e-06, + "loss": 0.4275, + "step": 4963 + }, + { + "epoch": 0.75, + "grad_norm": 11.524743331608585, + "learning_rate": 1.5664894496268246e-06, + "loss": 0.428, + "step": 4964 + }, + { + "epoch": 0.75, + "grad_norm": 7.532414852531679, + "learning_rate": 1.564714283259275e-06, + "loss": 0.4226, + "step": 4965 + }, + { + "epoch": 0.75, + "grad_norm": 4.58502843547674, + "learning_rate": 1.562939936687322e-06, + "loss": 0.4098, + "step": 4966 + }, + { + "epoch": 0.75, + "grad_norm": 5.281653260809573, + "learning_rate": 1.5611664103343933e-06, + "loss": 0.4738, + "step": 4967 + }, + { + "epoch": 0.75, + "grad_norm": 3.0186521786628306, + "learning_rate": 1.5593937046237268e-06, + "loss": 0.4278, + "step": 4968 + }, + { + "epoch": 0.75, + "grad_norm": 6.664192832521856, + "learning_rate": 1.5576218199783578e-06, + "loss": 0.418, + "step": 4969 + }, + { + "epoch": 0.75, + "grad_norm": 1.110019828106071, + "learning_rate": 1.5558507568211289e-06, + "loss": 0.5295, + "step": 4970 + }, + { + "epoch": 0.75, + "grad_norm": 4.346694597508153, + "learning_rate": 1.5540805155746863e-06, + "loss": 0.4122, + "step": 4971 + }, + { + "epoch": 0.75, + "grad_norm": 5.4587951647396435, + "learning_rate": 1.5523110966614778e-06, + "loss": 0.4142, + "step": 4972 + }, + { + "epoch": 0.75, + "grad_norm": 18.4464459571236, + "learning_rate": 1.5505425005037606e-06, + "loss": 0.5019, + "step": 4973 + }, + { + "epoch": 0.75, + "grad_norm": 4.433047318968047, + "learning_rate": 1.5487747275235888e-06, + "loss": 0.4582, + "step": 4974 + }, + { + "epoch": 0.75, + "grad_norm": 7.583302793967848, + "learning_rate": 1.5470077781428254e-06, + "loss": 0.4551, + "step": 4975 + }, + { + "epoch": 0.75, + "grad_norm": 7.920971170604424, + "learning_rate": 1.5452416527831348e-06, + "loss": 0.396, + "step": 4976 + }, + { + "epoch": 0.75, + "grad_norm": 15.03584059436499, + "learning_rate": 1.5434763518659817e-06, + "loss": 0.4102, + "step": 4977 + }, + { + "epoch": 0.75, + "grad_norm": 5.455199832648352, + "learning_rate": 1.5417118758126408e-06, + "loss": 0.5157, + "step": 4978 + }, + { + "epoch": 0.75, + "grad_norm": 1.3688125223528294, + "learning_rate": 1.5399482250441827e-06, + "loss": 0.5449, + "step": 4979 + }, + { + "epoch": 0.75, + "grad_norm": 4.913251830182728, + "learning_rate": 1.5381853999814882e-06, + "loss": 0.4201, + "step": 4980 + }, + { + "epoch": 0.75, + "grad_norm": 5.102867226372371, + "learning_rate": 1.5364234010452355e-06, + "loss": 0.4513, + "step": 4981 + }, + { + "epoch": 0.75, + "grad_norm": 5.336885961993578, + "learning_rate": 1.534662228655907e-06, + "loss": 0.4167, + "step": 4982 + }, + { + "epoch": 0.75, + "grad_norm": 7.660676716155712, + "learning_rate": 1.5329018832337895e-06, + "loss": 0.4492, + "step": 4983 + }, + { + "epoch": 0.75, + "grad_norm": 5.574517719236944, + "learning_rate": 1.5311423651989681e-06, + "loss": 0.449, + "step": 4984 + }, + { + "epoch": 0.75, + "grad_norm": 4.337382435500911, + "learning_rate": 1.5293836749713386e-06, + "loss": 0.4748, + "step": 4985 + }, + { + "epoch": 0.75, + "grad_norm": 4.953432348233978, + "learning_rate": 1.5276258129705918e-06, + "loss": 0.403, + "step": 4986 + }, + { + "epoch": 0.75, + "grad_norm": 5.968175989762074, + "learning_rate": 1.5258687796162213e-06, + "loss": 0.4911, + "step": 4987 + }, + { + "epoch": 0.75, + "grad_norm": 17.076921614142, + "learning_rate": 1.5241125753275294e-06, + "loss": 0.4593, + "step": 4988 + }, + { + "epoch": 0.75, + "grad_norm": 21.29709436044458, + "learning_rate": 1.5223572005236127e-06, + "loss": 0.3512, + "step": 4989 + }, + { + "epoch": 0.75, + "grad_norm": 8.886619188293746, + "learning_rate": 1.5206026556233761e-06, + "loss": 0.4307, + "step": 4990 + }, + { + "epoch": 0.75, + "grad_norm": 5.704069095500053, + "learning_rate": 1.518848941045522e-06, + "loss": 0.4009, + "step": 4991 + }, + { + "epoch": 0.75, + "grad_norm": 34.07310361399107, + "learning_rate": 1.5170960572085552e-06, + "loss": 0.3855, + "step": 4992 + }, + { + "epoch": 0.75, + "grad_norm": 7.31798468522245, + "learning_rate": 1.5153440045307865e-06, + "loss": 0.4079, + "step": 4993 + }, + { + "epoch": 0.75, + "grad_norm": 6.825764117579442, + "learning_rate": 1.5135927834303238e-06, + "loss": 0.4062, + "step": 4994 + }, + { + "epoch": 0.75, + "grad_norm": 5.7469029121473705, + "learning_rate": 1.511842394325077e-06, + "loss": 0.4039, + "step": 4995 + }, + { + "epoch": 0.75, + "grad_norm": 4.482634405815571, + "learning_rate": 1.51009283763276e-06, + "loss": 0.3795, + "step": 4996 + }, + { + "epoch": 0.75, + "grad_norm": 6.924154988394382, + "learning_rate": 1.5083441137708837e-06, + "loss": 0.3926, + "step": 4997 + }, + { + "epoch": 0.75, + "grad_norm": 6.011807234411183, + "learning_rate": 1.506596223156767e-06, + "loss": 0.4554, + "step": 4998 + }, + { + "epoch": 0.75, + "grad_norm": 1.2554318549154682, + "learning_rate": 1.5048491662075244e-06, + "loss": 0.5372, + "step": 4999 + }, + { + "epoch": 0.75, + "grad_norm": 7.727035800579789, + "learning_rate": 1.5031029433400706e-06, + "loss": 0.4526, + "step": 5000 + }, + { + "epoch": 0.75, + "grad_norm": 0.9668285819469707, + "learning_rate": 1.5013575549711274e-06, + "loss": 0.4905, + "step": 5001 + }, + { + "epoch": 0.75, + "grad_norm": 7.305456759648467, + "learning_rate": 1.499613001517211e-06, + "loss": 0.4602, + "step": 5002 + }, + { + "epoch": 0.75, + "grad_norm": 7.64271793865173, + "learning_rate": 1.497869283394644e-06, + "loss": 0.3659, + "step": 5003 + }, + { + "epoch": 0.75, + "grad_norm": 6.469227620734533, + "learning_rate": 1.4961264010195454e-06, + "loss": 0.4103, + "step": 5004 + }, + { + "epoch": 0.75, + "grad_norm": 8.257764161899248, + "learning_rate": 1.4943843548078353e-06, + "loss": 0.4562, + "step": 5005 + }, + { + "epoch": 0.76, + "grad_norm": 10.573707041627797, + "learning_rate": 1.4926431451752361e-06, + "loss": 0.4107, + "step": 5006 + }, + { + "epoch": 0.76, + "grad_norm": 7.692833025163236, + "learning_rate": 1.4909027725372676e-06, + "loss": 0.4144, + "step": 5007 + }, + { + "epoch": 0.76, + "grad_norm": 1.1615584357344928, + "learning_rate": 1.4891632373092551e-06, + "loss": 0.5538, + "step": 5008 + }, + { + "epoch": 0.76, + "grad_norm": 14.037220153784945, + "learning_rate": 1.4874245399063197e-06, + "loss": 0.4295, + "step": 5009 + }, + { + "epoch": 0.76, + "grad_norm": 4.991245378434636, + "learning_rate": 1.4856866807433807e-06, + "loss": 0.4175, + "step": 5010 + }, + { + "epoch": 0.76, + "grad_norm": 7.047828441359134, + "learning_rate": 1.483949660235165e-06, + "loss": 0.4211, + "step": 5011 + }, + { + "epoch": 0.76, + "grad_norm": 9.10353848545864, + "learning_rate": 1.48221347879619e-06, + "loss": 0.3995, + "step": 5012 + }, + { + "epoch": 0.76, + "grad_norm": 4.078817742004709, + "learning_rate": 1.4804781368407822e-06, + "loss": 0.4085, + "step": 5013 + }, + { + "epoch": 0.76, + "grad_norm": 6.93248203287576, + "learning_rate": 1.4787436347830602e-06, + "loss": 0.4385, + "step": 5014 + }, + { + "epoch": 0.76, + "grad_norm": 4.2386796232560195, + "learning_rate": 1.4770099730369447e-06, + "loss": 0.4073, + "step": 5015 + }, + { + "epoch": 0.76, + "grad_norm": 8.731620232758024, + "learning_rate": 1.4752771520161584e-06, + "loss": 0.4584, + "step": 5016 + }, + { + "epoch": 0.76, + "grad_norm": 4.705219357918715, + "learning_rate": 1.4735451721342203e-06, + "loss": 0.4428, + "step": 5017 + }, + { + "epoch": 0.76, + "grad_norm": 4.374967374081161, + "learning_rate": 1.471814033804449e-06, + "loss": 0.4582, + "step": 5018 + }, + { + "epoch": 0.76, + "grad_norm": 4.57975153571692, + "learning_rate": 1.4700837374399634e-06, + "loss": 0.4491, + "step": 5019 + }, + { + "epoch": 0.76, + "grad_norm": 6.067021484946328, + "learning_rate": 1.468354283453679e-06, + "loss": 0.4487, + "step": 5020 + }, + { + "epoch": 0.76, + "grad_norm": 7.771526750438657, + "learning_rate": 1.466625672258316e-06, + "loss": 0.4155, + "step": 5021 + }, + { + "epoch": 0.76, + "grad_norm": 5.463939821230833, + "learning_rate": 1.4648979042663853e-06, + "loss": 0.4636, + "step": 5022 + }, + { + "epoch": 0.76, + "grad_norm": 11.856508054285326, + "learning_rate": 1.4631709798902055e-06, + "loss": 0.4192, + "step": 5023 + }, + { + "epoch": 0.76, + "grad_norm": 9.846500647665819, + "learning_rate": 1.4614448995418868e-06, + "loss": 0.4378, + "step": 5024 + }, + { + "epoch": 0.76, + "grad_norm": 5.0311535692938625, + "learning_rate": 1.4597196636333389e-06, + "loss": 0.3613, + "step": 5025 + }, + { + "epoch": 0.76, + "grad_norm": 1.2251153346105808, + "learning_rate": 1.4579952725762753e-06, + "loss": 0.5423, + "step": 5026 + }, + { + "epoch": 0.76, + "grad_norm": 5.890956368901254, + "learning_rate": 1.4562717267822013e-06, + "loss": 0.4588, + "step": 5027 + }, + { + "epoch": 0.76, + "grad_norm": 4.421942934626455, + "learning_rate": 1.454549026662425e-06, + "loss": 0.4605, + "step": 5028 + }, + { + "epoch": 0.76, + "grad_norm": 19.777961898317834, + "learning_rate": 1.4528271726280518e-06, + "loss": 0.4693, + "step": 5029 + }, + { + "epoch": 0.76, + "grad_norm": 9.164174319310765, + "learning_rate": 1.4511061650899822e-06, + "loss": 0.4213, + "step": 5030 + }, + { + "epoch": 0.76, + "grad_norm": 181.86978784583428, + "learning_rate": 1.4493860044589187e-06, + "loss": 0.4255, + "step": 5031 + }, + { + "epoch": 0.76, + "grad_norm": 6.21550242859919, + "learning_rate": 1.447666691145357e-06, + "loss": 0.4102, + "step": 5032 + }, + { + "epoch": 0.76, + "grad_norm": 10.418637999509029, + "learning_rate": 1.4459482255595975e-06, + "loss": 0.4263, + "step": 5033 + }, + { + "epoch": 0.76, + "grad_norm": 5.003351528918016, + "learning_rate": 1.4442306081117325e-06, + "loss": 0.4699, + "step": 5034 + }, + { + "epoch": 0.76, + "grad_norm": 10.40828557545126, + "learning_rate": 1.4425138392116516e-06, + "loss": 0.4436, + "step": 5035 + }, + { + "epoch": 0.76, + "grad_norm": 7.154325875127878, + "learning_rate": 1.4407979192690486e-06, + "loss": 0.3938, + "step": 5036 + }, + { + "epoch": 0.76, + "grad_norm": 14.159195327938118, + "learning_rate": 1.439082848693406e-06, + "loss": 0.4731, + "step": 5037 + }, + { + "epoch": 0.76, + "grad_norm": 5.848309158793154, + "learning_rate": 1.4373686278940107e-06, + "loss": 0.4148, + "step": 5038 + }, + { + "epoch": 0.76, + "grad_norm": 14.440524526535384, + "learning_rate": 1.4356552572799432e-06, + "loss": 0.3897, + "step": 5039 + }, + { + "epoch": 0.76, + "grad_norm": 4.117808057537764, + "learning_rate": 1.4339427372600811e-06, + "loss": 0.3786, + "step": 5040 + }, + { + "epoch": 0.76, + "grad_norm": 5.339388721116429, + "learning_rate": 1.4322310682430996e-06, + "loss": 0.454, + "step": 5041 + }, + { + "epoch": 0.76, + "grad_norm": 6.8734836291596215, + "learning_rate": 1.4305202506374695e-06, + "loss": 0.4473, + "step": 5042 + }, + { + "epoch": 0.76, + "grad_norm": 4.825666242375968, + "learning_rate": 1.428810284851463e-06, + "loss": 0.482, + "step": 5043 + }, + { + "epoch": 0.76, + "grad_norm": 55.66188834973104, + "learning_rate": 1.4271011712931443e-06, + "loss": 0.4147, + "step": 5044 + }, + { + "epoch": 0.76, + "grad_norm": 8.982597282498034, + "learning_rate": 1.4253929103703745e-06, + "loss": 0.3752, + "step": 5045 + }, + { + "epoch": 0.76, + "grad_norm": 12.134829373252874, + "learning_rate": 1.4236855024908153e-06, + "loss": 0.4382, + "step": 5046 + }, + { + "epoch": 0.76, + "grad_norm": 4.638815280767644, + "learning_rate": 1.4219789480619183e-06, + "loss": 0.4752, + "step": 5047 + }, + { + "epoch": 0.76, + "grad_norm": 6.848971597262186, + "learning_rate": 1.4202732474909397e-06, + "loss": 0.4162, + "step": 5048 + }, + { + "epoch": 0.76, + "grad_norm": 7.618388263151876, + "learning_rate": 1.4185684011849243e-06, + "loss": 0.4437, + "step": 5049 + }, + { + "epoch": 0.76, + "grad_norm": 11.510118289733407, + "learning_rate": 1.4168644095507155e-06, + "loss": 0.4332, + "step": 5050 + }, + { + "epoch": 0.76, + "grad_norm": 1.0487746346455564, + "learning_rate": 1.4151612729949566e-06, + "loss": 0.4938, + "step": 5051 + }, + { + "epoch": 0.76, + "grad_norm": 8.430452058097448, + "learning_rate": 1.4134589919240815e-06, + "loss": 0.4029, + "step": 5052 + }, + { + "epoch": 0.76, + "grad_norm": 5.594057396851826, + "learning_rate": 1.4117575667443224e-06, + "loss": 0.4821, + "step": 5053 + }, + { + "epoch": 0.76, + "grad_norm": 5.324014851793637, + "learning_rate": 1.4100569978617068e-06, + "loss": 0.3887, + "step": 5054 + }, + { + "epoch": 0.76, + "grad_norm": 42.432809571795296, + "learning_rate": 1.4083572856820564e-06, + "loss": 0.4324, + "step": 5055 + }, + { + "epoch": 0.76, + "grad_norm": 7.378214528511265, + "learning_rate": 1.4066584306109937e-06, + "loss": 0.4643, + "step": 5056 + }, + { + "epoch": 0.76, + "grad_norm": 4.132300953441496, + "learning_rate": 1.4049604330539285e-06, + "loss": 0.3994, + "step": 5057 + }, + { + "epoch": 0.76, + "grad_norm": 8.704928136765426, + "learning_rate": 1.403263293416075e-06, + "loss": 0.5244, + "step": 5058 + }, + { + "epoch": 0.76, + "grad_norm": 8.865145634010185, + "learning_rate": 1.4015670121024355e-06, + "loss": 0.4691, + "step": 5059 + }, + { + "epoch": 0.76, + "grad_norm": 4.873181629234607, + "learning_rate": 1.3998715895178093e-06, + "loss": 0.445, + "step": 5060 + }, + { + "epoch": 0.76, + "grad_norm": 28.163526929394067, + "learning_rate": 1.3981770260667948e-06, + "loss": 0.4537, + "step": 5061 + }, + { + "epoch": 0.76, + "grad_norm": 6.531773599097027, + "learning_rate": 1.3964833221537794e-06, + "loss": 0.4047, + "step": 5062 + }, + { + "epoch": 0.76, + "grad_norm": 5.100363959231573, + "learning_rate": 1.3947904781829497e-06, + "loss": 0.4087, + "step": 5063 + }, + { + "epoch": 0.76, + "grad_norm": 8.537724691517257, + "learning_rate": 1.3930984945582837e-06, + "loss": 0.4265, + "step": 5064 + }, + { + "epoch": 0.76, + "grad_norm": 5.603927719488869, + "learning_rate": 1.3914073716835557e-06, + "loss": 0.4516, + "step": 5065 + }, + { + "epoch": 0.76, + "grad_norm": 5.544240876025431, + "learning_rate": 1.389717109962338e-06, + "loss": 0.4305, + "step": 5066 + }, + { + "epoch": 0.76, + "grad_norm": 4.480323024762036, + "learning_rate": 1.3880277097979899e-06, + "loss": 0.3711, + "step": 5067 + }, + { + "epoch": 0.76, + "grad_norm": 7.702112300801673, + "learning_rate": 1.3863391715936736e-06, + "loss": 0.4476, + "step": 5068 + }, + { + "epoch": 0.76, + "grad_norm": 1.1843117534125258, + "learning_rate": 1.3846514957523389e-06, + "loss": 0.5532, + "step": 5069 + }, + { + "epoch": 0.76, + "grad_norm": 1.024133521290889, + "learning_rate": 1.3829646826767312e-06, + "loss": 0.5096, + "step": 5070 + }, + { + "epoch": 0.76, + "grad_norm": 6.677878441041741, + "learning_rate": 1.3812787327693945e-06, + "loss": 0.435, + "step": 5071 + }, + { + "epoch": 0.77, + "grad_norm": 4.956339524483958, + "learning_rate": 1.37959364643266e-06, + "loss": 0.4537, + "step": 5072 + }, + { + "epoch": 0.77, + "grad_norm": 27.865882478510272, + "learning_rate": 1.377909424068659e-06, + "loss": 0.4076, + "step": 5073 + }, + { + "epoch": 0.77, + "grad_norm": 5.615725433438222, + "learning_rate": 1.3762260660793125e-06, + "loss": 0.4513, + "step": 5074 + }, + { + "epoch": 0.77, + "grad_norm": 6.99264071019721, + "learning_rate": 1.374543572866337e-06, + "loss": 0.4499, + "step": 5075 + }, + { + "epoch": 0.77, + "grad_norm": 5.101540467122868, + "learning_rate": 1.3728619448312413e-06, + "loss": 0.4315, + "step": 5076 + }, + { + "epoch": 0.77, + "grad_norm": 4.209007994735339, + "learning_rate": 1.3711811823753286e-06, + "loss": 0.381, + "step": 5077 + }, + { + "epoch": 0.77, + "grad_norm": 10.545759366064242, + "learning_rate": 1.369501285899697e-06, + "loss": 0.4285, + "step": 5078 + }, + { + "epoch": 0.77, + "grad_norm": 8.404897518354886, + "learning_rate": 1.3678222558052357e-06, + "loss": 0.4278, + "step": 5079 + }, + { + "epoch": 0.77, + "grad_norm": 5.274680194953741, + "learning_rate": 1.366144092492626e-06, + "loss": 0.4319, + "step": 5080 + }, + { + "epoch": 0.77, + "grad_norm": 4.252560613176315, + "learning_rate": 1.364466796362348e-06, + "loss": 0.3952, + "step": 5081 + }, + { + "epoch": 0.77, + "grad_norm": 6.587293480920098, + "learning_rate": 1.362790367814668e-06, + "loss": 0.4472, + "step": 5082 + }, + { + "epoch": 0.77, + "grad_norm": 5.9490719415231865, + "learning_rate": 1.3611148072496515e-06, + "loss": 0.4503, + "step": 5083 + }, + { + "epoch": 0.77, + "grad_norm": 6.045786535039181, + "learning_rate": 1.3594401150671521e-06, + "loss": 0.3873, + "step": 5084 + }, + { + "epoch": 0.77, + "grad_norm": 5.506321741753398, + "learning_rate": 1.3577662916668167e-06, + "loss": 0.4566, + "step": 5085 + }, + { + "epoch": 0.77, + "grad_norm": 5.8837960273195815, + "learning_rate": 1.3560933374480877e-06, + "loss": 0.435, + "step": 5086 + }, + { + "epoch": 0.77, + "grad_norm": 6.092460221328097, + "learning_rate": 1.3544212528101986e-06, + "loss": 0.5137, + "step": 5087 + }, + { + "epoch": 0.77, + "grad_norm": 6.0347044119138, + "learning_rate": 1.3527500381521746e-06, + "loss": 0.448, + "step": 5088 + }, + { + "epoch": 0.77, + "grad_norm": 5.930244687050622, + "learning_rate": 1.3510796938728331e-06, + "loss": 0.3529, + "step": 5089 + }, + { + "epoch": 0.77, + "grad_norm": 9.329309031948746, + "learning_rate": 1.3494102203707848e-06, + "loss": 0.4177, + "step": 5090 + }, + { + "epoch": 0.77, + "grad_norm": 5.620250468507199, + "learning_rate": 1.3477416180444337e-06, + "loss": 0.4571, + "step": 5091 + }, + { + "epoch": 0.77, + "grad_norm": 5.955158178794182, + "learning_rate": 1.346073887291972e-06, + "loss": 0.4233, + "step": 5092 + }, + { + "epoch": 0.77, + "grad_norm": 4.703983476991287, + "learning_rate": 1.3444070285113903e-06, + "loss": 0.4612, + "step": 5093 + }, + { + "epoch": 0.77, + "grad_norm": 6.573998628513427, + "learning_rate": 1.342741042100465e-06, + "loss": 0.4237, + "step": 5094 + }, + { + "epoch": 0.77, + "grad_norm": 7.575709227372103, + "learning_rate": 1.3410759284567654e-06, + "loss": 0.3889, + "step": 5095 + }, + { + "epoch": 0.77, + "grad_norm": 6.122960875552425, + "learning_rate": 1.339411687977657e-06, + "loss": 0.3971, + "step": 5096 + }, + { + "epoch": 0.77, + "grad_norm": 11.590237955023165, + "learning_rate": 1.3377483210602916e-06, + "loss": 0.4994, + "step": 5097 + }, + { + "epoch": 0.77, + "grad_norm": 6.60530443659417, + "learning_rate": 1.3360858281016148e-06, + "loss": 0.4718, + "step": 5098 + }, + { + "epoch": 0.77, + "grad_norm": 7.644522660749432, + "learning_rate": 1.3344242094983634e-06, + "loss": 0.4426, + "step": 5099 + }, + { + "epoch": 0.77, + "grad_norm": 10.694368078447145, + "learning_rate": 1.332763465647064e-06, + "loss": 0.3835, + "step": 5100 + }, + { + "epoch": 0.77, + "grad_norm": 8.313994520571706, + "learning_rate": 1.3311035969440394e-06, + "loss": 0.3112, + "step": 5101 + }, + { + "epoch": 0.77, + "grad_norm": 6.283374724321883, + "learning_rate": 1.3294446037853976e-06, + "loss": 0.4149, + "step": 5102 + }, + { + "epoch": 0.77, + "grad_norm": 4.196433765836003, + "learning_rate": 1.327786486567042e-06, + "loss": 0.3418, + "step": 5103 + }, + { + "epoch": 0.77, + "grad_norm": 5.0053539570023124, + "learning_rate": 1.3261292456846648e-06, + "loss": 0.4472, + "step": 5104 + }, + { + "epoch": 0.77, + "grad_norm": 4.455400762496257, + "learning_rate": 1.3244728815337472e-06, + "loss": 0.4125, + "step": 5105 + }, + { + "epoch": 0.77, + "grad_norm": 7.852037869214728, + "learning_rate": 1.3228173945095668e-06, + "loss": 0.4249, + "step": 5106 + }, + { + "epoch": 0.77, + "grad_norm": 6.08036676535131, + "learning_rate": 1.3211627850071872e-06, + "loss": 0.3972, + "step": 5107 + }, + { + "epoch": 0.77, + "grad_norm": 5.6794212779192295, + "learning_rate": 1.3195090534214628e-06, + "loss": 0.3449, + "step": 5108 + }, + { + "epoch": 0.77, + "grad_norm": 3.8932722797065846, + "learning_rate": 1.3178562001470414e-06, + "loss": 0.4247, + "step": 5109 + }, + { + "epoch": 0.77, + "grad_norm": 8.23750522876511, + "learning_rate": 1.3162042255783591e-06, + "loss": 0.4437, + "step": 5110 + }, + { + "epoch": 0.77, + "grad_norm": 69.25474528353405, + "learning_rate": 1.314553130109642e-06, + "loss": 0.3696, + "step": 5111 + }, + { + "epoch": 0.77, + "grad_norm": 5.391821194467162, + "learning_rate": 1.3129029141349075e-06, + "loss": 0.4452, + "step": 5112 + }, + { + "epoch": 0.77, + "grad_norm": 7.4782162064933555, + "learning_rate": 1.3112535780479607e-06, + "loss": 0.4021, + "step": 5113 + }, + { + "epoch": 0.77, + "grad_norm": 8.155508232891005, + "learning_rate": 1.3096051222424023e-06, + "loss": 0.4565, + "step": 5114 + }, + { + "epoch": 0.77, + "grad_norm": 20.104237032047223, + "learning_rate": 1.3079575471116157e-06, + "loss": 0.4195, + "step": 5115 + }, + { + "epoch": 0.77, + "grad_norm": 20.615915478594456, + "learning_rate": 1.3063108530487812e-06, + "loss": 0.4159, + "step": 5116 + }, + { + "epoch": 0.77, + "grad_norm": 7.457310352825753, + "learning_rate": 1.304665040446863e-06, + "loss": 0.4336, + "step": 5117 + }, + { + "epoch": 0.77, + "grad_norm": 1.047668525756197, + "learning_rate": 1.3030201096986172e-06, + "loss": 0.5261, + "step": 5118 + }, + { + "epoch": 0.77, + "grad_norm": 13.476392258921688, + "learning_rate": 1.301376061196592e-06, + "loss": 0.4793, + "step": 5119 + }, + { + "epoch": 0.77, + "grad_norm": 1.2967073747642202, + "learning_rate": 1.2997328953331194e-06, + "loss": 0.5636, + "step": 5120 + }, + { + "epoch": 0.77, + "grad_norm": 7.272901466599276, + "learning_rate": 1.2980906125003273e-06, + "loss": 0.4419, + "step": 5121 + }, + { + "epoch": 0.77, + "grad_norm": 5.032222843301946, + "learning_rate": 1.296449213090128e-06, + "loss": 0.4813, + "step": 5122 + }, + { + "epoch": 0.77, + "grad_norm": 12.105906031826008, + "learning_rate": 1.2948086974942242e-06, + "loss": 0.5116, + "step": 5123 + }, + { + "epoch": 0.77, + "grad_norm": 5.168090665655384, + "learning_rate": 1.293169066104109e-06, + "loss": 0.4129, + "step": 5124 + }, + { + "epoch": 0.77, + "grad_norm": 7.982034978506171, + "learning_rate": 1.2915303193110619e-06, + "loss": 0.3922, + "step": 5125 + }, + { + "epoch": 0.77, + "grad_norm": 6.729913984841636, + "learning_rate": 1.289892457506155e-06, + "loss": 0.3865, + "step": 5126 + }, + { + "epoch": 0.77, + "grad_norm": 6.713053274031575, + "learning_rate": 1.2882554810802466e-06, + "loss": 0.4657, + "step": 5127 + }, + { + "epoch": 0.77, + "grad_norm": 9.168304109565556, + "learning_rate": 1.286619390423982e-06, + "loss": 0.4183, + "step": 5128 + }, + { + "epoch": 0.77, + "grad_norm": 5.869869976406264, + "learning_rate": 1.2849841859278017e-06, + "loss": 0.3842, + "step": 5129 + }, + { + "epoch": 0.77, + "grad_norm": 4.898132562050409, + "learning_rate": 1.2833498679819261e-06, + "loss": 0.362, + "step": 5130 + }, + { + "epoch": 0.77, + "grad_norm": 12.084542099062338, + "learning_rate": 1.2817164369763718e-06, + "loss": 0.4165, + "step": 5131 + }, + { + "epoch": 0.77, + "grad_norm": 16.603211518560276, + "learning_rate": 1.280083893300939e-06, + "loss": 0.5093, + "step": 5132 + }, + { + "epoch": 0.77, + "grad_norm": 9.411280345203798, + "learning_rate": 1.2784522373452174e-06, + "loss": 0.3904, + "step": 5133 + }, + { + "epoch": 0.77, + "grad_norm": 29.509160866334966, + "learning_rate": 1.2768214694985848e-06, + "loss": 0.4202, + "step": 5134 + }, + { + "epoch": 0.77, + "grad_norm": 8.066588868172394, + "learning_rate": 1.2751915901502055e-06, + "loss": 0.4333, + "step": 5135 + }, + { + "epoch": 0.77, + "grad_norm": 13.68333296425764, + "learning_rate": 1.273562599689037e-06, + "loss": 0.4025, + "step": 5136 + }, + { + "epoch": 0.77, + "grad_norm": 6.386057410407292, + "learning_rate": 1.271934498503819e-06, + "loss": 0.4212, + "step": 5137 + }, + { + "epoch": 0.77, + "grad_norm": 8.350984280334277, + "learning_rate": 1.27030728698308e-06, + "loss": 0.3799, + "step": 5138 + }, + { + "epoch": 0.78, + "grad_norm": 10.86664175747827, + "learning_rate": 1.2686809655151406e-06, + "loss": 0.3936, + "step": 5139 + }, + { + "epoch": 0.78, + "grad_norm": 24.64420114527087, + "learning_rate": 1.2670555344881014e-06, + "loss": 0.4639, + "step": 5140 + }, + { + "epoch": 0.78, + "grad_norm": 21.063636411691398, + "learning_rate": 1.265430994289859e-06, + "loss": 0.4485, + "step": 5141 + }, + { + "epoch": 0.78, + "grad_norm": 48.66010014128539, + "learning_rate": 1.2638073453080919e-06, + "loss": 0.357, + "step": 5142 + }, + { + "epoch": 0.78, + "grad_norm": 8.281473066130745, + "learning_rate": 1.2621845879302646e-06, + "loss": 0.3863, + "step": 5143 + }, + { + "epoch": 0.78, + "grad_norm": 5.739456548021198, + "learning_rate": 1.2605627225436352e-06, + "loss": 0.4571, + "step": 5144 + }, + { + "epoch": 0.78, + "grad_norm": 7.785867311520014, + "learning_rate": 1.2589417495352436e-06, + "loss": 0.411, + "step": 5145 + }, + { + "epoch": 0.78, + "grad_norm": 19.675627993697038, + "learning_rate": 1.2573216692919176e-06, + "loss": 0.3885, + "step": 5146 + }, + { + "epoch": 0.78, + "grad_norm": 7.8680192078513445, + "learning_rate": 1.255702482200274e-06, + "loss": 0.4342, + "step": 5147 + }, + { + "epoch": 0.78, + "grad_norm": 4.959885106418071, + "learning_rate": 1.254084188646713e-06, + "loss": 0.34, + "step": 5148 + }, + { + "epoch": 0.78, + "grad_norm": 23.189940167183142, + "learning_rate": 1.2524667890174264e-06, + "loss": 0.4411, + "step": 5149 + }, + { + "epoch": 0.78, + "grad_norm": 6.612172820829139, + "learning_rate": 1.250850283698387e-06, + "loss": 0.4543, + "step": 5150 + }, + { + "epoch": 0.78, + "grad_norm": 9.925550668017921, + "learning_rate": 1.2492346730753613e-06, + "loss": 0.386, + "step": 5151 + }, + { + "epoch": 0.78, + "grad_norm": 9.26038152259737, + "learning_rate": 1.2476199575338955e-06, + "loss": 0.4198, + "step": 5152 + }, + { + "epoch": 0.78, + "grad_norm": 6.848273321239681, + "learning_rate": 1.246006137459323e-06, + "loss": 0.407, + "step": 5153 + }, + { + "epoch": 0.78, + "grad_norm": 11.15514675183654, + "learning_rate": 1.2443932132367698e-06, + "loss": 0.4604, + "step": 5154 + }, + { + "epoch": 0.78, + "grad_norm": 4.164009496871939, + "learning_rate": 1.2427811852511396e-06, + "loss": 0.3594, + "step": 5155 + }, + { + "epoch": 0.78, + "grad_norm": 7.873862003696068, + "learning_rate": 1.2411700538871291e-06, + "loss": 0.5251, + "step": 5156 + }, + { + "epoch": 0.78, + "grad_norm": 10.445269651283365, + "learning_rate": 1.239559819529218e-06, + "loss": 0.4922, + "step": 5157 + }, + { + "epoch": 0.78, + "grad_norm": 8.748615963388067, + "learning_rate": 1.2379504825616705e-06, + "loss": 0.4075, + "step": 5158 + }, + { + "epoch": 0.78, + "grad_norm": 7.341510109603943, + "learning_rate": 1.2363420433685397e-06, + "loss": 0.4819, + "step": 5159 + }, + { + "epoch": 0.78, + "grad_norm": 1.3017595339647985, + "learning_rate": 1.234734502333661e-06, + "loss": 0.54, + "step": 5160 + }, + { + "epoch": 0.78, + "grad_norm": 4.537145110478899, + "learning_rate": 1.2331278598406603e-06, + "loss": 0.4541, + "step": 5161 + }, + { + "epoch": 0.78, + "grad_norm": 7.117118839961999, + "learning_rate": 1.2315221162729457e-06, + "loss": 0.4469, + "step": 5162 + }, + { + "epoch": 0.78, + "grad_norm": 6.540606619211511, + "learning_rate": 1.2299172720137086e-06, + "loss": 0.3367, + "step": 5163 + }, + { + "epoch": 0.78, + "grad_norm": 16.212249645585914, + "learning_rate": 1.2283133274459331e-06, + "loss": 0.4083, + "step": 5164 + }, + { + "epoch": 0.78, + "grad_norm": 1.1453264676266575, + "learning_rate": 1.2267102829523791e-06, + "loss": 0.541, + "step": 5165 + }, + { + "epoch": 0.78, + "grad_norm": 9.24754173471259, + "learning_rate": 1.2251081389156015e-06, + "loss": 0.3938, + "step": 5166 + }, + { + "epoch": 0.78, + "grad_norm": 8.920056837344895, + "learning_rate": 1.223506895717933e-06, + "loss": 0.4108, + "step": 5167 + }, + { + "epoch": 0.78, + "grad_norm": 10.74138231909456, + "learning_rate": 1.2219065537414942e-06, + "loss": 0.4541, + "step": 5168 + }, + { + "epoch": 0.78, + "grad_norm": 10.493548631984039, + "learning_rate": 1.2203071133681898e-06, + "loss": 0.3815, + "step": 5169 + }, + { + "epoch": 0.78, + "grad_norm": 39.58590795316855, + "learning_rate": 1.2187085749797089e-06, + "loss": 0.4503, + "step": 5170 + }, + { + "epoch": 0.78, + "grad_norm": 13.659835420170028, + "learning_rate": 1.2171109389575293e-06, + "loss": 0.4047, + "step": 5171 + }, + { + "epoch": 0.78, + "grad_norm": 7.260350055767581, + "learning_rate": 1.2155142056829083e-06, + "loss": 0.4899, + "step": 5172 + }, + { + "epoch": 0.78, + "grad_norm": 6.403695723549134, + "learning_rate": 1.2139183755368889e-06, + "loss": 0.3754, + "step": 5173 + }, + { + "epoch": 0.78, + "grad_norm": 13.818665836170146, + "learning_rate": 1.2123234489003015e-06, + "loss": 0.4229, + "step": 5174 + }, + { + "epoch": 0.78, + "grad_norm": 6.346491825637881, + "learning_rate": 1.2107294261537572e-06, + "loss": 0.4204, + "step": 5175 + }, + { + "epoch": 0.78, + "grad_norm": 8.639800028249642, + "learning_rate": 1.2091363076776557e-06, + "loss": 0.3931, + "step": 5176 + }, + { + "epoch": 0.78, + "grad_norm": 8.108770899807388, + "learning_rate": 1.207544093852177e-06, + "loss": 0.4301, + "step": 5177 + }, + { + "epoch": 0.78, + "grad_norm": 18.90795717953795, + "learning_rate": 1.2059527850572843e-06, + "loss": 0.4846, + "step": 5178 + }, + { + "epoch": 0.78, + "grad_norm": 7.544912438320002, + "learning_rate": 1.2043623816727312e-06, + "loss": 0.3919, + "step": 5179 + }, + { + "epoch": 0.78, + "grad_norm": 6.472741349013857, + "learning_rate": 1.2027728840780488e-06, + "loss": 0.3832, + "step": 5180 + }, + { + "epoch": 0.78, + "grad_norm": 8.692862650437506, + "learning_rate": 1.2011842926525546e-06, + "loss": 0.4579, + "step": 5181 + }, + { + "epoch": 0.78, + "grad_norm": 8.07395618371269, + "learning_rate": 1.1995966077753496e-06, + "loss": 0.4485, + "step": 5182 + }, + { + "epoch": 0.78, + "grad_norm": 11.765022936239674, + "learning_rate": 1.1980098298253167e-06, + "loss": 0.3811, + "step": 5183 + }, + { + "epoch": 0.78, + "grad_norm": 21.99215706969242, + "learning_rate": 1.1964239591811277e-06, + "loss": 0.4788, + "step": 5184 + }, + { + "epoch": 0.78, + "grad_norm": 11.826477716857916, + "learning_rate": 1.1948389962212308e-06, + "loss": 0.4322, + "step": 5185 + }, + { + "epoch": 0.78, + "grad_norm": 7.096983445717911, + "learning_rate": 1.1932549413238643e-06, + "loss": 0.4465, + "step": 5186 + }, + { + "epoch": 0.78, + "grad_norm": 6.026818728888764, + "learning_rate": 1.1916717948670449e-06, + "loss": 0.4769, + "step": 5187 + }, + { + "epoch": 0.78, + "grad_norm": 10.27559121547826, + "learning_rate": 1.190089557228573e-06, + "loss": 0.4182, + "step": 5188 + }, + { + "epoch": 0.78, + "grad_norm": 15.101294881154143, + "learning_rate": 1.1885082287860361e-06, + "loss": 0.4571, + "step": 5189 + }, + { + "epoch": 0.78, + "grad_norm": 10.248892158745734, + "learning_rate": 1.1869278099167997e-06, + "loss": 0.4346, + "step": 5190 + }, + { + "epoch": 0.78, + "grad_norm": 4.91246491225219, + "learning_rate": 1.1853483009980166e-06, + "loss": 0.4192, + "step": 5191 + }, + { + "epoch": 0.78, + "grad_norm": 12.274843248254083, + "learning_rate": 1.1837697024066192e-06, + "loss": 0.4277, + "step": 5192 + }, + { + "epoch": 0.78, + "grad_norm": 29.499369614079253, + "learning_rate": 1.1821920145193232e-06, + "loss": 0.4382, + "step": 5193 + }, + { + "epoch": 0.78, + "grad_norm": 1.2177193721271373, + "learning_rate": 1.180615237712629e-06, + "loss": 0.5203, + "step": 5194 + }, + { + "epoch": 0.78, + "grad_norm": 7.000334475433054, + "learning_rate": 1.1790393723628157e-06, + "loss": 0.4305, + "step": 5195 + }, + { + "epoch": 0.78, + "grad_norm": 1.2452708378447501, + "learning_rate": 1.17746441884595e-06, + "loss": 0.5829, + "step": 5196 + }, + { + "epoch": 0.78, + "grad_norm": 12.920142900737668, + "learning_rate": 1.1758903775378777e-06, + "loss": 0.4838, + "step": 5197 + }, + { + "epoch": 0.78, + "grad_norm": 6.337086902962379, + "learning_rate": 1.1743172488142257e-06, + "loss": 0.4407, + "step": 5198 + }, + { + "epoch": 0.78, + "grad_norm": 9.083253470462495, + "learning_rate": 1.1727450330504082e-06, + "loss": 0.371, + "step": 5199 + }, + { + "epoch": 0.78, + "grad_norm": 14.57793039208313, + "learning_rate": 1.1711737306216147e-06, + "loss": 0.4439, + "step": 5200 + }, + { + "epoch": 0.78, + "grad_norm": 5.370808857700116, + "learning_rate": 1.1696033419028242e-06, + "loss": 0.4751, + "step": 5201 + }, + { + "epoch": 0.78, + "grad_norm": 9.85052341898441, + "learning_rate": 1.1680338672687913e-06, + "loss": 0.4178, + "step": 5202 + }, + { + "epoch": 0.78, + "grad_norm": 6.62725604523496, + "learning_rate": 1.1664653070940562e-06, + "loss": 0.4499, + "step": 5203 + }, + { + "epoch": 0.78, + "grad_norm": 11.785066592062842, + "learning_rate": 1.1648976617529383e-06, + "loss": 0.459, + "step": 5204 + }, + { + "epoch": 0.79, + "grad_norm": 7.139168944944306, + "learning_rate": 1.1633309316195402e-06, + "loss": 0.4385, + "step": 5205 + }, + { + "epoch": 0.79, + "grad_norm": 11.987025374891225, + "learning_rate": 1.1617651170677474e-06, + "loss": 0.3551, + "step": 5206 + }, + { + "epoch": 0.79, + "grad_norm": 6.091033276487138, + "learning_rate": 1.1602002184712242e-06, + "loss": 0.4036, + "step": 5207 + }, + { + "epoch": 0.79, + "grad_norm": 7.028489284680846, + "learning_rate": 1.158636236203417e-06, + "loss": 0.4398, + "step": 5208 + }, + { + "epoch": 0.79, + "grad_norm": 7.596042492868288, + "learning_rate": 1.157073170637556e-06, + "loss": 0.4539, + "step": 5209 + }, + { + "epoch": 0.79, + "grad_norm": 12.258058163483097, + "learning_rate": 1.1555110221466487e-06, + "loss": 0.4554, + "step": 5210 + }, + { + "epoch": 0.79, + "grad_norm": 9.713158450557584, + "learning_rate": 1.1539497911034875e-06, + "loss": 0.3848, + "step": 5211 + }, + { + "epoch": 0.79, + "grad_norm": 6.268720137388348, + "learning_rate": 1.1523894778806439e-06, + "loss": 0.4247, + "step": 5212 + }, + { + "epoch": 0.79, + "grad_norm": 6.767642548762897, + "learning_rate": 1.1508300828504682e-06, + "loss": 0.4178, + "step": 5213 + }, + { + "epoch": 0.79, + "grad_norm": 9.216853147507242, + "learning_rate": 1.1492716063850973e-06, + "loss": 0.4653, + "step": 5214 + }, + { + "epoch": 0.79, + "grad_norm": 19.075985686223902, + "learning_rate": 1.1477140488564443e-06, + "loss": 0.4205, + "step": 5215 + }, + { + "epoch": 0.79, + "grad_norm": 21.48094994487318, + "learning_rate": 1.1461574106362043e-06, + "loss": 0.3891, + "step": 5216 + }, + { + "epoch": 0.79, + "grad_norm": 9.502345181472593, + "learning_rate": 1.1446016920958524e-06, + "loss": 0.4421, + "step": 5217 + }, + { + "epoch": 0.79, + "grad_norm": 17.610433753667458, + "learning_rate": 1.1430468936066442e-06, + "loss": 0.4814, + "step": 5218 + }, + { + "epoch": 0.79, + "grad_norm": 1.1727431148548835, + "learning_rate": 1.1414930155396186e-06, + "loss": 0.492, + "step": 5219 + }, + { + "epoch": 0.79, + "grad_norm": 12.525349032783764, + "learning_rate": 1.1399400582655911e-06, + "loss": 0.4542, + "step": 5220 + }, + { + "epoch": 0.79, + "grad_norm": 6.544124345077458, + "learning_rate": 1.138388022155158e-06, + "loss": 0.4135, + "step": 5221 + }, + { + "epoch": 0.79, + "grad_norm": 7.973888666074085, + "learning_rate": 1.1368369075786995e-06, + "loss": 0.3604, + "step": 5222 + }, + { + "epoch": 0.79, + "grad_norm": 16.320872691536408, + "learning_rate": 1.1352867149063706e-06, + "loss": 0.4852, + "step": 5223 + }, + { + "epoch": 0.79, + "grad_norm": 12.007480316573227, + "learning_rate": 1.1337374445081106e-06, + "loss": 0.5231, + "step": 5224 + }, + { + "epoch": 0.79, + "grad_norm": 5.569547556532935, + "learning_rate": 1.132189096753637e-06, + "loss": 0.4011, + "step": 5225 + }, + { + "epoch": 0.79, + "grad_norm": 9.438710607587812, + "learning_rate": 1.1306416720124459e-06, + "loss": 0.4146, + "step": 5226 + }, + { + "epoch": 0.79, + "grad_norm": 10.600343731643779, + "learning_rate": 1.1290951706538145e-06, + "loss": 0.409, + "step": 5227 + }, + { + "epoch": 0.79, + "grad_norm": 9.926458272949903, + "learning_rate": 1.1275495930467988e-06, + "loss": 0.397, + "step": 5228 + }, + { + "epoch": 0.79, + "grad_norm": 5.639140339002207, + "learning_rate": 1.1260049395602368e-06, + "loss": 0.4193, + "step": 5229 + }, + { + "epoch": 0.79, + "grad_norm": 14.94909099458609, + "learning_rate": 1.1244612105627434e-06, + "loss": 0.4862, + "step": 5230 + }, + { + "epoch": 0.79, + "grad_norm": 8.995619107407077, + "learning_rate": 1.1229184064227117e-06, + "loss": 0.475, + "step": 5231 + }, + { + "epoch": 0.79, + "grad_norm": 14.3032674304945, + "learning_rate": 1.1213765275083193e-06, + "loss": 0.4597, + "step": 5232 + }, + { + "epoch": 0.79, + "grad_norm": 5.039345703647712, + "learning_rate": 1.1198355741875171e-06, + "loss": 0.443, + "step": 5233 + }, + { + "epoch": 0.79, + "grad_norm": 1.3641022482825955, + "learning_rate": 1.1182955468280404e-06, + "loss": 0.4717, + "step": 5234 + }, + { + "epoch": 0.79, + "grad_norm": 20.235760096401627, + "learning_rate": 1.1167564457973989e-06, + "loss": 0.3645, + "step": 5235 + }, + { + "epoch": 0.79, + "grad_norm": 6.450642579799381, + "learning_rate": 1.1152182714628828e-06, + "loss": 0.4189, + "step": 5236 + }, + { + "epoch": 0.79, + "grad_norm": 19.491593805563863, + "learning_rate": 1.1136810241915635e-06, + "loss": 0.4447, + "step": 5237 + }, + { + "epoch": 0.79, + "grad_norm": 20.41090531956967, + "learning_rate": 1.112144704350288e-06, + "loss": 0.4328, + "step": 5238 + }, + { + "epoch": 0.79, + "grad_norm": 6.505361119913244, + "learning_rate": 1.1106093123056837e-06, + "loss": 0.4498, + "step": 5239 + }, + { + "epoch": 0.79, + "grad_norm": 1.1849355129116315, + "learning_rate": 1.109074848424156e-06, + "loss": 0.526, + "step": 5240 + }, + { + "epoch": 0.79, + "grad_norm": 5.862573424603288, + "learning_rate": 1.1075413130718877e-06, + "loss": 0.4209, + "step": 5241 + }, + { + "epoch": 0.79, + "grad_norm": 12.907877673102453, + "learning_rate": 1.106008706614843e-06, + "loss": 0.488, + "step": 5242 + }, + { + "epoch": 0.79, + "grad_norm": 8.700250926595706, + "learning_rate": 1.1044770294187612e-06, + "loss": 0.3887, + "step": 5243 + }, + { + "epoch": 0.79, + "grad_norm": 15.731433854698702, + "learning_rate": 1.1029462818491632e-06, + "loss": 0.3796, + "step": 5244 + }, + { + "epoch": 0.79, + "grad_norm": 6.050652082431767, + "learning_rate": 1.1014164642713448e-06, + "loss": 0.3696, + "step": 5245 + }, + { + "epoch": 0.79, + "grad_norm": 7.80600062796878, + "learning_rate": 1.0998875770503802e-06, + "loss": 0.4105, + "step": 5246 + }, + { + "epoch": 0.79, + "grad_norm": 16.827187558745475, + "learning_rate": 1.0983596205511254e-06, + "loss": 0.434, + "step": 5247 + }, + { + "epoch": 0.79, + "grad_norm": 10.94698906524837, + "learning_rate": 1.0968325951382087e-06, + "loss": 0.4323, + "step": 5248 + }, + { + "epoch": 0.79, + "grad_norm": 6.989165068700943, + "learning_rate": 1.0953065011760417e-06, + "loss": 0.3795, + "step": 5249 + }, + { + "epoch": 0.79, + "grad_norm": 9.315884966656627, + "learning_rate": 1.0937813390288093e-06, + "loss": 0.5044, + "step": 5250 + }, + { + "epoch": 0.79, + "grad_norm": 19.3752936298468, + "learning_rate": 1.092257109060476e-06, + "loss": 0.4636, + "step": 5251 + }, + { + "epoch": 0.79, + "grad_norm": 4.742785348959713, + "learning_rate": 1.0907338116347842e-06, + "loss": 0.4063, + "step": 5252 + }, + { + "epoch": 0.79, + "grad_norm": 8.197864785968546, + "learning_rate": 1.089211447115251e-06, + "loss": 0.4279, + "step": 5253 + }, + { + "epoch": 0.79, + "grad_norm": 42.55599065405896, + "learning_rate": 1.087690015865176e-06, + "loss": 0.4398, + "step": 5254 + }, + { + "epoch": 0.79, + "grad_norm": 17.435792625543357, + "learning_rate": 1.0861695182476318e-06, + "loss": 0.4745, + "step": 5255 + }, + { + "epoch": 0.79, + "grad_norm": 8.294964717592858, + "learning_rate": 1.0846499546254685e-06, + "loss": 0.4321, + "step": 5256 + }, + { + "epoch": 0.79, + "grad_norm": 9.080013680961672, + "learning_rate": 1.0831313253613162e-06, + "loss": 0.4354, + "step": 5257 + }, + { + "epoch": 0.79, + "grad_norm": 8.34560421743313, + "learning_rate": 1.0816136308175778e-06, + "loss": 0.445, + "step": 5258 + }, + { + "epoch": 0.79, + "grad_norm": 6.427217639397913, + "learning_rate": 1.0800968713564387e-06, + "loss": 0.4083, + "step": 5259 + }, + { + "epoch": 0.79, + "grad_norm": 6.464687147707914, + "learning_rate": 1.0785810473398562e-06, + "loss": 0.4389, + "step": 5260 + }, + { + "epoch": 0.79, + "grad_norm": 9.30510846547853, + "learning_rate": 1.0770661591295662e-06, + "loss": 0.4677, + "step": 5261 + }, + { + "epoch": 0.79, + "grad_norm": 7.060799733776654, + "learning_rate": 1.0755522070870806e-06, + "loss": 0.4414, + "step": 5262 + }, + { + "epoch": 0.79, + "grad_norm": 6.122451529670742, + "learning_rate": 1.074039191573688e-06, + "loss": 0.4667, + "step": 5263 + }, + { + "epoch": 0.79, + "grad_norm": 9.790589408615965, + "learning_rate": 1.072527112950456e-06, + "loss": 0.4312, + "step": 5264 + }, + { + "epoch": 0.79, + "grad_norm": 9.188513790568354, + "learning_rate": 1.071015971578226e-06, + "loss": 0.4594, + "step": 5265 + }, + { + "epoch": 0.79, + "grad_norm": 9.293406749758283, + "learning_rate": 1.0695057678176146e-06, + "loss": 0.4654, + "step": 5266 + }, + { + "epoch": 0.79, + "grad_norm": 6.401803872182206, + "learning_rate": 1.067996502029019e-06, + "loss": 0.3949, + "step": 5267 + }, + { + "epoch": 0.79, + "grad_norm": 14.206695496936929, + "learning_rate": 1.066488174572607e-06, + "loss": 0.4492, + "step": 5268 + }, + { + "epoch": 0.79, + "grad_norm": 11.086724535628768, + "learning_rate": 1.0649807858083289e-06, + "loss": 0.3833, + "step": 5269 + }, + { + "epoch": 0.79, + "grad_norm": 9.2338792586553, + "learning_rate": 1.0634743360959054e-06, + "loss": 0.4675, + "step": 5270 + }, + { + "epoch": 0.8, + "grad_norm": 14.200366538782356, + "learning_rate": 1.0619688257948342e-06, + "loss": 0.4065, + "step": 5271 + }, + { + "epoch": 0.8, + "grad_norm": 12.759278601939412, + "learning_rate": 1.0604642552643923e-06, + "loss": 0.3823, + "step": 5272 + }, + { + "epoch": 0.8, + "grad_norm": 8.41425315419526, + "learning_rate": 1.0589606248636291e-06, + "loss": 0.4213, + "step": 5273 + }, + { + "epoch": 0.8, + "grad_norm": 10.556358613651867, + "learning_rate": 1.0574579349513708e-06, + "loss": 0.4393, + "step": 5274 + }, + { + "epoch": 0.8, + "grad_norm": 12.935282220300795, + "learning_rate": 1.0559561858862177e-06, + "loss": 0.4972, + "step": 5275 + }, + { + "epoch": 0.8, + "grad_norm": 10.833717266170387, + "learning_rate": 1.0544553780265466e-06, + "loss": 0.3769, + "step": 5276 + }, + { + "epoch": 0.8, + "grad_norm": 13.544550508415306, + "learning_rate": 1.0529555117305117e-06, + "loss": 0.3664, + "step": 5277 + }, + { + "epoch": 0.8, + "grad_norm": 33.131326730641916, + "learning_rate": 1.0514565873560384e-06, + "loss": 0.4159, + "step": 5278 + }, + { + "epoch": 0.8, + "grad_norm": 11.837387050213831, + "learning_rate": 1.0499586052608323e-06, + "loss": 0.3898, + "step": 5279 + }, + { + "epoch": 0.8, + "grad_norm": 8.371320740145254, + "learning_rate": 1.04846156580237e-06, + "loss": 0.4659, + "step": 5280 + }, + { + "epoch": 0.8, + "grad_norm": 8.046284978154636, + "learning_rate": 1.0469654693379027e-06, + "loss": 0.3309, + "step": 5281 + }, + { + "epoch": 0.8, + "grad_norm": 7.520986960420434, + "learning_rate": 1.0454703162244623e-06, + "loss": 0.3909, + "step": 5282 + }, + { + "epoch": 0.8, + "grad_norm": 7.199677140028825, + "learning_rate": 1.0439761068188482e-06, + "loss": 0.4164, + "step": 5283 + }, + { + "epoch": 0.8, + "grad_norm": 8.653736334657397, + "learning_rate": 1.0424828414776405e-06, + "loss": 0.4403, + "step": 5284 + }, + { + "epoch": 0.8, + "grad_norm": 6.427586901550289, + "learning_rate": 1.0409905205571914e-06, + "loss": 0.4399, + "step": 5285 + }, + { + "epoch": 0.8, + "grad_norm": 6.473819357641133, + "learning_rate": 1.0394991444136265e-06, + "loss": 0.3068, + "step": 5286 + }, + { + "epoch": 0.8, + "grad_norm": 9.3930709411111, + "learning_rate": 1.0380087134028482e-06, + "loss": 0.4176, + "step": 5287 + }, + { + "epoch": 0.8, + "grad_norm": 7.1545566685279125, + "learning_rate": 1.0365192278805313e-06, + "loss": 0.3902, + "step": 5288 + }, + { + "epoch": 0.8, + "grad_norm": 8.234105407720222, + "learning_rate": 1.0350306882021287e-06, + "loss": 0.4429, + "step": 5289 + }, + { + "epoch": 0.8, + "grad_norm": 9.771297465224404, + "learning_rate": 1.0335430947228642e-06, + "loss": 0.4064, + "step": 5290 + }, + { + "epoch": 0.8, + "grad_norm": 12.936314675266622, + "learning_rate": 1.032056447797734e-06, + "loss": 0.3565, + "step": 5291 + }, + { + "epoch": 0.8, + "grad_norm": 10.408937629863596, + "learning_rate": 1.0305707477815152e-06, + "loss": 0.3856, + "step": 5292 + }, + { + "epoch": 0.8, + "grad_norm": 15.813999713890535, + "learning_rate": 1.0290859950287512e-06, + "loss": 0.4508, + "step": 5293 + }, + { + "epoch": 0.8, + "grad_norm": 8.966289492204469, + "learning_rate": 1.027602189893766e-06, + "loss": 0.4801, + "step": 5294 + }, + { + "epoch": 0.8, + "grad_norm": 15.975350251276435, + "learning_rate": 1.0261193327306534e-06, + "loss": 0.3629, + "step": 5295 + }, + { + "epoch": 0.8, + "grad_norm": 19.66338686977912, + "learning_rate": 1.0246374238932816e-06, + "loss": 0.4264, + "step": 5296 + }, + { + "epoch": 0.8, + "grad_norm": 8.153297592693, + "learning_rate": 1.0231564637352931e-06, + "loss": 0.4036, + "step": 5297 + }, + { + "epoch": 0.8, + "grad_norm": 4.814403688561901, + "learning_rate": 1.021676452610102e-06, + "loss": 0.4095, + "step": 5298 + }, + { + "epoch": 0.8, + "grad_norm": 15.548988967711908, + "learning_rate": 1.0201973908709011e-06, + "loss": 0.4349, + "step": 5299 + }, + { + "epoch": 0.8, + "grad_norm": 20.660775469645763, + "learning_rate": 1.0187192788706518e-06, + "loss": 0.3837, + "step": 5300 + }, + { + "epoch": 0.8, + "grad_norm": 13.31224617058704, + "learning_rate": 1.017242116962089e-06, + "loss": 0.4714, + "step": 5301 + }, + { + "epoch": 0.8, + "grad_norm": 12.529008240697914, + "learning_rate": 1.0157659054977237e-06, + "loss": 0.3807, + "step": 5302 + }, + { + "epoch": 0.8, + "grad_norm": 9.68972179040758, + "learning_rate": 1.0142906448298379e-06, + "loss": 0.4383, + "step": 5303 + }, + { + "epoch": 0.8, + "grad_norm": 9.764145934663592, + "learning_rate": 1.0128163353104885e-06, + "loss": 0.4468, + "step": 5304 + }, + { + "epoch": 0.8, + "grad_norm": 14.013593383993157, + "learning_rate": 1.0113429772915035e-06, + "loss": 0.4578, + "step": 5305 + }, + { + "epoch": 0.8, + "grad_norm": 52.972378245581666, + "learning_rate": 1.0098705711244833e-06, + "loss": 0.4233, + "step": 5306 + }, + { + "epoch": 0.8, + "grad_norm": 8.003729375059885, + "learning_rate": 1.0083991171608054e-06, + "loss": 0.443, + "step": 5307 + }, + { + "epoch": 0.8, + "grad_norm": 9.2713120334391, + "learning_rate": 1.0069286157516155e-06, + "loss": 0.4061, + "step": 5308 + }, + { + "epoch": 0.8, + "grad_norm": 10.6198649402066, + "learning_rate": 1.0054590672478338e-06, + "loss": 0.4388, + "step": 5309 + }, + { + "epoch": 0.8, + "grad_norm": 12.993781945864539, + "learning_rate": 1.0039904720001525e-06, + "loss": 0.4329, + "step": 5310 + }, + { + "epoch": 0.8, + "grad_norm": 14.130819494909543, + "learning_rate": 1.0025228303590363e-06, + "loss": 0.395, + "step": 5311 + }, + { + "epoch": 0.8, + "grad_norm": 6.844447558354887, + "learning_rate": 1.0010561426747246e-06, + "loss": 0.4606, + "step": 5312 + }, + { + "epoch": 0.8, + "grad_norm": 7.733595716993202, + "learning_rate": 9.995904092972247e-07, + "loss": 0.4685, + "step": 5313 + }, + { + "epoch": 0.8, + "grad_norm": 8.073245529057157, + "learning_rate": 9.98125630576322e-07, + "loss": 0.4309, + "step": 5314 + }, + { + "epoch": 0.8, + "grad_norm": 13.480674402508539, + "learning_rate": 9.966618068615692e-07, + "loss": 0.4629, + "step": 5315 + }, + { + "epoch": 0.8, + "grad_norm": 7.050572746636589, + "learning_rate": 9.951989385022914e-07, + "loss": 0.4178, + "step": 5316 + }, + { + "epoch": 0.8, + "grad_norm": 4.916632033503475, + "learning_rate": 9.937370258475897e-07, + "loss": 0.3576, + "step": 5317 + }, + { + "epoch": 0.8, + "grad_norm": 46.572830132848544, + "learning_rate": 9.922760692463323e-07, + "loss": 0.4319, + "step": 5318 + }, + { + "epoch": 0.8, + "grad_norm": 7.253967507060426, + "learning_rate": 9.908160690471641e-07, + "loss": 0.3997, + "step": 5319 + }, + { + "epoch": 0.8, + "grad_norm": 10.20017756424348, + "learning_rate": 9.893570255984968e-07, + "loss": 0.4085, + "step": 5320 + }, + { + "epoch": 0.8, + "grad_norm": 13.998402059686661, + "learning_rate": 9.878989392485173e-07, + "loss": 0.4871, + "step": 5321 + }, + { + "epoch": 0.8, + "grad_norm": 12.1145248316699, + "learning_rate": 9.86441810345183e-07, + "loss": 0.4949, + "step": 5322 + }, + { + "epoch": 0.8, + "grad_norm": 6.155262799365879, + "learning_rate": 9.849856392362201e-07, + "loss": 0.4464, + "step": 5323 + }, + { + "epoch": 0.8, + "grad_norm": 7.3441905632034965, + "learning_rate": 9.83530426269132e-07, + "loss": 0.4947, + "step": 5324 + }, + { + "epoch": 0.8, + "grad_norm": 28.563408218828048, + "learning_rate": 9.820761717911897e-07, + "loss": 0.4785, + "step": 5325 + }, + { + "epoch": 0.8, + "grad_norm": 9.342751265067179, + "learning_rate": 9.806228761494341e-07, + "loss": 0.4825, + "step": 5326 + }, + { + "epoch": 0.8, + "grad_norm": 49.722750149805314, + "learning_rate": 9.79170539690682e-07, + "loss": 0.4701, + "step": 5327 + }, + { + "epoch": 0.8, + "grad_norm": 7.53220764843252, + "learning_rate": 9.777191627615163e-07, + "loss": 0.4701, + "step": 5328 + }, + { + "epoch": 0.8, + "grad_norm": 78.49228166441343, + "learning_rate": 9.762687457082954e-07, + "loss": 0.4688, + "step": 5329 + }, + { + "epoch": 0.8, + "grad_norm": 29.672214054558914, + "learning_rate": 9.748192888771452e-07, + "loss": 0.407, + "step": 5330 + }, + { + "epoch": 0.8, + "grad_norm": 1.1171786664690846, + "learning_rate": 9.733707926139636e-07, + "loss": 0.5409, + "step": 5331 + }, + { + "epoch": 0.8, + "grad_norm": 9.807406188305423, + "learning_rate": 9.719232572644189e-07, + "loss": 0.3971, + "step": 5332 + }, + { + "epoch": 0.8, + "grad_norm": 13.134096030758567, + "learning_rate": 9.704766831739514e-07, + "loss": 0.4528, + "step": 5333 + }, + { + "epoch": 0.8, + "grad_norm": 10.771346155826658, + "learning_rate": 9.690310706877698e-07, + "loss": 0.4757, + "step": 5334 + }, + { + "epoch": 0.8, + "grad_norm": 11.64095514484084, + "learning_rate": 9.67586420150856e-07, + "loss": 0.4665, + "step": 5335 + }, + { + "epoch": 0.8, + "grad_norm": 7.841373354329896, + "learning_rate": 9.661427319079603e-07, + "loss": 0.4369, + "step": 5336 + }, + { + "epoch": 0.8, + "grad_norm": 23.246526163948307, + "learning_rate": 9.647000063036048e-07, + "loss": 0.4107, + "step": 5337 + }, + { + "epoch": 0.81, + "grad_norm": 14.568157543815765, + "learning_rate": 9.63258243682081e-07, + "loss": 0.4377, + "step": 5338 + }, + { + "epoch": 0.81, + "grad_norm": 5.636343147116894, + "learning_rate": 9.61817444387449e-07, + "loss": 0.4075, + "step": 5339 + }, + { + "epoch": 0.81, + "grad_norm": 8.27642767753732, + "learning_rate": 9.603776087635435e-07, + "loss": 0.4763, + "step": 5340 + }, + { + "epoch": 0.81, + "grad_norm": 9.859837587409997, + "learning_rate": 9.589387371539638e-07, + "loss": 0.5496, + "step": 5341 + }, + { + "epoch": 0.81, + "grad_norm": 14.78797014793003, + "learning_rate": 9.575008299020838e-07, + "loss": 0.4167, + "step": 5342 + }, + { + "epoch": 0.81, + "grad_norm": 5.809449159255145, + "learning_rate": 9.560638873510452e-07, + "loss": 0.4425, + "step": 5343 + }, + { + "epoch": 0.81, + "grad_norm": 6.327736070113407, + "learning_rate": 9.546279098437584e-07, + "loss": 0.4135, + "step": 5344 + }, + { + "epoch": 0.81, + "grad_norm": 6.317090082233126, + "learning_rate": 9.531928977229055e-07, + "loss": 0.3851, + "step": 5345 + }, + { + "epoch": 0.81, + "grad_norm": 7.298722133531391, + "learning_rate": 9.517588513309356e-07, + "loss": 0.4234, + "step": 5346 + }, + { + "epoch": 0.81, + "grad_norm": 141.77190223924276, + "learning_rate": 9.503257710100717e-07, + "loss": 0.5149, + "step": 5347 + }, + { + "epoch": 0.81, + "grad_norm": 11.4353560413999, + "learning_rate": 9.488936571023022e-07, + "loss": 0.4062, + "step": 5348 + }, + { + "epoch": 0.81, + "grad_norm": 8.947462834547668, + "learning_rate": 9.474625099493862e-07, + "loss": 0.4298, + "step": 5349 + }, + { + "epoch": 0.81, + "grad_norm": 10.987739151364714, + "learning_rate": 9.460323298928531e-07, + "loss": 0.4521, + "step": 5350 + }, + { + "epoch": 0.81, + "grad_norm": 26.889159092654957, + "learning_rate": 9.446031172739995e-07, + "loss": 0.4861, + "step": 5351 + }, + { + "epoch": 0.81, + "grad_norm": 19.90265312681175, + "learning_rate": 9.431748724338946e-07, + "loss": 0.4991, + "step": 5352 + }, + { + "epoch": 0.81, + "grad_norm": 7.186931072508379, + "learning_rate": 9.417475957133726e-07, + "loss": 0.347, + "step": 5353 + }, + { + "epoch": 0.81, + "grad_norm": 8.623916292344132, + "learning_rate": 9.403212874530382e-07, + "loss": 0.4723, + "step": 5354 + }, + { + "epoch": 0.81, + "grad_norm": 8.473699289582015, + "learning_rate": 9.388959479932658e-07, + "loss": 0.3714, + "step": 5355 + }, + { + "epoch": 0.81, + "grad_norm": 8.991104637246604, + "learning_rate": 9.374715776741967e-07, + "loss": 0.4417, + "step": 5356 + }, + { + "epoch": 0.81, + "grad_norm": 18.34837033996337, + "learning_rate": 9.360481768357449e-07, + "loss": 0.4074, + "step": 5357 + }, + { + "epoch": 0.81, + "grad_norm": 9.168149168833336, + "learning_rate": 9.346257458175884e-07, + "loss": 0.4204, + "step": 5358 + }, + { + "epoch": 0.81, + "grad_norm": 16.718114171699025, + "learning_rate": 9.332042849591749e-07, + "loss": 0.4634, + "step": 5359 + }, + { + "epoch": 0.81, + "grad_norm": 8.54269076672843, + "learning_rate": 9.317837945997238e-07, + "loss": 0.3795, + "step": 5360 + }, + { + "epoch": 0.81, + "grad_norm": 12.963376025408735, + "learning_rate": 9.303642750782177e-07, + "loss": 0.3941, + "step": 5361 + }, + { + "epoch": 0.81, + "grad_norm": 8.727492164126, + "learning_rate": 9.28945726733414e-07, + "loss": 0.4527, + "step": 5362 + }, + { + "epoch": 0.81, + "grad_norm": 17.435572675800458, + "learning_rate": 9.275281499038325e-07, + "loss": 0.4318, + "step": 5363 + }, + { + "epoch": 0.81, + "grad_norm": 7.354787508507699, + "learning_rate": 9.261115449277619e-07, + "loss": 0.4289, + "step": 5364 + }, + { + "epoch": 0.81, + "grad_norm": 16.05186992055341, + "learning_rate": 9.24695912143263e-07, + "loss": 0.5115, + "step": 5365 + }, + { + "epoch": 0.81, + "grad_norm": 7.906203933418942, + "learning_rate": 9.232812518881618e-07, + "loss": 0.4823, + "step": 5366 + }, + { + "epoch": 0.81, + "grad_norm": 18.532784926667624, + "learning_rate": 9.218675645000508e-07, + "loss": 0.3802, + "step": 5367 + }, + { + "epoch": 0.81, + "grad_norm": 13.150237916566377, + "learning_rate": 9.20454850316293e-07, + "loss": 0.455, + "step": 5368 + }, + { + "epoch": 0.81, + "grad_norm": 11.821623203235147, + "learning_rate": 9.190431096740172e-07, + "loss": 0.3826, + "step": 5369 + }, + { + "epoch": 0.81, + "grad_norm": 57.15329305542216, + "learning_rate": 9.176323429101219e-07, + "loss": 0.3859, + "step": 5370 + }, + { + "epoch": 0.81, + "grad_norm": 10.500499509022175, + "learning_rate": 9.162225503612704e-07, + "loss": 0.4201, + "step": 5371 + }, + { + "epoch": 0.81, + "grad_norm": 11.458023785866128, + "learning_rate": 9.148137323638978e-07, + "loss": 0.3961, + "step": 5372 + }, + { + "epoch": 0.81, + "grad_norm": 19.575331499971533, + "learning_rate": 9.13405889254203e-07, + "loss": 0.4376, + "step": 5373 + }, + { + "epoch": 0.81, + "grad_norm": 14.570755550071617, + "learning_rate": 9.119990213681512e-07, + "loss": 0.4522, + "step": 5374 + }, + { + "epoch": 0.81, + "grad_norm": 26.613532275491732, + "learning_rate": 9.105931290414799e-07, + "loss": 0.4517, + "step": 5375 + }, + { + "epoch": 0.81, + "grad_norm": 23.17720518315366, + "learning_rate": 9.091882126096885e-07, + "loss": 0.4643, + "step": 5376 + }, + { + "epoch": 0.81, + "grad_norm": 5.807516051988936, + "learning_rate": 9.077842724080482e-07, + "loss": 0.4292, + "step": 5377 + }, + { + "epoch": 0.81, + "grad_norm": 6.669734192521455, + "learning_rate": 9.063813087715933e-07, + "loss": 0.4309, + "step": 5378 + }, + { + "epoch": 0.81, + "grad_norm": 5.961110951705839, + "learning_rate": 9.049793220351272e-07, + "loss": 0.4317, + "step": 5379 + }, + { + "epoch": 0.81, + "grad_norm": 10.45197552929036, + "learning_rate": 9.035783125332199e-07, + "loss": 0.3876, + "step": 5380 + }, + { + "epoch": 0.81, + "grad_norm": 11.301394583195993, + "learning_rate": 9.021782806002055e-07, + "loss": 0.4791, + "step": 5381 + }, + { + "epoch": 0.81, + "grad_norm": 11.663360197406467, + "learning_rate": 9.007792265701903e-07, + "loss": 0.3954, + "step": 5382 + }, + { + "epoch": 0.81, + "grad_norm": 8.999743232797956, + "learning_rate": 8.993811507770428e-07, + "loss": 0.4135, + "step": 5383 + }, + { + "epoch": 0.81, + "grad_norm": 9.125528599395487, + "learning_rate": 8.979840535543988e-07, + "loss": 0.4115, + "step": 5384 + }, + { + "epoch": 0.81, + "grad_norm": 14.42854521659178, + "learning_rate": 8.965879352356632e-07, + "loss": 0.4148, + "step": 5385 + }, + { + "epoch": 0.81, + "grad_norm": 23.552832492318323, + "learning_rate": 8.951927961540025e-07, + "loss": 0.4123, + "step": 5386 + }, + { + "epoch": 0.81, + "grad_norm": 7.359465691146914, + "learning_rate": 8.937986366423551e-07, + "loss": 0.4304, + "step": 5387 + }, + { + "epoch": 0.81, + "grad_norm": 8.943569126365619, + "learning_rate": 8.924054570334212e-07, + "loss": 0.4343, + "step": 5388 + }, + { + "epoch": 0.81, + "grad_norm": 11.692673723055693, + "learning_rate": 8.910132576596697e-07, + "loss": 0.4283, + "step": 5389 + }, + { + "epoch": 0.81, + "grad_norm": 10.886952991639738, + "learning_rate": 8.896220388533338e-07, + "loss": 0.4901, + "step": 5390 + }, + { + "epoch": 0.81, + "grad_norm": 7.149600853929758, + "learning_rate": 8.882318009464124e-07, + "loss": 0.4219, + "step": 5391 + }, + { + "epoch": 0.81, + "grad_norm": 1.0314381093565235, + "learning_rate": 8.868425442706747e-07, + "loss": 0.49, + "step": 5392 + }, + { + "epoch": 0.81, + "grad_norm": 11.238530004192857, + "learning_rate": 8.8545426915765e-07, + "loss": 0.4258, + "step": 5393 + }, + { + "epoch": 0.81, + "grad_norm": 15.31924123839077, + "learning_rate": 8.840669759386355e-07, + "loss": 0.4736, + "step": 5394 + }, + { + "epoch": 0.81, + "grad_norm": 7.941482194418099, + "learning_rate": 8.826806649446973e-07, + "loss": 0.4757, + "step": 5395 + }, + { + "epoch": 0.81, + "grad_norm": 13.217421501621535, + "learning_rate": 8.812953365066607e-07, + "loss": 0.4531, + "step": 5396 + }, + { + "epoch": 0.81, + "grad_norm": 20.988005390500383, + "learning_rate": 8.799109909551234e-07, + "loss": 0.4171, + "step": 5397 + }, + { + "epoch": 0.81, + "grad_norm": 5.609200048094525, + "learning_rate": 8.785276286204436e-07, + "loss": 0.4233, + "step": 5398 + }, + { + "epoch": 0.81, + "grad_norm": 19.681995742839995, + "learning_rate": 8.771452498327454e-07, + "loss": 0.3887, + "step": 5399 + }, + { + "epoch": 0.81, + "grad_norm": 6.540920297998119, + "learning_rate": 8.757638549219216e-07, + "loss": 0.3774, + "step": 5400 + }, + { + "epoch": 0.81, + "grad_norm": 11.855411721353846, + "learning_rate": 8.743834442176269e-07, + "loss": 0.3917, + "step": 5401 + }, + { + "epoch": 0.81, + "grad_norm": 12.478983553372718, + "learning_rate": 8.730040180492822e-07, + "loss": 0.4504, + "step": 5402 + }, + { + "epoch": 0.81, + "grad_norm": 9.894161809961167, + "learning_rate": 8.716255767460729e-07, + "loss": 0.5123, + "step": 5403 + }, + { + "epoch": 0.82, + "grad_norm": 1.0773921881952275, + "learning_rate": 8.702481206369485e-07, + "loss": 0.5369, + "step": 5404 + }, + { + "epoch": 0.82, + "grad_norm": 9.460680299698193, + "learning_rate": 8.688716500506278e-07, + "loss": 0.4928, + "step": 5405 + }, + { + "epoch": 0.82, + "grad_norm": 14.941661007730284, + "learning_rate": 8.674961653155889e-07, + "loss": 0.4299, + "step": 5406 + }, + { + "epoch": 0.82, + "grad_norm": 12.470646639276142, + "learning_rate": 8.661216667600785e-07, + "loss": 0.4447, + "step": 5407 + }, + { + "epoch": 0.82, + "grad_norm": 7.8558013261313215, + "learning_rate": 8.647481547121067e-07, + "loss": 0.3873, + "step": 5408 + }, + { + "epoch": 0.82, + "grad_norm": 9.246643880539626, + "learning_rate": 8.633756294994461e-07, + "loss": 0.4499, + "step": 5409 + }, + { + "epoch": 0.82, + "grad_norm": 11.355628594910776, + "learning_rate": 8.62004091449638e-07, + "loss": 0.4973, + "step": 5410 + }, + { + "epoch": 0.82, + "grad_norm": 11.294057088049206, + "learning_rate": 8.606335408899841e-07, + "loss": 0.361, + "step": 5411 + }, + { + "epoch": 0.82, + "grad_norm": 20.095617129834707, + "learning_rate": 8.592639781475537e-07, + "loss": 0.4067, + "step": 5412 + }, + { + "epoch": 0.82, + "grad_norm": 6.774971684403115, + "learning_rate": 8.578954035491782e-07, + "loss": 0.3653, + "step": 5413 + }, + { + "epoch": 0.82, + "grad_norm": 22.3761901239624, + "learning_rate": 8.565278174214541e-07, + "loss": 0.4877, + "step": 5414 + }, + { + "epoch": 0.82, + "grad_norm": 14.323027601414552, + "learning_rate": 8.551612200907411e-07, + "loss": 0.379, + "step": 5415 + }, + { + "epoch": 0.82, + "grad_norm": 12.014425899888368, + "learning_rate": 8.537956118831626e-07, + "loss": 0.3544, + "step": 5416 + }, + { + "epoch": 0.82, + "grad_norm": 11.983591973531155, + "learning_rate": 8.524309931246094e-07, + "loss": 0.4074, + "step": 5417 + }, + { + "epoch": 0.82, + "grad_norm": 13.098253115895954, + "learning_rate": 8.51067364140733e-07, + "loss": 0.426, + "step": 5418 + }, + { + "epoch": 0.82, + "grad_norm": 9.278898689836112, + "learning_rate": 8.49704725256948e-07, + "loss": 0.4144, + "step": 5419 + }, + { + "epoch": 0.82, + "grad_norm": 7.729294212717467, + "learning_rate": 8.483430767984358e-07, + "loss": 0.4441, + "step": 5420 + }, + { + "epoch": 0.82, + "grad_norm": 8.560971225962868, + "learning_rate": 8.469824190901382e-07, + "loss": 0.421, + "step": 5421 + }, + { + "epoch": 0.82, + "grad_norm": 14.92554607386126, + "learning_rate": 8.45622752456764e-07, + "loss": 0.449, + "step": 5422 + }, + { + "epoch": 0.82, + "grad_norm": 12.751843861618113, + "learning_rate": 8.442640772227834e-07, + "loss": 0.4509, + "step": 5423 + }, + { + "epoch": 0.82, + "grad_norm": 5.6499411260161745, + "learning_rate": 8.429063937124294e-07, + "loss": 0.4114, + "step": 5424 + }, + { + "epoch": 0.82, + "grad_norm": 10.880109251472874, + "learning_rate": 8.415497022496994e-07, + "loss": 0.369, + "step": 5425 + }, + { + "epoch": 0.82, + "grad_norm": 9.010374038016272, + "learning_rate": 8.401940031583523e-07, + "loss": 0.4386, + "step": 5426 + }, + { + "epoch": 0.82, + "grad_norm": 5.3201261174502195, + "learning_rate": 8.388392967619152e-07, + "loss": 0.4033, + "step": 5427 + }, + { + "epoch": 0.82, + "grad_norm": 13.96327718756873, + "learning_rate": 8.374855833836726e-07, + "loss": 0.4673, + "step": 5428 + }, + { + "epoch": 0.82, + "grad_norm": 6.885553946939302, + "learning_rate": 8.361328633466737e-07, + "loss": 0.473, + "step": 5429 + }, + { + "epoch": 0.82, + "grad_norm": 17.758939212542643, + "learning_rate": 8.347811369737335e-07, + "loss": 0.4137, + "step": 5430 + }, + { + "epoch": 0.82, + "grad_norm": 7.630900636313524, + "learning_rate": 8.334304045874248e-07, + "loss": 0.4273, + "step": 5431 + }, + { + "epoch": 0.82, + "grad_norm": 7.094728582479015, + "learning_rate": 8.320806665100889e-07, + "loss": 0.426, + "step": 5432 + }, + { + "epoch": 0.82, + "grad_norm": 7.909052902383191, + "learning_rate": 8.307319230638255e-07, + "loss": 0.4948, + "step": 5433 + }, + { + "epoch": 0.82, + "grad_norm": 8.188368797120107, + "learning_rate": 8.293841745704967e-07, + "loss": 0.4242, + "step": 5434 + }, + { + "epoch": 0.82, + "grad_norm": 17.74081302461166, + "learning_rate": 8.280374213517323e-07, + "loss": 0.4164, + "step": 5435 + }, + { + "epoch": 0.82, + "grad_norm": 46.89130617455893, + "learning_rate": 8.266916637289191e-07, + "loss": 0.4618, + "step": 5436 + }, + { + "epoch": 0.82, + "grad_norm": 7.479381634809684, + "learning_rate": 8.25346902023208e-07, + "loss": 0.4521, + "step": 5437 + }, + { + "epoch": 0.82, + "grad_norm": 10.354713366541333, + "learning_rate": 8.240031365555134e-07, + "loss": 0.3983, + "step": 5438 + }, + { + "epoch": 0.82, + "grad_norm": 5.092164241310773, + "learning_rate": 8.226603676465094e-07, + "loss": 0.4554, + "step": 5439 + }, + { + "epoch": 0.82, + "grad_norm": 1.119249123186006, + "learning_rate": 8.213185956166364e-07, + "loss": 0.5143, + "step": 5440 + }, + { + "epoch": 0.82, + "grad_norm": 8.121095829200007, + "learning_rate": 8.199778207860925e-07, + "loss": 0.4416, + "step": 5441 + }, + { + "epoch": 0.82, + "grad_norm": 6.375719252416167, + "learning_rate": 8.18638043474842e-07, + "loss": 0.4097, + "step": 5442 + }, + { + "epoch": 0.82, + "grad_norm": 6.2311877621263765, + "learning_rate": 8.172992640026072e-07, + "loss": 0.4376, + "step": 5443 + }, + { + "epoch": 0.82, + "grad_norm": 9.215199756003829, + "learning_rate": 8.159614826888734e-07, + "loss": 0.404, + "step": 5444 + }, + { + "epoch": 0.82, + "grad_norm": 7.256454462751428, + "learning_rate": 8.146246998528912e-07, + "loss": 0.452, + "step": 5445 + }, + { + "epoch": 0.82, + "grad_norm": 10.516756407954842, + "learning_rate": 8.132889158136681e-07, + "loss": 0.374, + "step": 5446 + }, + { + "epoch": 0.82, + "grad_norm": 10.117380328944872, + "learning_rate": 8.119541308899742e-07, + "loss": 0.4284, + "step": 5447 + }, + { + "epoch": 0.82, + "grad_norm": 7.562529523511456, + "learning_rate": 8.106203454003442e-07, + "loss": 0.3677, + "step": 5448 + }, + { + "epoch": 0.82, + "grad_norm": 9.869982190998707, + "learning_rate": 8.092875596630722e-07, + "loss": 0.3814, + "step": 5449 + }, + { + "epoch": 0.82, + "grad_norm": 7.372778564510488, + "learning_rate": 8.079557739962129e-07, + "loss": 0.4585, + "step": 5450 + }, + { + "epoch": 0.82, + "grad_norm": 9.398518841347423, + "learning_rate": 8.066249887175837e-07, + "loss": 0.4737, + "step": 5451 + }, + { + "epoch": 0.82, + "grad_norm": 1.0279437760131922, + "learning_rate": 8.052952041447614e-07, + "loss": 0.4857, + "step": 5452 + }, + { + "epoch": 0.82, + "grad_norm": 7.196641811159402, + "learning_rate": 8.039664205950876e-07, + "loss": 0.3688, + "step": 5453 + }, + { + "epoch": 0.82, + "grad_norm": 10.303075909876602, + "learning_rate": 8.02638638385661e-07, + "loss": 0.4225, + "step": 5454 + }, + { + "epoch": 0.82, + "grad_norm": 9.992061594200957, + "learning_rate": 8.013118578333451e-07, + "loss": 0.4252, + "step": 5455 + }, + { + "epoch": 0.82, + "grad_norm": 9.462497695830738, + "learning_rate": 7.999860792547609e-07, + "loss": 0.5213, + "step": 5456 + }, + { + "epoch": 0.82, + "grad_norm": 8.308784006368848, + "learning_rate": 7.986613029662915e-07, + "loss": 0.4743, + "step": 5457 + }, + { + "epoch": 0.82, + "grad_norm": 8.457188862158105, + "learning_rate": 7.973375292840835e-07, + "loss": 0.3949, + "step": 5458 + }, + { + "epoch": 0.82, + "grad_norm": 12.912844136822514, + "learning_rate": 7.960147585240396e-07, + "loss": 0.4804, + "step": 5459 + }, + { + "epoch": 0.82, + "grad_norm": 20.159357480109776, + "learning_rate": 7.946929910018264e-07, + "loss": 0.4379, + "step": 5460 + }, + { + "epoch": 0.82, + "grad_norm": 7.395285434474368, + "learning_rate": 7.933722270328698e-07, + "loss": 0.3389, + "step": 5461 + }, + { + "epoch": 0.82, + "grad_norm": 11.39123696763879, + "learning_rate": 7.920524669323554e-07, + "loss": 0.459, + "step": 5462 + }, + { + "epoch": 0.82, + "grad_norm": 8.572361982268138, + "learning_rate": 7.907337110152324e-07, + "loss": 0.376, + "step": 5463 + }, + { + "epoch": 0.82, + "grad_norm": 7.425311075581576, + "learning_rate": 7.894159595962065e-07, + "loss": 0.3773, + "step": 5464 + }, + { + "epoch": 0.82, + "grad_norm": 10.060761149606728, + "learning_rate": 7.88099212989748e-07, + "loss": 0.4358, + "step": 5465 + }, + { + "epoch": 0.82, + "grad_norm": 5.917067035064541, + "learning_rate": 7.86783471510083e-07, + "loss": 0.4486, + "step": 5466 + }, + { + "epoch": 0.82, + "grad_norm": 6.3500289954649105, + "learning_rate": 7.854687354711993e-07, + "loss": 0.4813, + "step": 5467 + }, + { + "epoch": 0.82, + "grad_norm": 23.2102581882762, + "learning_rate": 7.841550051868469e-07, + "loss": 0.4012, + "step": 5468 + }, + { + "epoch": 0.82, + "grad_norm": 12.970681113446565, + "learning_rate": 7.828422809705316e-07, + "loss": 0.5034, + "step": 5469 + }, + { + "epoch": 0.83, + "grad_norm": 12.726628095879253, + "learning_rate": 7.815305631355241e-07, + "loss": 0.466, + "step": 5470 + }, + { + "epoch": 0.83, + "grad_norm": 10.588893959587628, + "learning_rate": 7.802198519948517e-07, + "loss": 0.4592, + "step": 5471 + }, + { + "epoch": 0.83, + "grad_norm": 9.701525593299186, + "learning_rate": 7.789101478613015e-07, + "loss": 0.4248, + "step": 5472 + }, + { + "epoch": 0.83, + "grad_norm": 1.1242544254659539, + "learning_rate": 7.776014510474211e-07, + "loss": 0.4944, + "step": 5473 + }, + { + "epoch": 0.83, + "grad_norm": 11.148906692264298, + "learning_rate": 7.762937618655164e-07, + "loss": 0.4348, + "step": 5474 + }, + { + "epoch": 0.83, + "grad_norm": 5.885614452350353, + "learning_rate": 7.74987080627656e-07, + "loss": 0.472, + "step": 5475 + }, + { + "epoch": 0.83, + "grad_norm": 10.520510608247074, + "learning_rate": 7.73681407645665e-07, + "loss": 0.5044, + "step": 5476 + }, + { + "epoch": 0.83, + "grad_norm": 9.961601093829659, + "learning_rate": 7.72376743231128e-07, + "loss": 0.3834, + "step": 5477 + }, + { + "epoch": 0.83, + "grad_norm": 9.950733669095644, + "learning_rate": 7.71073087695392e-07, + "loss": 0.4509, + "step": 5478 + }, + { + "epoch": 0.83, + "grad_norm": 11.714956890221021, + "learning_rate": 7.697704413495577e-07, + "loss": 0.4424, + "step": 5479 + }, + { + "epoch": 0.83, + "grad_norm": 11.013422939968912, + "learning_rate": 7.684688045044919e-07, + "loss": 0.5245, + "step": 5480 + }, + { + "epoch": 0.83, + "grad_norm": 24.399389602411276, + "learning_rate": 7.671681774708145e-07, + "loss": 0.4571, + "step": 5481 + }, + { + "epoch": 0.83, + "grad_norm": 15.102592354188358, + "learning_rate": 7.658685605589067e-07, + "loss": 0.4404, + "step": 5482 + }, + { + "epoch": 0.83, + "grad_norm": 24.226653396307476, + "learning_rate": 7.645699540789103e-07, + "loss": 0.401, + "step": 5483 + }, + { + "epoch": 0.83, + "grad_norm": 1.2530389737284793, + "learning_rate": 7.632723583407231e-07, + "loss": 0.5657, + "step": 5484 + }, + { + "epoch": 0.83, + "grad_norm": 7.265824344163817, + "learning_rate": 7.619757736540034e-07, + "loss": 0.3777, + "step": 5485 + }, + { + "epoch": 0.83, + "grad_norm": 10.832633498463991, + "learning_rate": 7.606802003281682e-07, + "loss": 0.422, + "step": 5486 + }, + { + "epoch": 0.83, + "grad_norm": 13.691435534783574, + "learning_rate": 7.593856386723902e-07, + "loss": 0.4799, + "step": 5487 + }, + { + "epoch": 0.83, + "grad_norm": 7.2553935562674745, + "learning_rate": 7.580920889956073e-07, + "loss": 0.3921, + "step": 5488 + }, + { + "epoch": 0.83, + "grad_norm": 9.938674433190346, + "learning_rate": 7.56799551606508e-07, + "loss": 0.4451, + "step": 5489 + }, + { + "epoch": 0.83, + "grad_norm": 1.1597192554984277, + "learning_rate": 7.555080268135461e-07, + "loss": 0.4974, + "step": 5490 + }, + { + "epoch": 0.83, + "grad_norm": 4.908251298624413, + "learning_rate": 7.542175149249298e-07, + "loss": 0.4622, + "step": 5491 + }, + { + "epoch": 0.83, + "grad_norm": 10.112911582078047, + "learning_rate": 7.529280162486246e-07, + "loss": 0.4138, + "step": 5492 + }, + { + "epoch": 0.83, + "grad_norm": 9.804774232295548, + "learning_rate": 7.516395310923585e-07, + "loss": 0.4225, + "step": 5493 + }, + { + "epoch": 0.83, + "grad_norm": 36.71552951418733, + "learning_rate": 7.503520597636149e-07, + "loss": 0.445, + "step": 5494 + }, + { + "epoch": 0.83, + "grad_norm": 7.42487675676883, + "learning_rate": 7.490656025696346e-07, + "loss": 0.4624, + "step": 5495 + }, + { + "epoch": 0.83, + "grad_norm": 10.14948136468449, + "learning_rate": 7.477801598174183e-07, + "loss": 0.4288, + "step": 5496 + }, + { + "epoch": 0.83, + "grad_norm": 8.59903812856341, + "learning_rate": 7.464957318137217e-07, + "loss": 0.4074, + "step": 5497 + }, + { + "epoch": 0.83, + "grad_norm": 1.1234741500879277, + "learning_rate": 7.452123188650629e-07, + "loss": 0.5009, + "step": 5498 + }, + { + "epoch": 0.83, + "grad_norm": 19.074137615173424, + "learning_rate": 7.439299212777135e-07, + "loss": 0.3728, + "step": 5499 + }, + { + "epoch": 0.83, + "grad_norm": 6.071198476218797, + "learning_rate": 7.426485393577054e-07, + "loss": 0.404, + "step": 5500 + }, + { + "epoch": 0.83, + "grad_norm": 154.97052663751754, + "learning_rate": 7.413681734108275e-07, + "loss": 0.4382, + "step": 5501 + }, + { + "epoch": 0.83, + "grad_norm": 7.052756220088788, + "learning_rate": 7.40088823742624e-07, + "loss": 0.3785, + "step": 5502 + }, + { + "epoch": 0.83, + "grad_norm": 8.857804008926568, + "learning_rate": 7.388104906584015e-07, + "loss": 0.4741, + "step": 5503 + }, + { + "epoch": 0.83, + "grad_norm": 7.871414569430011, + "learning_rate": 7.375331744632181e-07, + "loss": 0.4818, + "step": 5504 + }, + { + "epoch": 0.83, + "grad_norm": 11.571135352751194, + "learning_rate": 7.362568754618943e-07, + "loss": 0.3936, + "step": 5505 + }, + { + "epoch": 0.83, + "grad_norm": 8.838243795593858, + "learning_rate": 7.34981593959005e-07, + "loss": 0.4177, + "step": 5506 + }, + { + "epoch": 0.83, + "grad_norm": 10.847496226071302, + "learning_rate": 7.337073302588832e-07, + "loss": 0.3739, + "step": 5507 + }, + { + "epoch": 0.83, + "grad_norm": 11.256660253561318, + "learning_rate": 7.324340846656187e-07, + "loss": 0.4332, + "step": 5508 + }, + { + "epoch": 0.83, + "grad_norm": 8.836791684332274, + "learning_rate": 7.31161857483057e-07, + "loss": 0.424, + "step": 5509 + }, + { + "epoch": 0.83, + "grad_norm": 7.574437513010073, + "learning_rate": 7.29890649014805e-07, + "loss": 0.4879, + "step": 5510 + }, + { + "epoch": 0.83, + "grad_norm": 8.273232357956443, + "learning_rate": 7.286204595642216e-07, + "loss": 0.3898, + "step": 5511 + }, + { + "epoch": 0.83, + "grad_norm": 12.348302459052665, + "learning_rate": 7.273512894344231e-07, + "loss": 0.3724, + "step": 5512 + }, + { + "epoch": 0.83, + "grad_norm": 9.039736219026485, + "learning_rate": 7.260831389282874e-07, + "loss": 0.3652, + "step": 5513 + }, + { + "epoch": 0.83, + "grad_norm": 11.035678171431481, + "learning_rate": 7.248160083484423e-07, + "loss": 0.4277, + "step": 5514 + }, + { + "epoch": 0.83, + "grad_norm": 11.685298921132938, + "learning_rate": 7.235498979972777e-07, + "loss": 0.3904, + "step": 5515 + }, + { + "epoch": 0.83, + "grad_norm": 6.510818244015671, + "learning_rate": 7.222848081769374e-07, + "loss": 0.3888, + "step": 5516 + }, + { + "epoch": 0.83, + "grad_norm": 7.001664522982363, + "learning_rate": 7.21020739189322e-07, + "loss": 0.4183, + "step": 5517 + }, + { + "epoch": 0.83, + "grad_norm": 7.3508437427253845, + "learning_rate": 7.197576913360877e-07, + "loss": 0.4485, + "step": 5518 + }, + { + "epoch": 0.83, + "grad_norm": 1.210353999241169, + "learning_rate": 7.184956649186475e-07, + "loss": 0.5199, + "step": 5519 + }, + { + "epoch": 0.83, + "grad_norm": 21.3927844333565, + "learning_rate": 7.17234660238173e-07, + "loss": 0.4009, + "step": 5520 + }, + { + "epoch": 0.83, + "grad_norm": 1.1176423681492933, + "learning_rate": 7.159746775955889e-07, + "loss": 0.481, + "step": 5521 + }, + { + "epoch": 0.83, + "grad_norm": 10.959402327989377, + "learning_rate": 7.147157172915764e-07, + "loss": 0.4296, + "step": 5522 + }, + { + "epoch": 0.83, + "grad_norm": 9.153409803049946, + "learning_rate": 7.134577796265746e-07, + "loss": 0.4894, + "step": 5523 + }, + { + "epoch": 0.83, + "grad_norm": 21.96269428148758, + "learning_rate": 7.122008649007761e-07, + "loss": 0.4399, + "step": 5524 + }, + { + "epoch": 0.83, + "grad_norm": 8.381761205361828, + "learning_rate": 7.109449734141328e-07, + "loss": 0.4067, + "step": 5525 + }, + { + "epoch": 0.83, + "grad_norm": 14.266455533689362, + "learning_rate": 7.096901054663491e-07, + "loss": 0.4407, + "step": 5526 + }, + { + "epoch": 0.83, + "grad_norm": 19.85916593862597, + "learning_rate": 7.084362613568852e-07, + "loss": 0.536, + "step": 5527 + }, + { + "epoch": 0.83, + "grad_norm": 14.701483063656545, + "learning_rate": 7.071834413849599e-07, + "loss": 0.4339, + "step": 5528 + }, + { + "epoch": 0.83, + "grad_norm": 13.620920958147204, + "learning_rate": 7.059316458495446e-07, + "loss": 0.3993, + "step": 5529 + }, + { + "epoch": 0.83, + "grad_norm": 7.3607575525173115, + "learning_rate": 7.046808750493683e-07, + "loss": 0.4513, + "step": 5530 + }, + { + "epoch": 0.83, + "grad_norm": 11.131029464001363, + "learning_rate": 7.034311292829144e-07, + "loss": 0.4068, + "step": 5531 + }, + { + "epoch": 0.83, + "grad_norm": 10.303302720631565, + "learning_rate": 7.021824088484202e-07, + "loss": 0.4474, + "step": 5532 + }, + { + "epoch": 0.83, + "grad_norm": 10.091218172696102, + "learning_rate": 7.009347140438821e-07, + "loss": 0.3944, + "step": 5533 + }, + { + "epoch": 0.83, + "grad_norm": 10.956084420562494, + "learning_rate": 6.996880451670484e-07, + "loss": 0.4214, + "step": 5534 + }, + { + "epoch": 0.83, + "grad_norm": 11.378505163929583, + "learning_rate": 6.984424025154252e-07, + "loss": 0.4888, + "step": 5535 + }, + { + "epoch": 0.83, + "grad_norm": 5.43662923810866, + "learning_rate": 6.971977863862717e-07, + "loss": 0.4234, + "step": 5536 + }, + { + "epoch": 0.84, + "grad_norm": 8.400655742768366, + "learning_rate": 6.95954197076601e-07, + "loss": 0.3862, + "step": 5537 + }, + { + "epoch": 0.84, + "grad_norm": 5.147481212659574, + "learning_rate": 6.947116348831856e-07, + "loss": 0.485, + "step": 5538 + }, + { + "epoch": 0.84, + "grad_norm": 8.704196513212135, + "learning_rate": 6.934701001025479e-07, + "loss": 0.4243, + "step": 5539 + }, + { + "epoch": 0.84, + "grad_norm": 7.044711986869853, + "learning_rate": 6.922295930309691e-07, + "loss": 0.4583, + "step": 5540 + }, + { + "epoch": 0.84, + "grad_norm": 9.599515668062049, + "learning_rate": 6.909901139644831e-07, + "loss": 0.4241, + "step": 5541 + }, + { + "epoch": 0.84, + "grad_norm": 19.41049883453156, + "learning_rate": 6.897516631988787e-07, + "loss": 0.4753, + "step": 5542 + }, + { + "epoch": 0.84, + "grad_norm": 10.379022583398443, + "learning_rate": 6.885142410296991e-07, + "loss": 0.3576, + "step": 5543 + }, + { + "epoch": 0.84, + "grad_norm": 10.666542407232468, + "learning_rate": 6.872778477522412e-07, + "loss": 0.3911, + "step": 5544 + }, + { + "epoch": 0.84, + "grad_norm": 8.493434267439428, + "learning_rate": 6.860424836615604e-07, + "loss": 0.4475, + "step": 5545 + }, + { + "epoch": 0.84, + "grad_norm": 10.392206695963077, + "learning_rate": 6.848081490524616e-07, + "loss": 0.4431, + "step": 5546 + }, + { + "epoch": 0.84, + "grad_norm": 8.928015810422853, + "learning_rate": 6.835748442195062e-07, + "loss": 0.3892, + "step": 5547 + }, + { + "epoch": 0.84, + "grad_norm": 10.121054453843085, + "learning_rate": 6.823425694570101e-07, + "loss": 0.3908, + "step": 5548 + }, + { + "epoch": 0.84, + "grad_norm": 16.506259053732673, + "learning_rate": 6.811113250590429e-07, + "loss": 0.4186, + "step": 5549 + }, + { + "epoch": 0.84, + "grad_norm": 7.609310226575656, + "learning_rate": 6.798811113194286e-07, + "loss": 0.4155, + "step": 5550 + }, + { + "epoch": 0.84, + "grad_norm": 14.296086674112372, + "learning_rate": 6.786519285317455e-07, + "loss": 0.3894, + "step": 5551 + }, + { + "epoch": 0.84, + "grad_norm": 8.827772775845816, + "learning_rate": 6.77423776989325e-07, + "loss": 0.489, + "step": 5552 + }, + { + "epoch": 0.84, + "grad_norm": 10.082914081382292, + "learning_rate": 6.761966569852523e-07, + "loss": 0.4827, + "step": 5553 + }, + { + "epoch": 0.84, + "grad_norm": 25.851530539859105, + "learning_rate": 6.749705688123665e-07, + "loss": 0.3872, + "step": 5554 + }, + { + "epoch": 0.84, + "grad_norm": 8.137169488944476, + "learning_rate": 6.737455127632625e-07, + "loss": 0.4535, + "step": 5555 + }, + { + "epoch": 0.84, + "grad_norm": 1.4446082073226152, + "learning_rate": 6.725214891302873e-07, + "loss": 0.5627, + "step": 5556 + }, + { + "epoch": 0.84, + "grad_norm": 30.26879326773742, + "learning_rate": 6.712984982055393e-07, + "loss": 0.4265, + "step": 5557 + }, + { + "epoch": 0.84, + "grad_norm": 6.98885768070102, + "learning_rate": 6.700765402808751e-07, + "loss": 0.402, + "step": 5558 + }, + { + "epoch": 0.84, + "grad_norm": 6.670090435308791, + "learning_rate": 6.688556156479021e-07, + "loss": 0.3914, + "step": 5559 + }, + { + "epoch": 0.84, + "grad_norm": 8.643360809748664, + "learning_rate": 6.676357245979792e-07, + "loss": 0.4076, + "step": 5560 + }, + { + "epoch": 0.84, + "grad_norm": 7.172960413519421, + "learning_rate": 6.664168674222238e-07, + "loss": 0.4046, + "step": 5561 + }, + { + "epoch": 0.84, + "grad_norm": 8.234849693930766, + "learning_rate": 6.651990444115014e-07, + "loss": 0.4168, + "step": 5562 + }, + { + "epoch": 0.84, + "grad_norm": 12.464880376582926, + "learning_rate": 6.639822558564346e-07, + "loss": 0.3797, + "step": 5563 + }, + { + "epoch": 0.84, + "grad_norm": 28.060458943835744, + "learning_rate": 6.627665020473966e-07, + "loss": 0.4054, + "step": 5564 + }, + { + "epoch": 0.84, + "grad_norm": 18.681770129167187, + "learning_rate": 6.61551783274515e-07, + "loss": 0.4515, + "step": 5565 + }, + { + "epoch": 0.84, + "grad_norm": 19.642670452947392, + "learning_rate": 6.603380998276699e-07, + "loss": 0.4188, + "step": 5566 + }, + { + "epoch": 0.84, + "grad_norm": 12.646286709544395, + "learning_rate": 6.591254519964924e-07, + "loss": 0.4279, + "step": 5567 + }, + { + "epoch": 0.84, + "grad_norm": 10.285966288290398, + "learning_rate": 6.579138400703716e-07, + "loss": 0.4581, + "step": 5568 + }, + { + "epoch": 0.84, + "grad_norm": 27.18120613476007, + "learning_rate": 6.567032643384441e-07, + "loss": 0.4479, + "step": 5569 + }, + { + "epoch": 0.84, + "grad_norm": 8.970165211670107, + "learning_rate": 6.554937250896015e-07, + "loss": 0.4604, + "step": 5570 + }, + { + "epoch": 0.84, + "grad_norm": 7.083272600231045, + "learning_rate": 6.542852226124891e-07, + "loss": 0.4189, + "step": 5571 + }, + { + "epoch": 0.84, + "grad_norm": 7.233899474832337, + "learning_rate": 6.530777571955021e-07, + "loss": 0.3937, + "step": 5572 + }, + { + "epoch": 0.84, + "grad_norm": 11.219075539303324, + "learning_rate": 6.518713291267914e-07, + "loss": 0.4151, + "step": 5573 + }, + { + "epoch": 0.84, + "grad_norm": 7.182827857968582, + "learning_rate": 6.506659386942576e-07, + "loss": 0.4477, + "step": 5574 + }, + { + "epoch": 0.84, + "grad_norm": 12.858608685374824, + "learning_rate": 6.494615861855536e-07, + "loss": 0.4441, + "step": 5575 + }, + { + "epoch": 0.84, + "grad_norm": 13.343200874300361, + "learning_rate": 6.482582718880887e-07, + "loss": 0.412, + "step": 5576 + }, + { + "epoch": 0.84, + "grad_norm": 19.636744994451625, + "learning_rate": 6.470559960890188e-07, + "loss": 0.4431, + "step": 5577 + }, + { + "epoch": 0.84, + "grad_norm": 24.97897313528571, + "learning_rate": 6.458547590752562e-07, + "loss": 0.4364, + "step": 5578 + }, + { + "epoch": 0.84, + "grad_norm": 20.482634387962907, + "learning_rate": 6.446545611334631e-07, + "loss": 0.4387, + "step": 5579 + }, + { + "epoch": 0.84, + "grad_norm": 13.992366881230474, + "learning_rate": 6.434554025500533e-07, + "loss": 0.4134, + "step": 5580 + }, + { + "epoch": 0.84, + "grad_norm": 10.465793524023145, + "learning_rate": 6.422572836111957e-07, + "loss": 0.4027, + "step": 5581 + }, + { + "epoch": 0.84, + "grad_norm": 7.805258766383043, + "learning_rate": 6.410602046028075e-07, + "loss": 0.4268, + "step": 5582 + }, + { + "epoch": 0.84, + "grad_norm": 6.343037177447048, + "learning_rate": 6.398641658105603e-07, + "loss": 0.3878, + "step": 5583 + }, + { + "epoch": 0.84, + "grad_norm": 5.144721756354544, + "learning_rate": 6.386691675198764e-07, + "loss": 0.4519, + "step": 5584 + }, + { + "epoch": 0.84, + "grad_norm": 8.88427942944416, + "learning_rate": 6.374752100159287e-07, + "loss": 0.4038, + "step": 5585 + }, + { + "epoch": 0.84, + "grad_norm": 16.222589234824692, + "learning_rate": 6.362822935836449e-07, + "loss": 0.4105, + "step": 5586 + }, + { + "epoch": 0.84, + "grad_norm": 21.00806368749471, + "learning_rate": 6.350904185077006e-07, + "loss": 0.4303, + "step": 5587 + }, + { + "epoch": 0.84, + "grad_norm": 7.679518334885346, + "learning_rate": 6.338995850725255e-07, + "loss": 0.4337, + "step": 5588 + }, + { + "epoch": 0.84, + "grad_norm": 1.2148912408716686, + "learning_rate": 6.327097935622988e-07, + "loss": 0.5058, + "step": 5589 + }, + { + "epoch": 0.84, + "grad_norm": 10.117702754912248, + "learning_rate": 6.315210442609515e-07, + "loss": 0.4407, + "step": 5590 + }, + { + "epoch": 0.84, + "grad_norm": 8.923260996879476, + "learning_rate": 6.303333374521687e-07, + "loss": 0.3506, + "step": 5591 + }, + { + "epoch": 0.84, + "grad_norm": 13.262861983087861, + "learning_rate": 6.291466734193818e-07, + "loss": 0.4489, + "step": 5592 + }, + { + "epoch": 0.84, + "grad_norm": 6.180824567897123, + "learning_rate": 6.27961052445778e-07, + "loss": 0.3984, + "step": 5593 + }, + { + "epoch": 0.84, + "grad_norm": 8.675765574744439, + "learning_rate": 6.26776474814293e-07, + "loss": 0.4685, + "step": 5594 + }, + { + "epoch": 0.84, + "grad_norm": 1.1748222223127798, + "learning_rate": 6.255929408076128e-07, + "loss": 0.5008, + "step": 5595 + }, + { + "epoch": 0.84, + "grad_norm": 9.337558067729937, + "learning_rate": 6.244104507081778e-07, + "loss": 0.4225, + "step": 5596 + }, + { + "epoch": 0.84, + "grad_norm": 18.087740993517055, + "learning_rate": 6.232290047981743e-07, + "loss": 0.4671, + "step": 5597 + }, + { + "epoch": 0.84, + "grad_norm": 5.789449449626573, + "learning_rate": 6.220486033595452e-07, + "loss": 0.5024, + "step": 5598 + }, + { + "epoch": 0.84, + "grad_norm": 12.990060295659438, + "learning_rate": 6.208692466739802e-07, + "loss": 0.4763, + "step": 5599 + }, + { + "epoch": 0.84, + "grad_norm": 11.045174501733419, + "learning_rate": 6.196909350229197e-07, + "loss": 0.4317, + "step": 5600 + }, + { + "epoch": 0.84, + "grad_norm": 15.685068973750885, + "learning_rate": 6.185136686875565e-07, + "loss": 0.3601, + "step": 5601 + }, + { + "epoch": 0.84, + "grad_norm": 8.968061600331163, + "learning_rate": 6.173374479488315e-07, + "loss": 0.4069, + "step": 5602 + }, + { + "epoch": 0.85, + "grad_norm": 14.901855997486388, + "learning_rate": 6.161622730874406e-07, + "loss": 0.4526, + "step": 5603 + }, + { + "epoch": 0.85, + "grad_norm": 8.530263102298733, + "learning_rate": 6.149881443838252e-07, + "loss": 0.4611, + "step": 5604 + }, + { + "epoch": 0.85, + "grad_norm": 11.326347898171674, + "learning_rate": 6.138150621181782e-07, + "loss": 0.457, + "step": 5605 + }, + { + "epoch": 0.85, + "grad_norm": 8.351497553305427, + "learning_rate": 6.126430265704459e-07, + "loss": 0.4792, + "step": 5606 + }, + { + "epoch": 0.85, + "grad_norm": 10.50017692252538, + "learning_rate": 6.114720380203204e-07, + "loss": 0.4717, + "step": 5607 + }, + { + "epoch": 0.85, + "grad_norm": 6.374019015791695, + "learning_rate": 6.103020967472484e-07, + "loss": 0.4664, + "step": 5608 + }, + { + "epoch": 0.85, + "grad_norm": 1.2256092850377978, + "learning_rate": 6.091332030304231e-07, + "loss": 0.5165, + "step": 5609 + }, + { + "epoch": 0.85, + "grad_norm": 24.563565165765784, + "learning_rate": 6.079653571487875e-07, + "loss": 0.3516, + "step": 5610 + }, + { + "epoch": 0.85, + "grad_norm": 6.611685673779102, + "learning_rate": 6.067985593810389e-07, + "loss": 0.3764, + "step": 5611 + }, + { + "epoch": 0.85, + "grad_norm": 13.915056033764408, + "learning_rate": 6.056328100056197e-07, + "loss": 0.3612, + "step": 5612 + }, + { + "epoch": 0.85, + "grad_norm": 7.533147516021703, + "learning_rate": 6.044681093007249e-07, + "loss": 0.4666, + "step": 5613 + }, + { + "epoch": 0.85, + "grad_norm": 13.976486293622372, + "learning_rate": 6.033044575442975e-07, + "loss": 0.4071, + "step": 5614 + }, + { + "epoch": 0.85, + "grad_norm": 10.51258030395662, + "learning_rate": 6.021418550140307e-07, + "loss": 0.4768, + "step": 5615 + }, + { + "epoch": 0.85, + "grad_norm": 5.669433300531895, + "learning_rate": 6.009803019873689e-07, + "loss": 0.406, + "step": 5616 + }, + { + "epoch": 0.85, + "grad_norm": 58.62598027916851, + "learning_rate": 5.998197987415033e-07, + "loss": 0.4652, + "step": 5617 + }, + { + "epoch": 0.85, + "grad_norm": 10.438668231595424, + "learning_rate": 5.98660345553378e-07, + "loss": 0.4624, + "step": 5618 + }, + { + "epoch": 0.85, + "grad_norm": 6.280991923028768, + "learning_rate": 5.975019426996837e-07, + "loss": 0.5044, + "step": 5619 + }, + { + "epoch": 0.85, + "grad_norm": 5.383664618119922, + "learning_rate": 5.963445904568599e-07, + "loss": 0.3666, + "step": 5620 + }, + { + "epoch": 0.85, + "grad_norm": 17.08530571967915, + "learning_rate": 5.95188289101099e-07, + "loss": 0.4836, + "step": 5621 + }, + { + "epoch": 0.85, + "grad_norm": 6.776040278474298, + "learning_rate": 5.940330389083399e-07, + "loss": 0.5028, + "step": 5622 + }, + { + "epoch": 0.85, + "grad_norm": 10.611438755357886, + "learning_rate": 5.928788401542706e-07, + "loss": 0.4252, + "step": 5623 + }, + { + "epoch": 0.85, + "grad_norm": 6.6247140275729794, + "learning_rate": 5.917256931143289e-07, + "loss": 0.3918, + "step": 5624 + }, + { + "epoch": 0.85, + "grad_norm": 39.4693845621563, + "learning_rate": 5.905735980637001e-07, + "loss": 0.3979, + "step": 5625 + }, + { + "epoch": 0.85, + "grad_norm": 9.758711497395522, + "learning_rate": 5.894225552773225e-07, + "loss": 0.4984, + "step": 5626 + }, + { + "epoch": 0.85, + "grad_norm": 12.665761726618195, + "learning_rate": 5.882725650298787e-07, + "loss": 0.4493, + "step": 5627 + }, + { + "epoch": 0.85, + "grad_norm": 1.2716606821246144, + "learning_rate": 5.871236275958031e-07, + "loss": 0.5166, + "step": 5628 + }, + { + "epoch": 0.85, + "grad_norm": 32.90641965638817, + "learning_rate": 5.85975743249278e-07, + "loss": 0.4303, + "step": 5629 + }, + { + "epoch": 0.85, + "grad_norm": 9.647120613957467, + "learning_rate": 5.84828912264232e-07, + "loss": 0.402, + "step": 5630 + }, + { + "epoch": 0.85, + "grad_norm": 8.04465802355464, + "learning_rate": 5.836831349143473e-07, + "loss": 0.5031, + "step": 5631 + }, + { + "epoch": 0.85, + "grad_norm": 30.9031687896907, + "learning_rate": 5.825384114730498e-07, + "loss": 0.3993, + "step": 5632 + }, + { + "epoch": 0.85, + "grad_norm": 7.1514732297621535, + "learning_rate": 5.81394742213518e-07, + "loss": 0.4274, + "step": 5633 + }, + { + "epoch": 0.85, + "grad_norm": 6.150250806909699, + "learning_rate": 5.802521274086754e-07, + "loss": 0.4055, + "step": 5634 + }, + { + "epoch": 0.85, + "grad_norm": 9.354934230114342, + "learning_rate": 5.791105673311959e-07, + "loss": 0.4529, + "step": 5635 + }, + { + "epoch": 0.85, + "grad_norm": 13.724247803012537, + "learning_rate": 5.779700622535001e-07, + "loss": 0.4861, + "step": 5636 + }, + { + "epoch": 0.85, + "grad_norm": 17.093153718017877, + "learning_rate": 5.768306124477574e-07, + "loss": 0.4123, + "step": 5637 + }, + { + "epoch": 0.85, + "grad_norm": 7.234835527298902, + "learning_rate": 5.756922181858881e-07, + "loss": 0.4443, + "step": 5638 + }, + { + "epoch": 0.85, + "grad_norm": 5.9940439274297175, + "learning_rate": 5.745548797395567e-07, + "loss": 0.4067, + "step": 5639 + }, + { + "epoch": 0.85, + "grad_norm": 8.470793971543523, + "learning_rate": 5.73418597380177e-07, + "loss": 0.4463, + "step": 5640 + }, + { + "epoch": 0.85, + "grad_norm": 8.262088099724101, + "learning_rate": 5.722833713789122e-07, + "loss": 0.3918, + "step": 5641 + }, + { + "epoch": 0.85, + "grad_norm": 15.084521902574652, + "learning_rate": 5.711492020066711e-07, + "loss": 0.4437, + "step": 5642 + }, + { + "epoch": 0.85, + "grad_norm": 6.452488994968562, + "learning_rate": 5.700160895341133e-07, + "loss": 0.4064, + "step": 5643 + }, + { + "epoch": 0.85, + "grad_norm": 16.007820558833274, + "learning_rate": 5.688840342316431e-07, + "loss": 0.3816, + "step": 5644 + }, + { + "epoch": 0.85, + "grad_norm": 10.8515851911073, + "learning_rate": 5.677530363694139e-07, + "loss": 0.3804, + "step": 5645 + }, + { + "epoch": 0.85, + "grad_norm": 8.536552095346227, + "learning_rate": 5.666230962173275e-07, + "loss": 0.4057, + "step": 5646 + }, + { + "epoch": 0.85, + "grad_norm": 11.59117474445883, + "learning_rate": 5.654942140450304e-07, + "loss": 0.465, + "step": 5647 + }, + { + "epoch": 0.85, + "grad_norm": 7.086468860328539, + "learning_rate": 5.643663901219215e-07, + "loss": 0.3988, + "step": 5648 + }, + { + "epoch": 0.85, + "grad_norm": 9.272839910671308, + "learning_rate": 5.632396247171429e-07, + "loss": 0.4426, + "step": 5649 + }, + { + "epoch": 0.85, + "grad_norm": 7.882336577814514, + "learning_rate": 5.621139180995849e-07, + "loss": 0.4801, + "step": 5650 + }, + { + "epoch": 0.85, + "grad_norm": 4.777267751360537, + "learning_rate": 5.609892705378878e-07, + "loss": 0.4407, + "step": 5651 + }, + { + "epoch": 0.85, + "grad_norm": 1.178585486453405, + "learning_rate": 5.59865682300435e-07, + "loss": 0.4972, + "step": 5652 + }, + { + "epoch": 0.85, + "grad_norm": 6.565976527795983, + "learning_rate": 5.587431536553611e-07, + "loss": 0.3992, + "step": 5653 + }, + { + "epoch": 0.85, + "grad_norm": 7.176518295115487, + "learning_rate": 5.576216848705452e-07, + "loss": 0.4311, + "step": 5654 + }, + { + "epoch": 0.85, + "grad_norm": 118.30770608181587, + "learning_rate": 5.565012762136135e-07, + "loss": 0.4091, + "step": 5655 + }, + { + "epoch": 0.85, + "grad_norm": 17.948241797255406, + "learning_rate": 5.553819279519418e-07, + "loss": 0.4411, + "step": 5656 + }, + { + "epoch": 0.85, + "grad_norm": 8.898729305891955, + "learning_rate": 5.542636403526503e-07, + "loss": 0.4756, + "step": 5657 + }, + { + "epoch": 0.85, + "grad_norm": 6.941702124890432, + "learning_rate": 5.531464136826064e-07, + "loss": 0.3906, + "step": 5658 + }, + { + "epoch": 0.85, + "grad_norm": 12.289334493421135, + "learning_rate": 5.52030248208425e-07, + "loss": 0.4139, + "step": 5659 + }, + { + "epoch": 0.85, + "grad_norm": 10.592136752729822, + "learning_rate": 5.509151441964666e-07, + "loss": 0.3875, + "step": 5660 + }, + { + "epoch": 0.85, + "grad_norm": 17.93025674198459, + "learning_rate": 5.498011019128407e-07, + "loss": 0.4285, + "step": 5661 + }, + { + "epoch": 0.85, + "grad_norm": 9.020766477949305, + "learning_rate": 5.486881216234008e-07, + "loss": 0.3646, + "step": 5662 + }, + { + "epoch": 0.85, + "grad_norm": 84.18799922370366, + "learning_rate": 5.475762035937499e-07, + "loss": 0.4056, + "step": 5663 + }, + { + "epoch": 0.85, + "grad_norm": 9.437349271202502, + "learning_rate": 5.464653480892351e-07, + "loss": 0.4835, + "step": 5664 + }, + { + "epoch": 0.85, + "grad_norm": 8.29269975766321, + "learning_rate": 5.45355555374949e-07, + "loss": 0.4377, + "step": 5665 + }, + { + "epoch": 0.85, + "grad_norm": 6.721969389061686, + "learning_rate": 5.442468257157346e-07, + "loss": 0.3924, + "step": 5666 + }, + { + "epoch": 0.85, + "grad_norm": 6.421541353933005, + "learning_rate": 5.431391593761764e-07, + "loss": 0.3613, + "step": 5667 + }, + { + "epoch": 0.85, + "grad_norm": 10.937045483059979, + "learning_rate": 5.420325566206102e-07, + "loss": 0.4931, + "step": 5668 + }, + { + "epoch": 0.86, + "grad_norm": 6.579533209791268, + "learning_rate": 5.409270177131143e-07, + "loss": 0.4375, + "step": 5669 + }, + { + "epoch": 0.86, + "grad_norm": 11.613406449542559, + "learning_rate": 5.398225429175142e-07, + "loss": 0.4224, + "step": 5670 + }, + { + "epoch": 0.86, + "grad_norm": 12.797931599574172, + "learning_rate": 5.387191324973806e-07, + "loss": 0.4917, + "step": 5671 + }, + { + "epoch": 0.86, + "grad_norm": 8.60346318774248, + "learning_rate": 5.376167867160326e-07, + "loss": 0.329, + "step": 5672 + }, + { + "epoch": 0.86, + "grad_norm": 13.170520445355194, + "learning_rate": 5.365155058365312e-07, + "loss": 0.3761, + "step": 5673 + }, + { + "epoch": 0.86, + "grad_norm": 5.249832483312413, + "learning_rate": 5.35415290121688e-07, + "loss": 0.4405, + "step": 5674 + }, + { + "epoch": 0.86, + "grad_norm": 7.567058518041789, + "learning_rate": 5.343161398340568e-07, + "loss": 0.434, + "step": 5675 + }, + { + "epoch": 0.86, + "grad_norm": 10.647481148075773, + "learning_rate": 5.332180552359406e-07, + "loss": 0.3875, + "step": 5676 + }, + { + "epoch": 0.86, + "grad_norm": 6.7095035020816125, + "learning_rate": 5.321210365893842e-07, + "loss": 0.3879, + "step": 5677 + }, + { + "epoch": 0.86, + "grad_norm": 9.239415784637044, + "learning_rate": 5.310250841561792e-07, + "loss": 0.4681, + "step": 5678 + }, + { + "epoch": 0.86, + "grad_norm": 7.866506980445486, + "learning_rate": 5.299301981978655e-07, + "loss": 0.4115, + "step": 5679 + }, + { + "epoch": 0.86, + "grad_norm": 14.942293573647621, + "learning_rate": 5.28836378975725e-07, + "loss": 0.4461, + "step": 5680 + }, + { + "epoch": 0.86, + "grad_norm": 18.9784239148245, + "learning_rate": 5.277436267507868e-07, + "loss": 0.3994, + "step": 5681 + }, + { + "epoch": 0.86, + "grad_norm": 9.025737393460759, + "learning_rate": 5.26651941783825e-07, + "loss": 0.449, + "step": 5682 + }, + { + "epoch": 0.86, + "grad_norm": 10.36701485000101, + "learning_rate": 5.255613243353575e-07, + "loss": 0.4616, + "step": 5683 + }, + { + "epoch": 0.86, + "grad_norm": 9.033426528217516, + "learning_rate": 5.244717746656513e-07, + "loss": 0.4316, + "step": 5684 + }, + { + "epoch": 0.86, + "grad_norm": 5.979353206368469, + "learning_rate": 5.233832930347149e-07, + "loss": 0.4736, + "step": 5685 + }, + { + "epoch": 0.86, + "grad_norm": 6.487124641691687, + "learning_rate": 5.222958797023036e-07, + "loss": 0.4213, + "step": 5686 + }, + { + "epoch": 0.86, + "grad_norm": 9.479140295090433, + "learning_rate": 5.212095349279179e-07, + "loss": 0.4897, + "step": 5687 + }, + { + "epoch": 0.86, + "grad_norm": 14.612605987349252, + "learning_rate": 5.201242589708011e-07, + "loss": 0.4276, + "step": 5688 + }, + { + "epoch": 0.86, + "grad_norm": 16.4484802234681, + "learning_rate": 5.190400520899452e-07, + "loss": 0.414, + "step": 5689 + }, + { + "epoch": 0.86, + "grad_norm": 6.033005193540101, + "learning_rate": 5.179569145440827e-07, + "loss": 0.4161, + "step": 5690 + }, + { + "epoch": 0.86, + "grad_norm": 5.195047327074397, + "learning_rate": 5.168748465916961e-07, + "loss": 0.4376, + "step": 5691 + }, + { + "epoch": 0.86, + "grad_norm": 8.181609040873978, + "learning_rate": 5.15793848491008e-07, + "loss": 0.4019, + "step": 5692 + }, + { + "epoch": 0.86, + "grad_norm": 11.950056359607112, + "learning_rate": 5.147139204999884e-07, + "loss": 0.414, + "step": 5693 + }, + { + "epoch": 0.86, + "grad_norm": 6.482197610920505, + "learning_rate": 5.136350628763498e-07, + "loss": 0.3904, + "step": 5694 + }, + { + "epoch": 0.86, + "grad_norm": 7.979316039537605, + "learning_rate": 5.125572758775505e-07, + "loss": 0.4267, + "step": 5695 + }, + { + "epoch": 0.86, + "grad_norm": 7.746346609027996, + "learning_rate": 5.114805597607942e-07, + "loss": 0.4052, + "step": 5696 + }, + { + "epoch": 0.86, + "grad_norm": 11.75101781866843, + "learning_rate": 5.104049147830281e-07, + "loss": 0.4353, + "step": 5697 + }, + { + "epoch": 0.86, + "grad_norm": 8.191257214619394, + "learning_rate": 5.093303412009426e-07, + "loss": 0.3919, + "step": 5698 + }, + { + "epoch": 0.86, + "grad_norm": 5.164935376401182, + "learning_rate": 5.082568392709758e-07, + "loss": 0.4103, + "step": 5699 + }, + { + "epoch": 0.86, + "grad_norm": 27.3851666585, + "learning_rate": 5.071844092493051e-07, + "loss": 0.3764, + "step": 5700 + }, + { + "epoch": 0.86, + "grad_norm": 7.973146889492861, + "learning_rate": 5.061130513918577e-07, + "loss": 0.461, + "step": 5701 + }, + { + "epoch": 0.86, + "grad_norm": 6.467336534638455, + "learning_rate": 5.050427659543006e-07, + "loss": 0.4437, + "step": 5702 + }, + { + "epoch": 0.86, + "grad_norm": 1.1466089920999127, + "learning_rate": 5.039735531920453e-07, + "loss": 0.5031, + "step": 5703 + }, + { + "epoch": 0.86, + "grad_norm": 9.949478849548411, + "learning_rate": 5.029054133602512e-07, + "loss": 0.4773, + "step": 5704 + }, + { + "epoch": 0.86, + "grad_norm": 8.72311772445394, + "learning_rate": 5.018383467138177e-07, + "loss": 0.4232, + "step": 5705 + }, + { + "epoch": 0.86, + "grad_norm": 8.80274400159251, + "learning_rate": 5.007723535073888e-07, + "loss": 0.4374, + "step": 5706 + }, + { + "epoch": 0.86, + "grad_norm": 6.8926207686313745, + "learning_rate": 4.997074339953528e-07, + "loss": 0.4657, + "step": 5707 + }, + { + "epoch": 0.86, + "grad_norm": 27.695058194721316, + "learning_rate": 4.986435884318413e-07, + "loss": 0.4128, + "step": 5708 + }, + { + "epoch": 0.86, + "grad_norm": 8.430352442889728, + "learning_rate": 4.975808170707314e-07, + "loss": 0.4633, + "step": 5709 + }, + { + "epoch": 0.86, + "grad_norm": 6.54661209309096, + "learning_rate": 4.965191201656417e-07, + "loss": 0.4296, + "step": 5710 + }, + { + "epoch": 0.86, + "grad_norm": 12.390199704193106, + "learning_rate": 4.954584979699356e-07, + "loss": 0.362, + "step": 5711 + }, + { + "epoch": 0.86, + "grad_norm": 79.10493654650362, + "learning_rate": 4.943989507367203e-07, + "loss": 0.4672, + "step": 5712 + }, + { + "epoch": 0.86, + "grad_norm": 37.57192897845998, + "learning_rate": 4.933404787188439e-07, + "loss": 0.3725, + "step": 5713 + }, + { + "epoch": 0.86, + "grad_norm": 8.227387249385961, + "learning_rate": 4.922830821689023e-07, + "loss": 0.3913, + "step": 5714 + }, + { + "epoch": 0.86, + "grad_norm": 8.530733664575141, + "learning_rate": 4.912267613392307e-07, + "loss": 0.4059, + "step": 5715 + }, + { + "epoch": 0.86, + "grad_norm": 12.794248794476655, + "learning_rate": 4.901715164819099e-07, + "loss": 0.4061, + "step": 5716 + }, + { + "epoch": 0.86, + "grad_norm": 15.931633078597596, + "learning_rate": 4.891173478487626e-07, + "loss": 0.3957, + "step": 5717 + }, + { + "epoch": 0.86, + "grad_norm": 12.677477570007873, + "learning_rate": 4.88064255691355e-07, + "loss": 0.4207, + "step": 5718 + }, + { + "epoch": 0.86, + "grad_norm": 6.055336745677284, + "learning_rate": 4.870122402609984e-07, + "loss": 0.3826, + "step": 5719 + }, + { + "epoch": 0.86, + "grad_norm": 15.305161504303738, + "learning_rate": 4.859613018087434e-07, + "loss": 0.4033, + "step": 5720 + }, + { + "epoch": 0.86, + "grad_norm": 10.299566564219118, + "learning_rate": 4.849114405853872e-07, + "loss": 0.4242, + "step": 5721 + }, + { + "epoch": 0.86, + "grad_norm": 8.560275551971317, + "learning_rate": 4.838626568414684e-07, + "loss": 0.4527, + "step": 5722 + }, + { + "epoch": 0.86, + "grad_norm": 11.34603021045852, + "learning_rate": 4.828149508272661e-07, + "loss": 0.3828, + "step": 5723 + }, + { + "epoch": 0.86, + "grad_norm": 1.2624194968561506, + "learning_rate": 4.817683227928083e-07, + "loss": 0.4778, + "step": 5724 + }, + { + "epoch": 0.86, + "grad_norm": 8.74923902802455, + "learning_rate": 4.807227729878583e-07, + "loss": 0.3932, + "step": 5725 + }, + { + "epoch": 0.86, + "grad_norm": 6.131562637663482, + "learning_rate": 4.796783016619295e-07, + "loss": 0.4531, + "step": 5726 + }, + { + "epoch": 0.86, + "grad_norm": 9.802769459316018, + "learning_rate": 4.786349090642711e-07, + "loss": 0.4297, + "step": 5727 + }, + { + "epoch": 0.86, + "grad_norm": 9.03984817667625, + "learning_rate": 4.775925954438798e-07, + "loss": 0.4012, + "step": 5728 + }, + { + "epoch": 0.86, + "grad_norm": 11.357935921487051, + "learning_rate": 4.7655136104949264e-07, + "loss": 0.4575, + "step": 5729 + }, + { + "epoch": 0.86, + "grad_norm": 8.05842934765025, + "learning_rate": 4.7551120612958803e-07, + "loss": 0.4345, + "step": 5730 + }, + { + "epoch": 0.86, + "grad_norm": 10.445141783594204, + "learning_rate": 4.7447213093239077e-07, + "loss": 0.328, + "step": 5731 + }, + { + "epoch": 0.86, + "grad_norm": 9.431366136855896, + "learning_rate": 4.7343413570586415e-07, + "loss": 0.4746, + "step": 5732 + }, + { + "epoch": 0.86, + "grad_norm": 9.549995759024563, + "learning_rate": 4.72397220697714e-07, + "loss": 0.3851, + "step": 5733 + }, + { + "epoch": 0.86, + "grad_norm": 7.1655662236003606, + "learning_rate": 4.7136138615539116e-07, + "loss": 0.4136, + "step": 5734 + }, + { + "epoch": 0.87, + "grad_norm": 10.755325717763183, + "learning_rate": 4.703266323260858e-07, + "loss": 0.4618, + "step": 5735 + }, + { + "epoch": 0.87, + "grad_norm": 16.90871352422481, + "learning_rate": 4.692929594567325e-07, + "loss": 0.4791, + "step": 5736 + }, + { + "epoch": 0.87, + "grad_norm": 18.251547897826573, + "learning_rate": 4.682603677940062e-07, + "loss": 0.4507, + "step": 5737 + }, + { + "epoch": 0.87, + "grad_norm": 12.050366827926503, + "learning_rate": 4.672288575843226e-07, + "loss": 0.3917, + "step": 5738 + }, + { + "epoch": 0.87, + "grad_norm": 6.534185912978316, + "learning_rate": 4.661984290738436e-07, + "loss": 0.3498, + "step": 5739 + }, + { + "epoch": 0.87, + "grad_norm": 9.068298797850447, + "learning_rate": 4.651690825084687e-07, + "loss": 0.4057, + "step": 5740 + }, + { + "epoch": 0.87, + "grad_norm": 9.56245523703993, + "learning_rate": 4.6414081813384193e-07, + "loss": 0.358, + "step": 5741 + }, + { + "epoch": 0.87, + "grad_norm": 11.15730740651848, + "learning_rate": 4.63113636195347e-07, + "loss": 0.4554, + "step": 5742 + }, + { + "epoch": 0.87, + "grad_norm": 6.9321532360970775, + "learning_rate": 4.6208753693810946e-07, + "loss": 0.3842, + "step": 5743 + }, + { + "epoch": 0.87, + "grad_norm": 61.35444380576177, + "learning_rate": 4.6106252060699965e-07, + "loss": 0.4292, + "step": 5744 + }, + { + "epoch": 0.87, + "grad_norm": 7.2972561593342276, + "learning_rate": 4.6003858744662564e-07, + "loss": 0.4482, + "step": 5745 + }, + { + "epoch": 0.87, + "grad_norm": 1.084508613070108, + "learning_rate": 4.5901573770133935e-07, + "loss": 0.5078, + "step": 5746 + }, + { + "epoch": 0.87, + "grad_norm": 7.7949200066925775, + "learning_rate": 4.5799397161523316e-07, + "loss": 0.4514, + "step": 5747 + }, + { + "epoch": 0.87, + "grad_norm": 13.620322355015267, + "learning_rate": 4.569732894321405e-07, + "loss": 0.3886, + "step": 5748 + }, + { + "epoch": 0.87, + "grad_norm": 5.945537482859955, + "learning_rate": 4.559536913956375e-07, + "loss": 0.4519, + "step": 5749 + }, + { + "epoch": 0.87, + "grad_norm": 17.524320864370292, + "learning_rate": 4.549351777490402e-07, + "loss": 0.4646, + "step": 5750 + }, + { + "epoch": 0.87, + "grad_norm": 7.610888430315229, + "learning_rate": 4.539177487354063e-07, + "loss": 0.4867, + "step": 5751 + }, + { + "epoch": 0.87, + "grad_norm": 7.1163939932395195, + "learning_rate": 4.529014045975355e-07, + "loss": 0.4037, + "step": 5752 + }, + { + "epoch": 0.87, + "grad_norm": 5.113875827951045, + "learning_rate": 4.5188614557796605e-07, + "loss": 0.3851, + "step": 5753 + }, + { + "epoch": 0.87, + "grad_norm": 5.385745072195075, + "learning_rate": 4.5087197191898137e-07, + "loss": 0.4305, + "step": 5754 + }, + { + "epoch": 0.87, + "grad_norm": 8.546124107484307, + "learning_rate": 4.498588838626017e-07, + "loss": 0.4615, + "step": 5755 + }, + { + "epoch": 0.87, + "grad_norm": 8.409475452055824, + "learning_rate": 4.488468816505914e-07, + "loss": 0.4435, + "step": 5756 + }, + { + "epoch": 0.87, + "grad_norm": 7.700102155429978, + "learning_rate": 4.4783596552445463e-07, + "loss": 0.3715, + "step": 5757 + }, + { + "epoch": 0.87, + "grad_norm": 31.044687684877545, + "learning_rate": 4.468261357254339e-07, + "loss": 0.3872, + "step": 5758 + }, + { + "epoch": 0.87, + "grad_norm": 5.734995082863173, + "learning_rate": 4.458173924945175e-07, + "loss": 0.4475, + "step": 5759 + }, + { + "epoch": 0.87, + "grad_norm": 15.902255381996751, + "learning_rate": 4.4480973607242907e-07, + "loss": 0.429, + "step": 5760 + }, + { + "epoch": 0.87, + "grad_norm": 1.1850681851243217, + "learning_rate": 4.4380316669963787e-07, + "loss": 0.5049, + "step": 5761 + }, + { + "epoch": 0.87, + "grad_norm": 11.563051576657616, + "learning_rate": 4.4279768461635066e-07, + "loss": 0.3898, + "step": 5762 + }, + { + "epoch": 0.87, + "grad_norm": 7.028345858062199, + "learning_rate": 4.417932900625144e-07, + "loss": 0.3766, + "step": 5763 + }, + { + "epoch": 0.87, + "grad_norm": 24.969022287012418, + "learning_rate": 4.4078998327781787e-07, + "loss": 0.4169, + "step": 5764 + }, + { + "epoch": 0.87, + "grad_norm": 9.348139426378493, + "learning_rate": 4.397877645016896e-07, + "loss": 0.4384, + "step": 5765 + }, + { + "epoch": 0.87, + "grad_norm": 15.008014953805748, + "learning_rate": 4.387866339732999e-07, + "loss": 0.4192, + "step": 5766 + }, + { + "epoch": 0.87, + "grad_norm": 7.782074008351293, + "learning_rate": 4.377865919315577e-07, + "loss": 0.4543, + "step": 5767 + }, + { + "epoch": 0.87, + "grad_norm": 12.992674157927052, + "learning_rate": 4.367876386151115e-07, + "loss": 0.4539, + "step": 5768 + }, + { + "epoch": 0.87, + "grad_norm": 15.700736873013351, + "learning_rate": 4.3578977426235346e-07, + "loss": 0.399, + "step": 5769 + }, + { + "epoch": 0.87, + "grad_norm": 5.565182926446994, + "learning_rate": 4.3479299911141195e-07, + "loss": 0.3982, + "step": 5770 + }, + { + "epoch": 0.87, + "grad_norm": 14.067938724301634, + "learning_rate": 4.337973134001588e-07, + "loss": 0.5046, + "step": 5771 + }, + { + "epoch": 0.87, + "grad_norm": 7.255399802036756, + "learning_rate": 4.328027173662025e-07, + "loss": 0.4484, + "step": 5772 + }, + { + "epoch": 0.87, + "grad_norm": 6.167122074006211, + "learning_rate": 4.318092112468941e-07, + "loss": 0.4761, + "step": 5773 + }, + { + "epoch": 0.87, + "grad_norm": 8.019541635289777, + "learning_rate": 4.308167952793235e-07, + "loss": 0.5185, + "step": 5774 + }, + { + "epoch": 0.87, + "grad_norm": 7.127206797273236, + "learning_rate": 4.29825469700319e-07, + "loss": 0.4355, + "step": 5775 + }, + { + "epoch": 0.87, + "grad_norm": 8.192695657851663, + "learning_rate": 4.2883523474645307e-07, + "loss": 0.4329, + "step": 5776 + }, + { + "epoch": 0.87, + "grad_norm": 25.03825766016349, + "learning_rate": 4.278460906540333e-07, + "loss": 0.3992, + "step": 5777 + }, + { + "epoch": 0.87, + "grad_norm": 12.467845012877225, + "learning_rate": 4.2685803765910807e-07, + "loss": 0.4476, + "step": 5778 + }, + { + "epoch": 0.87, + "grad_norm": 8.656219505642666, + "learning_rate": 4.2587107599746825e-07, + "loss": 0.4481, + "step": 5779 + }, + { + "epoch": 0.87, + "grad_norm": 7.57832767715027, + "learning_rate": 4.248852059046399e-07, + "loss": 0.4296, + "step": 5780 + }, + { + "epoch": 0.87, + "grad_norm": 6.059078139845612, + "learning_rate": 4.239004276158926e-07, + "loss": 0.4153, + "step": 5781 + }, + { + "epoch": 0.87, + "grad_norm": 6.0779925181147565, + "learning_rate": 4.229167413662327e-07, + "loss": 0.4088, + "step": 5782 + }, + { + "epoch": 0.87, + "grad_norm": 4.622199452157603, + "learning_rate": 4.2193414739040593e-07, + "loss": 0.3877, + "step": 5783 + }, + { + "epoch": 0.87, + "grad_norm": 6.694962440302269, + "learning_rate": 4.209526459229002e-07, + "loss": 0.3568, + "step": 5784 + }, + { + "epoch": 0.87, + "grad_norm": 8.895018591157557, + "learning_rate": 4.199722371979398e-07, + "loss": 0.4204, + "step": 5785 + }, + { + "epoch": 0.87, + "grad_norm": 5.850017061556132, + "learning_rate": 4.1899292144948923e-07, + "loss": 0.3747, + "step": 5786 + }, + { + "epoch": 0.87, + "grad_norm": 10.219309258328249, + "learning_rate": 4.1801469891125156e-07, + "loss": 0.447, + "step": 5787 + }, + { + "epoch": 0.87, + "grad_norm": 8.101422102388584, + "learning_rate": 4.17037569816669e-07, + "loss": 0.4256, + "step": 5788 + }, + { + "epoch": 0.87, + "grad_norm": 6.687046241994253, + "learning_rate": 4.160615343989255e-07, + "loss": 0.4552, + "step": 5789 + }, + { + "epoch": 0.87, + "grad_norm": 7.025507936986526, + "learning_rate": 4.150865928909409e-07, + "loss": 0.4493, + "step": 5790 + }, + { + "epoch": 0.87, + "grad_norm": 7.3397577390930815, + "learning_rate": 4.141127455253729e-07, + "loss": 0.4176, + "step": 5791 + }, + { + "epoch": 0.87, + "grad_norm": 7.497353435800689, + "learning_rate": 4.1313999253462356e-07, + "loss": 0.5068, + "step": 5792 + }, + { + "epoch": 0.87, + "grad_norm": 16.3698569289533, + "learning_rate": 4.1216833415082815e-07, + "loss": 0.4403, + "step": 5793 + }, + { + "epoch": 0.87, + "grad_norm": 8.28682631783735, + "learning_rate": 4.1119777060586397e-07, + "loss": 0.4417, + "step": 5794 + }, + { + "epoch": 0.87, + "grad_norm": 11.377348347937348, + "learning_rate": 4.1022830213134577e-07, + "loss": 0.417, + "step": 5795 + }, + { + "epoch": 0.87, + "grad_norm": 6.180122450446617, + "learning_rate": 4.092599289586263e-07, + "loss": 0.4661, + "step": 5796 + }, + { + "epoch": 0.87, + "grad_norm": 17.800636002196796, + "learning_rate": 4.082926513187996e-07, + "loss": 0.3614, + "step": 5797 + }, + { + "epoch": 0.87, + "grad_norm": 1.0663976759232696, + "learning_rate": 4.0732646944269593e-07, + "loss": 0.5574, + "step": 5798 + }, + { + "epoch": 0.87, + "grad_norm": 8.779870943763425, + "learning_rate": 4.063613835608837e-07, + "loss": 0.4356, + "step": 5799 + }, + { + "epoch": 0.87, + "grad_norm": 23.986355881970134, + "learning_rate": 4.053973939036715e-07, + "loss": 0.4697, + "step": 5800 + }, + { + "epoch": 0.87, + "grad_norm": 5.972932561980954, + "learning_rate": 4.044345007011047e-07, + "loss": 0.3772, + "step": 5801 + }, + { + "epoch": 0.88, + "grad_norm": 9.461362181638089, + "learning_rate": 4.0347270418296945e-07, + "loss": 0.416, + "step": 5802 + }, + { + "epoch": 0.88, + "grad_norm": 7.183477579521882, + "learning_rate": 4.025120045787867e-07, + "loss": 0.3976, + "step": 5803 + }, + { + "epoch": 0.88, + "grad_norm": 8.308461113007686, + "learning_rate": 4.0155240211781966e-07, + "loss": 0.4567, + "step": 5804 + }, + { + "epoch": 0.88, + "grad_norm": 10.092301587722812, + "learning_rate": 4.0059389702906627e-07, + "loss": 0.4107, + "step": 5805 + }, + { + "epoch": 0.88, + "grad_norm": 7.865731334096125, + "learning_rate": 3.99636489541263e-07, + "loss": 0.3818, + "step": 5806 + }, + { + "epoch": 0.88, + "grad_norm": 7.29541235815272, + "learning_rate": 3.986801798828871e-07, + "loss": 0.4302, + "step": 5807 + }, + { + "epoch": 0.88, + "grad_norm": 26.220178910674, + "learning_rate": 3.9772496828215214e-07, + "loss": 0.4538, + "step": 5808 + }, + { + "epoch": 0.88, + "grad_norm": 5.8420321802545665, + "learning_rate": 3.9677085496700796e-07, + "loss": 0.4201, + "step": 5809 + }, + { + "epoch": 0.88, + "grad_norm": 9.53346726518796, + "learning_rate": 3.958178401651447e-07, + "loss": 0.4284, + "step": 5810 + }, + { + "epoch": 0.88, + "grad_norm": 7.320379191639871, + "learning_rate": 3.9486592410398925e-07, + "loss": 0.4187, + "step": 5811 + }, + { + "epoch": 0.88, + "grad_norm": 19.72833264662245, + "learning_rate": 3.9391510701070725e-07, + "loss": 0.4125, + "step": 5812 + }, + { + "epoch": 0.88, + "grad_norm": 8.502274491886736, + "learning_rate": 3.9296538911219984e-07, + "loss": 0.4858, + "step": 5813 + }, + { + "epoch": 0.88, + "grad_norm": 14.742654346493456, + "learning_rate": 3.9201677063510967e-07, + "loss": 0.4259, + "step": 5814 + }, + { + "epoch": 0.88, + "grad_norm": 39.58727536396428, + "learning_rate": 3.91069251805814e-07, + "loss": 0.4317, + "step": 5815 + }, + { + "epoch": 0.88, + "grad_norm": 9.328524105195852, + "learning_rate": 3.9012283285042696e-07, + "loss": 0.4619, + "step": 5816 + }, + { + "epoch": 0.88, + "grad_norm": 10.450612251860516, + "learning_rate": 3.8917751399480396e-07, + "loss": 0.4735, + "step": 5817 + }, + { + "epoch": 0.88, + "grad_norm": 16.320421472311793, + "learning_rate": 3.882332954645335e-07, + "loss": 0.4579, + "step": 5818 + }, + { + "epoch": 0.88, + "grad_norm": 5.970528836564768, + "learning_rate": 3.872901774849458e-07, + "loss": 0.3822, + "step": 5819 + }, + { + "epoch": 0.88, + "grad_norm": 9.942055003037975, + "learning_rate": 3.863481602811048e-07, + "loss": 0.4, + "step": 5820 + }, + { + "epoch": 0.88, + "grad_norm": 1.2167118776089723, + "learning_rate": 3.854072440778134e-07, + "loss": 0.5192, + "step": 5821 + }, + { + "epoch": 0.88, + "grad_norm": 7.889310368873597, + "learning_rate": 3.8446742909961197e-07, + "loss": 0.4161, + "step": 5822 + }, + { + "epoch": 0.88, + "grad_norm": 11.383990877006225, + "learning_rate": 3.8352871557077674e-07, + "loss": 0.4948, + "step": 5823 + }, + { + "epoch": 0.88, + "grad_norm": 1.2002827312773676, + "learning_rate": 3.8259110371532347e-07, + "loss": 0.5312, + "step": 5824 + }, + { + "epoch": 0.88, + "grad_norm": 12.2864149092543, + "learning_rate": 3.8165459375700263e-07, + "loss": 0.4097, + "step": 5825 + }, + { + "epoch": 0.88, + "grad_norm": 1.2586840835496809, + "learning_rate": 3.807191859193021e-07, + "loss": 0.5256, + "step": 5826 + }, + { + "epoch": 0.88, + "grad_norm": 8.218857571189341, + "learning_rate": 3.797848804254489e-07, + "loss": 0.371, + "step": 5827 + }, + { + "epoch": 0.88, + "grad_norm": 7.718419251602715, + "learning_rate": 3.788516774984041e-07, + "loss": 0.4883, + "step": 5828 + }, + { + "epoch": 0.88, + "grad_norm": 13.247463609512966, + "learning_rate": 3.7791957736086794e-07, + "loss": 0.4283, + "step": 5829 + }, + { + "epoch": 0.88, + "grad_norm": 10.450029385520013, + "learning_rate": 3.769885802352768e-07, + "loss": 0.408, + "step": 5830 + }, + { + "epoch": 0.88, + "grad_norm": 15.62022109198703, + "learning_rate": 3.760586863438015e-07, + "loss": 0.5181, + "step": 5831 + }, + { + "epoch": 0.88, + "grad_norm": 6.420797661763059, + "learning_rate": 3.751298959083538e-07, + "loss": 0.412, + "step": 5832 + }, + { + "epoch": 0.88, + "grad_norm": 1.478791512088389, + "learning_rate": 3.742022091505798e-07, + "loss": 0.5118, + "step": 5833 + }, + { + "epoch": 0.88, + "grad_norm": 5.649552586048902, + "learning_rate": 3.7327562629186133e-07, + "loss": 0.3923, + "step": 5834 + }, + { + "epoch": 0.88, + "grad_norm": 9.157657199957873, + "learning_rate": 3.723501475533187e-07, + "loss": 0.4114, + "step": 5835 + }, + { + "epoch": 0.88, + "grad_norm": 7.028315649281295, + "learning_rate": 3.714257731558063e-07, + "loss": 0.3404, + "step": 5836 + }, + { + "epoch": 0.88, + "grad_norm": 5.778623088036855, + "learning_rate": 3.705025033199189e-07, + "loss": 0.3864, + "step": 5837 + }, + { + "epoch": 0.88, + "grad_norm": 7.520110896763575, + "learning_rate": 3.695803382659835e-07, + "loss": 0.4551, + "step": 5838 + }, + { + "epoch": 0.88, + "grad_norm": 7.23156189014286, + "learning_rate": 3.6865927821406687e-07, + "loss": 0.3906, + "step": 5839 + }, + { + "epoch": 0.88, + "grad_norm": 7.40454729061486, + "learning_rate": 3.677393233839693e-07, + "loss": 0.3828, + "step": 5840 + }, + { + "epoch": 0.88, + "grad_norm": 10.267521077205595, + "learning_rate": 3.668204739952286e-07, + "loss": 0.4079, + "step": 5841 + }, + { + "epoch": 0.88, + "grad_norm": 7.741427543154004, + "learning_rate": 3.6590273026711975e-07, + "loss": 0.4377, + "step": 5842 + }, + { + "epoch": 0.88, + "grad_norm": 17.163739083812075, + "learning_rate": 3.649860924186521e-07, + "loss": 0.4158, + "step": 5843 + }, + { + "epoch": 0.88, + "grad_norm": 10.04906459931671, + "learning_rate": 3.640705606685718e-07, + "loss": 0.5255, + "step": 5844 + }, + { + "epoch": 0.88, + "grad_norm": 6.795166239455593, + "learning_rate": 3.6315613523536185e-07, + "loss": 0.5055, + "step": 5845 + }, + { + "epoch": 0.88, + "grad_norm": 7.795142062949072, + "learning_rate": 3.622428163372382e-07, + "loss": 0.461, + "step": 5846 + }, + { + "epoch": 0.88, + "grad_norm": 41.59665340527445, + "learning_rate": 3.6133060419215825e-07, + "loss": 0.4074, + "step": 5847 + }, + { + "epoch": 0.88, + "grad_norm": 42.333353518364206, + "learning_rate": 3.6041949901780895e-07, + "loss": 0.4329, + "step": 5848 + }, + { + "epoch": 0.88, + "grad_norm": 6.7609524991011565, + "learning_rate": 3.5950950103161906e-07, + "loss": 0.4373, + "step": 5849 + }, + { + "epoch": 0.88, + "grad_norm": 12.048411841711832, + "learning_rate": 3.5860061045074836e-07, + "loss": 0.4571, + "step": 5850 + }, + { + "epoch": 0.88, + "grad_norm": 13.386935904670116, + "learning_rate": 3.5769282749209375e-07, + "loss": 0.4024, + "step": 5851 + }, + { + "epoch": 0.88, + "grad_norm": 6.21681420818885, + "learning_rate": 3.5678615237229043e-07, + "loss": 0.4239, + "step": 5852 + }, + { + "epoch": 0.88, + "grad_norm": 9.144338260682893, + "learning_rate": 3.558805853077052e-07, + "loss": 0.5091, + "step": 5853 + }, + { + "epoch": 0.88, + "grad_norm": 18.794005908617386, + "learning_rate": 3.5497612651444355e-07, + "loss": 0.4535, + "step": 5854 + }, + { + "epoch": 0.88, + "grad_norm": 7.13700831213853, + "learning_rate": 3.540727762083451e-07, + "loss": 0.3585, + "step": 5855 + }, + { + "epoch": 0.88, + "grad_norm": 9.497479375444295, + "learning_rate": 3.531705346049846e-07, + "loss": 0.3891, + "step": 5856 + }, + { + "epoch": 0.88, + "grad_norm": 7.241630777726071, + "learning_rate": 3.522694019196732e-07, + "loss": 0.4747, + "step": 5857 + }, + { + "epoch": 0.88, + "grad_norm": 8.436203674512253, + "learning_rate": 3.5136937836745546e-07, + "loss": 0.3904, + "step": 5858 + }, + { + "epoch": 0.88, + "grad_norm": 10.43799922507736, + "learning_rate": 3.504704641631146e-07, + "loss": 0.4385, + "step": 5859 + }, + { + "epoch": 0.88, + "grad_norm": 22.965251743508155, + "learning_rate": 3.495726595211668e-07, + "loss": 0.437, + "step": 5860 + }, + { + "epoch": 0.88, + "grad_norm": 8.362678855286216, + "learning_rate": 3.4867596465586295e-07, + "loss": 0.4571, + "step": 5861 + }, + { + "epoch": 0.88, + "grad_norm": 12.359486268755676, + "learning_rate": 3.477803797811913e-07, + "loss": 0.3913, + "step": 5862 + }, + { + "epoch": 0.88, + "grad_norm": 6.048281823987576, + "learning_rate": 3.4688590511087304e-07, + "loss": 0.3852, + "step": 5863 + }, + { + "epoch": 0.88, + "grad_norm": 11.87739033110838, + "learning_rate": 3.459925408583664e-07, + "loss": 0.4177, + "step": 5864 + }, + { + "epoch": 0.88, + "grad_norm": 12.665808506976699, + "learning_rate": 3.451002872368625e-07, + "loss": 0.4198, + "step": 5865 + }, + { + "epoch": 0.88, + "grad_norm": 6.8026966861875335, + "learning_rate": 3.442091444592888e-07, + "loss": 0.3369, + "step": 5866 + }, + { + "epoch": 0.88, + "grad_norm": 5.023375766446867, + "learning_rate": 3.433191127383079e-07, + "loss": 0.3314, + "step": 5867 + }, + { + "epoch": 0.89, + "grad_norm": 22.176753973495885, + "learning_rate": 3.42430192286316e-07, + "loss": 0.4332, + "step": 5868 + }, + { + "epoch": 0.89, + "grad_norm": 5.801643402467804, + "learning_rate": 3.4154238331544575e-07, + "loss": 0.4571, + "step": 5869 + }, + { + "epoch": 0.89, + "grad_norm": 9.011212173278045, + "learning_rate": 3.4065568603756263e-07, + "loss": 0.365, + "step": 5870 + }, + { + "epoch": 0.89, + "grad_norm": 6.633075045903737, + "learning_rate": 3.397701006642673e-07, + "loss": 0.4836, + "step": 5871 + }, + { + "epoch": 0.89, + "grad_norm": 21.004804027129715, + "learning_rate": 3.388856274068975e-07, + "loss": 0.3793, + "step": 5872 + }, + { + "epoch": 0.89, + "grad_norm": 8.469106425141895, + "learning_rate": 3.3800226647652146e-07, + "loss": 0.4306, + "step": 5873 + }, + { + "epoch": 0.89, + "grad_norm": 29.074676700270786, + "learning_rate": 3.371200180839462e-07, + "loss": 0.4601, + "step": 5874 + }, + { + "epoch": 0.89, + "grad_norm": 9.521713636465842, + "learning_rate": 3.3623888243971113e-07, + "loss": 0.475, + "step": 5875 + }, + { + "epoch": 0.89, + "grad_norm": 11.557612339071632, + "learning_rate": 3.35358859754088e-07, + "loss": 0.3847, + "step": 5876 + }, + { + "epoch": 0.89, + "grad_norm": 5.665560747020367, + "learning_rate": 3.344799502370877e-07, + "loss": 0.4702, + "step": 5877 + }, + { + "epoch": 0.89, + "grad_norm": 13.80683927724409, + "learning_rate": 3.3360215409845133e-07, + "loss": 0.4031, + "step": 5878 + }, + { + "epoch": 0.89, + "grad_norm": 10.587454133285425, + "learning_rate": 3.327254715476569e-07, + "loss": 0.4477, + "step": 5879 + }, + { + "epoch": 0.89, + "grad_norm": 10.211162935467746, + "learning_rate": 3.3184990279391536e-07, + "loss": 0.4184, + "step": 5880 + }, + { + "epoch": 0.89, + "grad_norm": 8.108456532153015, + "learning_rate": 3.309754480461713e-07, + "loss": 0.4231, + "step": 5881 + }, + { + "epoch": 0.89, + "grad_norm": 7.923728186089213, + "learning_rate": 3.3010210751310555e-07, + "loss": 0.4775, + "step": 5882 + }, + { + "epoch": 0.89, + "grad_norm": 7.789054789791504, + "learning_rate": 3.2922988140313086e-07, + "loss": 0.4642, + "step": 5883 + }, + { + "epoch": 0.89, + "grad_norm": 8.086120285412205, + "learning_rate": 3.283587699243962e-07, + "loss": 0.4471, + "step": 5884 + }, + { + "epoch": 0.89, + "grad_norm": 7.684113000506246, + "learning_rate": 3.2748877328478314e-07, + "loss": 0.4535, + "step": 5885 + }, + { + "epoch": 0.89, + "grad_norm": 7.373516612576686, + "learning_rate": 3.266198916919061e-07, + "loss": 0.4123, + "step": 5886 + }, + { + "epoch": 0.89, + "grad_norm": 7.157874962369352, + "learning_rate": 3.2575212535311596e-07, + "loss": 0.4352, + "step": 5887 + }, + { + "epoch": 0.89, + "grad_norm": 7.07206604021634, + "learning_rate": 3.248854744754953e-07, + "loss": 0.4601, + "step": 5888 + }, + { + "epoch": 0.89, + "grad_norm": 10.371191084921188, + "learning_rate": 3.2401993926586326e-07, + "loss": 0.3942, + "step": 5889 + }, + { + "epoch": 0.89, + "grad_norm": 15.629838693153198, + "learning_rate": 3.2315551993076955e-07, + "loss": 0.4213, + "step": 5890 + }, + { + "epoch": 0.89, + "grad_norm": 11.473874375715154, + "learning_rate": 3.222922166764991e-07, + "loss": 0.5146, + "step": 5891 + }, + { + "epoch": 0.89, + "grad_norm": 1.3413898055516917, + "learning_rate": 3.2143002970907e-07, + "loss": 0.5721, + "step": 5892 + }, + { + "epoch": 0.89, + "grad_norm": 8.48045174682581, + "learning_rate": 3.2056895923423426e-07, + "loss": 0.4209, + "step": 5893 + }, + { + "epoch": 0.89, + "grad_norm": 13.150911596863523, + "learning_rate": 3.197090054574786e-07, + "loss": 0.3993, + "step": 5894 + }, + { + "epoch": 0.89, + "grad_norm": 7.562272341104438, + "learning_rate": 3.1885016858402175e-07, + "loss": 0.4022, + "step": 5895 + }, + { + "epoch": 0.89, + "grad_norm": 9.168984802124054, + "learning_rate": 3.179924488188146e-07, + "loss": 0.4049, + "step": 5896 + }, + { + "epoch": 0.89, + "grad_norm": 11.696890917737706, + "learning_rate": 3.1713584636654584e-07, + "loss": 0.4386, + "step": 5897 + }, + { + "epoch": 0.89, + "grad_norm": 13.105419591031598, + "learning_rate": 3.1628036143163344e-07, + "loss": 0.4056, + "step": 5898 + }, + { + "epoch": 0.89, + "grad_norm": 11.736275011397376, + "learning_rate": 3.154259942182292e-07, + "loss": 0.3967, + "step": 5899 + }, + { + "epoch": 0.89, + "grad_norm": 5.9574506291647955, + "learning_rate": 3.1457274493022105e-07, + "loss": 0.3852, + "step": 5900 + }, + { + "epoch": 0.89, + "grad_norm": 8.479543251059091, + "learning_rate": 3.1372061377122565e-07, + "loss": 0.4248, + "step": 5901 + }, + { + "epoch": 0.89, + "grad_norm": 7.270029517196845, + "learning_rate": 3.128696009445986e-07, + "loss": 0.437, + "step": 5902 + }, + { + "epoch": 0.89, + "grad_norm": 8.739981247843087, + "learning_rate": 3.120197066534231e-07, + "loss": 0.4753, + "step": 5903 + }, + { + "epoch": 0.89, + "grad_norm": 7.580745683221518, + "learning_rate": 3.1117093110051787e-07, + "loss": 0.3802, + "step": 5904 + }, + { + "epoch": 0.89, + "grad_norm": 8.76151752955906, + "learning_rate": 3.103232744884349e-07, + "loss": 0.3253, + "step": 5905 + }, + { + "epoch": 0.89, + "grad_norm": 6.660374946957707, + "learning_rate": 3.094767370194579e-07, + "loss": 0.4542, + "step": 5906 + }, + { + "epoch": 0.89, + "grad_norm": 8.436438158758204, + "learning_rate": 3.0863131889560583e-07, + "loss": 0.5159, + "step": 5907 + }, + { + "epoch": 0.89, + "grad_norm": 9.838535856965322, + "learning_rate": 3.077870203186284e-07, + "loss": 0.4172, + "step": 5908 + }, + { + "epoch": 0.89, + "grad_norm": 7.708910380159401, + "learning_rate": 3.069438414900078e-07, + "loss": 0.4152, + "step": 5909 + }, + { + "epoch": 0.89, + "grad_norm": 1.0947214880228184, + "learning_rate": 3.0610178261096134e-07, + "loss": 0.5209, + "step": 5910 + }, + { + "epoch": 0.89, + "grad_norm": 6.310474423995987, + "learning_rate": 3.0526084388243613e-07, + "loss": 0.3925, + "step": 5911 + }, + { + "epoch": 0.89, + "grad_norm": 6.892581751437814, + "learning_rate": 3.0442102550511553e-07, + "loss": 0.3509, + "step": 5912 + }, + { + "epoch": 0.89, + "grad_norm": 5.836463588081054, + "learning_rate": 3.0358232767941253e-07, + "loss": 0.4146, + "step": 5913 + }, + { + "epoch": 0.89, + "grad_norm": 1.26926936529437, + "learning_rate": 3.0274475060547423e-07, + "loss": 0.5146, + "step": 5914 + }, + { + "epoch": 0.89, + "grad_norm": 10.693172961761073, + "learning_rate": 3.019082944831786e-07, + "loss": 0.365, + "step": 5915 + }, + { + "epoch": 0.89, + "grad_norm": 6.3409964378446535, + "learning_rate": 3.0107295951213753e-07, + "loss": 0.4116, + "step": 5916 + }, + { + "epoch": 0.89, + "grad_norm": 25.805125235166113, + "learning_rate": 3.0023874589169667e-07, + "loss": 0.5167, + "step": 5917 + }, + { + "epoch": 0.89, + "grad_norm": 6.493040299453499, + "learning_rate": 2.994056538209311e-07, + "loss": 0.4228, + "step": 5918 + }, + { + "epoch": 0.89, + "grad_norm": 4.006246393576023, + "learning_rate": 2.985736834986497e-07, + "loss": 0.4168, + "step": 5919 + }, + { + "epoch": 0.89, + "grad_norm": 5.208133491756821, + "learning_rate": 2.977428351233941e-07, + "loss": 0.4235, + "step": 5920 + }, + { + "epoch": 0.89, + "grad_norm": 7.43967447959776, + "learning_rate": 2.969131088934374e-07, + "loss": 0.4695, + "step": 5921 + }, + { + "epoch": 0.89, + "grad_norm": 4.946178976018677, + "learning_rate": 2.9608450500678566e-07, + "loss": 0.4563, + "step": 5922 + }, + { + "epoch": 0.89, + "grad_norm": 18.403417902225954, + "learning_rate": 2.9525702366117683e-07, + "loss": 0.4507, + "step": 5923 + }, + { + "epoch": 0.89, + "grad_norm": 7.459395665680696, + "learning_rate": 2.9443066505407956e-07, + "loss": 0.4302, + "step": 5924 + }, + { + "epoch": 0.89, + "grad_norm": 7.093963549298736, + "learning_rate": 2.936054293826973e-07, + "loss": 0.4015, + "step": 5925 + }, + { + "epoch": 0.89, + "grad_norm": 5.937180722602683, + "learning_rate": 2.927813168439636e-07, + "loss": 0.4148, + "step": 5926 + }, + { + "epoch": 0.89, + "grad_norm": 37.43256228148086, + "learning_rate": 2.9195832763454393e-07, + "loss": 0.42, + "step": 5927 + }, + { + "epoch": 0.89, + "grad_norm": 8.166699629278286, + "learning_rate": 2.9113646195083723e-07, + "loss": 0.3557, + "step": 5928 + }, + { + "epoch": 0.89, + "grad_norm": 5.963191051335659, + "learning_rate": 2.903157199889711e-07, + "loss": 0.3823, + "step": 5929 + }, + { + "epoch": 0.89, + "grad_norm": 4.885475335171903, + "learning_rate": 2.8949610194481e-07, + "loss": 0.4291, + "step": 5930 + }, + { + "epoch": 0.89, + "grad_norm": 37.574632057038095, + "learning_rate": 2.886776080139447e-07, + "loss": 0.4378, + "step": 5931 + }, + { + "epoch": 0.89, + "grad_norm": 5.581847989615499, + "learning_rate": 2.8786023839170287e-07, + "loss": 0.461, + "step": 5932 + }, + { + "epoch": 0.89, + "grad_norm": 8.111300579536097, + "learning_rate": 2.8704399327314005e-07, + "loss": 0.3309, + "step": 5933 + }, + { + "epoch": 0.9, + "grad_norm": 11.923693285473334, + "learning_rate": 2.8622887285304437e-07, + "loss": 0.4058, + "step": 5934 + }, + { + "epoch": 0.9, + "grad_norm": 7.8306779466319245, + "learning_rate": 2.8541487732593687e-07, + "loss": 0.4386, + "step": 5935 + }, + { + "epoch": 0.9, + "grad_norm": 9.658779694308569, + "learning_rate": 2.8460200688606885e-07, + "loss": 0.4602, + "step": 5936 + }, + { + "epoch": 0.9, + "grad_norm": 31.434862952894317, + "learning_rate": 2.8379026172742297e-07, + "loss": 0.3659, + "step": 5937 + }, + { + "epoch": 0.9, + "grad_norm": 9.751107030673094, + "learning_rate": 2.8297964204371477e-07, + "loss": 0.4335, + "step": 5938 + }, + { + "epoch": 0.9, + "grad_norm": 9.931064161192655, + "learning_rate": 2.8217014802838894e-07, + "loss": 0.4217, + "step": 5939 + }, + { + "epoch": 0.9, + "grad_norm": 11.830491024998155, + "learning_rate": 2.813617798746243e-07, + "loss": 0.4012, + "step": 5940 + }, + { + "epoch": 0.9, + "grad_norm": 7.358509556948288, + "learning_rate": 2.805545377753288e-07, + "loss": 0.4796, + "step": 5941 + }, + { + "epoch": 0.9, + "grad_norm": 6.6338949307507855, + "learning_rate": 2.7974842192314277e-07, + "loss": 0.4229, + "step": 5942 + }, + { + "epoch": 0.9, + "grad_norm": 6.914309459169457, + "learning_rate": 2.7894343251043787e-07, + "loss": 0.4117, + "step": 5943 + }, + { + "epoch": 0.9, + "grad_norm": 11.015595362723658, + "learning_rate": 2.7813956972931543e-07, + "loss": 0.4404, + "step": 5944 + }, + { + "epoch": 0.9, + "grad_norm": 4.901879425882384, + "learning_rate": 2.7733683377161035e-07, + "loss": 0.4406, + "step": 5945 + }, + { + "epoch": 0.9, + "grad_norm": 9.042872878868316, + "learning_rate": 2.7653522482888606e-07, + "loss": 0.3736, + "step": 5946 + }, + { + "epoch": 0.9, + "grad_norm": 8.731108451343736, + "learning_rate": 2.75734743092439e-07, + "loss": 0.4247, + "step": 5947 + }, + { + "epoch": 0.9, + "grad_norm": 11.784134408659243, + "learning_rate": 2.749353887532963e-07, + "loss": 0.4717, + "step": 5948 + }, + { + "epoch": 0.9, + "grad_norm": 12.855547547684075, + "learning_rate": 2.7413716200221485e-07, + "loss": 0.4025, + "step": 5949 + }, + { + "epoch": 0.9, + "grad_norm": 7.690992590756652, + "learning_rate": 2.7334006302968395e-07, + "loss": 0.4157, + "step": 5950 + }, + { + "epoch": 0.9, + "grad_norm": 11.447659208624662, + "learning_rate": 2.725440920259215e-07, + "loss": 0.4379, + "step": 5951 + }, + { + "epoch": 0.9, + "grad_norm": 5.657507798853742, + "learning_rate": 2.7174924918087987e-07, + "loss": 0.4562, + "step": 5952 + }, + { + "epoch": 0.9, + "grad_norm": 41.18739517789081, + "learning_rate": 2.709555346842391e-07, + "loss": 0.3168, + "step": 5953 + }, + { + "epoch": 0.9, + "grad_norm": 9.825166919859997, + "learning_rate": 2.701629487254104e-07, + "loss": 0.449, + "step": 5954 + }, + { + "epoch": 0.9, + "grad_norm": 4.82649782666973, + "learning_rate": 2.6937149149353803e-07, + "loss": 0.4019, + "step": 5955 + }, + { + "epoch": 0.9, + "grad_norm": 11.066950538402935, + "learning_rate": 2.685811631774926e-07, + "loss": 0.4304, + "step": 5956 + }, + { + "epoch": 0.9, + "grad_norm": 5.938553033151722, + "learning_rate": 2.677919639658805e-07, + "loss": 0.482, + "step": 5957 + }, + { + "epoch": 0.9, + "grad_norm": 50.26138556187903, + "learning_rate": 2.670038940470349e-07, + "loss": 0.4394, + "step": 5958 + }, + { + "epoch": 0.9, + "grad_norm": 6.925414078694652, + "learning_rate": 2.662169536090192e-07, + "loss": 0.3938, + "step": 5959 + }, + { + "epoch": 0.9, + "grad_norm": 6.700971743360954, + "learning_rate": 2.654311428396311e-07, + "loss": 0.3995, + "step": 5960 + }, + { + "epoch": 0.9, + "grad_norm": 56.33366616847569, + "learning_rate": 2.6464646192639496e-07, + "loss": 0.4386, + "step": 5961 + }, + { + "epoch": 0.9, + "grad_norm": 10.825047504067705, + "learning_rate": 2.6386291105656656e-07, + "loss": 0.455, + "step": 5962 + }, + { + "epoch": 0.9, + "grad_norm": 9.370317165418127, + "learning_rate": 2.6308049041713247e-07, + "loss": 0.4483, + "step": 5963 + }, + { + "epoch": 0.9, + "grad_norm": 11.018362355876949, + "learning_rate": 2.6229920019480934e-07, + "loss": 0.466, + "step": 5964 + }, + { + "epoch": 0.9, + "grad_norm": 7.472306304143079, + "learning_rate": 2.615190405760443e-07, + "loss": 0.4138, + "step": 5965 + }, + { + "epoch": 0.9, + "grad_norm": 7.048097489540707, + "learning_rate": 2.607400117470138e-07, + "loss": 0.458, + "step": 5966 + }, + { + "epoch": 0.9, + "grad_norm": 6.6609356250245915, + "learning_rate": 2.599621138936265e-07, + "loss": 0.4238, + "step": 5967 + }, + { + "epoch": 0.9, + "grad_norm": 8.057106471833475, + "learning_rate": 2.5918534720151823e-07, + "loss": 0.4579, + "step": 5968 + }, + { + "epoch": 0.9, + "grad_norm": 8.213143784548258, + "learning_rate": 2.584097118560569e-07, + "loss": 0.448, + "step": 5969 + }, + { + "epoch": 0.9, + "grad_norm": 8.323593317303844, + "learning_rate": 2.5763520804234e-07, + "loss": 0.4208, + "step": 5970 + }, + { + "epoch": 0.9, + "grad_norm": 1.2344997344719222, + "learning_rate": 2.5686183594519467e-07, + "loss": 0.5261, + "step": 5971 + }, + { + "epoch": 0.9, + "grad_norm": 6.740790511037284, + "learning_rate": 2.560895957491788e-07, + "loss": 0.5076, + "step": 5972 + }, + { + "epoch": 0.9, + "grad_norm": 8.430463530071608, + "learning_rate": 2.5531848763857894e-07, + "loss": 0.4736, + "step": 5973 + }, + { + "epoch": 0.9, + "grad_norm": 9.23844208113187, + "learning_rate": 2.545485117974111e-07, + "loss": 0.3933, + "step": 5974 + }, + { + "epoch": 0.9, + "grad_norm": 1.0811573669767824, + "learning_rate": 2.537796684094246e-07, + "loss": 0.5218, + "step": 5975 + }, + { + "epoch": 0.9, + "grad_norm": 10.58761368366624, + "learning_rate": 2.530119576580936e-07, + "loss": 0.4835, + "step": 5976 + }, + { + "epoch": 0.9, + "grad_norm": 17.97062323608271, + "learning_rate": 2.522453797266261e-07, + "loss": 0.4003, + "step": 5977 + }, + { + "epoch": 0.9, + "grad_norm": 56.48831233462823, + "learning_rate": 2.514799347979574e-07, + "loss": 0.4126, + "step": 5978 + }, + { + "epoch": 0.9, + "grad_norm": 5.660000643558496, + "learning_rate": 2.507156230547525e-07, + "loss": 0.4777, + "step": 5979 + }, + { + "epoch": 0.9, + "grad_norm": 6.445899400591786, + "learning_rate": 2.4995244467940773e-07, + "loss": 0.3921, + "step": 5980 + }, + { + "epoch": 0.9, + "grad_norm": 9.31067687194062, + "learning_rate": 2.4919039985404626e-07, + "loss": 0.4439, + "step": 5981 + }, + { + "epoch": 0.9, + "grad_norm": 11.727360803821629, + "learning_rate": 2.4842948876052373e-07, + "loss": 0.4434, + "step": 5982 + }, + { + "epoch": 0.9, + "grad_norm": 5.528255476822902, + "learning_rate": 2.476697115804233e-07, + "loss": 0.4788, + "step": 5983 + }, + { + "epoch": 0.9, + "grad_norm": 7.56690615952462, + "learning_rate": 2.4691106849505766e-07, + "loss": 0.4451, + "step": 5984 + }, + { + "epoch": 0.9, + "grad_norm": 7.351468954746938, + "learning_rate": 2.4615355968546915e-07, + "loss": 0.4576, + "step": 5985 + }, + { + "epoch": 0.9, + "grad_norm": 6.588075521537686, + "learning_rate": 2.4539718533242885e-07, + "loss": 0.4148, + "step": 5986 + }, + { + "epoch": 0.9, + "grad_norm": 6.2692570735456306, + "learning_rate": 2.446419456164395e-07, + "loss": 0.3312, + "step": 5987 + }, + { + "epoch": 0.9, + "grad_norm": 16.862239544064373, + "learning_rate": 2.4388784071772976e-07, + "loss": 0.3892, + "step": 5988 + }, + { + "epoch": 0.9, + "grad_norm": 11.536035206226808, + "learning_rate": 2.431348708162595e-07, + "loss": 0.3907, + "step": 5989 + }, + { + "epoch": 0.9, + "grad_norm": 7.9818460449923005, + "learning_rate": 2.4238303609171734e-07, + "loss": 0.4744, + "step": 5990 + }, + { + "epoch": 0.9, + "grad_norm": 12.325482472388074, + "learning_rate": 2.4163233672352017e-07, + "loss": 0.3997, + "step": 5991 + }, + { + "epoch": 0.9, + "grad_norm": 1.084774400080377, + "learning_rate": 2.408827728908164e-07, + "loss": 0.5107, + "step": 5992 + }, + { + "epoch": 0.9, + "grad_norm": 1.1040052633469488, + "learning_rate": 2.4013434477248023e-07, + "loss": 0.5045, + "step": 5993 + }, + { + "epoch": 0.9, + "grad_norm": 5.79610948152564, + "learning_rate": 2.393870525471159e-07, + "loss": 0.3961, + "step": 5994 + }, + { + "epoch": 0.9, + "grad_norm": 6.868330017388978, + "learning_rate": 2.386408963930592e-07, + "loss": 0.4462, + "step": 5995 + }, + { + "epoch": 0.9, + "grad_norm": 10.92076172156508, + "learning_rate": 2.3789587648837043e-07, + "loss": 0.4445, + "step": 5996 + }, + { + "epoch": 0.9, + "grad_norm": 7.742327209160975, + "learning_rate": 2.3715199301084235e-07, + "loss": 0.4055, + "step": 5997 + }, + { + "epoch": 0.9, + "grad_norm": 11.544016624514466, + "learning_rate": 2.364092461379941e-07, + "loss": 0.3791, + "step": 5998 + }, + { + "epoch": 0.9, + "grad_norm": 8.799585469594696, + "learning_rate": 2.35667636047075e-07, + "loss": 0.3983, + "step": 5999 + }, + { + "epoch": 0.9, + "grad_norm": 8.601040485219471, + "learning_rate": 2.3492716291506235e-07, + "loss": 0.4707, + "step": 6000 + }, + { + "epoch": 0.91, + "grad_norm": 6.679971576664796, + "learning_rate": 2.341878269186626e-07, + "loss": 0.3836, + "step": 6001 + }, + { + "epoch": 0.91, + "grad_norm": 11.555158103521904, + "learning_rate": 2.3344962823431183e-07, + "loss": 0.4243, + "step": 6002 + }, + { + "epoch": 0.91, + "grad_norm": 6.369498920543641, + "learning_rate": 2.327125670381719e-07, + "loss": 0.4431, + "step": 6003 + }, + { + "epoch": 0.91, + "grad_norm": 20.738723382480593, + "learning_rate": 2.3197664350613547e-07, + "loss": 0.4553, + "step": 6004 + }, + { + "epoch": 0.91, + "grad_norm": 7.900448518895032, + "learning_rate": 2.3124185781382368e-07, + "loss": 0.5062, + "step": 6005 + }, + { + "epoch": 0.91, + "grad_norm": 4.671027022612743, + "learning_rate": 2.3050821013658465e-07, + "loss": 0.3903, + "step": 6006 + }, + { + "epoch": 0.91, + "grad_norm": 1.248884464187572, + "learning_rate": 2.297757006494966e-07, + "loss": 0.5548, + "step": 6007 + }, + { + "epoch": 0.91, + "grad_norm": 13.29356887073713, + "learning_rate": 2.2904432952736533e-07, + "loss": 0.4211, + "step": 6008 + }, + { + "epoch": 0.91, + "grad_norm": 12.532186972980012, + "learning_rate": 2.2831409694472395e-07, + "loss": 0.4355, + "step": 6009 + }, + { + "epoch": 0.91, + "grad_norm": 10.235767644856566, + "learning_rate": 2.2758500307583642e-07, + "loss": 0.4012, + "step": 6010 + }, + { + "epoch": 0.91, + "grad_norm": 15.737942138769418, + "learning_rate": 2.26857048094693e-07, + "loss": 0.4185, + "step": 6011 + }, + { + "epoch": 0.91, + "grad_norm": 8.60557061687433, + "learning_rate": 2.261302321750114e-07, + "loss": 0.4227, + "step": 6012 + }, + { + "epoch": 0.91, + "grad_norm": 13.56688523077517, + "learning_rate": 2.254045554902412e-07, + "loss": 0.3822, + "step": 6013 + }, + { + "epoch": 0.91, + "grad_norm": 5.90680573226255, + "learning_rate": 2.2468001821355502e-07, + "loss": 0.4469, + "step": 6014 + }, + { + "epoch": 0.91, + "grad_norm": 4.152151691376813, + "learning_rate": 2.239566205178584e-07, + "loss": 0.4141, + "step": 6015 + }, + { + "epoch": 0.91, + "grad_norm": 8.016338476327478, + "learning_rate": 2.2323436257578212e-07, + "loss": 0.4269, + "step": 6016 + }, + { + "epoch": 0.91, + "grad_norm": 1.0486243164892097, + "learning_rate": 2.2251324455968447e-07, + "loss": 0.501, + "step": 6017 + }, + { + "epoch": 0.91, + "grad_norm": 8.301773411730899, + "learning_rate": 2.21793266641655e-07, + "loss": 0.4132, + "step": 6018 + }, + { + "epoch": 0.91, + "grad_norm": 7.4305781111696705, + "learning_rate": 2.210744289935074e-07, + "loss": 0.3648, + "step": 6019 + }, + { + "epoch": 0.91, + "grad_norm": 10.780243809139604, + "learning_rate": 2.2035673178678553e-07, + "loss": 0.4576, + "step": 6020 + }, + { + "epoch": 0.91, + "grad_norm": 1.1923403148339042, + "learning_rate": 2.196401751927596e-07, + "loss": 0.474, + "step": 6021 + }, + { + "epoch": 0.91, + "grad_norm": 10.061521025676669, + "learning_rate": 2.1892475938242895e-07, + "loss": 0.4223, + "step": 6022 + }, + { + "epoch": 0.91, + "grad_norm": 9.55886429224066, + "learning_rate": 2.1821048452652095e-07, + "loss": 0.3507, + "step": 6023 + }, + { + "epoch": 0.91, + "grad_norm": 12.418535918653141, + "learning_rate": 2.1749735079548861e-07, + "loss": 0.3805, + "step": 6024 + }, + { + "epoch": 0.91, + "grad_norm": 6.200482062165804, + "learning_rate": 2.1678535835951532e-07, + "loss": 0.3832, + "step": 6025 + }, + { + "epoch": 0.91, + "grad_norm": 9.357742183138216, + "learning_rate": 2.160745073885101e-07, + "loss": 0.4424, + "step": 6026 + }, + { + "epoch": 0.91, + "grad_norm": 9.233555603130945, + "learning_rate": 2.1536479805210952e-07, + "loss": 0.4291, + "step": 6027 + }, + { + "epoch": 0.91, + "grad_norm": 7.1710118690525, + "learning_rate": 2.1465623051967975e-07, + "loss": 0.4057, + "step": 6028 + }, + { + "epoch": 0.91, + "grad_norm": 6.9083133829225805, + "learning_rate": 2.1394880496031223e-07, + "loss": 0.411, + "step": 6029 + }, + { + "epoch": 0.91, + "grad_norm": 9.75403303958984, + "learning_rate": 2.1324252154282743e-07, + "loss": 0.4844, + "step": 6030 + }, + { + "epoch": 0.91, + "grad_norm": 10.624183726125615, + "learning_rate": 2.1253738043577221e-07, + "loss": 0.4423, + "step": 6031 + }, + { + "epoch": 0.91, + "grad_norm": 8.664724592601209, + "learning_rate": 2.1183338180742142e-07, + "loss": 0.4139, + "step": 6032 + }, + { + "epoch": 0.91, + "grad_norm": 1.0565802086343434, + "learning_rate": 2.1113052582577675e-07, + "loss": 0.4605, + "step": 6033 + }, + { + "epoch": 0.91, + "grad_norm": 7.592858971807619, + "learning_rate": 2.104288126585674e-07, + "loss": 0.4075, + "step": 6034 + }, + { + "epoch": 0.91, + "grad_norm": 7.096021036684525, + "learning_rate": 2.0972824247325109e-07, + "loss": 0.442, + "step": 6035 + }, + { + "epoch": 0.91, + "grad_norm": 9.629129948431016, + "learning_rate": 2.090288154370107e-07, + "loss": 0.48, + "step": 6036 + }, + { + "epoch": 0.91, + "grad_norm": 13.036012427022493, + "learning_rate": 2.0833053171675722e-07, + "loss": 0.4638, + "step": 6037 + }, + { + "epoch": 0.91, + "grad_norm": 13.541993179349438, + "learning_rate": 2.0763339147912954e-07, + "loss": 0.482, + "step": 6038 + }, + { + "epoch": 0.91, + "grad_norm": 33.39607907012994, + "learning_rate": 2.0693739489049235e-07, + "loss": 0.3405, + "step": 6039 + }, + { + "epoch": 0.91, + "grad_norm": 10.787505409632686, + "learning_rate": 2.0624254211693894e-07, + "loss": 0.4341, + "step": 6040 + }, + { + "epoch": 0.91, + "grad_norm": 6.451348763693033, + "learning_rate": 2.0554883332428777e-07, + "loss": 0.4305, + "step": 6041 + }, + { + "epoch": 0.91, + "grad_norm": 53.18736987286233, + "learning_rate": 2.048562686780864e-07, + "loss": 0.4924, + "step": 6042 + }, + { + "epoch": 0.91, + "grad_norm": 10.357649108103432, + "learning_rate": 2.041648483436076e-07, + "loss": 0.4606, + "step": 6043 + }, + { + "epoch": 0.91, + "grad_norm": 6.532069854911818, + "learning_rate": 2.034745724858511e-07, + "loss": 0.3895, + "step": 6044 + }, + { + "epoch": 0.91, + "grad_norm": 12.587868492103368, + "learning_rate": 2.0278544126954514e-07, + "loss": 0.3911, + "step": 6045 + }, + { + "epoch": 0.91, + "grad_norm": 7.234987106807465, + "learning_rate": 2.0209745485914368e-07, + "loss": 0.4138, + "step": 6046 + }, + { + "epoch": 0.91, + "grad_norm": 12.342673172628077, + "learning_rate": 2.014106134188265e-07, + "loss": 0.4961, + "step": 6047 + }, + { + "epoch": 0.91, + "grad_norm": 8.242794987818895, + "learning_rate": 2.0072491711250308e-07, + "loss": 0.3942, + "step": 6048 + }, + { + "epoch": 0.91, + "grad_norm": 6.426404424257709, + "learning_rate": 2.0004036610380583e-07, + "loss": 0.4529, + "step": 6049 + }, + { + "epoch": 0.91, + "grad_norm": 7.489751749543051, + "learning_rate": 1.9935696055609798e-07, + "loss": 0.4268, + "step": 6050 + }, + { + "epoch": 0.91, + "grad_norm": 17.66782386662816, + "learning_rate": 1.986747006324652e-07, + "loss": 0.4749, + "step": 6051 + }, + { + "epoch": 0.91, + "grad_norm": 4.56879124382153, + "learning_rate": 1.9799358649572275e-07, + "loss": 0.405, + "step": 6052 + }, + { + "epoch": 0.91, + "grad_norm": 5.065688135132426, + "learning_rate": 1.9731361830841177e-07, + "loss": 0.4141, + "step": 6053 + }, + { + "epoch": 0.91, + "grad_norm": 16.602856883407917, + "learning_rate": 1.966347962327997e-07, + "loss": 0.4122, + "step": 6054 + }, + { + "epoch": 0.91, + "grad_norm": 9.831499194964778, + "learning_rate": 1.9595712043087968e-07, + "loss": 0.4748, + "step": 6055 + }, + { + "epoch": 0.91, + "grad_norm": 5.848797752722055, + "learning_rate": 1.9528059106437237e-07, + "loss": 0.5357, + "step": 6056 + }, + { + "epoch": 0.91, + "grad_norm": 9.19568554864068, + "learning_rate": 1.946052082947242e-07, + "loss": 0.4482, + "step": 6057 + }, + { + "epoch": 0.91, + "grad_norm": 7.93631392675379, + "learning_rate": 1.9393097228310954e-07, + "loss": 0.4474, + "step": 6058 + }, + { + "epoch": 0.91, + "grad_norm": 6.260907125757506, + "learning_rate": 1.932578831904258e-07, + "loss": 0.4447, + "step": 6059 + }, + { + "epoch": 0.91, + "grad_norm": 5.256014806639852, + "learning_rate": 1.9258594117730123e-07, + "loss": 0.4063, + "step": 6060 + }, + { + "epoch": 0.91, + "grad_norm": 7.163849213436904, + "learning_rate": 1.9191514640408692e-07, + "loss": 0.4479, + "step": 6061 + }, + { + "epoch": 0.91, + "grad_norm": 10.329715900589596, + "learning_rate": 1.9124549903085987e-07, + "loss": 0.4495, + "step": 6062 + }, + { + "epoch": 0.91, + "grad_norm": 1.034254586824161, + "learning_rate": 1.9057699921742613e-07, + "loss": 0.5243, + "step": 6063 + }, + { + "epoch": 0.91, + "grad_norm": 5.7182537672694425, + "learning_rate": 1.8990964712331582e-07, + "loss": 0.3729, + "step": 6064 + }, + { + "epoch": 0.91, + "grad_norm": 10.581959634756734, + "learning_rate": 1.8924344290778608e-07, + "loss": 0.4436, + "step": 6065 + }, + { + "epoch": 0.91, + "grad_norm": 7.208452850194081, + "learning_rate": 1.8857838672981855e-07, + "loss": 0.4557, + "step": 6066 + }, + { + "epoch": 0.92, + "grad_norm": 9.828221397572074, + "learning_rate": 1.8791447874812296e-07, + "loss": 0.4238, + "step": 6067 + }, + { + "epoch": 0.92, + "grad_norm": 5.523347033792497, + "learning_rate": 1.872517191211337e-07, + "loss": 0.4285, + "step": 6068 + }, + { + "epoch": 0.92, + "grad_norm": 9.313057016353419, + "learning_rate": 1.865901080070115e-07, + "loss": 0.4722, + "step": 6069 + }, + { + "epoch": 0.92, + "grad_norm": 9.13967800304817, + "learning_rate": 1.8592964556364334e-07, + "loss": 0.4482, + "step": 6070 + }, + { + "epoch": 0.92, + "grad_norm": 6.2306709715323105, + "learning_rate": 1.8527033194864207e-07, + "loss": 0.4854, + "step": 6071 + }, + { + "epoch": 0.92, + "grad_norm": 9.59877598463886, + "learning_rate": 1.8461216731934517e-07, + "loss": 0.3817, + "step": 6072 + }, + { + "epoch": 0.92, + "grad_norm": 6.07994446266774, + "learning_rate": 1.839551518328181e-07, + "loss": 0.4427, + "step": 6073 + }, + { + "epoch": 0.92, + "grad_norm": 6.2372882074171825, + "learning_rate": 1.8329928564584932e-07, + "loss": 0.396, + "step": 6074 + }, + { + "epoch": 0.92, + "grad_norm": 16.64621995554326, + "learning_rate": 1.8264456891495586e-07, + "loss": 0.4128, + "step": 6075 + }, + { + "epoch": 0.92, + "grad_norm": 5.193948012152696, + "learning_rate": 1.8199100179637885e-07, + "loss": 0.4485, + "step": 6076 + }, + { + "epoch": 0.92, + "grad_norm": 8.934371184769104, + "learning_rate": 1.813385844460852e-07, + "loss": 0.399, + "step": 6077 + }, + { + "epoch": 0.92, + "grad_norm": 14.718352630750164, + "learning_rate": 1.806873170197676e-07, + "loss": 0.4267, + "step": 6078 + }, + { + "epoch": 0.92, + "grad_norm": 5.869044575771374, + "learning_rate": 1.8003719967284395e-07, + "loss": 0.439, + "step": 6079 + }, + { + "epoch": 0.92, + "grad_norm": 7.700500732923179, + "learning_rate": 1.7938823256045845e-07, + "loss": 0.3987, + "step": 6080 + }, + { + "epoch": 0.92, + "grad_norm": 10.481911746281208, + "learning_rate": 1.7874041583748114e-07, + "loss": 0.4864, + "step": 6081 + }, + { + "epoch": 0.92, + "grad_norm": 6.687644605406585, + "learning_rate": 1.78093749658505e-07, + "loss": 0.409, + "step": 6082 + }, + { + "epoch": 0.92, + "grad_norm": 6.299199419817891, + "learning_rate": 1.7744823417785163e-07, + "loss": 0.4652, + "step": 6083 + }, + { + "epoch": 0.92, + "grad_norm": 5.50805089776302, + "learning_rate": 1.7680386954956607e-07, + "loss": 0.3713, + "step": 6084 + }, + { + "epoch": 0.92, + "grad_norm": 13.252272735615387, + "learning_rate": 1.7616065592742038e-07, + "loss": 0.3684, + "step": 6085 + }, + { + "epoch": 0.92, + "grad_norm": 16.49320463551489, + "learning_rate": 1.7551859346490952e-07, + "loss": 0.4802, + "step": 6086 + }, + { + "epoch": 0.92, + "grad_norm": 6.032024264991484, + "learning_rate": 1.7487768231525592e-07, + "loss": 0.3817, + "step": 6087 + }, + { + "epoch": 0.92, + "grad_norm": 7.209413377151795, + "learning_rate": 1.7423792263140616e-07, + "loss": 0.4104, + "step": 6088 + }, + { + "epoch": 0.92, + "grad_norm": 7.913212388741603, + "learning_rate": 1.735993145660325e-07, + "loss": 0.4581, + "step": 6089 + }, + { + "epoch": 0.92, + "grad_norm": 14.43621099527534, + "learning_rate": 1.7296185827153144e-07, + "loss": 0.4174, + "step": 6090 + }, + { + "epoch": 0.92, + "grad_norm": 9.672026295219572, + "learning_rate": 1.7232555390002682e-07, + "loss": 0.4634, + "step": 6091 + }, + { + "epoch": 0.92, + "grad_norm": 18.816607288368626, + "learning_rate": 1.7169040160336436e-07, + "loss": 0.4519, + "step": 6092 + }, + { + "epoch": 0.92, + "grad_norm": 6.33786473699606, + "learning_rate": 1.7105640153311787e-07, + "loss": 0.4667, + "step": 6093 + }, + { + "epoch": 0.92, + "grad_norm": 6.753321000508387, + "learning_rate": 1.7042355384058407e-07, + "loss": 0.4346, + "step": 6094 + }, + { + "epoch": 0.92, + "grad_norm": 9.774343435345806, + "learning_rate": 1.697918586767866e-07, + "loss": 0.376, + "step": 6095 + }, + { + "epoch": 0.92, + "grad_norm": 5.52829607848003, + "learning_rate": 1.6916131619247267e-07, + "loss": 0.3897, + "step": 6096 + }, + { + "epoch": 0.92, + "grad_norm": 10.986911068852063, + "learning_rate": 1.6853192653811356e-07, + "loss": 0.4449, + "step": 6097 + }, + { + "epoch": 0.92, + "grad_norm": 6.982571705142476, + "learning_rate": 1.679036898639086e-07, + "loss": 0.4668, + "step": 6098 + }, + { + "epoch": 0.92, + "grad_norm": 6.189999698332444, + "learning_rate": 1.6727660631977894e-07, + "loss": 0.4533, + "step": 6099 + }, + { + "epoch": 0.92, + "grad_norm": 14.46312422642193, + "learning_rate": 1.666506760553721e-07, + "loss": 0.4804, + "step": 6100 + }, + { + "epoch": 0.92, + "grad_norm": 7.950630066121511, + "learning_rate": 1.6602589922005917e-07, + "loss": 0.4365, + "step": 6101 + }, + { + "epoch": 0.92, + "grad_norm": 6.690284345503232, + "learning_rate": 1.6540227596293644e-07, + "loss": 0.3894, + "step": 6102 + }, + { + "epoch": 0.92, + "grad_norm": 8.292282705426501, + "learning_rate": 1.6477980643282654e-07, + "loss": 0.4882, + "step": 6103 + }, + { + "epoch": 0.92, + "grad_norm": 9.871789321425036, + "learning_rate": 1.6415849077827395e-07, + "loss": 0.3604, + "step": 6104 + }, + { + "epoch": 0.92, + "grad_norm": 6.211371475177253, + "learning_rate": 1.6353832914755065e-07, + "loss": 0.4463, + "step": 6105 + }, + { + "epoch": 0.92, + "grad_norm": 7.146457691565257, + "learning_rate": 1.62919321688651e-07, + "loss": 0.4122, + "step": 6106 + }, + { + "epoch": 0.92, + "grad_norm": 7.035170646792051, + "learning_rate": 1.6230146854929462e-07, + "loss": 0.4458, + "step": 6107 + }, + { + "epoch": 0.92, + "grad_norm": 10.164358744721879, + "learning_rate": 1.6168476987692693e-07, + "loss": 0.4221, + "step": 6108 + }, + { + "epoch": 0.92, + "grad_norm": 10.403206890944526, + "learning_rate": 1.6106922581871464e-07, + "loss": 0.4844, + "step": 6109 + }, + { + "epoch": 0.92, + "grad_norm": 8.964761433693864, + "learning_rate": 1.6045483652155358e-07, + "loss": 0.4696, + "step": 6110 + }, + { + "epoch": 0.92, + "grad_norm": 7.911398358921779, + "learning_rate": 1.598416021320598e-07, + "loss": 0.4484, + "step": 6111 + }, + { + "epoch": 0.92, + "grad_norm": 8.843012016254029, + "learning_rate": 1.5922952279657566e-07, + "loss": 0.4043, + "step": 6112 + }, + { + "epoch": 0.92, + "grad_norm": 18.483307704338824, + "learning_rate": 1.5861859866116824e-07, + "loss": 0.3827, + "step": 6113 + }, + { + "epoch": 0.92, + "grad_norm": 4.836460207677469, + "learning_rate": 1.5800882987162648e-07, + "loss": 0.4279, + "step": 6114 + }, + { + "epoch": 0.92, + "grad_norm": 8.709478875809834, + "learning_rate": 1.5740021657346728e-07, + "loss": 0.379, + "step": 6115 + }, + { + "epoch": 0.92, + "grad_norm": 6.683578411386008, + "learning_rate": 1.5679275891192948e-07, + "loss": 0.4608, + "step": 6116 + }, + { + "epoch": 0.92, + "grad_norm": 6.532536639875415, + "learning_rate": 1.5618645703197598e-07, + "loss": 0.4154, + "step": 6117 + }, + { + "epoch": 0.92, + "grad_norm": 10.56188073742697, + "learning_rate": 1.5558131107829556e-07, + "loss": 0.3965, + "step": 6118 + }, + { + "epoch": 0.92, + "grad_norm": 6.838740835058254, + "learning_rate": 1.5497732119529874e-07, + "loss": 0.4639, + "step": 6119 + }, + { + "epoch": 0.92, + "grad_norm": 9.579710017900108, + "learning_rate": 1.5437448752712303e-07, + "loss": 0.3847, + "step": 6120 + }, + { + "epoch": 0.92, + "grad_norm": 8.369471342909991, + "learning_rate": 1.537728102176278e-07, + "loss": 0.416, + "step": 6121 + }, + { + "epoch": 0.92, + "grad_norm": 1.24045950028533, + "learning_rate": 1.5317228941039597e-07, + "loss": 0.5346, + "step": 6122 + }, + { + "epoch": 0.92, + "grad_norm": 5.410840526854266, + "learning_rate": 1.5257292524873791e-07, + "loss": 0.389, + "step": 6123 + }, + { + "epoch": 0.92, + "grad_norm": 10.143972820106212, + "learning_rate": 1.519747178756842e-07, + "loss": 0.4761, + "step": 6124 + }, + { + "epoch": 0.92, + "grad_norm": 10.940472454062137, + "learning_rate": 1.5137766743399062e-07, + "loss": 0.457, + "step": 6125 + }, + { + "epoch": 0.92, + "grad_norm": 5.283784516214541, + "learning_rate": 1.5078177406613825e-07, + "loss": 0.4376, + "step": 6126 + }, + { + "epoch": 0.92, + "grad_norm": 7.065278075248987, + "learning_rate": 1.5018703791432997e-07, + "loss": 0.4197, + "step": 6127 + }, + { + "epoch": 0.92, + "grad_norm": 9.239892905682847, + "learning_rate": 1.495934591204945e-07, + "loss": 0.4385, + "step": 6128 + }, + { + "epoch": 0.92, + "grad_norm": 6.4470178701795655, + "learning_rate": 1.4900103782628183e-07, + "loss": 0.4671, + "step": 6129 + }, + { + "epoch": 0.92, + "grad_norm": 8.18589751807776, + "learning_rate": 1.4840977417306836e-07, + "loss": 0.4816, + "step": 6130 + }, + { + "epoch": 0.92, + "grad_norm": 8.324755842896137, + "learning_rate": 1.4781966830195282e-07, + "loss": 0.4315, + "step": 6131 + }, + { + "epoch": 0.92, + "grad_norm": 19.09558742726461, + "learning_rate": 1.4723072035375708e-07, + "loss": 0.432, + "step": 6132 + }, + { + "epoch": 0.93, + "grad_norm": 6.726508621554789, + "learning_rate": 1.4664293046902923e-07, + "loss": 0.4485, + "step": 6133 + }, + { + "epoch": 0.93, + "grad_norm": 6.663620356403694, + "learning_rate": 1.4605629878803818e-07, + "loss": 0.4551, + "step": 6134 + }, + { + "epoch": 0.93, + "grad_norm": 6.934062349993554, + "learning_rate": 1.454708254507775e-07, + "loss": 0.4236, + "step": 6135 + }, + { + "epoch": 0.93, + "grad_norm": 7.838564535227337, + "learning_rate": 1.448865105969649e-07, + "loss": 0.4215, + "step": 6136 + }, + { + "epoch": 0.93, + "grad_norm": 15.019324743734105, + "learning_rate": 1.4430335436603992e-07, + "loss": 0.3952, + "step": 6137 + }, + { + "epoch": 0.93, + "grad_norm": 24.834864355586774, + "learning_rate": 1.437213568971685e-07, + "loss": 0.4209, + "step": 6138 + }, + { + "epoch": 0.93, + "grad_norm": 7.774246720594802, + "learning_rate": 1.431405183292367e-07, + "loss": 0.3794, + "step": 6139 + }, + { + "epoch": 0.93, + "grad_norm": 14.353543307368222, + "learning_rate": 1.4256083880085647e-07, + "loss": 0.4665, + "step": 6140 + }, + { + "epoch": 0.93, + "grad_norm": 7.527545171894781, + "learning_rate": 1.4198231845036326e-07, + "loss": 0.4026, + "step": 6141 + }, + { + "epoch": 0.93, + "grad_norm": 6.064523420757355, + "learning_rate": 1.4140495741581273e-07, + "loss": 0.4306, + "step": 6142 + }, + { + "epoch": 0.93, + "grad_norm": 32.57352240275346, + "learning_rate": 1.4082875583498857e-07, + "loss": 0.4544, + "step": 6143 + }, + { + "epoch": 0.93, + "grad_norm": 23.6531853769556, + "learning_rate": 1.402537138453941e-07, + "loss": 0.4804, + "step": 6144 + }, + { + "epoch": 0.93, + "grad_norm": 4.710499275511275, + "learning_rate": 1.396798315842568e-07, + "loss": 0.4208, + "step": 6145 + }, + { + "epoch": 0.93, + "grad_norm": 8.536440172262472, + "learning_rate": 1.391071091885293e-07, + "loss": 0.4149, + "step": 6146 + }, + { + "epoch": 0.93, + "grad_norm": 8.432204853019314, + "learning_rate": 1.3853554679488455e-07, + "loss": 0.4924, + "step": 6147 + }, + { + "epoch": 0.93, + "grad_norm": 8.580591845126987, + "learning_rate": 1.3796514453972066e-07, + "loss": 0.4878, + "step": 6148 + }, + { + "epoch": 0.93, + "grad_norm": 9.605951389950999, + "learning_rate": 1.373959025591576e-07, + "loss": 0.4007, + "step": 6149 + }, + { + "epoch": 0.93, + "grad_norm": 9.098101817067565, + "learning_rate": 1.3682782098903947e-07, + "loss": 0.4188, + "step": 6150 + }, + { + "epoch": 0.93, + "grad_norm": 13.017511020270387, + "learning_rate": 1.362608999649334e-07, + "loss": 0.404, + "step": 6151 + }, + { + "epoch": 0.93, + "grad_norm": 20.75751617456195, + "learning_rate": 1.35695139622129e-07, + "loss": 0.4719, + "step": 6152 + }, + { + "epoch": 0.93, + "grad_norm": 5.311549792026235, + "learning_rate": 1.351305400956393e-07, + "loss": 0.4027, + "step": 6153 + }, + { + "epoch": 0.93, + "grad_norm": 7.415796134660333, + "learning_rate": 1.3456710152019992e-07, + "loss": 0.4715, + "step": 6154 + }, + { + "epoch": 0.93, + "grad_norm": 5.6957421292133725, + "learning_rate": 1.3400482403026937e-07, + "loss": 0.3633, + "step": 6155 + }, + { + "epoch": 0.93, + "grad_norm": 6.150408492424024, + "learning_rate": 1.3344370776003034e-07, + "loss": 0.4589, + "step": 6156 + }, + { + "epoch": 0.93, + "grad_norm": 8.674866785251966, + "learning_rate": 1.3288375284338683e-07, + "loss": 0.4759, + "step": 6157 + }, + { + "epoch": 0.93, + "grad_norm": 5.450143856609761, + "learning_rate": 1.323249594139664e-07, + "loss": 0.3857, + "step": 6158 + }, + { + "epoch": 0.93, + "grad_norm": 9.174677013537165, + "learning_rate": 1.3176732760511956e-07, + "loss": 0.4495, + "step": 6159 + }, + { + "epoch": 0.93, + "grad_norm": 7.675806888912525, + "learning_rate": 1.3121085754991937e-07, + "loss": 0.4605, + "step": 6160 + }, + { + "epoch": 0.93, + "grad_norm": 6.776563307604608, + "learning_rate": 1.3065554938116176e-07, + "loss": 0.4629, + "step": 6161 + }, + { + "epoch": 0.93, + "grad_norm": 8.008889242790026, + "learning_rate": 1.301014032313641e-07, + "loss": 0.3937, + "step": 6162 + }, + { + "epoch": 0.93, + "grad_norm": 8.537559925888587, + "learning_rate": 1.295484192327695e-07, + "loss": 0.4519, + "step": 6163 + }, + { + "epoch": 0.93, + "grad_norm": 7.781368990509396, + "learning_rate": 1.2899659751734128e-07, + "loss": 0.4172, + "step": 6164 + }, + { + "epoch": 0.93, + "grad_norm": 8.548193029786226, + "learning_rate": 1.2844593821676465e-07, + "loss": 0.3655, + "step": 6165 + }, + { + "epoch": 0.93, + "grad_norm": 13.855774342748715, + "learning_rate": 1.2789644146245117e-07, + "loss": 0.3743, + "step": 6166 + }, + { + "epoch": 0.93, + "grad_norm": 7.718233595876965, + "learning_rate": 1.2734810738553038e-07, + "loss": 0.4506, + "step": 6167 + }, + { + "epoch": 0.93, + "grad_norm": 5.308908109534446, + "learning_rate": 1.2680093611685818e-07, + "loss": 0.4041, + "step": 6168 + }, + { + "epoch": 0.93, + "grad_norm": 8.888002872018777, + "learning_rate": 1.2625492778701065e-07, + "loss": 0.435, + "step": 6169 + }, + { + "epoch": 0.93, + "grad_norm": 11.316311473231387, + "learning_rate": 1.2571008252628747e-07, + "loss": 0.3485, + "step": 6170 + }, + { + "epoch": 0.93, + "grad_norm": 14.294616480703894, + "learning_rate": 1.2516640046470964e-07, + "loss": 0.4262, + "step": 6171 + }, + { + "epoch": 0.93, + "grad_norm": 8.907786672349816, + "learning_rate": 1.246238817320211e-07, + "loss": 0.4633, + "step": 6172 + }, + { + "epoch": 0.93, + "grad_norm": 6.442265314643649, + "learning_rate": 1.2408252645769003e-07, + "loss": 0.4426, + "step": 6173 + }, + { + "epoch": 0.93, + "grad_norm": 10.74199060534077, + "learning_rate": 1.2354233477090361e-07, + "loss": 0.4579, + "step": 6174 + }, + { + "epoch": 0.93, + "grad_norm": 8.905669051502944, + "learning_rate": 1.230033068005737e-07, + "loss": 0.5288, + "step": 6175 + }, + { + "epoch": 0.93, + "grad_norm": 9.652861926247441, + "learning_rate": 1.2246544267533357e-07, + "loss": 0.4272, + "step": 6176 + }, + { + "epoch": 0.93, + "grad_norm": 16.146985015771364, + "learning_rate": 1.219287425235388e-07, + "loss": 0.432, + "step": 6177 + }, + { + "epoch": 0.93, + "grad_norm": 4.919366802008169, + "learning_rate": 1.213932064732687e-07, + "loss": 0.3562, + "step": 6178 + }, + { + "epoch": 0.93, + "grad_norm": 12.793186117279193, + "learning_rate": 1.2085883465232206e-07, + "loss": 0.4123, + "step": 6179 + }, + { + "epoch": 0.93, + "grad_norm": 7.738207152620466, + "learning_rate": 1.2032562718822082e-07, + "loss": 0.4254, + "step": 6180 + }, + { + "epoch": 0.93, + "grad_norm": 9.301695417673379, + "learning_rate": 1.1979358420821096e-07, + "loss": 0.4468, + "step": 6181 + }, + { + "epoch": 0.93, + "grad_norm": 12.15240676581641, + "learning_rate": 1.1926270583925815e-07, + "loss": 0.4315, + "step": 6182 + }, + { + "epoch": 0.93, + "grad_norm": 7.853076754472145, + "learning_rate": 1.1873299220805102e-07, + "loss": 0.481, + "step": 6183 + }, + { + "epoch": 0.93, + "grad_norm": 7.878955791984587, + "learning_rate": 1.1820444344100069e-07, + "loss": 0.4651, + "step": 6184 + }, + { + "epoch": 0.93, + "grad_norm": 1.1265777992631434, + "learning_rate": 1.1767705966423959e-07, + "loss": 0.532, + "step": 6185 + }, + { + "epoch": 0.93, + "grad_norm": 36.0739734141118, + "learning_rate": 1.1715084100362262e-07, + "loss": 0.4248, + "step": 6186 + }, + { + "epoch": 0.93, + "grad_norm": 9.798952760578612, + "learning_rate": 1.1662578758472542e-07, + "loss": 0.4315, + "step": 6187 + }, + { + "epoch": 0.93, + "grad_norm": 15.66556221203124, + "learning_rate": 1.1610189953284834e-07, + "loss": 0.3742, + "step": 6188 + }, + { + "epoch": 0.93, + "grad_norm": 6.7103720766085315, + "learning_rate": 1.1557917697301136e-07, + "loss": 0.4285, + "step": 6189 + }, + { + "epoch": 0.93, + "grad_norm": 15.82778755315649, + "learning_rate": 1.1505762002995579e-07, + "loss": 0.4252, + "step": 6190 + }, + { + "epoch": 0.93, + "grad_norm": 11.551355240602343, + "learning_rate": 1.1453722882814656e-07, + "loss": 0.351, + "step": 6191 + }, + { + "epoch": 0.93, + "grad_norm": 5.995026845216769, + "learning_rate": 1.1401800349176984e-07, + "loss": 0.4264, + "step": 6192 + }, + { + "epoch": 0.93, + "grad_norm": 5.803779554303081, + "learning_rate": 1.1349994414473375e-07, + "loss": 0.4216, + "step": 6193 + }, + { + "epoch": 0.93, + "grad_norm": 4.91791611313793, + "learning_rate": 1.1298305091066664e-07, + "loss": 0.3705, + "step": 6194 + }, + { + "epoch": 0.93, + "grad_norm": 8.134301460968024, + "learning_rate": 1.1246732391292092e-07, + "loss": 0.4105, + "step": 6195 + }, + { + "epoch": 0.93, + "grad_norm": 9.430889220347723, + "learning_rate": 1.1195276327456927e-07, + "loss": 0.3953, + "step": 6196 + }, + { + "epoch": 0.93, + "grad_norm": 5.766552114911704, + "learning_rate": 1.1143936911840513e-07, + "loss": 0.3973, + "step": 6197 + }, + { + "epoch": 0.93, + "grad_norm": 10.354382267756021, + "learning_rate": 1.1092714156694606e-07, + "loss": 0.4345, + "step": 6198 + }, + { + "epoch": 0.93, + "grad_norm": 9.89816523297762, + "learning_rate": 1.1041608074242982e-07, + "loss": 0.4089, + "step": 6199 + }, + { + "epoch": 0.94, + "grad_norm": 9.31942206584094, + "learning_rate": 1.0990618676681553e-07, + "loss": 0.3869, + "step": 6200 + }, + { + "epoch": 0.94, + "grad_norm": 8.00723360202763, + "learning_rate": 1.0939745976178361e-07, + "loss": 0.4864, + "step": 6201 + }, + { + "epoch": 0.94, + "grad_norm": 8.55530122855174, + "learning_rate": 1.0888989984873754e-07, + "loss": 0.3974, + "step": 6202 + }, + { + "epoch": 0.94, + "grad_norm": 26.402458639949128, + "learning_rate": 1.0838350714880042e-07, + "loss": 0.4416, + "step": 6203 + }, + { + "epoch": 0.94, + "grad_norm": 6.835441528544204, + "learning_rate": 1.0787828178281834e-07, + "loss": 0.4563, + "step": 6204 + }, + { + "epoch": 0.94, + "grad_norm": 1.0950243012630387, + "learning_rate": 1.0737422387135765e-07, + "loss": 0.4646, + "step": 6205 + }, + { + "epoch": 0.94, + "grad_norm": 1.2311540739899822, + "learning_rate": 1.0687133353470713e-07, + "loss": 0.5451, + "step": 6206 + }, + { + "epoch": 0.94, + "grad_norm": 5.4556181216107085, + "learning_rate": 1.0636961089287522e-07, + "loss": 0.4073, + "step": 6207 + }, + { + "epoch": 0.94, + "grad_norm": 9.72157523524437, + "learning_rate": 1.058690560655945e-07, + "loss": 0.4027, + "step": 6208 + }, + { + "epoch": 0.94, + "grad_norm": 14.102804485824553, + "learning_rate": 1.0536966917231606e-07, + "loss": 0.4182, + "step": 6209 + }, + { + "epoch": 0.94, + "grad_norm": 6.096964757122657, + "learning_rate": 1.048714503322129e-07, + "loss": 0.3686, + "step": 6210 + }, + { + "epoch": 0.94, + "grad_norm": 6.325144212009119, + "learning_rate": 1.0437439966418161e-07, + "loss": 0.4216, + "step": 6211 + }, + { + "epoch": 0.94, + "grad_norm": 6.601879498296856, + "learning_rate": 1.0387851728683729e-07, + "loss": 0.4356, + "step": 6212 + }, + { + "epoch": 0.94, + "grad_norm": 6.988576442202231, + "learning_rate": 1.0338380331851694e-07, + "loss": 0.3926, + "step": 6213 + }, + { + "epoch": 0.94, + "grad_norm": 5.032648911924355, + "learning_rate": 1.0289025787727947e-07, + "loss": 0.329, + "step": 6214 + }, + { + "epoch": 0.94, + "grad_norm": 9.183799346618919, + "learning_rate": 1.0239788108090343e-07, + "loss": 0.4528, + "step": 6215 + }, + { + "epoch": 0.94, + "grad_norm": 13.753660575879948, + "learning_rate": 1.0190667304689094e-07, + "loss": 0.4737, + "step": 6216 + }, + { + "epoch": 0.94, + "grad_norm": 19.59175075494714, + "learning_rate": 1.014166338924627e-07, + "loss": 0.3934, + "step": 6217 + }, + { + "epoch": 0.94, + "grad_norm": 5.35005613758522, + "learning_rate": 1.0092776373456181e-07, + "loss": 0.3916, + "step": 6218 + }, + { + "epoch": 0.94, + "grad_norm": 8.560156293011428, + "learning_rate": 1.0044006268985218e-07, + "loss": 0.4236, + "step": 6219 + }, + { + "epoch": 0.94, + "grad_norm": 6.892760656111012, + "learning_rate": 9.995353087471848e-08, + "loss": 0.4406, + "step": 6220 + }, + { + "epoch": 0.94, + "grad_norm": 8.197428676206108, + "learning_rate": 9.946816840526674e-08, + "loss": 0.462, + "step": 6221 + }, + { + "epoch": 0.94, + "grad_norm": 18.138688334081312, + "learning_rate": 9.898397539732318e-08, + "loss": 0.4115, + "step": 6222 + }, + { + "epoch": 0.94, + "grad_norm": 7.734147240297599, + "learning_rate": 9.850095196643649e-08, + "loss": 0.4495, + "step": 6223 + }, + { + "epoch": 0.94, + "grad_norm": 8.21041995832657, + "learning_rate": 9.801909822787448e-08, + "loss": 0.4348, + "step": 6224 + }, + { + "epoch": 0.94, + "grad_norm": 1.3672457935183118, + "learning_rate": 9.753841429662626e-08, + "loss": 0.4983, + "step": 6225 + }, + { + "epoch": 0.94, + "grad_norm": 4.75535753004519, + "learning_rate": 9.705890028740284e-08, + "loss": 0.4493, + "step": 6226 + }, + { + "epoch": 0.94, + "grad_norm": 7.2818425810044145, + "learning_rate": 9.658055631463547e-08, + "loss": 0.4811, + "step": 6227 + }, + { + "epoch": 0.94, + "grad_norm": 6.5004438332178065, + "learning_rate": 9.610338249247509e-08, + "loss": 0.4306, + "step": 6228 + }, + { + "epoch": 0.94, + "grad_norm": 10.856745748908933, + "learning_rate": 9.562737893479556e-08, + "loss": 0.4241, + "step": 6229 + }, + { + "epoch": 0.94, + "grad_norm": 6.1205381727042685, + "learning_rate": 9.515254575518829e-08, + "loss": 0.3558, + "step": 6230 + }, + { + "epoch": 0.94, + "grad_norm": 8.80451632731278, + "learning_rate": 9.467888306696982e-08, + "loss": 0.4796, + "step": 6231 + }, + { + "epoch": 0.94, + "grad_norm": 6.8637371868357695, + "learning_rate": 9.420639098317252e-08, + "loss": 0.4235, + "step": 6232 + }, + { + "epoch": 0.94, + "grad_norm": 15.059627688805241, + "learning_rate": 9.373506961655343e-08, + "loss": 0.4188, + "step": 6233 + }, + { + "epoch": 0.94, + "grad_norm": 1.17776818776651, + "learning_rate": 9.326491907958812e-08, + "loss": 0.509, + "step": 6234 + }, + { + "epoch": 0.94, + "grad_norm": 4.416916361852539, + "learning_rate": 9.279593948447296e-08, + "loss": 0.4201, + "step": 6235 + }, + { + "epoch": 0.94, + "grad_norm": 14.180444136034877, + "learning_rate": 9.232813094312509e-08, + "loss": 0.4084, + "step": 6236 + }, + { + "epoch": 0.94, + "grad_norm": 5.308438770242068, + "learning_rate": 9.186149356718244e-08, + "loss": 0.4343, + "step": 6237 + }, + { + "epoch": 0.94, + "grad_norm": 11.439157840899536, + "learning_rate": 9.139602746800258e-08, + "loss": 0.3929, + "step": 6238 + }, + { + "epoch": 0.94, + "grad_norm": 5.222587676307345, + "learning_rate": 9.0931732756665e-08, + "loss": 0.376, + "step": 6239 + }, + { + "epoch": 0.94, + "grad_norm": 5.45966418695035, + "learning_rate": 9.046860954396886e-08, + "loss": 0.4553, + "step": 6240 + }, + { + "epoch": 0.94, + "grad_norm": 9.453528185798511, + "learning_rate": 9.00066579404335e-08, + "loss": 0.4835, + "step": 6241 + }, + { + "epoch": 0.94, + "grad_norm": 13.380526233604195, + "learning_rate": 8.954587805629856e-08, + "loss": 0.4695, + "step": 6242 + }, + { + "epoch": 0.94, + "grad_norm": 5.544152201976444, + "learning_rate": 8.908627000152492e-08, + "loss": 0.4822, + "step": 6243 + }, + { + "epoch": 0.94, + "grad_norm": 1.1088655596431665, + "learning_rate": 8.862783388579265e-08, + "loss": 0.506, + "step": 6244 + }, + { + "epoch": 0.94, + "grad_norm": 5.951002118295332, + "learning_rate": 8.817056981850369e-08, + "loss": 0.4107, + "step": 6245 + }, + { + "epoch": 0.94, + "grad_norm": 6.224758466188311, + "learning_rate": 8.771447790877908e-08, + "loss": 0.4776, + "step": 6246 + }, + { + "epoch": 0.94, + "grad_norm": 10.984385284862904, + "learning_rate": 8.725955826546006e-08, + "loss": 0.4134, + "step": 6247 + }, + { + "epoch": 0.94, + "grad_norm": 9.255270867898895, + "learning_rate": 8.680581099710873e-08, + "loss": 0.46, + "step": 6248 + }, + { + "epoch": 0.94, + "grad_norm": 6.900324054356084, + "learning_rate": 8.635323621200786e-08, + "loss": 0.4158, + "step": 6249 + }, + { + "epoch": 0.94, + "grad_norm": 10.9917552263978, + "learning_rate": 8.590183401815833e-08, + "loss": 0.4184, + "step": 6250 + }, + { + "epoch": 0.94, + "grad_norm": 5.765921449419827, + "learning_rate": 8.54516045232845e-08, + "loss": 0.4116, + "step": 6251 + }, + { + "epoch": 0.94, + "grad_norm": 8.4262308851782, + "learning_rate": 8.500254783482765e-08, + "loss": 0.4079, + "step": 6252 + }, + { + "epoch": 0.94, + "grad_norm": 8.755059067253178, + "learning_rate": 8.45546640599515e-08, + "loss": 0.38, + "step": 6253 + }, + { + "epoch": 0.94, + "grad_norm": 5.210292720432568, + "learning_rate": 8.410795330553778e-08, + "loss": 0.4571, + "step": 6254 + }, + { + "epoch": 0.94, + "grad_norm": 8.651356874701834, + "learning_rate": 8.366241567818956e-08, + "loss": 0.3776, + "step": 6255 + }, + { + "epoch": 0.94, + "grad_norm": 12.314024890424601, + "learning_rate": 8.321805128423122e-08, + "loss": 0.4299, + "step": 6256 + }, + { + "epoch": 0.94, + "grad_norm": 7.821411932237205, + "learning_rate": 8.277486022970404e-08, + "loss": 0.4901, + "step": 6257 + }, + { + "epoch": 0.94, + "grad_norm": 6.457702541839718, + "learning_rate": 8.233284262037178e-08, + "loss": 0.4552, + "step": 6258 + }, + { + "epoch": 0.94, + "grad_norm": 6.333146713571961, + "learning_rate": 8.189199856171725e-08, + "loss": 0.456, + "step": 6259 + }, + { + "epoch": 0.94, + "grad_norm": 1.3214667530385984, + "learning_rate": 8.145232815894355e-08, + "loss": 0.5036, + "step": 6260 + }, + { + "epoch": 0.94, + "grad_norm": 7.47030316604993, + "learning_rate": 8.10138315169734e-08, + "loss": 0.4781, + "step": 6261 + }, + { + "epoch": 0.94, + "grad_norm": 7.017416910082287, + "learning_rate": 8.057650874044976e-08, + "loss": 0.4112, + "step": 6262 + }, + { + "epoch": 0.94, + "grad_norm": 4.568797617582446, + "learning_rate": 8.01403599337347e-08, + "loss": 0.3967, + "step": 6263 + }, + { + "epoch": 0.94, + "grad_norm": 6.56097533380947, + "learning_rate": 7.970538520091108e-08, + "loss": 0.3991, + "step": 6264 + }, + { + "epoch": 0.94, + "grad_norm": 15.603113132324735, + "learning_rate": 7.927158464578032e-08, + "loss": 0.4866, + "step": 6265 + }, + { + "epoch": 0.95, + "grad_norm": 4.325758803233794, + "learning_rate": 7.883895837186573e-08, + "loss": 0.4307, + "step": 6266 + }, + { + "epoch": 0.95, + "grad_norm": 11.429880250929648, + "learning_rate": 7.840750648240857e-08, + "loss": 0.3861, + "step": 6267 + }, + { + "epoch": 0.95, + "grad_norm": 10.947449269433012, + "learning_rate": 7.797722908036987e-08, + "loss": 0.4308, + "step": 6268 + }, + { + "epoch": 0.95, + "grad_norm": 5.00072491795342, + "learning_rate": 7.75481262684319e-08, + "loss": 0.375, + "step": 6269 + }, + { + "epoch": 0.95, + "grad_norm": 10.121494868597713, + "learning_rate": 7.71201981489944e-08, + "loss": 0.4152, + "step": 6270 + }, + { + "epoch": 0.95, + "grad_norm": 12.508415058362617, + "learning_rate": 7.669344482417907e-08, + "loss": 0.4033, + "step": 6271 + }, + { + "epoch": 0.95, + "grad_norm": 7.011429860813396, + "learning_rate": 7.62678663958255e-08, + "loss": 0.4854, + "step": 6272 + }, + { + "epoch": 0.95, + "grad_norm": 17.447920375317437, + "learning_rate": 7.584346296549417e-08, + "loss": 0.4521, + "step": 6273 + }, + { + "epoch": 0.95, + "grad_norm": 11.407042500360957, + "learning_rate": 7.542023463446457e-08, + "loss": 0.4909, + "step": 6274 + }, + { + "epoch": 0.95, + "grad_norm": 5.649233769420017, + "learning_rate": 7.499818150373539e-08, + "loss": 0.4889, + "step": 6275 + }, + { + "epoch": 0.95, + "grad_norm": 8.00110288029275, + "learning_rate": 7.45773036740255e-08, + "loss": 0.4806, + "step": 6276 + }, + { + "epoch": 0.95, + "grad_norm": 10.32964137723021, + "learning_rate": 7.415760124577287e-08, + "loss": 0.4439, + "step": 6277 + }, + { + "epoch": 0.95, + "grad_norm": 5.758530169061703, + "learning_rate": 7.373907431913519e-08, + "loss": 0.4388, + "step": 6278 + }, + { + "epoch": 0.95, + "grad_norm": 9.923175238872517, + "learning_rate": 7.332172299399032e-08, + "loss": 0.4391, + "step": 6279 + }, + { + "epoch": 0.95, + "grad_norm": 6.721110009292348, + "learning_rate": 7.290554736993305e-08, + "loss": 0.4425, + "step": 6280 + }, + { + "epoch": 0.95, + "grad_norm": 6.504487975282788, + "learning_rate": 7.24905475462817e-08, + "loss": 0.4039, + "step": 6281 + }, + { + "epoch": 0.95, + "grad_norm": 7.116429704100573, + "learning_rate": 7.207672362207041e-08, + "loss": 0.4873, + "step": 6282 + }, + { + "epoch": 0.95, + "grad_norm": 7.5708142677286085, + "learning_rate": 7.166407569605404e-08, + "loss": 0.4147, + "step": 6283 + }, + { + "epoch": 0.95, + "grad_norm": 10.465201351311464, + "learning_rate": 7.125260386670718e-08, + "loss": 0.49, + "step": 6284 + }, + { + "epoch": 0.95, + "grad_norm": 8.969689070164696, + "learning_rate": 7.084230823222294e-08, + "loss": 0.4904, + "step": 6285 + }, + { + "epoch": 0.95, + "grad_norm": 4.687710035906269, + "learning_rate": 7.043318889051409e-08, + "loss": 0.4666, + "step": 6286 + }, + { + "epoch": 0.95, + "grad_norm": 10.26145084273816, + "learning_rate": 7.002524593921367e-08, + "loss": 0.428, + "step": 6287 + }, + { + "epoch": 0.95, + "grad_norm": 4.2681979010817015, + "learning_rate": 6.961847947567213e-08, + "loss": 0.3484, + "step": 6288 + }, + { + "epoch": 0.95, + "grad_norm": 10.769069016244952, + "learning_rate": 6.921288959696015e-08, + "loss": 0.4604, + "step": 6289 + }, + { + "epoch": 0.95, + "grad_norm": 1.2406731506893431, + "learning_rate": 6.880847639986809e-08, + "loss": 0.4679, + "step": 6290 + }, + { + "epoch": 0.95, + "grad_norm": 11.985781528201327, + "learning_rate": 6.840523998090431e-08, + "loss": 0.3882, + "step": 6291 + }, + { + "epoch": 0.95, + "grad_norm": 11.29599953459447, + "learning_rate": 6.800318043629794e-08, + "loss": 0.4848, + "step": 6292 + }, + { + "epoch": 0.95, + "grad_norm": 6.839041523855355, + "learning_rate": 6.760229786199556e-08, + "loss": 0.4722, + "step": 6293 + }, + { + "epoch": 0.95, + "grad_norm": 6.99172585405479, + "learning_rate": 6.7202592353664e-08, + "loss": 0.393, + "step": 6294 + }, + { + "epoch": 0.95, + "grad_norm": 9.314006183245606, + "learning_rate": 6.68040640066886e-08, + "loss": 0.4432, + "step": 6295 + }, + { + "epoch": 0.95, + "grad_norm": 10.01781299250835, + "learning_rate": 6.64067129161744e-08, + "loss": 0.4178, + "step": 6296 + }, + { + "epoch": 0.95, + "grad_norm": 6.394195086486468, + "learning_rate": 6.601053917694556e-08, + "loss": 0.4301, + "step": 6297 + }, + { + "epoch": 0.95, + "grad_norm": 4.826779852745231, + "learning_rate": 6.561554288354421e-08, + "loss": 0.407, + "step": 6298 + }, + { + "epoch": 0.95, + "grad_norm": 10.431016798834973, + "learning_rate": 6.522172413023165e-08, + "loss": 0.405, + "step": 6299 + }, + { + "epoch": 0.95, + "grad_norm": 1.4023904753945833, + "learning_rate": 6.482908301098934e-08, + "loss": 0.5305, + "step": 6300 + }, + { + "epoch": 0.95, + "grad_norm": 6.634721821716822, + "learning_rate": 6.443761961951789e-08, + "loss": 0.3993, + "step": 6301 + }, + { + "epoch": 0.95, + "grad_norm": 11.663410809290141, + "learning_rate": 6.404733404923424e-08, + "loss": 0.4235, + "step": 6302 + }, + { + "epoch": 0.95, + "grad_norm": 1.3037939125186047, + "learning_rate": 6.365822639327724e-08, + "loss": 0.5162, + "step": 6303 + }, + { + "epoch": 0.95, + "grad_norm": 8.2548184881288, + "learning_rate": 6.327029674450314e-08, + "loss": 0.4164, + "step": 6304 + }, + { + "epoch": 0.95, + "grad_norm": 1.0718293017262177, + "learning_rate": 6.288354519548679e-08, + "loss": 0.4989, + "step": 6305 + }, + { + "epoch": 0.95, + "grad_norm": 7.918522250962681, + "learning_rate": 6.24979718385238e-08, + "loss": 0.3669, + "step": 6306 + }, + { + "epoch": 0.95, + "grad_norm": 9.3224170557502, + "learning_rate": 6.211357676562613e-08, + "loss": 0.5019, + "step": 6307 + }, + { + "epoch": 0.95, + "grad_norm": 7.091087989153223, + "learning_rate": 6.173036006852595e-08, + "loss": 0.3646, + "step": 6308 + }, + { + "epoch": 0.95, + "grad_norm": 7.599578542821639, + "learning_rate": 6.134832183867456e-08, + "loss": 0.4109, + "step": 6309 + }, + { + "epoch": 0.95, + "grad_norm": 7.406878349537093, + "learning_rate": 6.09674621672407e-08, + "loss": 0.4126, + "step": 6310 + }, + { + "epoch": 0.95, + "grad_norm": 7.02480753055882, + "learning_rate": 6.058778114511333e-08, + "loss": 0.4367, + "step": 6311 + }, + { + "epoch": 0.95, + "grad_norm": 10.02759280320257, + "learning_rate": 6.02092788628983e-08, + "loss": 0.4175, + "step": 6312 + }, + { + "epoch": 0.95, + "grad_norm": 7.260191815563298, + "learning_rate": 5.983195541092224e-08, + "loss": 0.4395, + "step": 6313 + }, + { + "epoch": 0.95, + "grad_norm": 13.92889875208836, + "learning_rate": 5.945581087922925e-08, + "loss": 0.4362, + "step": 6314 + }, + { + "epoch": 0.95, + "grad_norm": 7.624150660621997, + "learning_rate": 5.9080845357581965e-08, + "loss": 0.4116, + "step": 6315 + }, + { + "epoch": 0.95, + "grad_norm": 29.043009347251978, + "learning_rate": 5.870705893546324e-08, + "loss": 0.394, + "step": 6316 + }, + { + "epoch": 0.95, + "grad_norm": 8.652851887471437, + "learning_rate": 5.8334451702072304e-08, + "loss": 0.4503, + "step": 6317 + }, + { + "epoch": 0.95, + "grad_norm": 22.906452452120956, + "learning_rate": 5.796302374632801e-08, + "loss": 0.4604, + "step": 6318 + }, + { + "epoch": 0.95, + "grad_norm": 12.886603936450323, + "learning_rate": 5.759277515686834e-08, + "loss": 0.4265, + "step": 6319 + }, + { + "epoch": 0.95, + "grad_norm": 10.728404051456792, + "learning_rate": 5.722370602204874e-08, + "loss": 0.4077, + "step": 6320 + }, + { + "epoch": 0.95, + "grad_norm": 5.99216077297953, + "learning_rate": 5.685581642994431e-08, + "loss": 0.3951, + "step": 6321 + }, + { + "epoch": 0.95, + "grad_norm": 6.542490385279293, + "learning_rate": 5.648910646834815e-08, + "loss": 0.436, + "step": 6322 + }, + { + "epoch": 0.95, + "grad_norm": 5.633001852234321, + "learning_rate": 5.612357622477138e-08, + "loss": 0.4052, + "step": 6323 + }, + { + "epoch": 0.95, + "grad_norm": 8.7343659676861, + "learning_rate": 5.5759225786444216e-08, + "loss": 0.4007, + "step": 6324 + }, + { + "epoch": 0.95, + "grad_norm": 12.059719197207686, + "learning_rate": 5.5396055240314883e-08, + "loss": 0.4538, + "step": 6325 + }, + { + "epoch": 0.95, + "grad_norm": 7.032819691692465, + "learning_rate": 5.503406467305128e-08, + "loss": 0.3745, + "step": 6326 + }, + { + "epoch": 0.95, + "grad_norm": 11.469563669440324, + "learning_rate": 5.4673254171037635e-08, + "loss": 0.408, + "step": 6327 + }, + { + "epoch": 0.95, + "grad_norm": 19.925974111131847, + "learning_rate": 5.4313623820377305e-08, + "loss": 0.4222, + "step": 6328 + }, + { + "epoch": 0.95, + "grad_norm": 15.275587544538409, + "learning_rate": 5.3955173706894405e-08, + "loss": 0.4476, + "step": 6329 + }, + { + "epoch": 0.95, + "grad_norm": 10.238281336076064, + "learning_rate": 5.359790391612718e-08, + "loss": 0.3873, + "step": 6330 + }, + { + "epoch": 0.95, + "grad_norm": 6.246258780544184, + "learning_rate": 5.324181453333521e-08, + "loss": 0.3904, + "step": 6331 + }, + { + "epoch": 0.96, + "grad_norm": 4.499224801930144, + "learning_rate": 5.288690564349608e-08, + "loss": 0.3831, + "step": 6332 + }, + { + "epoch": 0.96, + "grad_norm": 5.596704315212283, + "learning_rate": 5.2533177331304806e-08, + "loss": 0.4389, + "step": 6333 + }, + { + "epoch": 0.96, + "grad_norm": 7.980345955490508, + "learning_rate": 5.218062968117499e-08, + "loss": 0.4185, + "step": 6334 + }, + { + "epoch": 0.96, + "grad_norm": 9.27548363822673, + "learning_rate": 5.182926277723821e-08, + "loss": 0.4503, + "step": 6335 + }, + { + "epoch": 0.96, + "grad_norm": 7.070633468285622, + "learning_rate": 5.147907670334462e-08, + "loss": 0.4852, + "step": 6336 + }, + { + "epoch": 0.96, + "grad_norm": 6.317264199415102, + "learning_rate": 5.113007154306293e-08, + "loss": 0.4181, + "step": 6337 + }, + { + "epoch": 0.96, + "grad_norm": 8.664993413885735, + "learning_rate": 5.078224737967874e-08, + "loss": 0.4323, + "step": 6338 + }, + { + "epoch": 0.96, + "grad_norm": 6.795743005034303, + "learning_rate": 5.043560429619787e-08, + "loss": 0.4317, + "step": 6339 + }, + { + "epoch": 0.96, + "grad_norm": 6.874772361412588, + "learning_rate": 5.009014237534249e-08, + "loss": 0.5446, + "step": 6340 + }, + { + "epoch": 0.96, + "grad_norm": 9.37146880660566, + "learning_rate": 4.974586169955386e-08, + "loss": 0.3951, + "step": 6341 + }, + { + "epoch": 0.96, + "grad_norm": 7.1652422013938795, + "learning_rate": 4.940276235099073e-08, + "loss": 0.4307, + "step": 6342 + }, + { + "epoch": 0.96, + "grad_norm": 10.362350513948233, + "learning_rate": 4.906084441152981e-08, + "loss": 0.396, + "step": 6343 + }, + { + "epoch": 0.96, + "grad_norm": 6.835658017923685, + "learning_rate": 4.872010796276749e-08, + "loss": 0.4128, + "step": 6344 + }, + { + "epoch": 0.96, + "grad_norm": 6.81766170852999, + "learning_rate": 4.838055308601652e-08, + "loss": 0.4167, + "step": 6345 + }, + { + "epoch": 0.96, + "grad_norm": 1.1282704089583009, + "learning_rate": 4.804217986230763e-08, + "loss": 0.4789, + "step": 6346 + }, + { + "epoch": 0.96, + "grad_norm": 17.77406959076672, + "learning_rate": 4.7704988372390126e-08, + "loss": 0.4522, + "step": 6347 + }, + { + "epoch": 0.96, + "grad_norm": 5.778476244250957, + "learning_rate": 4.7368978696731846e-08, + "loss": 0.5139, + "step": 6348 + }, + { + "epoch": 0.96, + "grad_norm": 6.663569072693145, + "learning_rate": 4.703415091551755e-08, + "loss": 0.442, + "step": 6349 + }, + { + "epoch": 0.96, + "grad_norm": 6.313865315046639, + "learning_rate": 4.670050510865165e-08, + "loss": 0.4034, + "step": 6350 + }, + { + "epoch": 0.96, + "grad_norm": 14.883221809590651, + "learning_rate": 4.6368041355753234e-08, + "loss": 0.3708, + "step": 6351 + }, + { + "epoch": 0.96, + "grad_norm": 16.184157583432206, + "learning_rate": 4.6036759736163286e-08, + "loss": 0.4693, + "step": 6352 + }, + { + "epoch": 0.96, + "grad_norm": 11.538429776588618, + "learning_rate": 4.570666032893689e-08, + "loss": 0.3846, + "step": 6353 + }, + { + "epoch": 0.96, + "grad_norm": 5.740438768383062, + "learning_rate": 4.537774321285105e-08, + "loss": 0.4036, + "step": 6354 + }, + { + "epoch": 0.96, + "grad_norm": 11.094305721374937, + "learning_rate": 4.505000846639685e-08, + "loss": 0.3677, + "step": 6355 + }, + { + "epoch": 0.96, + "grad_norm": 7.689872612810646, + "learning_rate": 4.4723456167785616e-08, + "loss": 0.437, + "step": 6356 + }, + { + "epoch": 0.96, + "grad_norm": 7.2137954511819125, + "learning_rate": 4.4398086394945026e-08, + "loss": 0.4495, + "step": 6357 + }, + { + "epoch": 0.96, + "grad_norm": 11.510387041526407, + "learning_rate": 4.407389922552185e-08, + "loss": 0.3892, + "step": 6358 + }, + { + "epoch": 0.96, + "grad_norm": 6.357229057087444, + "learning_rate": 4.375089473687921e-08, + "loss": 0.4452, + "step": 6359 + }, + { + "epoch": 0.96, + "grad_norm": 8.765084299397444, + "learning_rate": 4.3429073006099886e-08, + "loss": 0.4471, + "step": 6360 + }, + { + "epoch": 0.96, + "grad_norm": 7.950980556609874, + "learning_rate": 4.310843410998244e-08, + "loss": 0.408, + "step": 6361 + }, + { + "epoch": 0.96, + "grad_norm": 1.1516825670888677, + "learning_rate": 4.278897812504457e-08, + "loss": 0.4804, + "step": 6362 + }, + { + "epoch": 0.96, + "grad_norm": 8.353312920986031, + "learning_rate": 4.247070512752083e-08, + "loss": 0.4179, + "step": 6363 + }, + { + "epoch": 0.96, + "grad_norm": 7.1134559667998545, + "learning_rate": 4.215361519336436e-08, + "loss": 0.4269, + "step": 6364 + }, + { + "epoch": 0.96, + "grad_norm": 10.993329363549481, + "learning_rate": 4.183770839824464e-08, + "loss": 0.4467, + "step": 6365 + }, + { + "epoch": 0.96, + "grad_norm": 1.0782704789222695, + "learning_rate": 4.1522984817549685e-08, + "loss": 0.5193, + "step": 6366 + }, + { + "epoch": 0.96, + "grad_norm": 5.941580645300886, + "learning_rate": 4.120944452638609e-08, + "loss": 0.4131, + "step": 6367 + }, + { + "epoch": 0.96, + "grad_norm": 30.35268772377776, + "learning_rate": 4.089708759957567e-08, + "loss": 0.3911, + "step": 6368 + }, + { + "epoch": 0.96, + "grad_norm": 7.811053553892459, + "learning_rate": 4.058591411165991e-08, + "loss": 0.3856, + "step": 6369 + }, + { + "epoch": 0.96, + "grad_norm": 9.938666508186083, + "learning_rate": 4.027592413689663e-08, + "loss": 0.4051, + "step": 6370 + }, + { + "epoch": 0.96, + "grad_norm": 6.736984749197012, + "learning_rate": 3.996711774926221e-08, + "loss": 0.412, + "step": 6371 + }, + { + "epoch": 0.96, + "grad_norm": 7.947262093428573, + "learning_rate": 3.9659495022450476e-08, + "loss": 0.3723, + "step": 6372 + }, + { + "epoch": 0.96, + "grad_norm": 8.598192862602188, + "learning_rate": 3.93530560298716e-08, + "loss": 0.4488, + "step": 6373 + }, + { + "epoch": 0.96, + "grad_norm": 6.75004632959922, + "learning_rate": 3.9047800844654316e-08, + "loss": 0.4838, + "step": 6374 + }, + { + "epoch": 0.96, + "grad_norm": 6.935638982323909, + "learning_rate": 3.8743729539645335e-08, + "loss": 0.4391, + "step": 6375 + }, + { + "epoch": 0.96, + "grad_norm": 9.282839817522767, + "learning_rate": 3.8440842187406624e-08, + "loss": 0.4274, + "step": 6376 + }, + { + "epoch": 0.96, + "grad_norm": 11.811873216177023, + "learning_rate": 3.813913886022091e-08, + "loss": 0.3425, + "step": 6377 + }, + { + "epoch": 0.96, + "grad_norm": 17.406691558001874, + "learning_rate": 3.7838619630085614e-08, + "loss": 0.4302, + "step": 6378 + }, + { + "epoch": 0.96, + "grad_norm": 6.44707879629809, + "learning_rate": 3.7539284568716695e-08, + "loss": 0.4069, + "step": 6379 + }, + { + "epoch": 0.96, + "grad_norm": 53.576056872120446, + "learning_rate": 3.7241133747547566e-08, + "loss": 0.4504, + "step": 6380 + }, + { + "epoch": 0.96, + "grad_norm": 7.456705337740436, + "learning_rate": 3.694416723772853e-08, + "loss": 0.4545, + "step": 6381 + }, + { + "epoch": 0.96, + "grad_norm": 7.590957538849181, + "learning_rate": 3.6648385110127914e-08, + "loss": 0.4294, + "step": 6382 + }, + { + "epoch": 0.96, + "grad_norm": 7.380673246140858, + "learning_rate": 3.635378743533036e-08, + "loss": 0.3266, + "step": 6383 + }, + { + "epoch": 0.96, + "grad_norm": 4.869573716862622, + "learning_rate": 3.6060374283639623e-08, + "loss": 0.4105, + "step": 6384 + }, + { + "epoch": 0.96, + "grad_norm": 19.764516354673134, + "learning_rate": 3.5768145725075254e-08, + "loss": 0.431, + "step": 6385 + }, + { + "epoch": 0.96, + "grad_norm": 7.6059966910546, + "learning_rate": 3.547710182937425e-08, + "loss": 0.5175, + "step": 6386 + }, + { + "epoch": 0.96, + "grad_norm": 7.706697493953652, + "learning_rate": 3.518724266599216e-08, + "loss": 0.4397, + "step": 6387 + }, + { + "epoch": 0.96, + "grad_norm": 7.270126018500545, + "learning_rate": 3.4898568304099765e-08, + "loss": 0.4052, + "step": 6388 + }, + { + "epoch": 0.96, + "grad_norm": 1.0861909327135764, + "learning_rate": 3.461107881258696e-08, + "loss": 0.5001, + "step": 6389 + }, + { + "epoch": 0.96, + "grad_norm": 11.83188469126037, + "learning_rate": 3.4324774260060534e-08, + "loss": 0.3851, + "step": 6390 + }, + { + "epoch": 0.96, + "grad_norm": 9.051978433052861, + "learning_rate": 3.403965471484305e-08, + "loss": 0.4744, + "step": 6391 + }, + { + "epoch": 0.96, + "grad_norm": 12.165182441355029, + "learning_rate": 3.3755720244975644e-08, + "loss": 0.4219, + "step": 6392 + }, + { + "epoch": 0.96, + "grad_norm": 13.1365429976562, + "learning_rate": 3.347297091821688e-08, + "loss": 0.4167, + "step": 6393 + }, + { + "epoch": 0.96, + "grad_norm": 10.189605261849392, + "learning_rate": 3.3191406802041693e-08, + "loss": 0.415, + "step": 6394 + }, + { + "epoch": 0.96, + "grad_norm": 8.411579948771681, + "learning_rate": 3.2911027963642426e-08, + "loss": 0.4383, + "step": 6395 + }, + { + "epoch": 0.96, + "grad_norm": 7.541251057207983, + "learning_rate": 3.2631834469928344e-08, + "loss": 0.4211, + "step": 6396 + }, + { + "epoch": 0.96, + "grad_norm": 8.877095413449062, + "learning_rate": 3.235382638752671e-08, + "loss": 0.4256, + "step": 6397 + }, + { + "epoch": 0.97, + "grad_norm": 13.256902337495772, + "learning_rate": 3.2077003782780556e-08, + "loss": 0.4473, + "step": 6398 + }, + { + "epoch": 0.97, + "grad_norm": 12.187651781279879, + "learning_rate": 3.180136672175149e-08, + "loss": 0.4015, + "step": 6399 + }, + { + "epoch": 0.97, + "grad_norm": 7.3717444832438055, + "learning_rate": 3.152691527021745e-08, + "loss": 0.4136, + "step": 6400 + }, + { + "epoch": 0.97, + "grad_norm": 1.1265465572278124, + "learning_rate": 3.125364949367271e-08, + "loss": 0.4865, + "step": 6401 + }, + { + "epoch": 0.97, + "grad_norm": 1.2124475643556294, + "learning_rate": 3.098156945732955e-08, + "loss": 0.4712, + "step": 6402 + }, + { + "epoch": 0.97, + "grad_norm": 7.83264450950352, + "learning_rate": 3.0710675226117704e-08, + "loss": 0.3976, + "step": 6403 + }, + { + "epoch": 0.97, + "grad_norm": 8.575853149319501, + "learning_rate": 3.044096686468323e-08, + "loss": 0.4171, + "step": 6404 + }, + { + "epoch": 0.97, + "grad_norm": 7.290732658188067, + "learning_rate": 3.0172444437387984e-08, + "loss": 0.426, + "step": 6405 + }, + { + "epoch": 0.97, + "grad_norm": 6.87231698407366, + "learning_rate": 2.990510800831348e-08, + "loss": 0.4058, + "step": 6406 + }, + { + "epoch": 0.97, + "grad_norm": 12.767768156757501, + "learning_rate": 2.963895764125646e-08, + "loss": 0.4227, + "step": 6407 + }, + { + "epoch": 0.97, + "grad_norm": 5.571654312087443, + "learning_rate": 2.9373993399730015e-08, + "loss": 0.3978, + "step": 6408 + }, + { + "epoch": 0.97, + "grad_norm": 9.900859353435743, + "learning_rate": 2.9110215346966896e-08, + "loss": 0.3694, + "step": 6409 + }, + { + "epoch": 0.97, + "grad_norm": 6.116910089222625, + "learning_rate": 2.8847623545913972e-08, + "loss": 0.421, + "step": 6410 + }, + { + "epoch": 0.97, + "grad_norm": 15.75342509575384, + "learning_rate": 2.858621805923556e-08, + "loss": 0.4465, + "step": 6411 + }, + { + "epoch": 0.97, + "grad_norm": 9.897233728633616, + "learning_rate": 2.8325998949314536e-08, + "loss": 0.3729, + "step": 6412 + }, + { + "epoch": 0.97, + "grad_norm": 8.73804443388177, + "learning_rate": 2.8066966278248452e-08, + "loss": 0.391, + "step": 6413 + }, + { + "epoch": 0.97, + "grad_norm": 9.894319376587845, + "learning_rate": 2.7809120107853972e-08, + "loss": 0.4669, + "step": 6414 + }, + { + "epoch": 0.97, + "grad_norm": 5.472225806128964, + "learning_rate": 2.7552460499662425e-08, + "loss": 0.4556, + "step": 6415 + }, + { + "epoch": 0.97, + "grad_norm": 16.35753675542616, + "learning_rate": 2.7296987514923712e-08, + "loss": 0.4654, + "step": 6416 + }, + { + "epoch": 0.97, + "grad_norm": 12.960255954754041, + "learning_rate": 2.704270121460295e-08, + "loss": 0.4158, + "step": 6417 + }, + { + "epoch": 0.97, + "grad_norm": 5.7894061718514624, + "learning_rate": 2.6789601659383825e-08, + "loss": 0.3923, + "step": 6418 + }, + { + "epoch": 0.97, + "grad_norm": 13.719924428557155, + "learning_rate": 2.653768890966524e-08, + "loss": 0.4477, + "step": 6419 + }, + { + "epoch": 0.97, + "grad_norm": 6.900415858421535, + "learning_rate": 2.6286963025564107e-08, + "loss": 0.3456, + "step": 6420 + }, + { + "epoch": 0.97, + "grad_norm": 21.06817031022724, + "learning_rate": 2.603742406691312e-08, + "loss": 0.4639, + "step": 6421 + }, + { + "epoch": 0.97, + "grad_norm": 16.21772980058928, + "learning_rate": 2.578907209326298e-08, + "loss": 0.3669, + "step": 6422 + }, + { + "epoch": 0.97, + "grad_norm": 7.880611174455782, + "learning_rate": 2.5541907163879608e-08, + "loss": 0.3786, + "step": 6423 + }, + { + "epoch": 0.97, + "grad_norm": 9.68457715615109, + "learning_rate": 2.5295929337746385e-08, + "loss": 0.4214, + "step": 6424 + }, + { + "epoch": 0.97, + "grad_norm": 5.645144525956304, + "learning_rate": 2.505113867356357e-08, + "loss": 0.3484, + "step": 6425 + }, + { + "epoch": 0.97, + "grad_norm": 6.147909665755614, + "learning_rate": 2.4807535229748327e-08, + "loss": 0.4256, + "step": 6426 + }, + { + "epoch": 0.97, + "grad_norm": 8.524605171572896, + "learning_rate": 2.4565119064433595e-08, + "loss": 0.5083, + "step": 6427 + }, + { + "epoch": 0.97, + "grad_norm": 4.612651645571024, + "learning_rate": 2.4323890235469215e-08, + "loss": 0.4247, + "step": 6428 + }, + { + "epoch": 0.97, + "grad_norm": 9.193682687598692, + "learning_rate": 2.4083848800423027e-08, + "loss": 0.4329, + "step": 6429 + }, + { + "epoch": 0.97, + "grad_norm": 9.813235195391314, + "learning_rate": 2.3844994816577538e-08, + "loss": 0.4039, + "step": 6430 + }, + { + "epoch": 0.97, + "grad_norm": 6.969331476256643, + "learning_rate": 2.3607328340932713e-08, + "loss": 0.4301, + "step": 6431 + }, + { + "epoch": 0.97, + "grad_norm": 5.189955892024086, + "learning_rate": 2.3370849430205956e-08, + "loss": 0.4373, + "step": 6432 + }, + { + "epoch": 0.97, + "grad_norm": 10.793033310378792, + "learning_rate": 2.3135558140829906e-08, + "loss": 0.4075, + "step": 6433 + }, + { + "epoch": 0.97, + "grad_norm": 7.8542222453754995, + "learning_rate": 2.290145452895465e-08, + "loss": 0.3903, + "step": 6434 + }, + { + "epoch": 0.97, + "grad_norm": 13.904417722870559, + "learning_rate": 2.266853865044716e-08, + "loss": 0.442, + "step": 6435 + }, + { + "epoch": 0.97, + "grad_norm": 5.861851640662365, + "learning_rate": 2.2436810560889088e-08, + "loss": 0.4072, + "step": 6436 + }, + { + "epoch": 0.97, + "grad_norm": 7.510979629817108, + "learning_rate": 2.2206270315581203e-08, + "loss": 0.4119, + "step": 6437 + }, + { + "epoch": 0.97, + "grad_norm": 10.119670279977187, + "learning_rate": 2.1976917969538934e-08, + "loss": 0.4636, + "step": 6438 + }, + { + "epoch": 0.97, + "grad_norm": 7.1923880321675515, + "learning_rate": 2.174875357749462e-08, + "loss": 0.4003, + "step": 6439 + }, + { + "epoch": 0.97, + "grad_norm": 4.84732399017021, + "learning_rate": 2.1521777193898032e-08, + "loss": 0.456, + "step": 6440 + }, + { + "epoch": 0.97, + "grad_norm": 13.829982068583558, + "learning_rate": 2.129598887291473e-08, + "loss": 0.4334, + "step": 6441 + }, + { + "epoch": 0.97, + "grad_norm": 11.99644636017591, + "learning_rate": 2.1071388668426063e-08, + "loss": 0.4352, + "step": 6442 + }, + { + "epoch": 0.97, + "grad_norm": 7.436932653803079, + "learning_rate": 2.084797663403082e-08, + "loss": 0.3544, + "step": 6443 + }, + { + "epoch": 0.97, + "grad_norm": 6.0265222549424315, + "learning_rate": 2.0625752823044686e-08, + "loss": 0.4098, + "step": 6444 + }, + { + "epoch": 0.97, + "grad_norm": 5.533815621059066, + "learning_rate": 2.0404717288498577e-08, + "loss": 0.4227, + "step": 6445 + }, + { + "epoch": 0.97, + "grad_norm": 7.676943038409958, + "learning_rate": 2.018487008313974e-08, + "loss": 0.381, + "step": 6446 + }, + { + "epoch": 0.97, + "grad_norm": 5.973367476038849, + "learning_rate": 1.9966211259433433e-08, + "loss": 0.4027, + "step": 6447 + }, + { + "epoch": 0.97, + "grad_norm": 6.031779613165817, + "learning_rate": 1.974874086955958e-08, + "loss": 0.4119, + "step": 6448 + }, + { + "epoch": 0.97, + "grad_norm": 1.328670306126444, + "learning_rate": 1.953245896541611e-08, + "loss": 0.526, + "step": 6449 + }, + { + "epoch": 0.97, + "grad_norm": 7.590351684617457, + "learning_rate": 1.9317365598615635e-08, + "loss": 0.4596, + "step": 6450 + }, + { + "epoch": 0.97, + "grad_norm": 6.563792384853496, + "learning_rate": 1.9103460820488195e-08, + "loss": 0.4054, + "step": 6451 + }, + { + "epoch": 0.97, + "grad_norm": 6.425370254450405, + "learning_rate": 1.8890744682080743e-08, + "loss": 0.4289, + "step": 6452 + }, + { + "epoch": 0.97, + "grad_norm": 5.296060980983347, + "learning_rate": 1.8679217234154335e-08, + "loss": 0.3997, + "step": 6453 + }, + { + "epoch": 0.97, + "grad_norm": 5.9644398418069136, + "learning_rate": 1.846887852718915e-08, + "loss": 0.4395, + "step": 6454 + }, + { + "epoch": 0.97, + "grad_norm": 5.971959801762859, + "learning_rate": 1.825972861137948e-08, + "loss": 0.4254, + "step": 6455 + }, + { + "epoch": 0.97, + "grad_norm": 8.982906798503222, + "learning_rate": 1.8051767536637066e-08, + "loss": 0.3822, + "step": 6456 + }, + { + "epoch": 0.97, + "grad_norm": 10.471977421220158, + "learning_rate": 1.7844995352589988e-08, + "loss": 0.5026, + "step": 6457 + }, + { + "epoch": 0.97, + "grad_norm": 7.7127969029382575, + "learning_rate": 1.763941210858211e-08, + "loss": 0.4338, + "step": 6458 + }, + { + "epoch": 0.97, + "grad_norm": 9.80231238149647, + "learning_rate": 1.7435017853673632e-08, + "loss": 0.3877, + "step": 6459 + }, + { + "epoch": 0.97, + "grad_norm": 6.180318034663948, + "learning_rate": 1.7231812636641644e-08, + "loss": 0.4328, + "step": 6460 + }, + { + "epoch": 0.97, + "grad_norm": 5.83264658347131, + "learning_rate": 1.7029796505977912e-08, + "loss": 0.4705, + "step": 6461 + }, + { + "epoch": 0.97, + "grad_norm": 7.284943448992248, + "learning_rate": 1.682896950989277e-08, + "loss": 0.4559, + "step": 6462 + }, + { + "epoch": 0.97, + "grad_norm": 6.31368289069024, + "learning_rate": 1.6629331696310646e-08, + "loss": 0.4537, + "step": 6463 + }, + { + "epoch": 0.97, + "grad_norm": 8.517304645963296, + "learning_rate": 1.6430883112872887e-08, + "loss": 0.4226, + "step": 6464 + }, + { + "epoch": 0.98, + "grad_norm": 10.36662082306044, + "learning_rate": 1.623362380693827e-08, + "loss": 0.4989, + "step": 6465 + }, + { + "epoch": 0.98, + "grad_norm": 7.240004835862762, + "learning_rate": 1.6037553825579145e-08, + "loss": 0.4172, + "step": 6466 + }, + { + "epoch": 0.98, + "grad_norm": 9.60713598166183, + "learning_rate": 1.5842673215587524e-08, + "loss": 0.4081, + "step": 6467 + }, + { + "epoch": 0.98, + "grad_norm": 16.947889877563853, + "learning_rate": 1.564898202346843e-08, + "loss": 0.4346, + "step": 6468 + }, + { + "epoch": 0.98, + "grad_norm": 27.907913229975527, + "learning_rate": 1.5456480295443776e-08, + "loss": 0.4075, + "step": 6469 + }, + { + "epoch": 0.98, + "grad_norm": 11.434089112970549, + "learning_rate": 1.5265168077453485e-08, + "loss": 0.3341, + "step": 6470 + }, + { + "epoch": 0.98, + "grad_norm": 6.98913586305444, + "learning_rate": 1.507504541515159e-08, + "loss": 0.4662, + "step": 6471 + }, + { + "epoch": 0.98, + "grad_norm": 7.395349280237933, + "learning_rate": 1.4886112353908467e-08, + "loss": 0.4337, + "step": 6472 + }, + { + "epoch": 0.98, + "grad_norm": 8.922925738733252, + "learning_rate": 1.4698368938811935e-08, + "loss": 0.3577, + "step": 6473 + }, + { + "epoch": 0.98, + "grad_norm": 8.582957335555795, + "learning_rate": 1.4511815214664493e-08, + "loss": 0.3827, + "step": 6474 + }, + { + "epoch": 0.98, + "grad_norm": 6.362638682698845, + "learning_rate": 1.4326451225985527e-08, + "loss": 0.4074, + "step": 6475 + }, + { + "epoch": 0.98, + "grad_norm": 6.472823235569069, + "learning_rate": 1.4142277017009653e-08, + "loss": 0.4173, + "step": 6476 + }, + { + "epoch": 0.98, + "grad_norm": 7.531352106977007, + "learning_rate": 1.3959292631688937e-08, + "loss": 0.372, + "step": 6477 + }, + { + "epoch": 0.98, + "grad_norm": 4.9699405184098495, + "learning_rate": 1.3777498113690114e-08, + "loss": 0.4276, + "step": 6478 + }, + { + "epoch": 0.98, + "grad_norm": 9.988353369551623, + "learning_rate": 1.3596893506396813e-08, + "loss": 0.4201, + "step": 6479 + }, + { + "epoch": 0.98, + "grad_norm": 11.52132734798953, + "learning_rate": 1.341747885290845e-08, + "loss": 0.4028, + "step": 6480 + }, + { + "epoch": 0.98, + "grad_norm": 8.005767209905986, + "learning_rate": 1.3239254196040775e-08, + "loss": 0.3939, + "step": 6481 + }, + { + "epoch": 0.98, + "grad_norm": 27.225755104526392, + "learning_rate": 1.3062219578324765e-08, + "loss": 0.4322, + "step": 6482 + }, + { + "epoch": 0.98, + "grad_norm": 7.125091154746191, + "learning_rate": 1.2886375042008292e-08, + "loss": 0.4799, + "step": 6483 + }, + { + "epoch": 0.98, + "grad_norm": 32.370190311616646, + "learning_rate": 1.2711720629055014e-08, + "loss": 0.5071, + "step": 6484 + }, + { + "epoch": 0.98, + "grad_norm": 10.030081262550622, + "learning_rate": 1.253825638114381e-08, + "loss": 0.4175, + "step": 6485 + }, + { + "epoch": 0.98, + "grad_norm": 10.618309708074467, + "learning_rate": 1.2365982339670456e-08, + "loss": 0.4048, + "step": 6486 + }, + { + "epoch": 0.98, + "grad_norm": 5.404533079230864, + "learning_rate": 1.2194898545745958e-08, + "loss": 0.4034, + "step": 6487 + }, + { + "epoch": 0.98, + "grad_norm": 11.198495791155826, + "learning_rate": 1.2025005040198768e-08, + "loss": 0.48, + "step": 6488 + }, + { + "epoch": 0.98, + "grad_norm": 9.748227644663272, + "learning_rate": 1.1856301863570895e-08, + "loss": 0.4119, + "step": 6489 + }, + { + "epoch": 0.98, + "grad_norm": 6.334942972879035, + "learning_rate": 1.1688789056122918e-08, + "loss": 0.3999, + "step": 6490 + }, + { + "epoch": 0.98, + "grad_norm": 6.231604751158212, + "learning_rate": 1.1522466657828968e-08, + "loss": 0.4366, + "step": 6491 + }, + { + "epoch": 0.98, + "grad_norm": 5.260880021913545, + "learning_rate": 1.135733470838063e-08, + "loss": 0.4054, + "step": 6492 + }, + { + "epoch": 0.98, + "grad_norm": 23.534543982647225, + "learning_rate": 1.1193393247185268e-08, + "loss": 0.3619, + "step": 6493 + }, + { + "epoch": 0.98, + "grad_norm": 6.631052704548437, + "learning_rate": 1.1030642313364926e-08, + "loss": 0.4331, + "step": 6494 + }, + { + "epoch": 0.98, + "grad_norm": 6.3703449951391065, + "learning_rate": 1.0869081945759086e-08, + "loss": 0.4452, + "step": 6495 + }, + { + "epoch": 0.98, + "grad_norm": 5.7736360582718165, + "learning_rate": 1.070871218292302e-08, + "loss": 0.3481, + "step": 6496 + }, + { + "epoch": 0.98, + "grad_norm": 11.95300179592741, + "learning_rate": 1.0549533063126117e-08, + "loss": 0.5196, + "step": 6497 + }, + { + "epoch": 0.98, + "grad_norm": 9.830245134714541, + "learning_rate": 1.0391544624355764e-08, + "loss": 0.4423, + "step": 6498 + }, + { + "epoch": 0.98, + "grad_norm": 7.061023454513648, + "learning_rate": 1.0234746904313475e-08, + "loss": 0.4119, + "step": 6499 + }, + { + "epoch": 0.98, + "grad_norm": 7.385398856404234, + "learning_rate": 1.0079139940418204e-08, + "loss": 0.4342, + "step": 6500 + }, + { + "epoch": 0.98, + "grad_norm": 10.509958243540073, + "learning_rate": 9.924723769803025e-09, + "loss": 0.4132, + "step": 6501 + }, + { + "epoch": 0.98, + "grad_norm": 5.835439704650566, + "learning_rate": 9.771498429319015e-09, + "loss": 0.4379, + "step": 6502 + }, + { + "epoch": 0.98, + "grad_norm": 7.23291163722602, + "learning_rate": 9.619463955530817e-09, + "loss": 0.4798, + "step": 6503 + }, + { + "epoch": 0.98, + "grad_norm": 7.479682096238707, + "learning_rate": 9.468620384720517e-09, + "loss": 0.4282, + "step": 6504 + }, + { + "epoch": 0.98, + "grad_norm": 8.836813033634854, + "learning_rate": 9.318967752884878e-09, + "loss": 0.427, + "step": 6505 + }, + { + "epoch": 0.98, + "grad_norm": 9.46167019601088, + "learning_rate": 9.170506095736997e-09, + "loss": 0.3369, + "step": 6506 + }, + { + "epoch": 0.98, + "grad_norm": 1.130726300039751, + "learning_rate": 9.023235448706313e-09, + "loss": 0.534, + "step": 6507 + }, + { + "epoch": 0.98, + "grad_norm": 1.3674810779102473, + "learning_rate": 8.877155846936935e-09, + "loss": 0.5374, + "step": 6508 + }, + { + "epoch": 0.98, + "grad_norm": 8.813987012740299, + "learning_rate": 8.732267325289313e-09, + "loss": 0.4373, + "step": 6509 + }, + { + "epoch": 0.98, + "grad_norm": 7.064414994709687, + "learning_rate": 8.588569918339119e-09, + "loss": 0.4477, + "step": 6510 + }, + { + "epoch": 0.98, + "grad_norm": 9.540154813893036, + "learning_rate": 8.446063660379478e-09, + "loss": 0.46, + "step": 6511 + }, + { + "epoch": 0.98, + "grad_norm": 6.74478979995469, + "learning_rate": 8.304748585417077e-09, + "loss": 0.4481, + "step": 6512 + }, + { + "epoch": 0.98, + "grad_norm": 11.72029709553076, + "learning_rate": 8.164624727175497e-09, + "loss": 0.4275, + "step": 6513 + }, + { + "epoch": 0.98, + "grad_norm": 8.644498929976521, + "learning_rate": 8.025692119094098e-09, + "loss": 0.383, + "step": 6514 + }, + { + "epoch": 0.98, + "grad_norm": 13.074901320376275, + "learning_rate": 7.88795079432747e-09, + "loss": 0.4662, + "step": 6515 + }, + { + "epoch": 0.98, + "grad_norm": 8.666675314853856, + "learning_rate": 7.751400785746543e-09, + "loss": 0.4556, + "step": 6516 + }, + { + "epoch": 0.98, + "grad_norm": 5.840353480782144, + "learning_rate": 7.616042125936918e-09, + "loss": 0.3813, + "step": 6517 + }, + { + "epoch": 0.98, + "grad_norm": 6.339618661586898, + "learning_rate": 7.481874847201087e-09, + "loss": 0.471, + "step": 6518 + }, + { + "epoch": 0.98, + "grad_norm": 5.546323179783108, + "learning_rate": 7.3488989815567735e-09, + "loss": 0.3861, + "step": 6519 + }, + { + "epoch": 0.98, + "grad_norm": 5.962552033136589, + "learning_rate": 7.217114560736926e-09, + "loss": 0.4295, + "step": 6520 + }, + { + "epoch": 0.98, + "grad_norm": 6.465100018527397, + "learning_rate": 7.0865216161902785e-09, + "loss": 0.4431, + "step": 6521 + }, + { + "epoch": 0.98, + "grad_norm": 6.280162373325864, + "learning_rate": 6.9571201790818995e-09, + "loss": 0.4637, + "step": 6522 + }, + { + "epoch": 0.98, + "grad_norm": 5.439019209192012, + "learning_rate": 6.828910280292644e-09, + "loss": 0.3516, + "step": 6523 + }, + { + "epoch": 0.98, + "grad_norm": 6.612471242078843, + "learning_rate": 6.701891950417483e-09, + "loss": 0.4384, + "step": 6524 + }, + { + "epoch": 0.98, + "grad_norm": 16.100636376468643, + "learning_rate": 6.57606521976828e-09, + "loss": 0.3679, + "step": 6525 + }, + { + "epoch": 0.98, + "grad_norm": 6.748531413414321, + "learning_rate": 6.4514301183726814e-09, + "loss": 0.4333, + "step": 6526 + }, + { + "epoch": 0.98, + "grad_norm": 12.888410363356453, + "learning_rate": 6.327986675973563e-09, + "loss": 0.4349, + "step": 6527 + }, + { + "epoch": 0.98, + "grad_norm": 6.808863334609339, + "learning_rate": 6.2057349220295826e-09, + "loss": 0.3965, + "step": 6528 + }, + { + "epoch": 0.98, + "grad_norm": 7.2689411029315725, + "learning_rate": 6.084674885714071e-09, + "loss": 0.4758, + "step": 6529 + }, + { + "epoch": 0.98, + "grad_norm": 6.448588047583308, + "learning_rate": 5.9648065959172494e-09, + "loss": 0.4242, + "step": 6530 + }, + { + "epoch": 0.99, + "grad_norm": 6.5780481416273915, + "learning_rate": 5.846130081244017e-09, + "loss": 0.3675, + "step": 6531 + }, + { + "epoch": 0.99, + "grad_norm": 7.587618761015889, + "learning_rate": 5.728645370016162e-09, + "loss": 0.3922, + "step": 6532 + }, + { + "epoch": 0.99, + "grad_norm": 7.455604512657786, + "learning_rate": 5.612352490270146e-09, + "loss": 0.4353, + "step": 6533 + }, + { + "epoch": 0.99, + "grad_norm": 6.4866405693172, + "learning_rate": 5.497251469757103e-09, + "loss": 0.3575, + "step": 6534 + }, + { + "epoch": 0.99, + "grad_norm": 11.315193835655208, + "learning_rate": 5.383342335945063e-09, + "loss": 0.3951, + "step": 6535 + }, + { + "epoch": 0.99, + "grad_norm": 8.324848658753268, + "learning_rate": 5.270625116017836e-09, + "loss": 0.4027, + "step": 6536 + }, + { + "epoch": 0.99, + "grad_norm": 13.213150254317688, + "learning_rate": 5.159099836873904e-09, + "loss": 0.4901, + "step": 6537 + }, + { + "epoch": 0.99, + "grad_norm": 11.83011277915953, + "learning_rate": 5.048766525127535e-09, + "loss": 0.3583, + "step": 6538 + }, + { + "epoch": 0.99, + "grad_norm": 11.540333420472034, + "learning_rate": 4.939625207108778e-09, + "loss": 0.4371, + "step": 6539 + }, + { + "epoch": 0.99, + "grad_norm": 1.124167799218047, + "learning_rate": 4.831675908862909e-09, + "loss": 0.5524, + "step": 6540 + }, + { + "epoch": 0.99, + "grad_norm": 6.156708144595947, + "learning_rate": 4.724918656150989e-09, + "loss": 0.4206, + "step": 6541 + }, + { + "epoch": 0.99, + "grad_norm": 4.6059051023093405, + "learning_rate": 4.619353474449307e-09, + "loss": 0.3906, + "step": 6542 + }, + { + "epoch": 0.99, + "grad_norm": 6.144270153131836, + "learning_rate": 4.514980388950485e-09, + "loss": 0.4635, + "step": 6543 + }, + { + "epoch": 0.99, + "grad_norm": 8.639924959058924, + "learning_rate": 4.4117994245618245e-09, + "loss": 0.4325, + "step": 6544 + }, + { + "epoch": 0.99, + "grad_norm": 38.88023063435594, + "learning_rate": 4.309810605906406e-09, + "loss": 0.372, + "step": 6545 + }, + { + "epoch": 0.99, + "grad_norm": 6.249254992165918, + "learning_rate": 4.209013957322539e-09, + "loss": 0.3909, + "step": 6546 + }, + { + "epoch": 0.99, + "grad_norm": 18.528362047653776, + "learning_rate": 4.109409502864314e-09, + "loss": 0.4541, + "step": 6547 + }, + { + "epoch": 0.99, + "grad_norm": 13.014625042343413, + "learning_rate": 4.0109972663021635e-09, + "loss": 0.4147, + "step": 6548 + }, + { + "epoch": 0.99, + "grad_norm": 4.254836553459079, + "learning_rate": 3.913777271120078e-09, + "loss": 0.4276, + "step": 6549 + }, + { + "epoch": 0.99, + "grad_norm": 7.532058709482877, + "learning_rate": 3.8177495405189446e-09, + "loss": 0.3945, + "step": 6550 + }, + { + "epoch": 0.99, + "grad_norm": 6.771941482763166, + "learning_rate": 3.7229140974148757e-09, + "loss": 0.3926, + "step": 6551 + }, + { + "epoch": 0.99, + "grad_norm": 11.156158299037292, + "learning_rate": 3.629270964439213e-09, + "loss": 0.4161, + "step": 6552 + }, + { + "epoch": 0.99, + "grad_norm": 5.222151091147984, + "learning_rate": 3.536820163939636e-09, + "loss": 0.44, + "step": 6553 + }, + { + "epoch": 0.99, + "grad_norm": 5.58184317440906, + "learning_rate": 3.445561717977941e-09, + "loss": 0.4105, + "step": 6554 + }, + { + "epoch": 0.99, + "grad_norm": 1.2496824915060523, + "learning_rate": 3.355495648331708e-09, + "loss": 0.5197, + "step": 6555 + }, + { + "epoch": 0.99, + "grad_norm": 8.906055352633679, + "learning_rate": 3.266621976494855e-09, + "loss": 0.4662, + "step": 6556 + }, + { + "epoch": 0.99, + "grad_norm": 7.821292519469058, + "learning_rate": 3.1789407236759717e-09, + "loss": 0.4664, + "step": 6557 + }, + { + "epoch": 0.99, + "grad_norm": 6.503331485089174, + "learning_rate": 3.092451910799432e-09, + "loss": 0.4512, + "step": 6558 + }, + { + "epoch": 0.99, + "grad_norm": 6.748408859264843, + "learning_rate": 3.0071555585048374e-09, + "loss": 0.4857, + "step": 6559 + }, + { + "epoch": 0.99, + "grad_norm": 11.606029133978572, + "learning_rate": 2.923051687147016e-09, + "loss": 0.4215, + "step": 6560 + }, + { + "epoch": 0.99, + "grad_norm": 6.149931012582169, + "learning_rate": 2.8401403167971354e-09, + "loss": 0.4684, + "step": 6561 + }, + { + "epoch": 0.99, + "grad_norm": 8.638336343633409, + "learning_rate": 2.7584214672404798e-09, + "loss": 0.4807, + "step": 6562 + }, + { + "epoch": 0.99, + "grad_norm": 6.243235657287498, + "learning_rate": 2.6778951579786717e-09, + "loss": 0.3751, + "step": 6563 + }, + { + "epoch": 0.99, + "grad_norm": 7.697764161737031, + "learning_rate": 2.5985614082280063e-09, + "loss": 0.4407, + "step": 6564 + }, + { + "epoch": 0.99, + "grad_norm": 4.8456726226448765, + "learning_rate": 2.5204202369216723e-09, + "loss": 0.4288, + "step": 6565 + }, + { + "epoch": 0.99, + "grad_norm": 6.727840986219048, + "learning_rate": 2.4434716627064204e-09, + "loss": 0.3953, + "step": 6566 + }, + { + "epoch": 0.99, + "grad_norm": 5.195060705796738, + "learning_rate": 2.3677157039453393e-09, + "loss": 0.4806, + "step": 6567 + }, + { + "epoch": 0.99, + "grad_norm": 6.34204319057369, + "learning_rate": 2.2931523787173005e-09, + "loss": 0.4438, + "step": 6568 + }, + { + "epoch": 0.99, + "grad_norm": 8.539791407107073, + "learning_rate": 2.219781704815849e-09, + "loss": 0.415, + "step": 6569 + }, + { + "epoch": 0.99, + "grad_norm": 8.58090501874538, + "learning_rate": 2.147603699749756e-09, + "loss": 0.442, + "step": 6570 + }, + { + "epoch": 0.99, + "grad_norm": 38.58295245731931, + "learning_rate": 2.076618380744133e-09, + "loss": 0.3881, + "step": 6571 + }, + { + "epoch": 0.99, + "grad_norm": 0.9625456204477675, + "learning_rate": 2.006825764738762e-09, + "loss": 0.4412, + "step": 6572 + }, + { + "epoch": 0.99, + "grad_norm": 39.65751533306592, + "learning_rate": 1.9382258683880994e-09, + "loss": 0.3999, + "step": 6573 + }, + { + "epoch": 0.99, + "grad_norm": 14.39197389725519, + "learning_rate": 1.8708187080640485e-09, + "loss": 0.4232, + "step": 6574 + }, + { + "epoch": 0.99, + "grad_norm": 8.179578887633754, + "learning_rate": 1.8046042998520752e-09, + "loss": 0.3887, + "step": 6575 + }, + { + "epoch": 0.99, + "grad_norm": 6.451879978689478, + "learning_rate": 1.7395826595534293e-09, + "loss": 0.3454, + "step": 6576 + }, + { + "epoch": 0.99, + "grad_norm": 7.372670869397812, + "learning_rate": 1.6757538026851427e-09, + "loss": 0.3853, + "step": 6577 + }, + { + "epoch": 0.99, + "grad_norm": 13.200350153763086, + "learning_rate": 1.6131177444789204e-09, + "loss": 0.4, + "step": 6578 + }, + { + "epoch": 0.99, + "grad_norm": 7.603486397424962, + "learning_rate": 1.551674499882805e-09, + "loss": 0.4786, + "step": 6579 + }, + { + "epoch": 0.99, + "grad_norm": 9.893169855004569, + "learning_rate": 1.4914240835589566e-09, + "loss": 0.4106, + "step": 6580 + }, + { + "epoch": 0.99, + "grad_norm": 5.533020205604693, + "learning_rate": 1.4323665098858742e-09, + "loss": 0.3893, + "step": 6581 + }, + { + "epoch": 0.99, + "grad_norm": 11.986702272762939, + "learning_rate": 1.3745017929572835e-09, + "loss": 0.428, + "step": 6582 + }, + { + "epoch": 0.99, + "grad_norm": 9.18985005789812, + "learning_rate": 1.3178299465810284e-09, + "loss": 0.415, + "step": 6583 + }, + { + "epoch": 0.99, + "grad_norm": 7.2391142689865315, + "learning_rate": 1.2623509842824012e-09, + "loss": 0.3544, + "step": 6584 + }, + { + "epoch": 0.99, + "grad_norm": 10.990832735871294, + "learning_rate": 1.208064919300811e-09, + "loss": 0.4187, + "step": 6585 + }, + { + "epoch": 0.99, + "grad_norm": 25.063464070699546, + "learning_rate": 1.1549717645903401e-09, + "loss": 0.44, + "step": 6586 + }, + { + "epoch": 0.99, + "grad_norm": 25.693869849188705, + "learning_rate": 1.1030715328208542e-09, + "loss": 0.5064, + "step": 6587 + }, + { + "epoch": 0.99, + "grad_norm": 12.066246357515496, + "learning_rate": 1.052364236379111e-09, + "loss": 0.4221, + "step": 6588 + }, + { + "epoch": 0.99, + "grad_norm": 8.175332876522486, + "learning_rate": 1.0028498873648762e-09, + "loss": 0.4247, + "step": 6589 + }, + { + "epoch": 0.99, + "grad_norm": 11.305405270864528, + "learning_rate": 9.545284975948087e-10, + "loss": 0.3669, + "step": 6590 + }, + { + "epoch": 0.99, + "grad_norm": 10.32658780767732, + "learning_rate": 9.074000786002401e-10, + "loss": 0.4048, + "step": 6591 + }, + { + "epoch": 0.99, + "grad_norm": 9.559429392659263, + "learning_rate": 8.614646416271743e-10, + "loss": 0.4335, + "step": 6592 + }, + { + "epoch": 0.99, + "grad_norm": 7.719723370195331, + "learning_rate": 8.167221976385087e-10, + "loss": 0.4058, + "step": 6593 + }, + { + "epoch": 0.99, + "grad_norm": 6.581774500446054, + "learning_rate": 7.731727573112579e-10, + "loss": 0.4005, + "step": 6594 + }, + { + "epoch": 0.99, + "grad_norm": 5.905245832103491, + "learning_rate": 7.308163310376648e-10, + "loss": 0.4101, + "step": 6595 + }, + { + "epoch": 0.99, + "grad_norm": 8.825437010211374, + "learning_rate": 6.896529289263099e-10, + "loss": 0.4374, + "step": 6596 + }, + { + "epoch": 1.0, + "grad_norm": 21.966968092284404, + "learning_rate": 6.496825607998913e-10, + "loss": 0.4726, + "step": 6597 + }, + { + "epoch": 1.0, + "grad_norm": 10.83335582114488, + "learning_rate": 6.109052361974455e-10, + "loss": 0.412, + "step": 6598 + }, + { + "epoch": 1.0, + "grad_norm": 11.801321703736502, + "learning_rate": 5.733209643721261e-10, + "loss": 0.4545, + "step": 6599 + }, + { + "epoch": 1.0, + "grad_norm": 18.632137042897106, + "learning_rate": 5.369297542934249e-10, + "loss": 0.4216, + "step": 6600 + }, + { + "epoch": 1.0, + "grad_norm": 17.15857827560764, + "learning_rate": 5.017316146460616e-10, + "loss": 0.415, + "step": 6601 + }, + { + "epoch": 1.0, + "grad_norm": 1.0137337311868537, + "learning_rate": 4.677265538288733e-10, + "loss": 0.5045, + "step": 6602 + }, + { + "epoch": 1.0, + "grad_norm": 69.79505030196971, + "learning_rate": 4.3491457995759043e-10, + "loss": 0.4233, + "step": 6603 + }, + { + "epoch": 1.0, + "grad_norm": 5.510704024596302, + "learning_rate": 4.032957008620608e-10, + "loss": 0.4161, + "step": 6604 + }, + { + "epoch": 1.0, + "grad_norm": 8.869329637398582, + "learning_rate": 3.7286992408791525e-10, + "loss": 0.4433, + "step": 6605 + }, + { + "epoch": 1.0, + "grad_norm": 5.833557035854727, + "learning_rate": 3.436372568954571e-10, + "loss": 0.428, + "step": 6606 + }, + { + "epoch": 1.0, + "grad_norm": 6.40474502917948, + "learning_rate": 3.1559770626188314e-10, + "loss": 0.4433, + "step": 6607 + }, + { + "epoch": 1.0, + "grad_norm": 5.947195508435455, + "learning_rate": 2.887512788773972e-10, + "loss": 0.4798, + "step": 6608 + }, + { + "epoch": 1.0, + "grad_norm": 26.713151892568415, + "learning_rate": 2.6309798114965144e-10, + "loss": 0.3868, + "step": 6609 + }, + { + "epoch": 1.0, + "grad_norm": 19.001760220794395, + "learning_rate": 2.386378191998606e-10, + "loss": 0.4535, + "step": 6610 + }, + { + "epoch": 1.0, + "grad_norm": 7.791105649715406, + "learning_rate": 2.1537079886502222e-10, + "loss": 0.4605, + "step": 6611 + }, + { + "epoch": 1.0, + "grad_norm": 19.269514717272305, + "learning_rate": 1.9329692569791668e-10, + "loss": 0.5126, + "step": 6612 + }, + { + "epoch": 1.0, + "grad_norm": 5.205303360602723, + "learning_rate": 1.7241620496655232e-10, + "loss": 0.4116, + "step": 6613 + }, + { + "epoch": 1.0, + "grad_norm": 7.960901078804037, + "learning_rate": 1.527286416530549e-10, + "loss": 0.4354, + "step": 6614 + }, + { + "epoch": 1.0, + "grad_norm": 7.194733853937215, + "learning_rate": 1.342342404564434e-10, + "loss": 0.434, + "step": 6615 + }, + { + "epoch": 1.0, + "grad_norm": 5.488440618503094, + "learning_rate": 1.1693300578985435e-10, + "loss": 0.3522, + "step": 6616 + }, + { + "epoch": 1.0, + "grad_norm": 10.740956668666344, + "learning_rate": 1.0082494178220715e-10, + "loss": 0.4596, + "step": 6617 + }, + { + "epoch": 1.0, + "grad_norm": 18.339751169153274, + "learning_rate": 8.591005227764904e-11, + "loss": 0.4394, + "step": 6618 + }, + { + "epoch": 1.0, + "grad_norm": 6.073532136462251, + "learning_rate": 7.218834083499992e-11, + "loss": 0.4025, + "step": 6619 + }, + { + "epoch": 1.0, + "grad_norm": 33.15054376801335, + "learning_rate": 5.965981072941774e-11, + "loss": 0.3995, + "step": 6620 + }, + { + "epoch": 1.0, + "grad_norm": 6.6902728097162525, + "learning_rate": 4.832446494962284e-11, + "loss": 0.4442, + "step": 6621 + }, + { + "epoch": 1.0, + "grad_norm": 7.682580276493064, + "learning_rate": 3.818230620233898e-11, + "loss": 0.4302, + "step": 6622 + }, + { + "epoch": 1.0, + "grad_norm": 5.521696077434697, + "learning_rate": 2.9233336906187016e-11, + "loss": 0.4311, + "step": 6623 + }, + { + "epoch": 1.0, + "grad_norm": 7.832883945114147, + "learning_rate": 2.1477559197791153e-11, + "loss": 0.4352, + "step": 6624 + }, + { + "epoch": 1.0, + "grad_norm": 8.817620300061387, + "learning_rate": 1.491497492789318e-11, + "loss": 0.5094, + "step": 6625 + }, + { + "epoch": 1.0, + "grad_norm": 7.369565709244112, + "learning_rate": 9.545585662462665e-12, + "loss": 0.454, + "step": 6626 + }, + { + "epoch": 1.0, + "grad_norm": 7.242508760294443, + "learning_rate": 5.369392682696983e-12, + "loss": 0.3994, + "step": 6627 + }, + { + "epoch": 1.0, + "grad_norm": 6.977820450694954, + "learning_rate": 2.3863969850213e-12, + "loss": 0.3659, + "step": 6628 + }, + { + "epoch": 1.0, + "grad_norm": 4.541882542541116, + "learning_rate": 5.965992816436838e-13, + "loss": 0.4509, + "step": 6629 + }, + { + "epoch": 1.0, + "grad_norm": 4.2167060447028115, + "learning_rate": 0.0, + "loss": 0.5483, + "step": 6630 + }, + { + "epoch": 1.0, + "step": 6630, + "total_flos": 3531656507129856.0, + "train_loss": 0.47476094288254217, + "train_runtime": 79275.3724, + "train_samples_per_second": 10.704, + "train_steps_per_second": 0.084 + } + ], + "logging_steps": 1.0, + "max_steps": 6630, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "total_flos": 3531656507129856.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}