|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.99328, |
|
"eval_steps": 400, |
|
"global_step": 1170, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00512, |
|
"grad_norm": 0.033956460654735565, |
|
"learning_rate": 0.001998289136013687, |
|
"loss": 1.5175, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01024, |
|
"grad_norm": 0.11115246266126633, |
|
"learning_rate": 0.0019948674080410606, |
|
"loss": 1.5456, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01536, |
|
"grad_norm": 0.31134533882141113, |
|
"learning_rate": 0.001991445680068435, |
|
"loss": 1.4942, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02048, |
|
"grad_norm": 0.314473956823349, |
|
"learning_rate": 0.0019880239520958082, |
|
"loss": 1.4622, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 0.3232758641242981, |
|
"learning_rate": 0.001984602224123182, |
|
"loss": 1.4351, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03072, |
|
"grad_norm": 0.30670273303985596, |
|
"learning_rate": 0.0019811804961505563, |
|
"loss": 1.3924, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03584, |
|
"grad_norm": 0.2764911651611328, |
|
"learning_rate": 0.0019777587681779297, |
|
"loss": 1.3622, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04096, |
|
"grad_norm": 0.24471326172351837, |
|
"learning_rate": 0.001974337040205304, |
|
"loss": 1.386, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04608, |
|
"grad_norm": 0.21196012198925018, |
|
"learning_rate": 0.0019709153122326774, |
|
"loss": 1.3682, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 0.23733310401439667, |
|
"learning_rate": 0.0019674935842600516, |
|
"loss": 1.3512, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05632, |
|
"grad_norm": 0.16206419467926025, |
|
"learning_rate": 0.001964071856287425, |
|
"loss": 1.3428, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06144, |
|
"grad_norm": 0.2223929464817047, |
|
"learning_rate": 0.001960650128314799, |
|
"loss": 1.3706, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06656, |
|
"grad_norm": 0.1612645536661148, |
|
"learning_rate": 0.0019572284003421727, |
|
"loss": 1.3775, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07168, |
|
"grad_norm": 0.16616274416446686, |
|
"learning_rate": 0.0019538066723695465, |
|
"loss": 1.3572, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 0.15922699868679047, |
|
"learning_rate": 0.0019503849443969204, |
|
"loss": 1.3482, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08192, |
|
"grad_norm": 0.14939665794372559, |
|
"learning_rate": 0.0019469632164242944, |
|
"loss": 1.3312, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08704, |
|
"grad_norm": 0.16849732398986816, |
|
"learning_rate": 0.0019435414884516682, |
|
"loss": 1.3394, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09216, |
|
"grad_norm": 0.1453033685684204, |
|
"learning_rate": 0.0019401197604790419, |
|
"loss": 1.2957, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09728, |
|
"grad_norm": 0.14633750915527344, |
|
"learning_rate": 0.001936698032506416, |
|
"loss": 1.334, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 0.12038926780223846, |
|
"learning_rate": 0.0019332763045337895, |
|
"loss": 1.3257, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10752, |
|
"grad_norm": 0.1144009605050087, |
|
"learning_rate": 0.0019298545765611636, |
|
"loss": 1.337, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11264, |
|
"grad_norm": 0.09070790559053421, |
|
"learning_rate": 0.0019264328485885372, |
|
"loss": 1.2969, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11776, |
|
"grad_norm": 0.09154277294874191, |
|
"learning_rate": 0.001923011120615911, |
|
"loss": 1.3233, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12288, |
|
"grad_norm": 0.09742377698421478, |
|
"learning_rate": 0.0019195893926432848, |
|
"loss": 1.3272, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.10923709720373154, |
|
"learning_rate": 0.0019161676646706587, |
|
"loss": 1.3353, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13312, |
|
"grad_norm": 0.10012141615152359, |
|
"learning_rate": 0.0019127459366980327, |
|
"loss": 1.3011, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13824, |
|
"grad_norm": 0.10850805044174194, |
|
"learning_rate": 0.0019093242087254063, |
|
"loss": 1.3372, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14336, |
|
"grad_norm": 0.11083640158176422, |
|
"learning_rate": 0.0019059024807527804, |
|
"loss": 1.3098, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14848, |
|
"grad_norm": 0.11255177110433578, |
|
"learning_rate": 0.001902480752780154, |
|
"loss": 1.3223, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 0.09206261485815048, |
|
"learning_rate": 0.0018990590248075278, |
|
"loss": 1.2899, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15872, |
|
"grad_norm": 0.09589900076389313, |
|
"learning_rate": 0.0018956372968349016, |
|
"loss": 1.3026, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.16384, |
|
"grad_norm": 0.0895056203007698, |
|
"learning_rate": 0.0018922155688622755, |
|
"loss": 1.2996, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16896, |
|
"grad_norm": 0.11926258355379105, |
|
"learning_rate": 0.0018887938408896493, |
|
"loss": 1.2952, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.17408, |
|
"grad_norm": 0.11748767644166946, |
|
"learning_rate": 0.0018853721129170231, |
|
"loss": 1.3085, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 0.09052393585443497, |
|
"learning_rate": 0.001881950384944397, |
|
"loss": 1.3111, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18432, |
|
"grad_norm": 0.11488136649131775, |
|
"learning_rate": 0.0018785286569717708, |
|
"loss": 1.2884, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18944, |
|
"grad_norm": 0.10231968015432358, |
|
"learning_rate": 0.0018751069289991446, |
|
"loss": 1.2991, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.19456, |
|
"grad_norm": 0.17033444344997406, |
|
"learning_rate": 0.0018716852010265185, |
|
"loss": 1.3188, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19968, |
|
"grad_norm": 0.1066645011305809, |
|
"learning_rate": 0.0018682634730538923, |
|
"loss": 1.2946, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 0.1261938363313675, |
|
"learning_rate": 0.0018648417450812661, |
|
"loss": 1.316, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20992, |
|
"grad_norm": 0.09089711308479309, |
|
"learning_rate": 0.00186142001710864, |
|
"loss": 1.2927, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.21504, |
|
"grad_norm": 0.16600407660007477, |
|
"learning_rate": 0.0018579982891360136, |
|
"loss": 1.2806, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22016, |
|
"grad_norm": 0.10440811514854431, |
|
"learning_rate": 0.0018545765611633876, |
|
"loss": 1.2928, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.22528, |
|
"grad_norm": 0.11303785443305969, |
|
"learning_rate": 0.0018511548331907612, |
|
"loss": 1.2999, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 0.1060706302523613, |
|
"learning_rate": 0.0018477331052181353, |
|
"loss": 1.2887, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23552, |
|
"grad_norm": 0.11111288517713547, |
|
"learning_rate": 0.001844311377245509, |
|
"loss": 1.2935, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.24064, |
|
"grad_norm": 0.10950679332017899, |
|
"learning_rate": 0.001840889649272883, |
|
"loss": 1.2865, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.24576, |
|
"grad_norm": 0.09789486229419708, |
|
"learning_rate": 0.0018374679213002568, |
|
"loss": 1.3088, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.25088, |
|
"grad_norm": 0.10912415385246277, |
|
"learning_rate": 0.0018340461933276304, |
|
"loss": 1.2811, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.11448093503713608, |
|
"learning_rate": 0.0018306244653550044, |
|
"loss": 1.2707, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26112, |
|
"grad_norm": 0.10610745847225189, |
|
"learning_rate": 0.001827202737382378, |
|
"loss": 1.2994, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.26624, |
|
"grad_norm": 0.10192134976387024, |
|
"learning_rate": 0.001823781009409752, |
|
"loss": 1.2807, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.27136, |
|
"grad_norm": 0.10114825516939163, |
|
"learning_rate": 0.0018203592814371257, |
|
"loss": 1.2582, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.27648, |
|
"grad_norm": 0.09611225128173828, |
|
"learning_rate": 0.0018169375534644997, |
|
"loss": 1.2951, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 0.08788544684648514, |
|
"learning_rate": 0.0018135158254918733, |
|
"loss": 1.2869, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.28672, |
|
"grad_norm": 0.10186023265123367, |
|
"learning_rate": 0.0018100940975192472, |
|
"loss": 1.2916, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.29184, |
|
"grad_norm": 0.11426091939210892, |
|
"learning_rate": 0.0018066723695466212, |
|
"loss": 1.2705, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.29696, |
|
"grad_norm": 0.0911969244480133, |
|
"learning_rate": 0.0018032506415739948, |
|
"loss": 1.2826, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.30208, |
|
"grad_norm": 0.08570262044668198, |
|
"learning_rate": 0.0017998289136013689, |
|
"loss": 1.2783, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 0.10600815713405609, |
|
"learning_rate": 0.0017964071856287425, |
|
"loss": 1.2633, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31232, |
|
"grad_norm": 0.10680408775806427, |
|
"learning_rate": 0.0017929854576561163, |
|
"loss": 1.2751, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.31744, |
|
"grad_norm": 0.10642975568771362, |
|
"learning_rate": 0.0017895637296834902, |
|
"loss": 1.2776, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.32256, |
|
"grad_norm": 0.09877141565084457, |
|
"learning_rate": 0.001786142001710864, |
|
"loss": 1.2868, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.32768, |
|
"grad_norm": 0.12077789753675461, |
|
"learning_rate": 0.0017827202737382378, |
|
"loss": 1.2825, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 0.1019749641418457, |
|
"learning_rate": 0.0017792985457656116, |
|
"loss": 1.2689, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33792, |
|
"grad_norm": 0.08969077467918396, |
|
"learning_rate": 0.0017758768177929857, |
|
"loss": 1.2795, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.34304, |
|
"grad_norm": 0.10050085186958313, |
|
"learning_rate": 0.0017724550898203593, |
|
"loss": 1.2748, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.34816, |
|
"grad_norm": 0.08829426020383835, |
|
"learning_rate": 0.0017690333618477331, |
|
"loss": 1.2577, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.35328, |
|
"grad_norm": 0.09129626303911209, |
|
"learning_rate": 0.001765611633875107, |
|
"loss": 1.2703, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 0.10055152326822281, |
|
"learning_rate": 0.0017621899059024808, |
|
"loss": 1.2618, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.36352, |
|
"grad_norm": 0.09721378982067108, |
|
"learning_rate": 0.0017587681779298546, |
|
"loss": 1.2809, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.36864, |
|
"grad_norm": 0.09917334467172623, |
|
"learning_rate": 0.0017553464499572285, |
|
"loss": 1.2628, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.37376, |
|
"grad_norm": 0.1129026785492897, |
|
"learning_rate": 0.0017519247219846023, |
|
"loss": 1.2684, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.37888, |
|
"grad_norm": 0.08873754739761353, |
|
"learning_rate": 0.0017485029940119761, |
|
"loss": 1.2726, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.10734206438064575, |
|
"learning_rate": 0.0017450812660393497, |
|
"loss": 1.2545, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.38912, |
|
"grad_norm": 0.09691416472196579, |
|
"learning_rate": 0.0017416595380667238, |
|
"loss": 1.2511, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.39424, |
|
"grad_norm": 0.10880187153816223, |
|
"learning_rate": 0.0017382378100940976, |
|
"loss": 1.2656, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.39936, |
|
"grad_norm": 0.10392981767654419, |
|
"learning_rate": 0.0017348160821214714, |
|
"loss": 1.2755, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.40448, |
|
"grad_norm": 0.08561566472053528, |
|
"learning_rate": 0.0017313943541488453, |
|
"loss": 1.2956, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 0.09822013229131699, |
|
"learning_rate": 0.0017279726261762189, |
|
"loss": 1.2887, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.41472, |
|
"grad_norm": 0.09553670883178711, |
|
"learning_rate": 0.001724550898203593, |
|
"loss": 1.2743, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.41984, |
|
"grad_norm": 0.10794595628976822, |
|
"learning_rate": 0.0017211291702309665, |
|
"loss": 1.2603, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.42496, |
|
"grad_norm": 0.09373841434717178, |
|
"learning_rate": 0.0017177074422583406, |
|
"loss": 1.2441, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.43008, |
|
"grad_norm": 0.10550329089164734, |
|
"learning_rate": 0.0017142857142857142, |
|
"loss": 1.2714, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 0.08271865546703339, |
|
"learning_rate": 0.0017108639863130882, |
|
"loss": 1.2671, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.44032, |
|
"grad_norm": 0.09635920822620392, |
|
"learning_rate": 0.001707442258340462, |
|
"loss": 1.2861, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.44544, |
|
"grad_norm": 0.0995635837316513, |
|
"learning_rate": 0.0017040205303678357, |
|
"loss": 1.2803, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.45056, |
|
"grad_norm": 0.10621396452188492, |
|
"learning_rate": 0.0017005988023952097, |
|
"loss": 1.2685, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.45568, |
|
"grad_norm": 0.10926424711942673, |
|
"learning_rate": 0.0016971770744225833, |
|
"loss": 1.2551, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 0.09723444283008575, |
|
"learning_rate": 0.0016937553464499574, |
|
"loss": 1.2547, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.46592, |
|
"grad_norm": 0.09107507020235062, |
|
"learning_rate": 0.001690333618477331, |
|
"loss": 1.2664, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.47104, |
|
"grad_norm": 0.08052769303321838, |
|
"learning_rate": 0.001686911890504705, |
|
"loss": 1.2645, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.47616, |
|
"grad_norm": 0.0871344730257988, |
|
"learning_rate": 0.0016834901625320787, |
|
"loss": 1.2694, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.48128, |
|
"grad_norm": 0.11148510873317719, |
|
"learning_rate": 0.0016800684345594525, |
|
"loss": 1.2697, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 0.08355887234210968, |
|
"learning_rate": 0.0016766467065868263, |
|
"loss": 1.2458, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.49152, |
|
"grad_norm": 0.08468321710824966, |
|
"learning_rate": 0.0016732249786142002, |
|
"loss": 1.252, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.49664, |
|
"grad_norm": 0.08812184631824493, |
|
"learning_rate": 0.0016698032506415742, |
|
"loss": 1.2746, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.50176, |
|
"grad_norm": 0.09015596657991409, |
|
"learning_rate": 0.0016663815226689478, |
|
"loss": 1.2659, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.50688, |
|
"grad_norm": 0.08708484470844269, |
|
"learning_rate": 0.0016629597946963216, |
|
"loss": 1.2727, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.09585189074277878, |
|
"learning_rate": 0.0016595380667236955, |
|
"loss": 1.2702, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.51712, |
|
"grad_norm": 0.08958299458026886, |
|
"learning_rate": 0.0016561163387510693, |
|
"loss": 1.2605, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.52224, |
|
"grad_norm": 0.0867680162191391, |
|
"learning_rate": 0.0016526946107784431, |
|
"loss": 1.2561, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.52736, |
|
"grad_norm": 0.08700387924909592, |
|
"learning_rate": 0.001649272882805817, |
|
"loss": 1.2481, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.53248, |
|
"grad_norm": 0.08566949516534805, |
|
"learning_rate": 0.0016458511548331908, |
|
"loss": 1.255, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 0.08990107476711273, |
|
"learning_rate": 0.0016424294268605646, |
|
"loss": 1.2558, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.54272, |
|
"grad_norm": 0.08288553357124329, |
|
"learning_rate": 0.0016390076988879385, |
|
"loss": 1.2507, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.54784, |
|
"grad_norm": 0.08027470856904984, |
|
"learning_rate": 0.0016355859709153123, |
|
"loss": 1.278, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.55296, |
|
"grad_norm": 0.09287162125110626, |
|
"learning_rate": 0.0016321642429426861, |
|
"loss": 1.2626, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.55808, |
|
"grad_norm": 0.09153173863887787, |
|
"learning_rate": 0.00162874251497006, |
|
"loss": 1.2643, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 0.10922811180353165, |
|
"learning_rate": 0.0016253207869974338, |
|
"loss": 1.2755, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.56832, |
|
"grad_norm": 0.1041250005364418, |
|
"learning_rate": 0.0016218990590248074, |
|
"loss": 1.2493, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.57344, |
|
"grad_norm": 0.08747130632400513, |
|
"learning_rate": 0.0016184773310521814, |
|
"loss": 1.2491, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.57856, |
|
"grad_norm": 0.10124468803405762, |
|
"learning_rate": 0.001615055603079555, |
|
"loss": 1.2387, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.58368, |
|
"grad_norm": 0.09022431075572968, |
|
"learning_rate": 0.001611633875106929, |
|
"loss": 1.2521, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 0.09623134136199951, |
|
"learning_rate": 0.0016082121471343027, |
|
"loss": 1.2701, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.59392, |
|
"grad_norm": 0.09208202362060547, |
|
"learning_rate": 0.0016047904191616768, |
|
"loss": 1.2413, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.59904, |
|
"grad_norm": 0.10192185640335083, |
|
"learning_rate": 0.0016013686911890506, |
|
"loss": 1.278, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.60416, |
|
"grad_norm": 0.11536680907011032, |
|
"learning_rate": 0.0015979469632164242, |
|
"loss": 1.2729, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.60928, |
|
"grad_norm": 0.11593331396579742, |
|
"learning_rate": 0.0015945252352437982, |
|
"loss": 1.2772, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 0.10352569818496704, |
|
"learning_rate": 0.0015911035072711719, |
|
"loss": 1.2587, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.61952, |
|
"grad_norm": 0.11213277280330658, |
|
"learning_rate": 0.001587681779298546, |
|
"loss": 1.2626, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.62464, |
|
"grad_norm": 0.11924043297767639, |
|
"learning_rate": 0.0015842600513259195, |
|
"loss": 1.2826, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.62976, |
|
"grad_norm": 0.09039822220802307, |
|
"learning_rate": 0.0015808383233532936, |
|
"loss": 1.2777, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.63488, |
|
"grad_norm": 0.09418819099664688, |
|
"learning_rate": 0.0015774165953806672, |
|
"loss": 1.2765, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.09826017916202545, |
|
"learning_rate": 0.001573994867408041, |
|
"loss": 1.2474, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.64512, |
|
"grad_norm": 0.09216541796922684, |
|
"learning_rate": 0.001570573139435415, |
|
"loss": 1.243, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.65024, |
|
"grad_norm": 0.09861636161804199, |
|
"learning_rate": 0.0015671514114627887, |
|
"loss": 1.239, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.65536, |
|
"grad_norm": 0.1033303365111351, |
|
"learning_rate": 0.0015637296834901627, |
|
"loss": 1.2571, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.66048, |
|
"grad_norm": 0.10304012894630432, |
|
"learning_rate": 0.0015603079555175363, |
|
"loss": 1.2556, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 0.0865844339132309, |
|
"learning_rate": 0.0015568862275449104, |
|
"loss": 1.2475, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.67072, |
|
"grad_norm": 0.10008803755044937, |
|
"learning_rate": 0.001553464499572284, |
|
"loss": 1.2795, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.67584, |
|
"grad_norm": 0.09249156713485718, |
|
"learning_rate": 0.0015500427715996578, |
|
"loss": 1.263, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.68096, |
|
"grad_norm": 0.09253022074699402, |
|
"learning_rate": 0.0015466210436270317, |
|
"loss": 1.2901, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.68608, |
|
"grad_norm": 0.09615321457386017, |
|
"learning_rate": 0.0015431993156544055, |
|
"loss": 1.2643, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 0.13639943301677704, |
|
"learning_rate": 0.0015397775876817793, |
|
"loss": 1.2586, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.69632, |
|
"grad_norm": 0.10662351548671722, |
|
"learning_rate": 0.0015363558597091531, |
|
"loss": 1.2414, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.70144, |
|
"grad_norm": 0.08936125040054321, |
|
"learning_rate": 0.001532934131736527, |
|
"loss": 1.2645, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.70656, |
|
"grad_norm": 0.09400724619626999, |
|
"learning_rate": 0.0015295124037639008, |
|
"loss": 1.2274, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.71168, |
|
"grad_norm": 0.10700514912605286, |
|
"learning_rate": 0.0015260906757912746, |
|
"loss": 1.2564, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 0.09849894791841507, |
|
"learning_rate": 0.0015226689478186485, |
|
"loss": 1.2658, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.72192, |
|
"grad_norm": 0.10522880405187607, |
|
"learning_rate": 0.0015192472198460223, |
|
"loss": 1.2775, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.72704, |
|
"grad_norm": 0.09173361957073212, |
|
"learning_rate": 0.0015158254918733961, |
|
"loss": 1.2738, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.73216, |
|
"grad_norm": 0.09410373121500015, |
|
"learning_rate": 0.00151240376390077, |
|
"loss": 1.2483, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.73728, |
|
"grad_norm": 0.08925613760948181, |
|
"learning_rate": 0.0015089820359281436, |
|
"loss": 1.2554, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 0.09131377190351486, |
|
"learning_rate": 0.0015055603079555176, |
|
"loss": 1.2496, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.74752, |
|
"grad_norm": 0.10802093148231506, |
|
"learning_rate": 0.0015021385799828914, |
|
"loss": 1.2675, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.75264, |
|
"grad_norm": 0.09466376155614853, |
|
"learning_rate": 0.0014987168520102653, |
|
"loss": 1.2482, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.75776, |
|
"grad_norm": 0.10151738673448563, |
|
"learning_rate": 0.001495295124037639, |
|
"loss": 1.2676, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.76288, |
|
"grad_norm": 0.08664025366306305, |
|
"learning_rate": 0.0014918733960650127, |
|
"loss": 1.2496, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.08441973477602005, |
|
"learning_rate": 0.0014884516680923868, |
|
"loss": 1.254, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.77312, |
|
"grad_norm": 0.08028802275657654, |
|
"learning_rate": 0.0014850299401197604, |
|
"loss": 1.2718, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.77824, |
|
"grad_norm": 0.08772825449705124, |
|
"learning_rate": 0.0014816082121471344, |
|
"loss": 1.2624, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.78336, |
|
"grad_norm": 0.08271320164203644, |
|
"learning_rate": 0.001478186484174508, |
|
"loss": 1.2467, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.78848, |
|
"grad_norm": 0.08346061408519745, |
|
"learning_rate": 0.001474764756201882, |
|
"loss": 1.2782, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 0.09925299137830734, |
|
"learning_rate": 0.0014713430282292557, |
|
"loss": 1.2489, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.79872, |
|
"grad_norm": 0.08743379265069962, |
|
"learning_rate": 0.0014679213002566295, |
|
"loss": 1.257, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.80384, |
|
"grad_norm": 0.08556243032217026, |
|
"learning_rate": 0.0014644995722840036, |
|
"loss": 1.2507, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.80896, |
|
"grad_norm": 0.0923091247677803, |
|
"learning_rate": 0.0014610778443113772, |
|
"loss": 1.2551, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.81408, |
|
"grad_norm": 0.0928301066160202, |
|
"learning_rate": 0.0014576561163387512, |
|
"loss": 1.2309, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 0.08794920146465302, |
|
"learning_rate": 0.0014542343883661248, |
|
"loss": 1.2457, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.82432, |
|
"grad_norm": 0.10327792912721634, |
|
"learning_rate": 0.001450812660393499, |
|
"loss": 1.2786, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.82944, |
|
"grad_norm": 0.12187407165765762, |
|
"learning_rate": 0.0014473909324208725, |
|
"loss": 1.2814, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.83456, |
|
"grad_norm": 0.11819777637720108, |
|
"learning_rate": 0.0014439692044482463, |
|
"loss": 1.2499, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.83968, |
|
"grad_norm": 0.09041640162467957, |
|
"learning_rate": 0.0014405474764756202, |
|
"loss": 1.2693, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 0.09638890624046326, |
|
"learning_rate": 0.001437125748502994, |
|
"loss": 1.2514, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.84992, |
|
"grad_norm": 0.12534624338150024, |
|
"learning_rate": 0.001433704020530368, |
|
"loss": 1.2407, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.85504, |
|
"grad_norm": 0.09927276521921158, |
|
"learning_rate": 0.0014302822925577417, |
|
"loss": 1.2749, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.86016, |
|
"grad_norm": 0.0886382907629013, |
|
"learning_rate": 0.0014268605645851157, |
|
"loss": 1.2478, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.86528, |
|
"grad_norm": 0.09105540066957474, |
|
"learning_rate": 0.0014234388366124893, |
|
"loss": 1.2538, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 0.08825322240591049, |
|
"learning_rate": 0.0014200171086398631, |
|
"loss": 1.2679, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.87552, |
|
"grad_norm": 0.08396722376346588, |
|
"learning_rate": 0.001416595380667237, |
|
"loss": 1.2091, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.88064, |
|
"grad_norm": 0.08241663128137589, |
|
"learning_rate": 0.0014131736526946108, |
|
"loss": 1.2401, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.88576, |
|
"grad_norm": 0.11285565793514252, |
|
"learning_rate": 0.0014097519247219846, |
|
"loss": 1.2461, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.89088, |
|
"grad_norm": 0.09898606687784195, |
|
"learning_rate": 0.0014063301967493585, |
|
"loss": 1.2587, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.09175729751586914, |
|
"learning_rate": 0.001402908468776732, |
|
"loss": 1.2318, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.90112, |
|
"grad_norm": 0.08974505960941315, |
|
"learning_rate": 0.0013994867408041061, |
|
"loss": 1.265, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.90624, |
|
"grad_norm": 0.10069482773542404, |
|
"learning_rate": 0.00139606501283148, |
|
"loss": 1.2276, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.91136, |
|
"grad_norm": 0.09355876594781876, |
|
"learning_rate": 0.0013926432848588538, |
|
"loss": 1.2474, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.91648, |
|
"grad_norm": 0.09747931361198425, |
|
"learning_rate": 0.0013892215568862276, |
|
"loss": 1.2677, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 0.09114759415388107, |
|
"learning_rate": 0.0013857998289136014, |
|
"loss": 1.2442, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.92672, |
|
"grad_norm": 0.09683123230934143, |
|
"learning_rate": 0.0013823781009409753, |
|
"loss": 1.2328, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.93184, |
|
"grad_norm": 0.08525967597961426, |
|
"learning_rate": 0.0013789563729683489, |
|
"loss": 1.2301, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.93696, |
|
"grad_norm": 0.09861680120229721, |
|
"learning_rate": 0.001375534644995723, |
|
"loss": 1.2604, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.94208, |
|
"grad_norm": 0.08435367792844772, |
|
"learning_rate": 0.0013721129170230965, |
|
"loss": 1.2605, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 0.08719425648450851, |
|
"learning_rate": 0.0013686911890504706, |
|
"loss": 1.2412, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.95232, |
|
"grad_norm": 0.10853152722120285, |
|
"learning_rate": 0.0013652694610778444, |
|
"loss": 1.2503, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.95744, |
|
"grad_norm": 0.08797234296798706, |
|
"learning_rate": 0.001361847733105218, |
|
"loss": 1.2323, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.96256, |
|
"grad_norm": 0.09422844648361206, |
|
"learning_rate": 0.001358426005132592, |
|
"loss": 1.2444, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.96768, |
|
"grad_norm": 0.09000077098608017, |
|
"learning_rate": 0.0013550042771599657, |
|
"loss": 1.2573, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 0.09097360074520111, |
|
"learning_rate": 0.0013515825491873397, |
|
"loss": 1.2463, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.97792, |
|
"grad_norm": 0.08720215409994125, |
|
"learning_rate": 0.0013481608212147134, |
|
"loss": 1.2719, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.98304, |
|
"grad_norm": 0.09287154674530029, |
|
"learning_rate": 0.0013447390932420874, |
|
"loss": 1.2387, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.98816, |
|
"grad_norm": 0.08979474008083344, |
|
"learning_rate": 0.001341317365269461, |
|
"loss": 1.2353, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.99328, |
|
"grad_norm": 0.0891214981675148, |
|
"learning_rate": 0.0013378956372968348, |
|
"loss": 1.2441, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 0.09961092472076416, |
|
"learning_rate": 0.0013344739093242087, |
|
"loss": 1.2407, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.00256, |
|
"grad_norm": 0.09316655993461609, |
|
"learning_rate": 0.0013310521813515825, |
|
"loss": 1.2589, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.00768, |
|
"grad_norm": 0.6676607131958008, |
|
"learning_rate": 0.0013276304533789566, |
|
"loss": 1.3337, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.0128, |
|
"grad_norm": 0.24017909169197083, |
|
"learning_rate": 0.0013242087254063302, |
|
"loss": 1.2628, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.01792, |
|
"grad_norm": 0.17012369632720947, |
|
"learning_rate": 0.0013207869974337042, |
|
"loss": 1.2607, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.02304, |
|
"grad_norm": 0.18585637211799622, |
|
"learning_rate": 0.0013173652694610778, |
|
"loss": 1.2502, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.02304, |
|
"eval_loss": 1.2686372995376587, |
|
"eval_runtime": 279.6676, |
|
"eval_samples_per_second": 8.939, |
|
"eval_steps_per_second": 1.119, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.02816, |
|
"grad_norm": 0.16060733795166016, |
|
"learning_rate": 0.0013139435414884517, |
|
"loss": 1.2418, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.03328, |
|
"grad_norm": 0.13407620787620544, |
|
"learning_rate": 0.0013105218135158255, |
|
"loss": 1.2749, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.0384, |
|
"grad_norm": 0.12042540311813354, |
|
"learning_rate": 0.0013071000855431993, |
|
"loss": 1.2465, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.04352, |
|
"grad_norm": 0.13832047581672668, |
|
"learning_rate": 0.0013036783575705731, |
|
"loss": 1.2535, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.04864, |
|
"grad_norm": 0.11310556530952454, |
|
"learning_rate": 0.001300256629597947, |
|
"loss": 1.2352, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.05376, |
|
"grad_norm": 0.13382209837436676, |
|
"learning_rate": 0.001296834901625321, |
|
"loss": 1.2507, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.05888, |
|
"grad_norm": 0.13580721616744995, |
|
"learning_rate": 0.0012934131736526946, |
|
"loss": 1.2401, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 0.11162128299474716, |
|
"learning_rate": 0.0012899914456800685, |
|
"loss": 1.2534, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.06912, |
|
"grad_norm": 0.14186108112335205, |
|
"learning_rate": 0.0012865697177074423, |
|
"loss": 1.248, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.07424, |
|
"grad_norm": 0.13840018212795258, |
|
"learning_rate": 0.0012831479897348161, |
|
"loss": 1.2462, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.07936, |
|
"grad_norm": 0.11488790065050125, |
|
"learning_rate": 0.00127972626176219, |
|
"loss": 1.241, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.08448, |
|
"grad_norm": 0.1290818750858307, |
|
"learning_rate": 0.0012763045337895638, |
|
"loss": 1.2275, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.0896, |
|
"grad_norm": 0.10360855609178543, |
|
"learning_rate": 0.0012728828058169374, |
|
"loss": 1.2378, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.09472, |
|
"grad_norm": 0.12253882735967636, |
|
"learning_rate": 0.0012694610778443114, |
|
"loss": 1.2562, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.09984, |
|
"grad_norm": 0.12016449123620987, |
|
"learning_rate": 0.001266039349871685, |
|
"loss": 1.2742, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.10496, |
|
"grad_norm": 0.12187926471233368, |
|
"learning_rate": 0.0012626176218990591, |
|
"loss": 1.2202, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.11008, |
|
"grad_norm": 0.11081688851118088, |
|
"learning_rate": 0.001259195893926433, |
|
"loss": 1.2474, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.1152, |
|
"grad_norm": 0.09387561678886414, |
|
"learning_rate": 0.0012557741659538068, |
|
"loss": 1.2566, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.12032, |
|
"grad_norm": 0.11739682406187057, |
|
"learning_rate": 0.0012523524379811806, |
|
"loss": 1.2421, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.12544, |
|
"grad_norm": 0.11595962196588516, |
|
"learning_rate": 0.0012489307100085542, |
|
"loss": 1.2525, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.13056, |
|
"grad_norm": 0.10769950598478317, |
|
"learning_rate": 0.0012455089820359283, |
|
"loss": 1.2501, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.13568, |
|
"grad_norm": 0.11145374178886414, |
|
"learning_rate": 0.0012420872540633019, |
|
"loss": 1.2649, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.1408, |
|
"grad_norm": 0.09793559461832047, |
|
"learning_rate": 0.001238665526090676, |
|
"loss": 1.244, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.14592, |
|
"grad_norm": 0.16483676433563232, |
|
"learning_rate": 0.0012352437981180495, |
|
"loss": 1.2298, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.15104, |
|
"grad_norm": 0.13711702823638916, |
|
"learning_rate": 0.0012318220701454234, |
|
"loss": 1.2279, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.15616, |
|
"grad_norm": 0.10064009577035904, |
|
"learning_rate": 0.0012284003421727974, |
|
"loss": 1.2381, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.16128, |
|
"grad_norm": 0.09517936408519745, |
|
"learning_rate": 0.001224978614200171, |
|
"loss": 1.231, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.1663999999999999, |
|
"grad_norm": 0.111509308218956, |
|
"learning_rate": 0.001221556886227545, |
|
"loss": 1.25, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.1715200000000001, |
|
"grad_norm": 0.09270152449607849, |
|
"learning_rate": 0.0012181351582549187, |
|
"loss": 1.2478, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.17664, |
|
"grad_norm": 0.10461369901895523, |
|
"learning_rate": 0.0012147134302822927, |
|
"loss": 1.2536, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.18176, |
|
"grad_norm": 0.10189452022314072, |
|
"learning_rate": 0.0012112917023096663, |
|
"loss": 1.2177, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.18688, |
|
"grad_norm": 0.08983030170202255, |
|
"learning_rate": 0.0012078699743370402, |
|
"loss": 1.2117, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 0.09105629473924637, |
|
"learning_rate": 0.001204448246364414, |
|
"loss": 1.2302, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.19712, |
|
"grad_norm": 0.11691851168870926, |
|
"learning_rate": 0.0012010265183917878, |
|
"loss": 1.2101, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.20224, |
|
"grad_norm": 0.08926935493946075, |
|
"learning_rate": 0.0011976047904191617, |
|
"loss": 1.2479, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.20736, |
|
"grad_norm": 0.11435071378946304, |
|
"learning_rate": 0.0011941830624465355, |
|
"loss": 1.2593, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.21248, |
|
"grad_norm": 0.10086748749017715, |
|
"learning_rate": 0.0011907613344739095, |
|
"loss": 1.2413, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.2176, |
|
"grad_norm": 0.098397396504879, |
|
"learning_rate": 0.0011873396065012832, |
|
"loss": 1.228, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.22272, |
|
"grad_norm": 0.08532971143722534, |
|
"learning_rate": 0.001183917878528657, |
|
"loss": 1.2309, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.22784, |
|
"grad_norm": 0.10852818191051483, |
|
"learning_rate": 0.0011804961505560308, |
|
"loss": 1.232, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.23296, |
|
"grad_norm": 0.09808767586946487, |
|
"learning_rate": 0.0011770744225834046, |
|
"loss": 1.2415, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.23808, |
|
"grad_norm": 0.11177875101566315, |
|
"learning_rate": 0.0011736526946107785, |
|
"loss": 1.2592, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.2432, |
|
"grad_norm": 0.1047763004899025, |
|
"learning_rate": 0.0011702309666381523, |
|
"loss": 1.2487, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.24832, |
|
"grad_norm": 0.12227226048707962, |
|
"learning_rate": 0.001166809238665526, |
|
"loss": 1.2481, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.2534399999999999, |
|
"grad_norm": 0.11121272295713425, |
|
"learning_rate": 0.0011633875106929, |
|
"loss": 1.2369, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2585600000000001, |
|
"grad_norm": 0.1087367981672287, |
|
"learning_rate": 0.0011599657827202738, |
|
"loss": 1.2378, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.26368, |
|
"grad_norm": 0.09499981254339218, |
|
"learning_rate": 0.0011565440547476476, |
|
"loss": 1.2327, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.2688, |
|
"grad_norm": 0.12346815317869186, |
|
"learning_rate": 0.0011531223267750215, |
|
"loss": 1.2538, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.27392, |
|
"grad_norm": 0.10443610697984695, |
|
"learning_rate": 0.0011497005988023953, |
|
"loss": 1.2343, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.27904, |
|
"grad_norm": 0.1339293271303177, |
|
"learning_rate": 0.0011462788708297691, |
|
"loss": 1.2198, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.28416, |
|
"grad_norm": 0.08476725220680237, |
|
"learning_rate": 0.0011428571428571427, |
|
"loss": 1.236, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.28928, |
|
"grad_norm": 0.12042795866727829, |
|
"learning_rate": 0.0011394354148845168, |
|
"loss": 1.2357, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.2944, |
|
"grad_norm": 0.08857988566160202, |
|
"learning_rate": 0.0011360136869118904, |
|
"loss": 1.2476, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.29952, |
|
"grad_norm": 0.1092582419514656, |
|
"learning_rate": 0.0011325919589392644, |
|
"loss": 1.2445, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.30464, |
|
"grad_norm": 0.0912066176533699, |
|
"learning_rate": 0.001129170230966638, |
|
"loss": 1.2556, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.30976, |
|
"grad_norm": 0.12163588404655457, |
|
"learning_rate": 0.001125748502994012, |
|
"loss": 1.2427, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.31488, |
|
"grad_norm": 0.0835074707865715, |
|
"learning_rate": 0.001122326775021386, |
|
"loss": 1.2546, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.1106950119137764, |
|
"learning_rate": 0.0011189050470487595, |
|
"loss": 1.2168, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.32512, |
|
"grad_norm": 0.09452968090772629, |
|
"learning_rate": 0.0011154833190761336, |
|
"loss": 1.2314, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.3302399999999999, |
|
"grad_norm": 0.11672431975603104, |
|
"learning_rate": 0.0011120615911035072, |
|
"loss": 1.2455, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.33536, |
|
"grad_norm": 0.10683607310056686, |
|
"learning_rate": 0.0011086398631308812, |
|
"loss": 1.2474, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.34048, |
|
"grad_norm": 0.08974877744913101, |
|
"learning_rate": 0.0011052181351582549, |
|
"loss": 1.2226, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.3456000000000001, |
|
"grad_norm": 0.10645844787359238, |
|
"learning_rate": 0.0011017964071856287, |
|
"loss": 1.2545, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.35072, |
|
"grad_norm": 0.08754228055477142, |
|
"learning_rate": 0.0010983746792130025, |
|
"loss": 1.217, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.35584, |
|
"grad_norm": 0.10209974646568298, |
|
"learning_rate": 0.0010949529512403763, |
|
"loss": 1.2399, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.36096, |
|
"grad_norm": 0.09669913351535797, |
|
"learning_rate": 0.0010915312232677504, |
|
"loss": 1.2081, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.36608, |
|
"grad_norm": 0.10272342711687088, |
|
"learning_rate": 0.001088109495295124, |
|
"loss": 1.2635, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.3712, |
|
"grad_norm": 0.0976710096001625, |
|
"learning_rate": 0.001084687767322498, |
|
"loss": 1.2347, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.37632, |
|
"grad_norm": 0.09784968197345734, |
|
"learning_rate": 0.0010812660393498717, |
|
"loss": 1.2522, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.38144, |
|
"grad_norm": 0.09353113174438477, |
|
"learning_rate": 0.0010778443113772455, |
|
"loss": 1.2326, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.38656, |
|
"grad_norm": 0.10906370729207993, |
|
"learning_rate": 0.0010744225834046193, |
|
"loss": 1.2375, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.39168, |
|
"grad_norm": 0.11907199025154114, |
|
"learning_rate": 0.0010710008554319932, |
|
"loss": 1.2185, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.3968, |
|
"grad_norm": 0.10644809901714325, |
|
"learning_rate": 0.001067579127459367, |
|
"loss": 1.2216, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.40192, |
|
"grad_norm": 0.10000847280025482, |
|
"learning_rate": 0.0010641573994867408, |
|
"loss": 1.2471, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.40704, |
|
"grad_norm": 0.11204187572002411, |
|
"learning_rate": 0.0010607356715141146, |
|
"loss": 1.2693, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.41216, |
|
"grad_norm": 0.0837775245308876, |
|
"learning_rate": 0.0010573139435414885, |
|
"loss": 1.2444, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.4172799999999999, |
|
"grad_norm": 0.09714753180742264, |
|
"learning_rate": 0.0010538922155688623, |
|
"loss": 1.2525, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.4224, |
|
"grad_norm": 0.08763246238231659, |
|
"learning_rate": 0.0010504704875962361, |
|
"loss": 1.2678, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.42752, |
|
"grad_norm": 0.12144036591053009, |
|
"learning_rate": 0.00104704875962361, |
|
"loss": 1.2285, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.4326400000000001, |
|
"grad_norm": 0.11393667757511139, |
|
"learning_rate": 0.0010436270316509838, |
|
"loss": 1.253, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.43776, |
|
"grad_norm": 0.0934453159570694, |
|
"learning_rate": 0.0010402053036783576, |
|
"loss": 1.2388, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.44288, |
|
"grad_norm": 0.1040380522608757, |
|
"learning_rate": 0.0010367835757057312, |
|
"loss": 1.2446, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 0.09509964287281036, |
|
"learning_rate": 0.0010333618477331053, |
|
"loss": 1.2328, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.45312, |
|
"grad_norm": 0.09978800266981125, |
|
"learning_rate": 0.001029940119760479, |
|
"loss": 1.2385, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.45824, |
|
"grad_norm": 0.08431090414524078, |
|
"learning_rate": 0.001026518391787853, |
|
"loss": 1.2389, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.46336, |
|
"grad_norm": 0.09447863698005676, |
|
"learning_rate": 0.0010230966638152268, |
|
"loss": 1.2301, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.46848, |
|
"grad_norm": 0.09212321043014526, |
|
"learning_rate": 0.0010196749358426006, |
|
"loss": 1.2457, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.4736, |
|
"grad_norm": 0.09489751607179642, |
|
"learning_rate": 0.0010162532078699744, |
|
"loss": 1.2229, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.47872, |
|
"grad_norm": 0.08897145092487335, |
|
"learning_rate": 0.001012831479897348, |
|
"loss": 1.2242, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.48384, |
|
"grad_norm": 0.09199076145887375, |
|
"learning_rate": 0.001009409751924722, |
|
"loss": 1.251, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.48896, |
|
"grad_norm": 0.1008097380399704, |
|
"learning_rate": 0.0010059880239520957, |
|
"loss": 1.2633, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.49408, |
|
"grad_norm": 0.0928090438246727, |
|
"learning_rate": 0.0010025662959794698, |
|
"loss": 1.238, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.4992, |
|
"grad_norm": 0.09936055541038513, |
|
"learning_rate": 0.0009991445680068436, |
|
"loss": 1.2502, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.5043199999999999, |
|
"grad_norm": 0.12435046583414078, |
|
"learning_rate": 0.0009957228400342174, |
|
"loss": 1.238, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.5094400000000001, |
|
"grad_norm": 0.09954190254211426, |
|
"learning_rate": 0.000992301112061591, |
|
"loss": 1.2403, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.51456, |
|
"grad_norm": 0.0886056125164032, |
|
"learning_rate": 0.0009888793840889649, |
|
"loss": 1.2413, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.5196800000000001, |
|
"grad_norm": 0.10016464442014694, |
|
"learning_rate": 0.0009854576561163387, |
|
"loss": 1.2272, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.5248, |
|
"grad_norm": 0.08891763538122177, |
|
"learning_rate": 0.0009820359281437125, |
|
"loss": 1.2198, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.52992, |
|
"grad_norm": 0.08042890578508377, |
|
"learning_rate": 0.0009786142001710863, |
|
"loss": 1.2386, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.53504, |
|
"grad_norm": 0.09712400287389755, |
|
"learning_rate": 0.0009751924721984602, |
|
"loss": 1.2418, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.54016, |
|
"grad_norm": 0.09671667218208313, |
|
"learning_rate": 0.0009717707442258341, |
|
"loss": 1.2493, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.54528, |
|
"grad_norm": 0.1355689913034439, |
|
"learning_rate": 0.000968349016253208, |
|
"loss": 1.2529, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.5504, |
|
"grad_norm": 0.08577972650527954, |
|
"learning_rate": 0.0009649272882805818, |
|
"loss": 1.2444, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.55552, |
|
"grad_norm": 0.08881525695323944, |
|
"learning_rate": 0.0009615055603079555, |
|
"loss": 1.2561, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.56064, |
|
"grad_norm": 0.08444136381149292, |
|
"learning_rate": 0.0009580838323353293, |
|
"loss": 1.2378, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.56576, |
|
"grad_norm": 0.08552881330251694, |
|
"learning_rate": 0.0009546621043627032, |
|
"loss": 1.2186, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.57088, |
|
"grad_norm": 0.08705168962478638, |
|
"learning_rate": 0.000951240376390077, |
|
"loss": 1.2451, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 0.08805005997419357, |
|
"learning_rate": 0.0009478186484174508, |
|
"loss": 1.225, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.5811199999999999, |
|
"grad_norm": 0.10901911556720734, |
|
"learning_rate": 0.0009443969204448247, |
|
"loss": 1.2351, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.58624, |
|
"grad_norm": 0.08406229317188263, |
|
"learning_rate": 0.0009409751924721985, |
|
"loss": 1.2535, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5913599999999999, |
|
"grad_norm": 0.10775440186262131, |
|
"learning_rate": 0.0009375534644995723, |
|
"loss": 1.2474, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.5964800000000001, |
|
"grad_norm": 0.09037076681852341, |
|
"learning_rate": 0.0009341317365269461, |
|
"loss": 1.2346, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.6016, |
|
"grad_norm": 0.10725659132003784, |
|
"learning_rate": 0.00093071000855432, |
|
"loss": 1.2156, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.6067200000000001, |
|
"grad_norm": 0.08220596611499786, |
|
"learning_rate": 0.0009272882805816938, |
|
"loss": 1.2335, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.61184, |
|
"grad_norm": 0.08765338361263275, |
|
"learning_rate": 0.0009238665526090676, |
|
"loss": 1.2311, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.61696, |
|
"grad_norm": 0.0885564312338829, |
|
"learning_rate": 0.0009204448246364415, |
|
"loss": 1.2381, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.62208, |
|
"grad_norm": 0.09842797368764877, |
|
"learning_rate": 0.0009170230966638152, |
|
"loss": 1.2362, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.6272, |
|
"grad_norm": 0.08304440975189209, |
|
"learning_rate": 0.000913601368691189, |
|
"loss": 1.2635, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.63232, |
|
"grad_norm": 0.0953385978937149, |
|
"learning_rate": 0.0009101796407185628, |
|
"loss": 1.2221, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.63744, |
|
"grad_norm": 0.09365107119083405, |
|
"learning_rate": 0.0009067579127459367, |
|
"loss": 1.2449, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.64256, |
|
"grad_norm": 0.09064996242523193, |
|
"learning_rate": 0.0009033361847733106, |
|
"loss": 1.2212, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.64768, |
|
"grad_norm": 0.08681759238243103, |
|
"learning_rate": 0.0008999144568006844, |
|
"loss": 1.2581, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.6528, |
|
"grad_norm": 0.08267663419246674, |
|
"learning_rate": 0.0008964927288280582, |
|
"loss": 1.2347, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.6579199999999998, |
|
"grad_norm": 0.08745119720697403, |
|
"learning_rate": 0.000893071000855432, |
|
"loss": 1.2289, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.66304, |
|
"grad_norm": 0.08867384493350983, |
|
"learning_rate": 0.0008896492728828058, |
|
"loss": 1.2598, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6681599999999999, |
|
"grad_norm": 0.08820460736751556, |
|
"learning_rate": 0.0008862275449101797, |
|
"loss": 1.2458, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.67328, |
|
"grad_norm": 0.1035899817943573, |
|
"learning_rate": 0.0008828058169375535, |
|
"loss": 1.2346, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.6784, |
|
"grad_norm": 0.08878592401742935, |
|
"learning_rate": 0.0008793840889649273, |
|
"loss": 1.2332, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.6835200000000001, |
|
"grad_norm": 0.08506552875041962, |
|
"learning_rate": 0.0008759623609923011, |
|
"loss": 1.2392, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.68864, |
|
"grad_norm": 0.09227900952100754, |
|
"learning_rate": 0.0008725406330196749, |
|
"loss": 1.2552, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6937600000000002, |
|
"grad_norm": 0.08019377291202545, |
|
"learning_rate": 0.0008691189050470488, |
|
"loss": 1.2208, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.69888, |
|
"grad_norm": 0.09494514763355255, |
|
"learning_rate": 0.0008656971770744226, |
|
"loss": 1.2367, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 0.09500183910131454, |
|
"learning_rate": 0.0008622754491017965, |
|
"loss": 1.1998, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.70912, |
|
"grad_norm": 0.08492112904787064, |
|
"learning_rate": 0.0008588537211291703, |
|
"loss": 1.2489, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.71424, |
|
"grad_norm": 0.0840214341878891, |
|
"learning_rate": 0.0008554319931565441, |
|
"loss": 1.2358, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.71936, |
|
"grad_norm": 0.0879180058836937, |
|
"learning_rate": 0.0008520102651839178, |
|
"loss": 1.2448, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.72448, |
|
"grad_norm": 0.08872208744287491, |
|
"learning_rate": 0.0008485885372112917, |
|
"loss": 1.2432, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.7296, |
|
"grad_norm": 0.09941036254167557, |
|
"learning_rate": 0.0008451668092386655, |
|
"loss": 1.2416, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.73472, |
|
"grad_norm": 0.08928696811199188, |
|
"learning_rate": 0.0008417450812660393, |
|
"loss": 1.2545, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.73984, |
|
"grad_norm": 0.08976240456104279, |
|
"learning_rate": 0.0008383233532934132, |
|
"loss": 1.2596, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7449599999999998, |
|
"grad_norm": 0.08667703717947006, |
|
"learning_rate": 0.0008349016253207871, |
|
"loss": 1.2499, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.75008, |
|
"grad_norm": 0.08816345036029816, |
|
"learning_rate": 0.0008314798973481608, |
|
"loss": 1.2339, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.7551999999999999, |
|
"grad_norm": 0.08712169528007507, |
|
"learning_rate": 0.0008280581693755347, |
|
"loss": 1.2206, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.76032, |
|
"grad_norm": 0.08284337818622589, |
|
"learning_rate": 0.0008246364414029085, |
|
"loss": 1.2481, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.76544, |
|
"grad_norm": 0.08372201770544052, |
|
"learning_rate": 0.0008212147134302823, |
|
"loss": 1.2258, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.7705600000000001, |
|
"grad_norm": 0.08462055772542953, |
|
"learning_rate": 0.0008177929854576561, |
|
"loss": 1.255, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.77568, |
|
"grad_norm": 0.08806449919939041, |
|
"learning_rate": 0.00081437125748503, |
|
"loss": 1.2395, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.7808000000000002, |
|
"grad_norm": 0.08771070092916489, |
|
"learning_rate": 0.0008109495295124037, |
|
"loss": 1.2165, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.78592, |
|
"grad_norm": 0.10757436603307724, |
|
"learning_rate": 0.0008075278015397775, |
|
"loss": 1.2463, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.79104, |
|
"grad_norm": 0.10293210297822952, |
|
"learning_rate": 0.0008041060735671514, |
|
"loss": 1.2633, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.79616, |
|
"grad_norm": 0.08851849287748337, |
|
"learning_rate": 0.0008006843455945253, |
|
"loss": 1.2306, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.80128, |
|
"grad_norm": 0.08972053974866867, |
|
"learning_rate": 0.0007972626176218991, |
|
"loss": 1.2173, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.8064, |
|
"grad_norm": 0.12318170815706253, |
|
"learning_rate": 0.000793840889649273, |
|
"loss": 1.2528, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.81152, |
|
"grad_norm": 0.08965172618627548, |
|
"learning_rate": 0.0007904191616766468, |
|
"loss": 1.2102, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.81664, |
|
"grad_norm": 0.10421866178512573, |
|
"learning_rate": 0.0007869974337040205, |
|
"loss": 1.2465, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.82176, |
|
"grad_norm": 0.09522471576929092, |
|
"learning_rate": 0.0007835757057313943, |
|
"loss": 1.2386, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.82688, |
|
"grad_norm": 0.10268909484148026, |
|
"learning_rate": 0.0007801539777587682, |
|
"loss": 1.2592, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 0.10645327717065811, |
|
"learning_rate": 0.000776732249786142, |
|
"loss": 1.2109, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.83712, |
|
"grad_norm": 0.11298143118619919, |
|
"learning_rate": 0.0007733105218135158, |
|
"loss": 1.2527, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.8422399999999999, |
|
"grad_norm": 0.09252069890499115, |
|
"learning_rate": 0.0007698887938408897, |
|
"loss": 1.2461, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8473600000000001, |
|
"grad_norm": 0.1118890643119812, |
|
"learning_rate": 0.0007664670658682635, |
|
"loss": 1.2334, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.85248, |
|
"grad_norm": 0.09874032437801361, |
|
"learning_rate": 0.0007630453378956373, |
|
"loss": 1.2281, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.8576000000000001, |
|
"grad_norm": 0.09806526452302933, |
|
"learning_rate": 0.0007596236099230111, |
|
"loss": 1.2403, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.86272, |
|
"grad_norm": 0.11208830773830414, |
|
"learning_rate": 0.000756201881950385, |
|
"loss": 1.2433, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.86784, |
|
"grad_norm": 0.09039215743541718, |
|
"learning_rate": 0.0007527801539777588, |
|
"loss": 1.2366, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.87296, |
|
"grad_norm": 0.08755876123905182, |
|
"learning_rate": 0.0007493584260051326, |
|
"loss": 1.2419, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.87808, |
|
"grad_norm": 0.0822838544845581, |
|
"learning_rate": 0.0007459366980325064, |
|
"loss": 1.2155, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.8832, |
|
"grad_norm": 0.08834468573331833, |
|
"learning_rate": 0.0007425149700598802, |
|
"loss": 1.2575, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.88832, |
|
"grad_norm": 0.09723412245512009, |
|
"learning_rate": 0.000739093242087254, |
|
"loss": 1.2137, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.89344, |
|
"grad_norm": 0.11283569037914276, |
|
"learning_rate": 0.0007356715141146278, |
|
"loss": 1.2589, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.89856, |
|
"grad_norm": 0.08855767548084259, |
|
"learning_rate": 0.0007322497861420018, |
|
"loss": 1.2095, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.90368, |
|
"grad_norm": 0.09914392232894897, |
|
"learning_rate": 0.0007288280581693756, |
|
"loss": 1.2272, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.9088, |
|
"grad_norm": 0.10668183118104935, |
|
"learning_rate": 0.0007254063301967494, |
|
"loss": 1.2555, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.91392, |
|
"grad_norm": 0.09255476295948029, |
|
"learning_rate": 0.0007219846022241232, |
|
"loss": 1.2248, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.9190399999999999, |
|
"grad_norm": 0.10384318232536316, |
|
"learning_rate": 0.000718562874251497, |
|
"loss": 1.2128, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.92416, |
|
"grad_norm": 0.09946981072425842, |
|
"learning_rate": 0.0007151411462788708, |
|
"loss": 1.2323, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.9292799999999999, |
|
"grad_norm": 0.10720199346542358, |
|
"learning_rate": 0.0007117194183062447, |
|
"loss": 1.2361, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.9344000000000001, |
|
"grad_norm": 0.10026301443576813, |
|
"learning_rate": 0.0007082976903336185, |
|
"loss": 1.2443, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.93952, |
|
"grad_norm": 0.08168992400169373, |
|
"learning_rate": 0.0007048759623609923, |
|
"loss": 1.2297, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.9446400000000001, |
|
"grad_norm": 0.11108248680830002, |
|
"learning_rate": 0.000701454234388366, |
|
"loss": 1.2359, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.94976, |
|
"grad_norm": 0.1133013665676117, |
|
"learning_rate": 0.00069803250641574, |
|
"loss": 1.2569, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.95488, |
|
"grad_norm": 0.0839882493019104, |
|
"learning_rate": 0.0006946107784431138, |
|
"loss": 1.231, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.09552154690027237, |
|
"learning_rate": 0.0006911890504704876, |
|
"loss": 1.2531, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.96512, |
|
"grad_norm": 0.09877890348434448, |
|
"learning_rate": 0.0006877673224978615, |
|
"loss": 1.2795, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.97024, |
|
"grad_norm": 0.08949697017669678, |
|
"learning_rate": 0.0006843455945252353, |
|
"loss": 1.2104, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.97536, |
|
"grad_norm": 0.09640631079673767, |
|
"learning_rate": 0.000680923866552609, |
|
"loss": 1.2552, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.98048, |
|
"grad_norm": 0.0900396853685379, |
|
"learning_rate": 0.0006775021385799828, |
|
"loss": 1.2282, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.9856, |
|
"grad_norm": 0.0989600196480751, |
|
"learning_rate": 0.0006740804106073567, |
|
"loss": 1.2366, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.99072, |
|
"grad_norm": 0.08420181274414062, |
|
"learning_rate": 0.0006706586826347305, |
|
"loss": 1.2309, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 1.9958399999999998, |
|
"grad_norm": 0.0978875532746315, |
|
"learning_rate": 0.0006672369546621043, |
|
"loss": 1.2241, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.10489444434642792, |
|
"learning_rate": 0.0006638152266894783, |
|
"loss": 1.2438, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 2.00512, |
|
"grad_norm": 0.08600255101919174, |
|
"learning_rate": 0.0006603934987168521, |
|
"loss": 1.2313, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 2.01024, |
|
"grad_norm": 0.08720952272415161, |
|
"learning_rate": 0.0006569717707442258, |
|
"loss": 1.2322, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 2.01536, |
|
"grad_norm": 0.08914855122566223, |
|
"learning_rate": 0.0006535500427715997, |
|
"loss": 1.2284, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 2.02048, |
|
"grad_norm": 0.08840183168649673, |
|
"learning_rate": 0.0006501283147989735, |
|
"loss": 1.2535, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.0256, |
|
"grad_norm": 0.08714258670806885, |
|
"learning_rate": 0.0006467065868263473, |
|
"loss": 1.2358, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 2.03072, |
|
"grad_norm": 0.08174372464418411, |
|
"learning_rate": 0.0006432848588537211, |
|
"loss": 1.2419, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 2.03584, |
|
"grad_norm": 0.08205553144216537, |
|
"learning_rate": 0.000639863130881095, |
|
"loss": 1.2222, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 2.04096, |
|
"grad_norm": 0.0794735699892044, |
|
"learning_rate": 0.0006364414029084687, |
|
"loss": 1.2092, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 2.04608, |
|
"grad_norm": 0.07929161936044693, |
|
"learning_rate": 0.0006330196749358425, |
|
"loss": 1.2066, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.04608, |
|
"eval_loss": 1.2471669912338257, |
|
"eval_runtime": 280.5686, |
|
"eval_samples_per_second": 8.91, |
|
"eval_steps_per_second": 1.116, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0512, |
|
"grad_norm": 0.08398760110139847, |
|
"learning_rate": 0.0006295979469632165, |
|
"loss": 1.2302, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 2.05632, |
|
"grad_norm": 0.08723915368318558, |
|
"learning_rate": 0.0006261762189905903, |
|
"loss": 1.2264, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 2.06144, |
|
"grad_norm": 0.0817415714263916, |
|
"learning_rate": 0.0006227544910179641, |
|
"loss": 1.2301, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 2.06656, |
|
"grad_norm": 0.08779250085353851, |
|
"learning_rate": 0.000619332763045338, |
|
"loss": 1.2228, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 2.07168, |
|
"grad_norm": 0.09133391082286835, |
|
"learning_rate": 0.0006159110350727117, |
|
"loss": 1.2415, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.0768, |
|
"grad_norm": 0.08566722273826599, |
|
"learning_rate": 0.0006124893071000855, |
|
"loss": 1.2121, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 2.08192, |
|
"grad_norm": 0.08748096972703934, |
|
"learning_rate": 0.0006090675791274593, |
|
"loss": 1.2413, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 2.08704, |
|
"grad_norm": 0.0854722335934639, |
|
"learning_rate": 0.0006056458511548332, |
|
"loss": 1.2326, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 2.09216, |
|
"grad_norm": 0.08618238568305969, |
|
"learning_rate": 0.000602224123182207, |
|
"loss": 1.2216, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 2.09728, |
|
"grad_norm": 0.08122539520263672, |
|
"learning_rate": 0.0005988023952095808, |
|
"loss": 1.2048, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.1024, |
|
"grad_norm": 0.08748337626457214, |
|
"learning_rate": 0.0005953806672369548, |
|
"loss": 1.2377, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 2.10752, |
|
"grad_norm": 0.08610134571790695, |
|
"learning_rate": 0.0005919589392643285, |
|
"loss": 1.2312, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 2.11264, |
|
"grad_norm": 0.09176385402679443, |
|
"learning_rate": 0.0005885372112917023, |
|
"loss": 1.234, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 2.11776, |
|
"grad_norm": 0.08381321281194687, |
|
"learning_rate": 0.0005851154833190762, |
|
"loss": 1.2369, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 2.12288, |
|
"grad_norm": 0.08841554820537567, |
|
"learning_rate": 0.00058169375534645, |
|
"loss": 1.2283, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 0.09985855966806412, |
|
"learning_rate": 0.0005782720273738238, |
|
"loss": 1.2144, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 2.13312, |
|
"grad_norm": 0.09444481134414673, |
|
"learning_rate": 0.0005748502994011976, |
|
"loss": 1.2503, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 2.13824, |
|
"grad_norm": 0.08858395367860794, |
|
"learning_rate": 0.0005714285714285714, |
|
"loss": 1.2497, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 2.14336, |
|
"grad_norm": 0.08610956370830536, |
|
"learning_rate": 0.0005680068434559452, |
|
"loss": 1.2444, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 2.14848, |
|
"grad_norm": 0.08294233679771423, |
|
"learning_rate": 0.000564585115483319, |
|
"loss": 1.2388, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.1536, |
|
"grad_norm": 0.08838624507188797, |
|
"learning_rate": 0.000561163387510693, |
|
"loss": 1.2353, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 2.15872, |
|
"grad_norm": 0.07888966798782349, |
|
"learning_rate": 0.0005577416595380668, |
|
"loss": 1.2085, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 2.16384, |
|
"grad_norm": 0.08408137410879135, |
|
"learning_rate": 0.0005543199315654406, |
|
"loss": 1.1922, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 2.16896, |
|
"grad_norm": 0.08245803415775299, |
|
"learning_rate": 0.0005508982035928143, |
|
"loss": 1.2478, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 2.17408, |
|
"grad_norm": 0.07858633249998093, |
|
"learning_rate": 0.0005474764756201882, |
|
"loss": 1.2203, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.1792, |
|
"grad_norm": 0.1002994254231453, |
|
"learning_rate": 0.000544054747647562, |
|
"loss": 1.2295, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 2.18432, |
|
"grad_norm": 0.08837361633777618, |
|
"learning_rate": 0.0005406330196749358, |
|
"loss": 1.238, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 2.18944, |
|
"grad_norm": 0.09690374881029129, |
|
"learning_rate": 0.0005372112917023097, |
|
"loss": 1.2301, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 2.19456, |
|
"grad_norm": 0.1030053198337555, |
|
"learning_rate": 0.0005337895637296835, |
|
"loss": 1.2365, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 2.19968, |
|
"grad_norm": 0.08212369680404663, |
|
"learning_rate": 0.0005303678357570573, |
|
"loss": 1.2325, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.2048, |
|
"grad_norm": 0.10352316498756409, |
|
"learning_rate": 0.0005269461077844312, |
|
"loss": 1.2341, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 2.20992, |
|
"grad_norm": 0.08346536755561829, |
|
"learning_rate": 0.000523524379811805, |
|
"loss": 1.2237, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 2.21504, |
|
"grad_norm": 0.11156366020441055, |
|
"learning_rate": 0.0005201026518391788, |
|
"loss": 1.2325, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 2.22016, |
|
"grad_norm": 0.10335463285446167, |
|
"learning_rate": 0.0005166809238665526, |
|
"loss": 1.2258, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 2.22528, |
|
"grad_norm": 0.09020327776670456, |
|
"learning_rate": 0.0005132591958939265, |
|
"loss": 1.2305, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.2304, |
|
"grad_norm": 0.09839983284473419, |
|
"learning_rate": 0.0005098374679213003, |
|
"loss": 1.2529, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 2.23552, |
|
"grad_norm": 0.1528375744819641, |
|
"learning_rate": 0.000506415739948674, |
|
"loss": 1.2365, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 2.24064, |
|
"grad_norm": 0.08589835464954376, |
|
"learning_rate": 0.0005029940119760479, |
|
"loss": 1.2407, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 2.24576, |
|
"grad_norm": 0.09676992893218994, |
|
"learning_rate": 0.0004995722840034218, |
|
"loss": 1.2244, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 2.25088, |
|
"grad_norm": 0.09816568344831467, |
|
"learning_rate": 0.0004961505560307955, |
|
"loss": 1.2439, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 0.09415734559297562, |
|
"learning_rate": 0.0004927288280581693, |
|
"loss": 1.2329, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 2.26112, |
|
"grad_norm": 0.0947481021285057, |
|
"learning_rate": 0.0004893071000855432, |
|
"loss": 1.2161, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 2.26624, |
|
"grad_norm": 0.09034735709428787, |
|
"learning_rate": 0.00048588537211291706, |
|
"loss": 1.25, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 2.27136, |
|
"grad_norm": 0.09715861082077026, |
|
"learning_rate": 0.0004824636441402909, |
|
"loss": 1.2536, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 2.27648, |
|
"grad_norm": 0.08793841302394867, |
|
"learning_rate": 0.00047904191616766467, |
|
"loss": 1.2425, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.2816, |
|
"grad_norm": 0.08474820852279663, |
|
"learning_rate": 0.0004756201881950385, |
|
"loss": 1.2409, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 2.28672, |
|
"grad_norm": 0.0954166129231453, |
|
"learning_rate": 0.0004721984602224123, |
|
"loss": 1.2292, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 2.29184, |
|
"grad_norm": 0.08861096948385239, |
|
"learning_rate": 0.00046877673224978616, |
|
"loss": 1.2509, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 2.29696, |
|
"grad_norm": 0.08726037293672562, |
|
"learning_rate": 0.00046535500427716, |
|
"loss": 1.2471, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 2.30208, |
|
"grad_norm": 0.10455521196126938, |
|
"learning_rate": 0.0004619332763045338, |
|
"loss": 1.2025, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.3072, |
|
"grad_norm": 0.09069357812404633, |
|
"learning_rate": 0.0004585115483319076, |
|
"loss": 1.2267, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 2.31232, |
|
"grad_norm": 0.09878482669591904, |
|
"learning_rate": 0.0004550898203592814, |
|
"loss": 1.2276, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 2.31744, |
|
"grad_norm": 0.1100214272737503, |
|
"learning_rate": 0.0004516680923866553, |
|
"loss": 1.2264, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 2.32256, |
|
"grad_norm": 0.09539143741130829, |
|
"learning_rate": 0.0004482463644140291, |
|
"loss": 1.2298, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 2.32768, |
|
"grad_norm": 0.0885070264339447, |
|
"learning_rate": 0.0004448246364414029, |
|
"loss": 1.2108, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.3327999999999998, |
|
"grad_norm": 0.08761118352413177, |
|
"learning_rate": 0.00044140290846877674, |
|
"loss": 1.2393, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 2.33792, |
|
"grad_norm": 0.09102723747491837, |
|
"learning_rate": 0.00043798118049615057, |
|
"loss": 1.2372, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 2.3430400000000002, |
|
"grad_norm": 0.08345375210046768, |
|
"learning_rate": 0.0004345594525235244, |
|
"loss": 1.2152, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 2.34816, |
|
"grad_norm": 0.0871756300330162, |
|
"learning_rate": 0.00043113772455089823, |
|
"loss": 1.2539, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 2.35328, |
|
"grad_norm": 0.08344704657793045, |
|
"learning_rate": 0.00042771599657827206, |
|
"loss": 1.2553, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.3584, |
|
"grad_norm": 0.09925805032253265, |
|
"learning_rate": 0.00042429426860564584, |
|
"loss": 1.2407, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 2.36352, |
|
"grad_norm": 0.08815500140190125, |
|
"learning_rate": 0.00042087254063301967, |
|
"loss": 1.2165, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 2.36864, |
|
"grad_norm": 0.08601918071508408, |
|
"learning_rate": 0.00041745081266039355, |
|
"loss": 1.2335, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 2.37376, |
|
"grad_norm": 0.08933749049901962, |
|
"learning_rate": 0.0004140290846877673, |
|
"loss": 1.2226, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 2.37888, |
|
"grad_norm": 0.09617882966995239, |
|
"learning_rate": 0.00041060735671514116, |
|
"loss": 1.2232, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 0.08219394832849503, |
|
"learning_rate": 0.000407185628742515, |
|
"loss": 1.2143, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 2.38912, |
|
"grad_norm": 0.08431462943553925, |
|
"learning_rate": 0.00040376390076988876, |
|
"loss": 1.2416, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 2.39424, |
|
"grad_norm": 0.09297817200422287, |
|
"learning_rate": 0.00040034217279726265, |
|
"loss": 1.2151, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 2.39936, |
|
"grad_norm": 0.08247403055429459, |
|
"learning_rate": 0.0003969204448246365, |
|
"loss": 1.2291, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 2.40448, |
|
"grad_norm": 0.09852425754070282, |
|
"learning_rate": 0.00039349871685201025, |
|
"loss": 1.2351, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.4096, |
|
"grad_norm": 0.08745532482862473, |
|
"learning_rate": 0.0003900769888793841, |
|
"loss": 1.2343, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 2.41472, |
|
"grad_norm": 0.0770939365029335, |
|
"learning_rate": 0.0003866552609067579, |
|
"loss": 1.229, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 2.4198399999999998, |
|
"grad_norm": 0.08143922686576843, |
|
"learning_rate": 0.00038323353293413174, |
|
"loss": 1.2168, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 2.42496, |
|
"grad_norm": 0.0852786973118782, |
|
"learning_rate": 0.0003798118049615056, |
|
"loss": 1.237, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 2.4300800000000002, |
|
"grad_norm": 0.09687593579292297, |
|
"learning_rate": 0.0003763900769888794, |
|
"loss": 1.2508, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.4352, |
|
"grad_norm": 0.09721924364566803, |
|
"learning_rate": 0.0003729683490162532, |
|
"loss": 1.2524, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 2.44032, |
|
"grad_norm": 0.09008084982633591, |
|
"learning_rate": 0.000369546621043627, |
|
"loss": 1.244, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 2.44544, |
|
"grad_norm": 0.0971183255314827, |
|
"learning_rate": 0.0003661248930710009, |
|
"loss": 1.2339, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 2.45056, |
|
"grad_norm": 0.09916377067565918, |
|
"learning_rate": 0.0003627031650983747, |
|
"loss": 1.2361, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 2.45568, |
|
"grad_norm": 0.08370574563741684, |
|
"learning_rate": 0.0003592814371257485, |
|
"loss": 1.2155, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.4608, |
|
"grad_norm": 0.08623719960451126, |
|
"learning_rate": 0.00035585970915312233, |
|
"loss": 1.2165, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 2.46592, |
|
"grad_norm": 0.07589856535196304, |
|
"learning_rate": 0.00035243798118049616, |
|
"loss": 1.2243, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 2.47104, |
|
"grad_norm": 0.07783632725477219, |
|
"learning_rate": 0.00034901625320787, |
|
"loss": 1.2296, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 2.47616, |
|
"grad_norm": 0.08424780517816544, |
|
"learning_rate": 0.0003455945252352438, |
|
"loss": 1.2312, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 2.48128, |
|
"grad_norm": 0.07945290952920914, |
|
"learning_rate": 0.00034217279726261765, |
|
"loss": 1.217, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.4864, |
|
"grad_norm": 0.0799039751291275, |
|
"learning_rate": 0.0003387510692899914, |
|
"loss": 1.2201, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 2.49152, |
|
"grad_norm": 0.09106255322694778, |
|
"learning_rate": 0.00033532934131736525, |
|
"loss": 1.2159, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 2.49664, |
|
"grad_norm": 0.08736411482095718, |
|
"learning_rate": 0.00033190761334473914, |
|
"loss": 1.2298, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 2.50176, |
|
"grad_norm": 0.08567455410957336, |
|
"learning_rate": 0.0003284858853721129, |
|
"loss": 1.221, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 2.5068799999999998, |
|
"grad_norm": 0.10240741819143295, |
|
"learning_rate": 0.00032506415739948674, |
|
"loss": 1.2296, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 0.0851578339934349, |
|
"learning_rate": 0.0003216424294268606, |
|
"loss": 1.233, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 2.5171200000000002, |
|
"grad_norm": 0.08598732203245163, |
|
"learning_rate": 0.00031822070145423435, |
|
"loss": 1.1937, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 2.52224, |
|
"grad_norm": 0.08547403663396835, |
|
"learning_rate": 0.00031479897348160823, |
|
"loss": 1.2158, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 2.52736, |
|
"grad_norm": 0.08678396046161652, |
|
"learning_rate": 0.00031137724550898206, |
|
"loss": 1.2113, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 2.53248, |
|
"grad_norm": 0.08527970314025879, |
|
"learning_rate": 0.00030795551753635584, |
|
"loss": 1.2508, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.5376, |
|
"grad_norm": 0.08052323758602142, |
|
"learning_rate": 0.00030453378956372967, |
|
"loss": 1.2324, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 2.54272, |
|
"grad_norm": 0.09333521127700806, |
|
"learning_rate": 0.0003011120615911035, |
|
"loss": 1.2381, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 2.54784, |
|
"grad_norm": 0.08944050967693329, |
|
"learning_rate": 0.0002976903336184774, |
|
"loss": 1.2417, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 2.55296, |
|
"grad_norm": 0.0959937795996666, |
|
"learning_rate": 0.00029426860564585116, |
|
"loss": 1.2346, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 2.55808, |
|
"grad_norm": 0.08061899244785309, |
|
"learning_rate": 0.000290846877673225, |
|
"loss": 1.2377, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.5632, |
|
"grad_norm": 0.08358662575483322, |
|
"learning_rate": 0.0002874251497005988, |
|
"loss": 1.2432, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 2.56832, |
|
"grad_norm": 0.08278947323560715, |
|
"learning_rate": 0.0002840034217279726, |
|
"loss": 1.222, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 2.5734399999999997, |
|
"grad_norm": 0.08546797931194305, |
|
"learning_rate": 0.0002805816937553465, |
|
"loss": 1.2379, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 2.57856, |
|
"grad_norm": 0.08106214553117752, |
|
"learning_rate": 0.0002771599657827203, |
|
"loss": 1.217, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 2.58368, |
|
"grad_norm": 0.08738037943840027, |
|
"learning_rate": 0.0002737382378100941, |
|
"loss": 1.2202, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.5888, |
|
"grad_norm": 0.07763461023569107, |
|
"learning_rate": 0.0002703165098374679, |
|
"loss": 1.2335, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 2.59392, |
|
"grad_norm": 0.09023375809192657, |
|
"learning_rate": 0.00026689478186484175, |
|
"loss": 1.2557, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 2.59904, |
|
"grad_norm": 0.08477311581373215, |
|
"learning_rate": 0.0002634730538922156, |
|
"loss": 1.23, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 2.6041600000000003, |
|
"grad_norm": 0.0802718847990036, |
|
"learning_rate": 0.0002600513259195894, |
|
"loss": 1.2502, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 2.60928, |
|
"grad_norm": 0.08197642862796783, |
|
"learning_rate": 0.00025662959794696324, |
|
"loss": 1.2117, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.6144, |
|
"grad_norm": 0.07977724820375443, |
|
"learning_rate": 0.000253207869974337, |
|
"loss": 1.2223, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 2.61952, |
|
"grad_norm": 0.084455206990242, |
|
"learning_rate": 0.0002497861420017109, |
|
"loss": 1.2296, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 2.62464, |
|
"grad_norm": 0.08407705277204514, |
|
"learning_rate": 0.00024636441402908467, |
|
"loss": 1.2466, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 2.62976, |
|
"grad_norm": 0.09497237205505371, |
|
"learning_rate": 0.00024294268605645853, |
|
"loss": 1.2204, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 2.63488, |
|
"grad_norm": 0.07946959137916565, |
|
"learning_rate": 0.00023952095808383233, |
|
"loss": 1.2293, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.08926168084144592, |
|
"learning_rate": 0.00023609923011120616, |
|
"loss": 1.2303, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 2.64512, |
|
"grad_norm": 0.08781218528747559, |
|
"learning_rate": 0.00023267750213858, |
|
"loss": 1.2215, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 2.65024, |
|
"grad_norm": 0.10436686873435974, |
|
"learning_rate": 0.0002292557741659538, |
|
"loss": 1.2446, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 2.65536, |
|
"grad_norm": 0.08655078709125519, |
|
"learning_rate": 0.00022583404619332765, |
|
"loss": 1.1889, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 2.6604799999999997, |
|
"grad_norm": 0.08525776863098145, |
|
"learning_rate": 0.00022241231822070146, |
|
"loss": 1.2246, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.6656, |
|
"grad_norm": 0.08515007048845291, |
|
"learning_rate": 0.00021899059024807529, |
|
"loss": 1.2083, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 2.67072, |
|
"grad_norm": 0.08326171338558197, |
|
"learning_rate": 0.00021556886227544912, |
|
"loss": 1.2463, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 2.67584, |
|
"grad_norm": 0.07543444633483887, |
|
"learning_rate": 0.00021214713430282292, |
|
"loss": 1.2229, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 2.68096, |
|
"grad_norm": 0.08731929957866669, |
|
"learning_rate": 0.00020872540633019678, |
|
"loss": 1.2431, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 2.68608, |
|
"grad_norm": 0.0846877247095108, |
|
"learning_rate": 0.00020530367835757058, |
|
"loss": 1.2276, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.6912000000000003, |
|
"grad_norm": 0.08184527605772018, |
|
"learning_rate": 0.00020188195038494438, |
|
"loss": 1.2151, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 2.69632, |
|
"grad_norm": 0.0830821543931961, |
|
"learning_rate": 0.00019846022241231824, |
|
"loss": 1.23, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 2.70144, |
|
"grad_norm": 0.07801831513643265, |
|
"learning_rate": 0.00019503849443969204, |
|
"loss": 1.2237, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 2.70656, |
|
"grad_norm": 0.07970487326383591, |
|
"learning_rate": 0.00019161676646706587, |
|
"loss": 1.2296, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.71168, |
|
"grad_norm": 0.08674521744251251, |
|
"learning_rate": 0.0001881950384944397, |
|
"loss": 1.2416, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.7168, |
|
"grad_norm": 0.0784011259675026, |
|
"learning_rate": 0.0001847733105218135, |
|
"loss": 1.2174, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 2.72192, |
|
"grad_norm": 0.08390358090400696, |
|
"learning_rate": 0.00018135158254918736, |
|
"loss": 1.2221, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 2.72704, |
|
"grad_norm": 0.08034460991621017, |
|
"learning_rate": 0.00017792985457656116, |
|
"loss": 1.2367, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 2.73216, |
|
"grad_norm": 0.0805404931306839, |
|
"learning_rate": 0.000174508126603935, |
|
"loss": 1.2228, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 2.73728, |
|
"grad_norm": 0.10428917407989502, |
|
"learning_rate": 0.00017108639863130882, |
|
"loss": 1.2331, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.7424, |
|
"grad_norm": 0.07880023866891861, |
|
"learning_rate": 0.00016766467065868263, |
|
"loss": 1.2151, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 2.7475199999999997, |
|
"grad_norm": 0.08330074697732925, |
|
"learning_rate": 0.00016424294268605646, |
|
"loss": 1.201, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 2.75264, |
|
"grad_norm": 0.08316068351268768, |
|
"learning_rate": 0.0001608212147134303, |
|
"loss": 1.2206, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 2.75776, |
|
"grad_norm": 0.08193733543157578, |
|
"learning_rate": 0.00015739948674080412, |
|
"loss": 1.2133, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 2.76288, |
|
"grad_norm": 0.0799107700586319, |
|
"learning_rate": 0.00015397775876817792, |
|
"loss": 1.2202, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.768, |
|
"grad_norm": 0.07523773610591888, |
|
"learning_rate": 0.00015055603079555175, |
|
"loss": 1.2088, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 2.77312, |
|
"grad_norm": 0.07782167941331863, |
|
"learning_rate": 0.00014713430282292558, |
|
"loss": 1.2191, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 2.7782400000000003, |
|
"grad_norm": 0.0814930647611618, |
|
"learning_rate": 0.0001437125748502994, |
|
"loss": 1.2096, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 2.78336, |
|
"grad_norm": 0.07619259506464005, |
|
"learning_rate": 0.00014029084687767324, |
|
"loss": 1.2297, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 2.78848, |
|
"grad_norm": 0.08152459561824799, |
|
"learning_rate": 0.00013686911890504704, |
|
"loss": 1.2281, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.7936, |
|
"grad_norm": 0.08513466268777847, |
|
"learning_rate": 0.00013344739093242087, |
|
"loss": 1.2183, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 2.79872, |
|
"grad_norm": 0.07648808509111404, |
|
"learning_rate": 0.0001300256629597947, |
|
"loss": 1.2171, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 2.80384, |
|
"grad_norm": 0.08608166873455048, |
|
"learning_rate": 0.0001266039349871685, |
|
"loss": 1.233, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 2.80896, |
|
"grad_norm": 0.08903096616268158, |
|
"learning_rate": 0.00012318220701454234, |
|
"loss": 1.2329, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 2.81408, |
|
"grad_norm": 0.07557443529367447, |
|
"learning_rate": 0.00011976047904191617, |
|
"loss": 1.2333, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.8192, |
|
"grad_norm": 0.0815119668841362, |
|
"learning_rate": 0.00011633875106929, |
|
"loss": 1.2353, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 2.82432, |
|
"grad_norm": 0.08905310928821564, |
|
"learning_rate": 0.00011291702309666383, |
|
"loss": 1.2255, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 2.82944, |
|
"grad_norm": 0.08789879828691483, |
|
"learning_rate": 0.00010949529512403764, |
|
"loss": 1.2139, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 2.8345599999999997, |
|
"grad_norm": 0.08430198580026627, |
|
"learning_rate": 0.00010607356715141146, |
|
"loss": 1.229, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 2.83968, |
|
"grad_norm": 0.07988926768302917, |
|
"learning_rate": 0.00010265183917878529, |
|
"loss": 1.2316, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.8448, |
|
"grad_norm": 0.08727908134460449, |
|
"learning_rate": 9.923011120615912e-05, |
|
"loss": 1.2237, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 2.84992, |
|
"grad_norm": 0.10279367119073868, |
|
"learning_rate": 9.580838323353294e-05, |
|
"loss": 1.2565, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 2.85504, |
|
"grad_norm": 0.08122528344392776, |
|
"learning_rate": 9.238665526090675e-05, |
|
"loss": 1.2353, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 2.86016, |
|
"grad_norm": 0.09036324173212051, |
|
"learning_rate": 8.896492728828058e-05, |
|
"loss": 1.2337, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 2.8652800000000003, |
|
"grad_norm": 0.07413888722658157, |
|
"learning_rate": 8.554319931565441e-05, |
|
"loss": 1.2367, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.8704, |
|
"grad_norm": 0.08744188398122787, |
|
"learning_rate": 8.212147134302823e-05, |
|
"loss": 1.234, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 2.87552, |
|
"grad_norm": 0.07657689601182938, |
|
"learning_rate": 7.869974337040206e-05, |
|
"loss": 1.2094, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 2.88064, |
|
"grad_norm": 0.09120559692382812, |
|
"learning_rate": 7.527801539777588e-05, |
|
"loss": 1.2473, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 2.88576, |
|
"grad_norm": 0.08199866861104965, |
|
"learning_rate": 7.18562874251497e-05, |
|
"loss": 1.2384, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 2.89088, |
|
"grad_norm": 0.07703917473554611, |
|
"learning_rate": 6.843455945252352e-05, |
|
"loss": 1.2197, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.896, |
|
"grad_norm": 0.08120223879814148, |
|
"learning_rate": 6.501283147989735e-05, |
|
"loss": 1.2343, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 2.90112, |
|
"grad_norm": 0.08173457533121109, |
|
"learning_rate": 6.159110350727117e-05, |
|
"loss": 1.2422, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 2.90624, |
|
"grad_norm": 0.08017323166131973, |
|
"learning_rate": 5.8169375534645e-05, |
|
"loss": 1.1985, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 2.91136, |
|
"grad_norm": 0.09028081595897675, |
|
"learning_rate": 5.474764756201882e-05, |
|
"loss": 1.1902, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 2.91648, |
|
"grad_norm": 0.07673865556716919, |
|
"learning_rate": 5.1325919589392645e-05, |
|
"loss": 1.2292, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.9215999999999998, |
|
"grad_norm": 0.08590974658727646, |
|
"learning_rate": 4.790419161676647e-05, |
|
"loss": 1.2227, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 2.92672, |
|
"grad_norm": 0.07928025722503662, |
|
"learning_rate": 4.448246364414029e-05, |
|
"loss": 1.2426, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 2.9318400000000002, |
|
"grad_norm": 0.07864856719970703, |
|
"learning_rate": 4.1060735671514114e-05, |
|
"loss": 1.2476, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 2.93696, |
|
"grad_norm": 0.08154473453760147, |
|
"learning_rate": 3.763900769888794e-05, |
|
"loss": 1.2081, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 2.94208, |
|
"grad_norm": 0.07814584672451019, |
|
"learning_rate": 3.421727972626176e-05, |
|
"loss": 1.2463, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.9472, |
|
"grad_norm": 0.07772421091794968, |
|
"learning_rate": 3.0795551753635584e-05, |
|
"loss": 1.2447, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 2.9523200000000003, |
|
"grad_norm": 0.08610265702009201, |
|
"learning_rate": 2.737382378100941e-05, |
|
"loss": 1.226, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 2.95744, |
|
"grad_norm": 0.086619071662426, |
|
"learning_rate": 2.3952095808383234e-05, |
|
"loss": 1.2616, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 2.96256, |
|
"grad_norm": 0.07644681632518768, |
|
"learning_rate": 2.0530367835757057e-05, |
|
"loss": 1.2244, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 2.96768, |
|
"grad_norm": 0.08778993040323257, |
|
"learning_rate": 1.710863986313088e-05, |
|
"loss": 1.2259, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.9728, |
|
"grad_norm": 0.0812673419713974, |
|
"learning_rate": 1.3686911890504705e-05, |
|
"loss": 1.2081, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 2.97792, |
|
"grad_norm": 0.07923007756471634, |
|
"learning_rate": 1.0265183917878529e-05, |
|
"loss": 1.2297, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 2.98304, |
|
"grad_norm": 0.07516805827617645, |
|
"learning_rate": 6.843455945252353e-06, |
|
"loss": 1.2277, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 2.98816, |
|
"grad_norm": 0.08043860644102097, |
|
"learning_rate": 3.4217279726261763e-06, |
|
"loss": 1.2087, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 2.99328, |
|
"grad_norm": 0.07992921024560928, |
|
"learning_rate": 0.0, |
|
"loss": 1.2479, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.99328, |
|
"step": 1170, |
|
"total_flos": 1.3204661001922806e+19, |
|
"train_loss": 1.2500826900840825, |
|
"train_runtime": 40705.2136, |
|
"train_samples_per_second": 7.37, |
|
"train_steps_per_second": 0.029 |
|
}, |
|
{ |
|
"epoch": 2.99328, |
|
"eval_loss": 1.2431308031082153, |
|
"eval_runtime": 280.041, |
|
"eval_samples_per_second": 8.927, |
|
"eval_steps_per_second": 1.118, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.99328, |
|
"eval_loss": 1.2251627445220947, |
|
"eval_runtime": 300.2276, |
|
"eval_samples_per_second": 8.327, |
|
"eval_steps_per_second": 1.043, |
|
"step": 1170 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1170, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3204661001922806e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|