Gemma-3-1B-Roblox-Luau / trainer_state.json
boatbomber
Upload model
61ed988
raw
history blame
35.8 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 203,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0049261083743842365,
"grad_norm": 28.522218704223633,
"learning_rate": 1e-05,
"loss": 1.8038,
"step": 1
},
{
"epoch": 0.009852216748768473,
"grad_norm": 10.439436912536621,
"learning_rate": 9.999401258983426e-06,
"loss": 1.7024,
"step": 2
},
{
"epoch": 0.014778325123152709,
"grad_norm": 10.16576099395752,
"learning_rate": 9.997605179330018e-06,
"loss": 1.6585,
"step": 3
},
{
"epoch": 0.019704433497536946,
"grad_norm": 8.24977970123291,
"learning_rate": 9.994612191194407e-06,
"loss": 1.5447,
"step": 4
},
{
"epoch": 0.024630541871921183,
"grad_norm": 4.327300071716309,
"learning_rate": 9.990423011386489e-06,
"loss": 1.4008,
"step": 5
},
{
"epoch": 0.029556650246305417,
"grad_norm": 3.9305260181427,
"learning_rate": 9.98503864319978e-06,
"loss": 1.4287,
"step": 6
},
{
"epoch": 0.034482758620689655,
"grad_norm": 2.2586233615875244,
"learning_rate": 9.978460376171113e-06,
"loss": 1.4288,
"step": 7
},
{
"epoch": 0.03940886699507389,
"grad_norm": 2.152981996536255,
"learning_rate": 9.970689785771798e-06,
"loss": 1.2331,
"step": 8
},
{
"epoch": 0.04433497536945813,
"grad_norm": 2.3123421669006348,
"learning_rate": 9.961728733030318e-06,
"loss": 1.5576,
"step": 9
},
{
"epoch": 0.04926108374384237,
"grad_norm": 2.1223628520965576,
"learning_rate": 9.951579364086603e-06,
"loss": 1.3364,
"step": 10
},
{
"epoch": 0.054187192118226604,
"grad_norm": 2.337981939315796,
"learning_rate": 9.940244109678043e-06,
"loss": 1.3432,
"step": 11
},
{
"epoch": 0.059113300492610835,
"grad_norm": 2.3937160968780518,
"learning_rate": 9.927725684557339e-06,
"loss": 1.3346,
"step": 12
},
{
"epoch": 0.06403940886699508,
"grad_norm": 1.533258080482483,
"learning_rate": 9.914027086842323e-06,
"loss": 1.2448,
"step": 13
},
{
"epoch": 0.06896551724137931,
"grad_norm": 1.9601627588272095,
"learning_rate": 9.899151597297923e-06,
"loss": 1.2093,
"step": 14
},
{
"epoch": 0.07389162561576355,
"grad_norm": 1.4371508359909058,
"learning_rate": 9.883102778550434e-06,
"loss": 1.0955,
"step": 15
},
{
"epoch": 0.07881773399014778,
"grad_norm": 1.5759714841842651,
"learning_rate": 9.865884474234275e-06,
"loss": 1.2615,
"step": 16
},
{
"epoch": 0.08374384236453201,
"grad_norm": 1.4584101438522339,
"learning_rate": 9.847500808071458e-06,
"loss": 1.2764,
"step": 17
},
{
"epoch": 0.08866995073891626,
"grad_norm": 1.432774305343628,
"learning_rate": 9.82795618288397e-06,
"loss": 1.11,
"step": 18
},
{
"epoch": 0.09359605911330049,
"grad_norm": 1.7812994718551636,
"learning_rate": 9.807255279539313e-06,
"loss": 1.2687,
"step": 19
},
{
"epoch": 0.09852216748768473,
"grad_norm": 1.3773082494735718,
"learning_rate": 9.78540305582945e-06,
"loss": 1.1375,
"step": 20
},
{
"epoch": 0.10344827586206896,
"grad_norm": 1.4377540349960327,
"learning_rate": 9.762404745283439e-06,
"loss": 1.1887,
"step": 21
},
{
"epoch": 0.10837438423645321,
"grad_norm": 1.2567236423492432,
"learning_rate": 9.738265855914014e-06,
"loss": 1.1226,
"step": 22
},
{
"epoch": 0.11330049261083744,
"grad_norm": 1.288097620010376,
"learning_rate": 9.712992168898436e-06,
"loss": 1.1442,
"step": 23
},
{
"epoch": 0.11822660098522167,
"grad_norm": 1.3083038330078125,
"learning_rate": 9.686589737193929e-06,
"loss": 1.1809,
"step": 24
},
{
"epoch": 0.12315270935960591,
"grad_norm": 1.0722836256027222,
"learning_rate": 9.659064884088017e-06,
"loss": 1.1327,
"step": 25
},
{
"epoch": 0.12807881773399016,
"grad_norm": 1.1409716606140137,
"learning_rate": 9.630424201684105e-06,
"loss": 1.0866,
"step": 26
},
{
"epoch": 0.1330049261083744,
"grad_norm": 1.1258468627929688,
"learning_rate": 9.600674549322716e-06,
"loss": 1.0847,
"step": 27
},
{
"epoch": 0.13793103448275862,
"grad_norm": 1.0608943700790405,
"learning_rate": 9.569823051938689e-06,
"loss": 0.9715,
"step": 28
},
{
"epoch": 0.14285714285714285,
"grad_norm": 1.1090885400772095,
"learning_rate": 9.537877098354787e-06,
"loss": 1.0492,
"step": 29
},
{
"epoch": 0.1477832512315271,
"grad_norm": 1.2303950786590576,
"learning_rate": 9.504844339512096e-06,
"loss": 0.995,
"step": 30
},
{
"epoch": 0.15270935960591134,
"grad_norm": 1.2325893640518188,
"learning_rate": 9.470732686637665e-06,
"loss": 1.1353,
"step": 31
},
{
"epoch": 0.15763546798029557,
"grad_norm": 1.0923973321914673,
"learning_rate": 9.435550309349776e-06,
"loss": 1.0256,
"step": 32
},
{
"epoch": 0.1625615763546798,
"grad_norm": 1.8741207122802734,
"learning_rate": 9.399305633701372e-06,
"loss": 1.117,
"step": 33
},
{
"epoch": 0.16748768472906403,
"grad_norm": 1.292672038078308,
"learning_rate": 9.36200734016203e-06,
"loss": 1.0424,
"step": 34
},
{
"epoch": 0.1724137931034483,
"grad_norm": 1.2905791997909546,
"learning_rate": 9.32366436153902e-06,
"loss": 1.1793,
"step": 35
},
{
"epoch": 0.17733990147783252,
"grad_norm": 1.1905455589294434,
"learning_rate": 9.284285880837947e-06,
"loss": 1.0032,
"step": 36
},
{
"epoch": 0.18226600985221675,
"grad_norm": 1.1533136367797852,
"learning_rate": 9.243881329063436e-06,
"loss": 1.0406,
"step": 37
},
{
"epoch": 0.18719211822660098,
"grad_norm": 1.2299302816390991,
"learning_rate": 9.202460382960449e-06,
"loss": 1.1085,
"step": 38
},
{
"epoch": 0.1921182266009852,
"grad_norm": 1.0995800495147705,
"learning_rate": 9.160032962696734e-06,
"loss": 1.0465,
"step": 39
},
{
"epoch": 0.19704433497536947,
"grad_norm": 1.2899202108383179,
"learning_rate": 9.116609229486992e-06,
"loss": 1.0072,
"step": 40
},
{
"epoch": 0.2019704433497537,
"grad_norm": 1.068886399269104,
"learning_rate": 9.072199583159285e-06,
"loss": 1.0853,
"step": 41
},
{
"epoch": 0.20689655172413793,
"grad_norm": 1.0160249471664429,
"learning_rate": 9.026814659664331e-06,
"loss": 0.9201,
"step": 42
},
{
"epoch": 0.21182266009852216,
"grad_norm": 0.980324387550354,
"learning_rate": 8.98046532852822e-06,
"loss": 0.9792,
"step": 43
},
{
"epoch": 0.21674876847290642,
"grad_norm": 1.0656648874282837,
"learning_rate": 8.93316269024921e-06,
"loss": 0.9549,
"step": 44
},
{
"epoch": 0.22167487684729065,
"grad_norm": 1.124436855316162,
"learning_rate": 8.88491807363919e-06,
"loss": 1.0474,
"step": 45
},
{
"epoch": 0.22660098522167488,
"grad_norm": 1.1231716871261597,
"learning_rate": 8.835743033110482e-06,
"loss": 0.9981,
"step": 46
},
{
"epoch": 0.2315270935960591,
"grad_norm": 1.0960031747817993,
"learning_rate": 8.78564934590859e-06,
"loss": 1.0547,
"step": 47
},
{
"epoch": 0.23645320197044334,
"grad_norm": 1.056442141532898,
"learning_rate": 8.734649009291586e-06,
"loss": 1.0868,
"step": 48
},
{
"epoch": 0.2413793103448276,
"grad_norm": 1.0149261951446533,
"learning_rate": 8.68275423765683e-06,
"loss": 0.9538,
"step": 49
},
{
"epoch": 0.24630541871921183,
"grad_norm": 0.9313431978225708,
"learning_rate": 8.629977459615655e-06,
"loss": 0.9597,
"step": 50
},
{
"epoch": 0.2512315270935961,
"grad_norm": 1.086411714553833,
"learning_rate": 8.576331315016753e-06,
"loss": 1.0181,
"step": 51
},
{
"epoch": 0.2561576354679803,
"grad_norm": 1.1177152395248413,
"learning_rate": 8.521828651918983e-06,
"loss": 1.0278,
"step": 52
},
{
"epoch": 0.26108374384236455,
"grad_norm": 0.9545988440513611,
"learning_rate": 8.46648252351431e-06,
"loss": 0.9892,
"step": 53
},
{
"epoch": 0.2660098522167488,
"grad_norm": 1.05325186252594,
"learning_rate": 8.41030618500161e-06,
"loss": 1.0133,
"step": 54
},
{
"epoch": 0.270935960591133,
"grad_norm": 1.0253876447677612,
"learning_rate": 8.353313090412093e-06,
"loss": 0.9538,
"step": 55
},
{
"epoch": 0.27586206896551724,
"grad_norm": 1.0110273361206055,
"learning_rate": 8.295516889387115e-06,
"loss": 0.8805,
"step": 56
},
{
"epoch": 0.28078817733990147,
"grad_norm": 1.0400066375732422,
"learning_rate": 8.23693142390914e-06,
"loss": 0.9632,
"step": 57
},
{
"epoch": 0.2857142857142857,
"grad_norm": 1.128901481628418,
"learning_rate": 8.177570724986627e-06,
"loss": 1.015,
"step": 58
},
{
"epoch": 0.29064039408866993,
"grad_norm": 1.1031105518341064,
"learning_rate": 8.117449009293668e-06,
"loss": 0.9957,
"step": 59
},
{
"epoch": 0.2955665024630542,
"grad_norm": 4.238386154174805,
"learning_rate": 8.05658067576513e-06,
"loss": 0.9085,
"step": 60
},
{
"epoch": 0.30049261083743845,
"grad_norm": 1.160597324371338,
"learning_rate": 7.99498030214817e-06,
"loss": 0.9809,
"step": 61
},
{
"epoch": 0.3054187192118227,
"grad_norm": 1.0774437189102173,
"learning_rate": 7.932662641510915e-06,
"loss": 0.99,
"step": 62
},
{
"epoch": 0.3103448275862069,
"grad_norm": 1.0282933712005615,
"learning_rate": 7.869642618709162e-06,
"loss": 0.9275,
"step": 63
},
{
"epoch": 0.31527093596059114,
"grad_norm": 1.0454133749008179,
"learning_rate": 7.805935326811913e-06,
"loss": 0.9071,
"step": 64
},
{
"epoch": 0.32019704433497537,
"grad_norm": 1.1418848037719727,
"learning_rate": 7.741556023486655e-06,
"loss": 0.9734,
"step": 65
},
{
"epoch": 0.3251231527093596,
"grad_norm": 1.0286744832992554,
"learning_rate": 7.676520127345198e-06,
"loss": 0.9934,
"step": 66
},
{
"epoch": 0.33004926108374383,
"grad_norm": 1.2144535779953003,
"learning_rate": 7.610843214250964e-06,
"loss": 0.9829,
"step": 67
},
{
"epoch": 0.33497536945812806,
"grad_norm": 1.4030691385269165,
"learning_rate": 7.5445410135886455e-06,
"loss": 0.9717,
"step": 68
},
{
"epoch": 0.3399014778325123,
"grad_norm": 1.0528010129928589,
"learning_rate": 7.477629404497048e-06,
"loss": 0.9649,
"step": 69
},
{
"epoch": 0.3448275862068966,
"grad_norm": 1.0271952152252197,
"learning_rate": 7.4101244120661105e-06,
"loss": 0.9185,
"step": 70
},
{
"epoch": 0.3497536945812808,
"grad_norm": 0.9849188327789307,
"learning_rate": 7.342042203498952e-06,
"loss": 0.9192,
"step": 71
},
{
"epoch": 0.35467980295566504,
"grad_norm": 1.0050177574157715,
"learning_rate": 7.273399084239878e-06,
"loss": 0.9326,
"step": 72
},
{
"epoch": 0.35960591133004927,
"grad_norm": 0.9628230929374695,
"learning_rate": 7.204211494069292e-06,
"loss": 0.884,
"step": 73
},
{
"epoch": 0.3645320197044335,
"grad_norm": 1.1869782209396362,
"learning_rate": 7.134496003166423e-06,
"loss": 0.966,
"step": 74
},
{
"epoch": 0.3694581280788177,
"grad_norm": 1.0189898014068604,
"learning_rate": 7.06426930814083e-06,
"loss": 0.8847,
"step": 75
},
{
"epoch": 0.37438423645320196,
"grad_norm": 1.5452977418899536,
"learning_rate": 6.993548228033618e-06,
"loss": 0.9902,
"step": 76
},
{
"epoch": 0.3793103448275862,
"grad_norm": 1.061618685722351,
"learning_rate": 6.922349700289348e-06,
"loss": 0.9273,
"step": 77
},
{
"epoch": 0.3842364532019704,
"grad_norm": 0.9350699186325073,
"learning_rate": 6.850690776699574e-06,
"loss": 0.8633,
"step": 78
},
{
"epoch": 0.3891625615763547,
"grad_norm": 0.944102942943573,
"learning_rate": 6.7785886193189936e-06,
"loss": 0.9348,
"step": 79
},
{
"epoch": 0.39408866995073893,
"grad_norm": 0.9876391291618347,
"learning_rate": 6.7060604963552125e-06,
"loss": 0.9354,
"step": 80
},
{
"epoch": 0.39901477832512317,
"grad_norm": 1.1221191883087158,
"learning_rate": 6.633123778033061e-06,
"loss": 0.9122,
"step": 81
},
{
"epoch": 0.4039408866995074,
"grad_norm": 1.171373724937439,
"learning_rate": 6.559795932434489e-06,
"loss": 0.9184,
"step": 82
},
{
"epoch": 0.4088669950738916,
"grad_norm": 1.0021214485168457,
"learning_rate": 6.486094521315022e-06,
"loss": 0.904,
"step": 83
},
{
"epoch": 0.41379310344827586,
"grad_norm": 1.0860635042190552,
"learning_rate": 6.412037195897786e-06,
"loss": 0.9216,
"step": 84
},
{
"epoch": 0.4187192118226601,
"grad_norm": 1.1491731405258179,
"learning_rate": 6.337641692646106e-06,
"loss": 0.9381,
"step": 85
},
{
"epoch": 0.4236453201970443,
"grad_norm": 1.0246098041534424,
"learning_rate": 6.262925829015675e-06,
"loss": 0.8873,
"step": 86
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.9979232549667358,
"learning_rate": 6.187907499187357e-06,
"loss": 0.955,
"step": 87
},
{
"epoch": 0.43349753694581283,
"grad_norm": 1.022977590560913,
"learning_rate": 6.112604669781572e-06,
"loss": 0.8763,
"step": 88
},
{
"epoch": 0.43842364532019706,
"grad_norm": 1.0037063360214233,
"learning_rate": 6.037035375555376e-06,
"loss": 0.9651,
"step": 89
},
{
"epoch": 0.4433497536945813,
"grad_norm": 1.0663460493087769,
"learning_rate": 5.961217715083185e-06,
"loss": 0.969,
"step": 90
},
{
"epoch": 0.4482758620689655,
"grad_norm": 0.9997808933258057,
"learning_rate": 5.885169846422242e-06,
"loss": 1.0117,
"step": 91
},
{
"epoch": 0.45320197044334976,
"grad_norm": 0.9787831902503967,
"learning_rate": 5.808909982763825e-06,
"loss": 0.7974,
"step": 92
},
{
"epoch": 0.458128078817734,
"grad_norm": 1.020983338356018,
"learning_rate": 5.732456388071247e-06,
"loss": 0.9561,
"step": 93
},
{
"epoch": 0.4630541871921182,
"grad_norm": 0.9058898091316223,
"learning_rate": 5.655827372705712e-06,
"loss": 0.9409,
"step": 94
},
{
"epoch": 0.46798029556650245,
"grad_norm": 1.0344359874725342,
"learning_rate": 5.579041289041045e-06,
"loss": 0.9722,
"step": 95
},
{
"epoch": 0.4729064039408867,
"grad_norm": 0.9576674103736877,
"learning_rate": 5.502116527068363e-06,
"loss": 0.8873,
"step": 96
},
{
"epoch": 0.47783251231527096,
"grad_norm": 1.2546874284744263,
"learning_rate": 5.425071509991737e-06,
"loss": 0.912,
"step": 97
},
{
"epoch": 0.4827586206896552,
"grad_norm": 0.9510334730148315,
"learning_rate": 5.347924689815906e-06,
"loss": 0.8627,
"step": 98
},
{
"epoch": 0.4876847290640394,
"grad_norm": 0.9422460198402405,
"learning_rate": 5.270694542927089e-06,
"loss": 0.9726,
"step": 99
},
{
"epoch": 0.49261083743842365,
"grad_norm": 0.9733043909072876,
"learning_rate": 5.193399565667945e-06,
"loss": 0.8824,
"step": 100
},
{
"epoch": 0.4975369458128079,
"grad_norm": 0.9182532429695129,
"learning_rate": 5.116058269907779e-06,
"loss": 0.8612,
"step": 101
},
{
"epoch": 0.5024630541871922,
"grad_norm": 0.8935402035713196,
"learning_rate": 5.038689178609011e-06,
"loss": 0.9394,
"step": 102
},
{
"epoch": 0.5073891625615764,
"grad_norm": 1.4738961458206177,
"learning_rate": 4.96131082139099e-06,
"loss": 0.9609,
"step": 103
},
{
"epoch": 0.5123152709359606,
"grad_norm": 1.0967806577682495,
"learning_rate": 4.883941730092222e-06,
"loss": 0.9497,
"step": 104
},
{
"epoch": 0.5172413793103449,
"grad_norm": 0.9221424460411072,
"learning_rate": 4.806600434332056e-06,
"loss": 0.891,
"step": 105
},
{
"epoch": 0.5221674876847291,
"grad_norm": 1.0894826650619507,
"learning_rate": 4.729305457072913e-06,
"loss": 0.8682,
"step": 106
},
{
"epoch": 0.5270935960591133,
"grad_norm": 0.9089194536209106,
"learning_rate": 4.6520753101840945e-06,
"loss": 0.8213,
"step": 107
},
{
"epoch": 0.5320197044334976,
"grad_norm": 0.8551648855209351,
"learning_rate": 4.574928490008264e-06,
"loss": 0.8802,
"step": 108
},
{
"epoch": 0.5369458128078818,
"grad_norm": 0.9886828064918518,
"learning_rate": 4.497883472931639e-06,
"loss": 0.9347,
"step": 109
},
{
"epoch": 0.541871921182266,
"grad_norm": 0.9351578950881958,
"learning_rate": 4.4209587109589565e-06,
"loss": 0.8173,
"step": 110
},
{
"epoch": 0.5467980295566502,
"grad_norm": 0.9019532203674316,
"learning_rate": 4.3441726272942895e-06,
"loss": 0.8473,
"step": 111
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.9672942757606506,
"learning_rate": 4.267543611928755e-06,
"loss": 0.918,
"step": 112
},
{
"epoch": 0.5566502463054187,
"grad_norm": 1.3270796537399292,
"learning_rate": 4.191090017236177e-06,
"loss": 0.9567,
"step": 113
},
{
"epoch": 0.5615763546798029,
"grad_norm": 0.980305552482605,
"learning_rate": 4.114830153577759e-06,
"loss": 0.863,
"step": 114
},
{
"epoch": 0.5665024630541872,
"grad_norm": 0.8682278394699097,
"learning_rate": 4.0387822849168165e-06,
"loss": 0.8437,
"step": 115
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.9460963010787964,
"learning_rate": 3.962964624444625e-06,
"loss": 0.899,
"step": 116
},
{
"epoch": 0.5763546798029556,
"grad_norm": 0.8857601881027222,
"learning_rate": 3.887395330218429e-06,
"loss": 0.849,
"step": 117
},
{
"epoch": 0.5812807881773399,
"grad_norm": 1.0073286294937134,
"learning_rate": 3.8120925008126457e-06,
"loss": 0.9561,
"step": 118
},
{
"epoch": 0.5862068965517241,
"grad_norm": 0.976075291633606,
"learning_rate": 3.7370741709843263e-06,
"loss": 0.8938,
"step": 119
},
{
"epoch": 0.5911330049261084,
"grad_norm": 0.9713646173477173,
"learning_rate": 3.662358307353897e-06,
"loss": 0.9119,
"step": 120
},
{
"epoch": 0.5960591133004927,
"grad_norm": 0.8663797974586487,
"learning_rate": 3.587962804102214e-06,
"loss": 0.8631,
"step": 121
},
{
"epoch": 0.6009852216748769,
"grad_norm": 0.8859656453132629,
"learning_rate": 3.5139054786849787e-06,
"loss": 0.8044,
"step": 122
},
{
"epoch": 0.6059113300492611,
"grad_norm": 1.091760277748108,
"learning_rate": 3.440204067565511e-06,
"loss": 0.9143,
"step": 123
},
{
"epoch": 0.6108374384236454,
"grad_norm": 0.982275128364563,
"learning_rate": 3.3668762219669393e-06,
"loss": 0.918,
"step": 124
},
{
"epoch": 0.6157635467980296,
"grad_norm": 0.8803215622901917,
"learning_rate": 3.293939503644788e-06,
"loss": 0.8426,
"step": 125
},
{
"epoch": 0.6206896551724138,
"grad_norm": 0.922527015209198,
"learning_rate": 3.2214113806810077e-06,
"loss": 0.8571,
"step": 126
},
{
"epoch": 0.625615763546798,
"grad_norm": 0.9506503343582153,
"learning_rate": 3.149309223300428e-06,
"loss": 0.8659,
"step": 127
},
{
"epoch": 0.6305418719211823,
"grad_norm": 1.0316869020462036,
"learning_rate": 3.0776502997106526e-06,
"loss": 0.9088,
"step": 128
},
{
"epoch": 0.6354679802955665,
"grad_norm": 1.9990965127944946,
"learning_rate": 3.0064517719663833e-06,
"loss": 0.8672,
"step": 129
},
{
"epoch": 0.6403940886699507,
"grad_norm": 0.928225040435791,
"learning_rate": 2.935730691859172e-06,
"loss": 0.8305,
"step": 130
},
{
"epoch": 0.645320197044335,
"grad_norm": 0.9361408948898315,
"learning_rate": 2.8655039968335774e-06,
"loss": 0.8462,
"step": 131
},
{
"epoch": 0.6502463054187192,
"grad_norm": 0.9435849189758301,
"learning_rate": 2.7957885059307097e-06,
"loss": 0.8756,
"step": 132
},
{
"epoch": 0.6551724137931034,
"grad_norm": 0.9171866178512573,
"learning_rate": 2.7266009157601226e-06,
"loss": 0.917,
"step": 133
},
{
"epoch": 0.6600985221674877,
"grad_norm": 0.911807656288147,
"learning_rate": 2.65795779650105e-06,
"loss": 0.8588,
"step": 134
},
{
"epoch": 0.6650246305418719,
"grad_norm": 1.8045060634613037,
"learning_rate": 2.589875587933892e-06,
"loss": 0.9057,
"step": 135
},
{
"epoch": 0.6699507389162561,
"grad_norm": 0.8906491994857788,
"learning_rate": 2.522370595502954e-06,
"loss": 0.8708,
"step": 136
},
{
"epoch": 0.6748768472906403,
"grad_norm": 0.9065340757369995,
"learning_rate": 2.4554589864113566e-06,
"loss": 0.8558,
"step": 137
},
{
"epoch": 0.6798029556650246,
"grad_norm": 1.116025686264038,
"learning_rate": 2.3891567857490373e-06,
"loss": 0.9355,
"step": 138
},
{
"epoch": 0.6847290640394089,
"grad_norm": 0.8539987802505493,
"learning_rate": 2.323479872654805e-06,
"loss": 0.7964,
"step": 139
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.8484991192817688,
"learning_rate": 2.2584439765133453e-06,
"loss": 0.808,
"step": 140
},
{
"epoch": 0.6945812807881774,
"grad_norm": 0.9017306566238403,
"learning_rate": 2.1940646731880887e-06,
"loss": 0.9113,
"step": 141
},
{
"epoch": 0.6995073891625616,
"grad_norm": 1.0858136415481567,
"learning_rate": 2.1303573812908383e-06,
"loss": 0.8572,
"step": 142
},
{
"epoch": 0.7044334975369458,
"grad_norm": 0.8687289953231812,
"learning_rate": 2.0673373584890847e-06,
"loss": 0.8145,
"step": 143
},
{
"epoch": 0.7093596059113301,
"grad_norm": 0.9045321345329285,
"learning_rate": 2.0050196978518323e-06,
"loss": 0.8543,
"step": 144
},
{
"epoch": 0.7142857142857143,
"grad_norm": 1.0116416215896606,
"learning_rate": 1.943419324234871e-06,
"loss": 0.8539,
"step": 145
},
{
"epoch": 0.7192118226600985,
"grad_norm": 0.9924123883247375,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.95,
"step": 146
},
{
"epoch": 0.7241379310344828,
"grad_norm": 0.8728001713752747,
"learning_rate": 1.8224292750133743e-06,
"loss": 0.9293,
"step": 147
},
{
"epoch": 0.729064039408867,
"grad_norm": 0.8748157620429993,
"learning_rate": 1.7630685760908623e-06,
"loss": 0.844,
"step": 148
},
{
"epoch": 0.7339901477832512,
"grad_norm": 0.857449471950531,
"learning_rate": 1.7044831106128867e-06,
"loss": 0.8433,
"step": 149
},
{
"epoch": 0.7389162561576355,
"grad_norm": 0.9839447140693665,
"learning_rate": 1.6466869095879079e-06,
"loss": 0.8528,
"step": 150
},
{
"epoch": 0.7438423645320197,
"grad_norm": 1.1705787181854248,
"learning_rate": 1.589693814998391e-06,
"loss": 0.9376,
"step": 151
},
{
"epoch": 0.7487684729064039,
"grad_norm": 1.311543345451355,
"learning_rate": 1.533517476485691e-06,
"loss": 0.7922,
"step": 152
},
{
"epoch": 0.7536945812807881,
"grad_norm": 0.8858788013458252,
"learning_rate": 1.4781713480810184e-06,
"loss": 0.8161,
"step": 153
},
{
"epoch": 0.7586206896551724,
"grad_norm": 0.8841625452041626,
"learning_rate": 1.4236686849832497e-06,
"loss": 0.8746,
"step": 154
},
{
"epoch": 0.7635467980295566,
"grad_norm": 1.0294075012207031,
"learning_rate": 1.370022540384347e-06,
"loss": 0.9812,
"step": 155
},
{
"epoch": 0.7684729064039408,
"grad_norm": 1.0972410440444946,
"learning_rate": 1.3172457623431706e-06,
"loss": 0.966,
"step": 156
},
{
"epoch": 0.7733990147783252,
"grad_norm": 0.884397566318512,
"learning_rate": 1.2653509907084171e-06,
"loss": 0.8526,
"step": 157
},
{
"epoch": 0.7783251231527094,
"grad_norm": 0.9068350791931152,
"learning_rate": 1.214350654091413e-06,
"loss": 0.9192,
"step": 158
},
{
"epoch": 0.7832512315270936,
"grad_norm": 1.0193994045257568,
"learning_rate": 1.1642569668895171e-06,
"loss": 0.8804,
"step": 159
},
{
"epoch": 0.7881773399014779,
"grad_norm": 0.8925516605377197,
"learning_rate": 1.1150819263608098e-06,
"loss": 0.8384,
"step": 160
},
{
"epoch": 0.7931034482758621,
"grad_norm": 1.0054816007614136,
"learning_rate": 1.0668373097507922e-06,
"loss": 0.8544,
"step": 161
},
{
"epoch": 0.7980295566502463,
"grad_norm": 0.8788971304893494,
"learning_rate": 1.0195346714717813e-06,
"loss": 0.8462,
"step": 162
},
{
"epoch": 0.8029556650246306,
"grad_norm": 0.8969894051551819,
"learning_rate": 9.731853403356705e-07,
"loss": 0.8614,
"step": 163
},
{
"epoch": 0.8078817733990148,
"grad_norm": 0.9033743739128113,
"learning_rate": 9.278004168407151e-07,
"loss": 0.7701,
"step": 164
},
{
"epoch": 0.812807881773399,
"grad_norm": 0.8596148490905762,
"learning_rate": 8.833907705130091e-07,
"loss": 0.8451,
"step": 165
},
{
"epoch": 0.8177339901477833,
"grad_norm": 0.9375607967376709,
"learning_rate": 8.399670373032665e-07,
"loss": 0.8357,
"step": 166
},
{
"epoch": 0.8226600985221675,
"grad_norm": 1.2474132776260376,
"learning_rate": 7.975396170395522e-07,
"loss": 0.8348,
"step": 167
},
{
"epoch": 0.8275862068965517,
"grad_norm": 1.0321135520935059,
"learning_rate": 7.561186709365653e-07,
"loss": 1.0182,
"step": 168
},
{
"epoch": 0.8325123152709359,
"grad_norm": 0.8872163891792297,
"learning_rate": 7.157141191620548e-07,
"loss": 0.8193,
"step": 169
},
{
"epoch": 0.8374384236453202,
"grad_norm": 1.0191763639450073,
"learning_rate": 6.763356384609809e-07,
"loss": 0.867,
"step": 170
},
{
"epoch": 0.8423645320197044,
"grad_norm": 1.7469654083251953,
"learning_rate": 6.379926598379727e-07,
"loss": 0.7807,
"step": 171
},
{
"epoch": 0.8472906403940886,
"grad_norm": 0.8473700284957886,
"learning_rate": 6.006943662986275e-07,
"loss": 0.7896,
"step": 172
},
{
"epoch": 0.8522167487684729,
"grad_norm": 0.8807177543640137,
"learning_rate": 5.644496906502233e-07,
"loss": 0.8352,
"step": 173
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.1127580404281616,
"learning_rate": 5.292673133623372e-07,
"loss": 0.931,
"step": 174
},
{
"epoch": 0.8620689655172413,
"grad_norm": 1.0402883291244507,
"learning_rate": 4.951556604879049e-07,
"loss": 0.954,
"step": 175
},
{
"epoch": 0.8669950738916257,
"grad_norm": 0.8743595480918884,
"learning_rate": 4.6212290164521554e-07,
"loss": 0.8078,
"step": 176
},
{
"epoch": 0.8719211822660099,
"grad_norm": 0.8114182353019714,
"learning_rate": 4.3017694806131163e-07,
"loss": 0.8326,
"step": 177
},
{
"epoch": 0.8768472906403941,
"grad_norm": 0.907577395439148,
"learning_rate": 3.9932545067728366e-07,
"loss": 0.9081,
"step": 178
},
{
"epoch": 0.8817733990147784,
"grad_norm": 0.8789636492729187,
"learning_rate": 3.695757983158954e-07,
"loss": 0.8478,
"step": 179
},
{
"epoch": 0.8866995073891626,
"grad_norm": 0.8163318634033203,
"learning_rate": 3.409351159119845e-07,
"loss": 0.7992,
"step": 180
},
{
"epoch": 0.8916256157635468,
"grad_norm": 0.9108045697212219,
"learning_rate": 3.134102628060698e-07,
"loss": 0.9684,
"step": 181
},
{
"epoch": 0.896551724137931,
"grad_norm": 0.9332152009010315,
"learning_rate": 2.8700783110156507e-07,
"loss": 0.8402,
"step": 182
},
{
"epoch": 0.9014778325123153,
"grad_norm": 0.9538592100143433,
"learning_rate": 2.617341440859883e-07,
"loss": 0.885,
"step": 183
},
{
"epoch": 0.9064039408866995,
"grad_norm": 0.9269714951515198,
"learning_rate": 2.3759525471656163e-07,
"loss": 0.8598,
"step": 184
},
{
"epoch": 0.9113300492610837,
"grad_norm": 1.0273958444595337,
"learning_rate": 2.1459694417055033e-07,
"loss": 0.9211,
"step": 185
},
{
"epoch": 0.916256157635468,
"grad_norm": 0.8281537890434265,
"learning_rate": 1.9274472046068805e-07,
"loss": 0.7494,
"step": 186
},
{
"epoch": 0.9211822660098522,
"grad_norm": 0.91705721616745,
"learning_rate": 1.7204381711603046e-07,
"loss": 0.8442,
"step": 187
},
{
"epoch": 0.9261083743842364,
"grad_norm": 0.9961190223693848,
"learning_rate": 1.524991919285429e-07,
"loss": 1.0154,
"step": 188
},
{
"epoch": 0.9310344827586207,
"grad_norm": 0.8181946277618408,
"learning_rate": 1.3411552576572562e-07,
"loss": 0.8561,
"step": 189
},
{
"epoch": 0.9359605911330049,
"grad_norm": 0.822562575340271,
"learning_rate": 1.1689722144956672e-07,
"loss": 0.7987,
"step": 190
},
{
"epoch": 0.9408866995073891,
"grad_norm": 0.9585839509963989,
"learning_rate": 1.008484027020773e-07,
"loss": 0.8168,
"step": 191
},
{
"epoch": 0.9458128078817734,
"grad_norm": 0.8027588129043579,
"learning_rate": 8.597291315767808e-08,
"loss": 0.779,
"step": 192
},
{
"epoch": 0.9507389162561576,
"grad_norm": 0.965819239616394,
"learning_rate": 7.227431544266194e-08,
"loss": 0.9602,
"step": 193
},
{
"epoch": 0.9556650246305419,
"grad_norm": 0.8224076628684998,
"learning_rate": 5.97558903219575e-08,
"loss": 0.8123,
"step": 194
},
{
"epoch": 0.9605911330049262,
"grad_norm": 0.9190309643745422,
"learning_rate": 4.842063591339763e-08,
"loss": 0.9018,
"step": 195
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.8280571699142456,
"learning_rate": 3.82712669696822e-08,
"loss": 0.8204,
"step": 196
},
{
"epoch": 0.9704433497536946,
"grad_norm": 0.8307924270629883,
"learning_rate": 2.9310214228202016e-08,
"loss": 0.8532,
"step": 197
},
{
"epoch": 0.9753694581280788,
"grad_norm": 0.9194669127464294,
"learning_rate": 2.153962382888841e-08,
"loss": 0.8382,
"step": 198
},
{
"epoch": 0.9802955665024631,
"grad_norm": 0.804408848285675,
"learning_rate": 1.496135680021993e-08,
"loss": 0.8196,
"step": 199
},
{
"epoch": 0.9852216748768473,
"grad_norm": 0.8451563715934753,
"learning_rate": 9.576988613511084e-09,
"loss": 0.8492,
"step": 200
},
{
"epoch": 0.9901477832512315,
"grad_norm": 0.8822647929191589,
"learning_rate": 5.387808805594752e-09,
"loss": 0.8916,
"step": 201
},
{
"epoch": 0.9950738916256158,
"grad_norm": 0.8675324320793152,
"learning_rate": 2.3948206699819787e-09,
"loss": 0.8024,
"step": 202
},
{
"epoch": 1.0,
"grad_norm": 0.8639441132545471,
"learning_rate": 5.987410165758656e-10,
"loss": 0.8228,
"step": 203
}
],
"logging_steps": 1.0,
"max_steps": 203,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4104222270160896e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}