|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 10, |
|
"global_step": 154, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006493506493506494, |
|
"grad_norm": 5312.028745324425, |
|
"learning_rate": 0.0, |
|
"loss": 12.6644, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012987012987012988, |
|
"grad_norm": 3464.512608575103, |
|
"learning_rate": 1.25e-06, |
|
"loss": 13.386, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01948051948051948, |
|
"grad_norm": 31521.52866951872, |
|
"learning_rate": 2.5e-06, |
|
"loss": 12.124, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.025974025974025976, |
|
"grad_norm": 12714.37327874382, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 9.26, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.032467532467532464, |
|
"grad_norm": 2957.000475133963, |
|
"learning_rate": 5e-06, |
|
"loss": 7.0851, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03896103896103896, |
|
"grad_norm": 1856.4304411247954, |
|
"learning_rate": 6.25e-06, |
|
"loss": 4.3426, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.045454545454545456, |
|
"grad_norm": 380.9114379987856, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 3.9004, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05194805194805195, |
|
"grad_norm": 249.7480254394976, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 4.2357, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05844155844155844, |
|
"grad_norm": 163.12129308225673, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4659, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06493506493506493, |
|
"grad_norm": 306.28794651548174, |
|
"learning_rate": 1.125e-05, |
|
"loss": 3.2504, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 458.0129175616477, |
|
"learning_rate": 1.25e-05, |
|
"loss": 6.6892, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07792207792207792, |
|
"grad_norm": 220.92021209863208, |
|
"learning_rate": 1.375e-05, |
|
"loss": 3.3174, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08441558441558442, |
|
"grad_norm": 140.9175310968349, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 3.8063, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 100.53464591827579, |
|
"learning_rate": 1.6250000000000002e-05, |
|
"loss": 2.7457, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09740259740259741, |
|
"grad_norm": 81.16333748640615, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 2.7346, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1038961038961039, |
|
"grad_norm": 94.60838239841561, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 3.6905, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11038961038961038, |
|
"grad_norm": 96.27886786384646, |
|
"learning_rate": 2e-05, |
|
"loss": 2.7146, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11688311688311688, |
|
"grad_norm": 638.8030278028655, |
|
"learning_rate": 1.9997408848413494e-05, |
|
"loss": 2.9735, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12337662337662338, |
|
"grad_norm": 101.7239129398942, |
|
"learning_rate": 1.9989636736467278e-05, |
|
"loss": 3.0153, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": 67.71433029957166, |
|
"learning_rate": 1.9976687691905394e-05, |
|
"loss": 2.0634, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13636363636363635, |
|
"grad_norm": 61.33799178204917, |
|
"learning_rate": 1.9958568425315316e-05, |
|
"loss": 1.7994, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 77.67967542573722, |
|
"learning_rate": 1.9935288326650314e-05, |
|
"loss": 1.8646, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14935064935064934, |
|
"grad_norm": 66.7541786942052, |
|
"learning_rate": 1.9906859460363307e-05, |
|
"loss": 1.6082, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15584415584415584, |
|
"grad_norm": 66.42183035318644, |
|
"learning_rate": 1.98732965591547e-05, |
|
"loss": 1.4584, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16233766233766234, |
|
"grad_norm": 59.277341273127604, |
|
"learning_rate": 1.9834617016337424e-05, |
|
"loss": 1.2276, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16883116883116883, |
|
"grad_norm": 60.09621841530829, |
|
"learning_rate": 1.979084087682323e-05, |
|
"loss": 1.1038, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17532467532467533, |
|
"grad_norm": 53.451286572243966, |
|
"learning_rate": 1.9741990826734793e-05, |
|
"loss": 0.9448, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 52.91840992849332, |
|
"learning_rate": 1.9688092181649065e-05, |
|
"loss": 0.8475, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.18831168831168832, |
|
"grad_norm": 42.915320217692674, |
|
"learning_rate": 1.9629172873477995e-05, |
|
"loss": 0.6988, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.19480519480519481, |
|
"grad_norm": 36.403813181510394, |
|
"learning_rate": 1.956526343599335e-05, |
|
"loss": 0.591, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2012987012987013, |
|
"grad_norm": 27.894365664450408, |
|
"learning_rate": 1.9496396989003195e-05, |
|
"loss": 0.504, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2077922077922078, |
|
"grad_norm": 33.58287541986749, |
|
"learning_rate": 1.9422609221188208e-05, |
|
"loss": 0.4665, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 33.5121042744897, |
|
"learning_rate": 1.9343938371606714e-05, |
|
"loss": 0.4967, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.22077922077922077, |
|
"grad_norm": 27.982938599625648, |
|
"learning_rate": 1.9260425209878052e-05, |
|
"loss": 0.4135, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 51.93090738473159, |
|
"learning_rate": 1.917211301505453e-05, |
|
"loss": 0.5827, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23376623376623376, |
|
"grad_norm": 57.25489042342769, |
|
"learning_rate": 1.907904755319289e-05, |
|
"loss": 0.6328, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.24025974025974026, |
|
"grad_norm": 11.875972697569978, |
|
"learning_rate": 1.8981277053636963e-05, |
|
"loss": 0.3192, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.24675324675324675, |
|
"grad_norm": 27.883526944555015, |
|
"learning_rate": 1.8878852184023754e-05, |
|
"loss": 0.437, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2532467532467532, |
|
"grad_norm": 43.732501540378045, |
|
"learning_rate": 1.8771826024025944e-05, |
|
"loss": 0.7421, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 28.473023146202923, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 0.4279, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2662337662337662, |
|
"grad_norm": 13.106421492490322, |
|
"learning_rate": 1.8544194045464888e-05, |
|
"loss": 0.3088, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 5.104577671097258, |
|
"learning_rate": 1.8423706192694118e-05, |
|
"loss": 0.2931, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2792207792207792, |
|
"grad_norm": 52.64196123136193, |
|
"learning_rate": 1.8298852919990254e-05, |
|
"loss": 0.4912, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 31.65156603514557, |
|
"learning_rate": 1.816969893010442e-05, |
|
"loss": 0.3692, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2922077922077922, |
|
"grad_norm": 17.666169589586605, |
|
"learning_rate": 1.8036311154549783e-05, |
|
"loss": 0.2944, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2987012987012987, |
|
"grad_norm": 38.35439518358878, |
|
"learning_rate": 1.789875871891559e-05, |
|
"loss": 0.5252, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3051948051948052, |
|
"grad_norm": 39.96972433176561, |
|
"learning_rate": 1.77571129070442e-05, |
|
"loss": 0.5614, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3116883116883117, |
|
"grad_norm": 19.4739183485063, |
|
"learning_rate": 1.761144712408965e-05, |
|
"loss": 0.3101, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3181818181818182, |
|
"grad_norm": 2122.177114683787, |
|
"learning_rate": 1.7461836858476858e-05, |
|
"loss": 8.3024, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"grad_norm": 53.72514322304979, |
|
"learning_rate": 1.730835964278124e-05, |
|
"loss": 0.5558, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.33116883116883117, |
|
"grad_norm": 53.70022188889065, |
|
"learning_rate": 1.7151095013548996e-05, |
|
"loss": 0.5592, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.33766233766233766, |
|
"grad_norm": 33.4569293851714, |
|
"learning_rate": 1.699012447007882e-05, |
|
"loss": 0.3671, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.34415584415584416, |
|
"grad_norm": 17.137073429830238, |
|
"learning_rate": 1.6825531432186545e-05, |
|
"loss": 0.3092, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.35064935064935066, |
|
"grad_norm": 22.60309242557496, |
|
"learning_rate": 1.6657401196974405e-05, |
|
"loss": 0.3291, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 8.833069765914521, |
|
"learning_rate": 1.648582089462756e-05, |
|
"loss": 0.2884, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 42.48152192562097, |
|
"learning_rate": 1.631087944326053e-05, |
|
"loss": 0.4151, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.37012987012987014, |
|
"grad_norm": 51.1875826358593, |
|
"learning_rate": 1.6132667502837164e-05, |
|
"loss": 0.4914, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.37662337662337664, |
|
"grad_norm": 29.57316461617743, |
|
"learning_rate": 1.59512774281879e-05, |
|
"loss": 0.3513, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38311688311688313, |
|
"grad_norm": 1.6690431301081146, |
|
"learning_rate": 1.5766803221148676e-05, |
|
"loss": 0.279, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": 17.11791002555302, |
|
"learning_rate": 1.5579340481846338e-05, |
|
"loss": 0.3062, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3961038961038961, |
|
"grad_norm": 2.9548663461162916, |
|
"learning_rate": 1.538898635915576e-05, |
|
"loss": 0.2837, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4025974025974026, |
|
"grad_norm": 30.32462862806025, |
|
"learning_rate": 1.5195839500354337e-05, |
|
"loss": 0.3502, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4090909090909091, |
|
"grad_norm": 27.26334803927827, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.3433, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4155844155844156, |
|
"grad_norm": 25.006274002753365, |
|
"learning_rate": 1.4801569348059158e-05, |
|
"loss": 0.3237, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.42207792207792205, |
|
"grad_norm": 9.0889223900474, |
|
"learning_rate": 1.4600650377311523e-05, |
|
"loss": 0.2881, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 17.44533004191656, |
|
"learning_rate": 1.4397347210059059e-05, |
|
"loss": 0.3027, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.43506493506493504, |
|
"grad_norm": 19.97489878606359, |
|
"learning_rate": 1.4191765204166643e-05, |
|
"loss": 0.3013, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.44155844155844154, |
|
"grad_norm": 16.933568982459366, |
|
"learning_rate": 1.3984010898462417e-05, |
|
"loss": 0.3018, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.44805194805194803, |
|
"grad_norm": 29.645129941175437, |
|
"learning_rate": 1.3774191957526144e-05, |
|
"loss": 0.3391, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 9.80114973140799, |
|
"learning_rate": 1.356241711589417e-05, |
|
"loss": 0.2963, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.461038961038961, |
|
"grad_norm": 4.338277899349995, |
|
"learning_rate": 1.3348796121709862e-05, |
|
"loss": 0.2732, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4675324675324675, |
|
"grad_norm": 22.540725434050554, |
|
"learning_rate": 1.3133439679848824e-05, |
|
"loss": 0.3253, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.474025974025974, |
|
"grad_norm": 6.180079243988936, |
|
"learning_rate": 1.291645939454825e-05, |
|
"loss": 0.2838, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4805194805194805, |
|
"grad_norm": 9.944164489572458, |
|
"learning_rate": 1.2697967711570243e-05, |
|
"loss": 0.2912, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.487012987012987, |
|
"grad_norm": 22.006231698484978, |
|
"learning_rate": 1.2478077859929e-05, |
|
"loss": 0.3109, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4935064935064935, |
|
"grad_norm": 0.9993376662904194, |
|
"learning_rate": 1.2256903793212107e-05, |
|
"loss": 0.2769, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 6.736485919129092, |
|
"learning_rate": 1.2034560130526341e-05, |
|
"loss": 0.2776, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5064935064935064, |
|
"grad_norm": 1.5505988858577997, |
|
"learning_rate": 1.1811162097098559e-05, |
|
"loss": 0.259, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.512987012987013, |
|
"grad_norm": 19.317012183505398, |
|
"learning_rate": 1.1586825464562515e-05, |
|
"loss": 0.3168, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 24.98335565690831, |
|
"learning_rate": 1.1361666490962468e-05, |
|
"loss": 0.361, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.525974025974026, |
|
"grad_norm": 15.391559490756684, |
|
"learning_rate": 1.113580186050475e-05, |
|
"loss": 0.3048, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5324675324675324, |
|
"grad_norm": 4.355627253302395, |
|
"learning_rate": 1.0909348623088472e-05, |
|
"loss": 0.28, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.538961038961039, |
|
"grad_norm": 17.405555596408444, |
|
"learning_rate": 1.0682424133646712e-05, |
|
"loss": 0.2902, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 13.407085055385325, |
|
"learning_rate": 1.0455145991329639e-05, |
|
"loss": 0.2765, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.551948051948052, |
|
"grad_norm": 2.6139015199788034, |
|
"learning_rate": 1.0227631978561057e-05, |
|
"loss": 0.2785, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5584415584415584, |
|
"grad_norm": 12.810695157873214, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2845, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.564935064935065, |
|
"grad_norm": 8.816846652405896, |
|
"learning_rate": 9.772368021438943e-06, |
|
"loss": 0.2827, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 21.911718407785227, |
|
"learning_rate": 9.544854008670366e-06, |
|
"loss": 0.3217, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.577922077922078, |
|
"grad_norm": 12.550189539530098, |
|
"learning_rate": 9.317575866353293e-06, |
|
"loss": 0.292, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5844155844155844, |
|
"grad_norm": 15.167023999667723, |
|
"learning_rate": 9.090651376911532e-06, |
|
"loss": 0.3001, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5909090909090909, |
|
"grad_norm": 7.973065612261872, |
|
"learning_rate": 8.86419813949525e-06, |
|
"loss": 0.2886, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5974025974025974, |
|
"grad_norm": 16.15242808963043, |
|
"learning_rate": 8.638333509037537e-06, |
|
"loss": 0.299, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6038961038961039, |
|
"grad_norm": 18.179063200373662, |
|
"learning_rate": 8.413174535437486e-06, |
|
"loss": 0.3178, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6103896103896104, |
|
"grad_norm": 17.88073233317887, |
|
"learning_rate": 8.188837902901441e-06, |
|
"loss": 0.3026, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6168831168831169, |
|
"grad_norm": 17.894047190834744, |
|
"learning_rate": 7.965439869473664e-06, |
|
"loss": 0.3063, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6233766233766234, |
|
"grad_norm": 0.9912752221363975, |
|
"learning_rate": 7.743096206787894e-06, |
|
"loss": 0.2861, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6298701298701299, |
|
"grad_norm": 12.59974964084945, |
|
"learning_rate": 7.521922140071003e-06, |
|
"loss": 0.2873, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 5.247248714404641, |
|
"learning_rate": 7.3020322884297565e-06, |
|
"loss": 0.2772, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 15.713529017302493, |
|
"learning_rate": 7.0835406054517505e-06, |
|
"loss": 0.2954, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 14.670526789277048, |
|
"learning_rate": 6.866560320151179e-06, |
|
"loss": 0.2928, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6558441558441559, |
|
"grad_norm": 1.4479761532269853, |
|
"learning_rate": 6.651203878290139e-06, |
|
"loss": 0.2747, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6623376623376623, |
|
"grad_norm": 8.546550504285172, |
|
"learning_rate": 6.437582884105835e-06, |
|
"loss": 0.2796, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6688311688311688, |
|
"grad_norm": 5.205487389970545, |
|
"learning_rate": 6.225808042473857e-06, |
|
"loss": 0.2645, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6753246753246753, |
|
"grad_norm": 10.186260022708211, |
|
"learning_rate": 6.015989101537586e-06, |
|
"loss": 0.2813, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 8.851239912787552, |
|
"learning_rate": 5.8082347958333625e-06, |
|
"loss": 0.2801, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6883116883116883, |
|
"grad_norm": 0.47334586479886903, |
|
"learning_rate": 5.602652789940941e-06, |
|
"loss": 0.2814, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6948051948051948, |
|
"grad_norm": 10.137248782728967, |
|
"learning_rate": 5.399349622688479e-06, |
|
"loss": 0.2771, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7012987012987013, |
|
"grad_norm": 14.23493444448787, |
|
"learning_rate": 5.198430651940846e-06, |
|
"loss": 0.3031, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7077922077922078, |
|
"grad_norm": 25.77828835781949, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.3317, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 4.983254834642921, |
|
"learning_rate": 4.804160499645667e-06, |
|
"loss": 0.284, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7207792207792207, |
|
"grad_norm": 3.210921791733905, |
|
"learning_rate": 4.611013640844245e-06, |
|
"loss": 0.2656, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 12.882476468987955, |
|
"learning_rate": 4.420659518153667e-06, |
|
"loss": 0.2952, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7337662337662337, |
|
"grad_norm": 8.496092897870577, |
|
"learning_rate": 4.2331967788513295e-06, |
|
"loss": 0.2894, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7402597402597403, |
|
"grad_norm": 14.782117212880996, |
|
"learning_rate": 4.048722571812105e-06, |
|
"loss": 0.2943, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7467532467532467, |
|
"grad_norm": 8.221163363530726, |
|
"learning_rate": 3.867332497162836e-06, |
|
"loss": 0.2717, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7532467532467533, |
|
"grad_norm": 3.319249243011192, |
|
"learning_rate": 3.689120556739475e-06, |
|
"loss": 0.2675, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7597402597402597, |
|
"grad_norm": 5.8378264660336665, |
|
"learning_rate": 3.5141791053724405e-06, |
|
"loss": 0.2891, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7662337662337663, |
|
"grad_norm": 7.759679103047117, |
|
"learning_rate": 3.342598803025595e-06, |
|
"loss": 0.2742, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7727272727272727, |
|
"grad_norm": 3.391758679327134, |
|
"learning_rate": 3.174468567813461e-06, |
|
"loss": 0.2828, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 4.212349972665664, |
|
"learning_rate": 3.009875529921181e-06, |
|
"loss": 0.2702, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 16.260452235360823, |
|
"learning_rate": 2.8489049864510053e-06, |
|
"loss": 0.2843, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7922077922077922, |
|
"grad_norm": 12.65271768504945, |
|
"learning_rate": 2.691640357218759e-06, |
|
"loss": 0.2767, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7987012987012987, |
|
"grad_norm": 7.533838996842269, |
|
"learning_rate": 2.5381631415231455e-06, |
|
"loss": 0.2646, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8051948051948052, |
|
"grad_norm": 2.97864671142113, |
|
"learning_rate": 2.388552875910354e-06, |
|
"loss": 0.2922, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8116883116883117, |
|
"grad_norm": 0.3722989784490546, |
|
"learning_rate": 2.2428870929558012e-06, |
|
"loss": 0.2836, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": 0.7184053225638433, |
|
"learning_rate": 2.101241281084416e-06, |
|
"loss": 0.2809, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8246753246753247, |
|
"grad_norm": 7.043657965270735, |
|
"learning_rate": 1.963688845450218e-06, |
|
"loss": 0.2745, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8311688311688312, |
|
"grad_norm": 10.025849594465186, |
|
"learning_rate": 1.8303010698955803e-06, |
|
"loss": 0.2933, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8376623376623377, |
|
"grad_norm": 3.4255662736765844, |
|
"learning_rate": 1.7011470800097496e-06, |
|
"loss": 0.279, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8441558441558441, |
|
"grad_norm": 9.520856580468687, |
|
"learning_rate": 1.5762938073058853e-06, |
|
"loss": 0.2927, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8506493506493507, |
|
"grad_norm": 4.373148084541858, |
|
"learning_rate": 1.4558059545351144e-06, |
|
"loss": 0.2723, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 5.775208392633576, |
|
"learning_rate": 1.339745962155613e-06, |
|
"loss": 0.2787, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8636363636363636, |
|
"grad_norm": 2.1662144441766635, |
|
"learning_rate": 1.2281739759740575e-06, |
|
"loss": 0.2775, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8701298701298701, |
|
"grad_norm": 4.505056971511744, |
|
"learning_rate": 1.121147815976248e-06, |
|
"loss": 0.2818, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8766233766233766, |
|
"grad_norm": 5.227972464338057, |
|
"learning_rate": 1.01872294636304e-06, |
|
"loss": 0.2704, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8831168831168831, |
|
"grad_norm": 1.8227797095460878, |
|
"learning_rate": 9.209524468071096e-07, |
|
"loss": 0.2765, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8896103896103896, |
|
"grad_norm": 8.904887866554748, |
|
"learning_rate": 8.278869849454718e-07, |
|
"loss": 0.2752, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8961038961038961, |
|
"grad_norm": 3.2675223808297766, |
|
"learning_rate": 7.395747901219474e-07, |
|
"loss": 0.2721, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9025974025974026, |
|
"grad_norm": 1.0782893352353118, |
|
"learning_rate": 6.560616283932897e-07, |
|
"loss": 0.2666, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 10.237618635319794, |
|
"learning_rate": 5.77390778811796e-07, |
|
"loss": 0.279, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9155844155844156, |
|
"grad_norm": 7.524756342899654, |
|
"learning_rate": 5.036030109968082e-07, |
|
"loss": 0.2798, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.922077922077922, |
|
"grad_norm": 6.865627899284416, |
|
"learning_rate": 4.3473656400665256e-07, |
|
"loss": 0.2697, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 0.7216520448297371, |
|
"learning_rate": 3.708271265220087e-07, |
|
"loss": 0.2745, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.935064935064935, |
|
"grad_norm": 1.7017798820527004, |
|
"learning_rate": 3.119078183509372e-07, |
|
"loss": 0.2682, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9415584415584416, |
|
"grad_norm": 8.223255165977816, |
|
"learning_rate": 2.5800917326521013e-07, |
|
"loss": 0.2781, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.948051948051948, |
|
"grad_norm": 9.206762539208919, |
|
"learning_rate": 2.091591231767709e-07, |
|
"loss": 0.2708, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9545454545454546, |
|
"grad_norm": 4.056391637422524, |
|
"learning_rate": 1.6538298366257975e-07, |
|
"loss": 0.2666, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.961038961038961, |
|
"grad_norm": 4.623116122211878, |
|
"learning_rate": 1.2670344084530384e-07, |
|
"loss": 0.2809, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9675324675324676, |
|
"grad_norm": 0.5191836726417989, |
|
"learning_rate": 9.314053963669245e-08, |
|
"loss": 0.2679, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.974025974025974, |
|
"grad_norm": 3.9295463971723485, |
|
"learning_rate": 6.471167334968887e-08, |
|
"loss": 0.2691, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9805194805194806, |
|
"grad_norm": 1.6238761795878893, |
|
"learning_rate": 4.143157468468717e-08, |
|
"loss": 0.276, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.987012987012987, |
|
"grad_norm": 8.527985399139007, |
|
"learning_rate": 2.3312308094607382e-08, |
|
"loss": 0.2796, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9935064935064936, |
|
"grad_norm": 10.628431594177156, |
|
"learning_rate": 1.0363263532724433e-08, |
|
"loss": 0.2645, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.0009395147341795, |
|
"learning_rate": 2.591151586508467e-09, |
|
"loss": 0.2856, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 154, |
|
"total_flos": 98781319004160.0, |
|
"train_loss": 1.0868141991751534, |
|
"train_runtime": 5317.4783, |
|
"train_samples_per_second": 1.852, |
|
"train_steps_per_second": 0.029 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 154, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 98781319004160.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|