{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10, "global_step": 154, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006493506493506494, "grad_norm": 5312.028745324425, "learning_rate": 0.0, "loss": 12.6644, "step": 1 }, { "epoch": 0.012987012987012988, "grad_norm": 3464.512608575103, "learning_rate": 1.25e-06, "loss": 13.386, "step": 2 }, { "epoch": 0.01948051948051948, "grad_norm": 31521.52866951872, "learning_rate": 2.5e-06, "loss": 12.124, "step": 3 }, { "epoch": 0.025974025974025976, "grad_norm": 12714.37327874382, "learning_rate": 3.7500000000000005e-06, "loss": 9.26, "step": 4 }, { "epoch": 0.032467532467532464, "grad_norm": 2957.000475133963, "learning_rate": 5e-06, "loss": 7.0851, "step": 5 }, { "epoch": 0.03896103896103896, "grad_norm": 1856.4304411247954, "learning_rate": 6.25e-06, "loss": 4.3426, "step": 6 }, { "epoch": 0.045454545454545456, "grad_norm": 380.9114379987856, "learning_rate": 7.500000000000001e-06, "loss": 3.9004, "step": 7 }, { "epoch": 0.05194805194805195, "grad_norm": 249.7480254394976, "learning_rate": 8.750000000000001e-06, "loss": 4.2357, "step": 8 }, { "epoch": 0.05844155844155844, "grad_norm": 163.12129308225673, "learning_rate": 1e-05, "loss": 3.4659, "step": 9 }, { "epoch": 0.06493506493506493, "grad_norm": 306.28794651548174, "learning_rate": 1.125e-05, "loss": 3.2504, "step": 10 }, { "epoch": 0.07142857142857142, "grad_norm": 458.0129175616477, "learning_rate": 1.25e-05, "loss": 6.6892, "step": 11 }, { "epoch": 0.07792207792207792, "grad_norm": 220.92021209863208, "learning_rate": 1.375e-05, "loss": 3.3174, "step": 12 }, { "epoch": 0.08441558441558442, "grad_norm": 140.9175310968349, "learning_rate": 1.5000000000000002e-05, "loss": 3.8063, "step": 13 }, { "epoch": 0.09090909090909091, "grad_norm": 100.53464591827579, "learning_rate": 1.6250000000000002e-05, "loss": 2.7457, "step": 14 }, { "epoch": 0.09740259740259741, "grad_norm": 81.16333748640615, "learning_rate": 1.7500000000000002e-05, "loss": 2.7346, "step": 15 }, { "epoch": 0.1038961038961039, "grad_norm": 94.60838239841561, "learning_rate": 1.8750000000000002e-05, "loss": 3.6905, "step": 16 }, { "epoch": 0.11038961038961038, "grad_norm": 96.27886786384646, "learning_rate": 2e-05, "loss": 2.7146, "step": 17 }, { "epoch": 0.11688311688311688, "grad_norm": 638.8030278028655, "learning_rate": 1.9997408848413494e-05, "loss": 2.9735, "step": 18 }, { "epoch": 0.12337662337662338, "grad_norm": 101.7239129398942, "learning_rate": 1.9989636736467278e-05, "loss": 3.0153, "step": 19 }, { "epoch": 0.12987012987012986, "grad_norm": 67.71433029957166, "learning_rate": 1.9976687691905394e-05, "loss": 2.0634, "step": 20 }, { "epoch": 0.13636363636363635, "grad_norm": 61.33799178204917, "learning_rate": 1.9958568425315316e-05, "loss": 1.7994, "step": 21 }, { "epoch": 0.14285714285714285, "grad_norm": 77.67967542573722, "learning_rate": 1.9935288326650314e-05, "loss": 1.8646, "step": 22 }, { "epoch": 0.14935064935064934, "grad_norm": 66.7541786942052, "learning_rate": 1.9906859460363307e-05, "loss": 1.6082, "step": 23 }, { "epoch": 0.15584415584415584, "grad_norm": 66.42183035318644, "learning_rate": 1.98732965591547e-05, "loss": 1.4584, "step": 24 }, { "epoch": 0.16233766233766234, "grad_norm": 59.277341273127604, "learning_rate": 1.9834617016337424e-05, "loss": 1.2276, "step": 25 }, { "epoch": 0.16883116883116883, "grad_norm": 60.09621841530829, "learning_rate": 1.979084087682323e-05, "loss": 1.1038, "step": 26 }, { "epoch": 0.17532467532467533, "grad_norm": 53.451286572243966, "learning_rate": 1.9741990826734793e-05, "loss": 0.9448, "step": 27 }, { "epoch": 0.18181818181818182, "grad_norm": 52.91840992849332, "learning_rate": 1.9688092181649065e-05, "loss": 0.8475, "step": 28 }, { "epoch": 0.18831168831168832, "grad_norm": 42.915320217692674, "learning_rate": 1.9629172873477995e-05, "loss": 0.6988, "step": 29 }, { "epoch": 0.19480519480519481, "grad_norm": 36.403813181510394, "learning_rate": 1.956526343599335e-05, "loss": 0.591, "step": 30 }, { "epoch": 0.2012987012987013, "grad_norm": 27.894365664450408, "learning_rate": 1.9496396989003195e-05, "loss": 0.504, "step": 31 }, { "epoch": 0.2077922077922078, "grad_norm": 33.58287541986749, "learning_rate": 1.9422609221188208e-05, "loss": 0.4665, "step": 32 }, { "epoch": 0.21428571428571427, "grad_norm": 33.5121042744897, "learning_rate": 1.9343938371606714e-05, "loss": 0.4967, "step": 33 }, { "epoch": 0.22077922077922077, "grad_norm": 27.982938599625648, "learning_rate": 1.9260425209878052e-05, "loss": 0.4135, "step": 34 }, { "epoch": 0.22727272727272727, "grad_norm": 51.93090738473159, "learning_rate": 1.917211301505453e-05, "loss": 0.5827, "step": 35 }, { "epoch": 0.23376623376623376, "grad_norm": 57.25489042342769, "learning_rate": 1.907904755319289e-05, "loss": 0.6328, "step": 36 }, { "epoch": 0.24025974025974026, "grad_norm": 11.875972697569978, "learning_rate": 1.8981277053636963e-05, "loss": 0.3192, "step": 37 }, { "epoch": 0.24675324675324675, "grad_norm": 27.883526944555015, "learning_rate": 1.8878852184023754e-05, "loss": 0.437, "step": 38 }, { "epoch": 0.2532467532467532, "grad_norm": 43.732501540378045, "learning_rate": 1.8771826024025944e-05, "loss": 0.7421, "step": 39 }, { "epoch": 0.2597402597402597, "grad_norm": 28.473023146202923, "learning_rate": 1.866025403784439e-05, "loss": 0.4279, "step": 40 }, { "epoch": 0.2662337662337662, "grad_norm": 13.106421492490322, "learning_rate": 1.8544194045464888e-05, "loss": 0.3088, "step": 41 }, { "epoch": 0.2727272727272727, "grad_norm": 5.104577671097258, "learning_rate": 1.8423706192694118e-05, "loss": 0.2931, "step": 42 }, { "epoch": 0.2792207792207792, "grad_norm": 52.64196123136193, "learning_rate": 1.8298852919990254e-05, "loss": 0.4912, "step": 43 }, { "epoch": 0.2857142857142857, "grad_norm": 31.65156603514557, "learning_rate": 1.816969893010442e-05, "loss": 0.3692, "step": 44 }, { "epoch": 0.2922077922077922, "grad_norm": 17.666169589586605, "learning_rate": 1.8036311154549783e-05, "loss": 0.2944, "step": 45 }, { "epoch": 0.2987012987012987, "grad_norm": 38.35439518358878, "learning_rate": 1.789875871891559e-05, "loss": 0.5252, "step": 46 }, { "epoch": 0.3051948051948052, "grad_norm": 39.96972433176561, "learning_rate": 1.77571129070442e-05, "loss": 0.5614, "step": 47 }, { "epoch": 0.3116883116883117, "grad_norm": 19.4739183485063, "learning_rate": 1.761144712408965e-05, "loss": 0.3101, "step": 48 }, { "epoch": 0.3181818181818182, "grad_norm": 2122.177114683787, "learning_rate": 1.7461836858476858e-05, "loss": 8.3024, "step": 49 }, { "epoch": 0.3246753246753247, "grad_norm": 53.72514322304979, "learning_rate": 1.730835964278124e-05, "loss": 0.5558, "step": 50 }, { "epoch": 0.33116883116883117, "grad_norm": 53.70022188889065, "learning_rate": 1.7151095013548996e-05, "loss": 0.5592, "step": 51 }, { "epoch": 0.33766233766233766, "grad_norm": 33.4569293851714, "learning_rate": 1.699012447007882e-05, "loss": 0.3671, "step": 52 }, { "epoch": 0.34415584415584416, "grad_norm": 17.137073429830238, "learning_rate": 1.6825531432186545e-05, "loss": 0.3092, "step": 53 }, { "epoch": 0.35064935064935066, "grad_norm": 22.60309242557496, "learning_rate": 1.6657401196974405e-05, "loss": 0.3291, "step": 54 }, { "epoch": 0.35714285714285715, "grad_norm": 8.833069765914521, "learning_rate": 1.648582089462756e-05, "loss": 0.2884, "step": 55 }, { "epoch": 0.36363636363636365, "grad_norm": 42.48152192562097, "learning_rate": 1.631087944326053e-05, "loss": 0.4151, "step": 56 }, { "epoch": 0.37012987012987014, "grad_norm": 51.1875826358593, "learning_rate": 1.6132667502837164e-05, "loss": 0.4914, "step": 57 }, { "epoch": 0.37662337662337664, "grad_norm": 29.57316461617743, "learning_rate": 1.59512774281879e-05, "loss": 0.3513, "step": 58 }, { "epoch": 0.38311688311688313, "grad_norm": 1.6690431301081146, "learning_rate": 1.5766803221148676e-05, "loss": 0.279, "step": 59 }, { "epoch": 0.38961038961038963, "grad_norm": 17.11791002555302, "learning_rate": 1.5579340481846338e-05, "loss": 0.3062, "step": 60 }, { "epoch": 0.3961038961038961, "grad_norm": 2.9548663461162916, "learning_rate": 1.538898635915576e-05, "loss": 0.2837, "step": 61 }, { "epoch": 0.4025974025974026, "grad_norm": 30.32462862806025, "learning_rate": 1.5195839500354337e-05, "loss": 0.3502, "step": 62 }, { "epoch": 0.4090909090909091, "grad_norm": 27.26334803927827, "learning_rate": 1.5000000000000002e-05, "loss": 0.3433, "step": 63 }, { "epoch": 0.4155844155844156, "grad_norm": 25.006274002753365, "learning_rate": 1.4801569348059158e-05, "loss": 0.3237, "step": 64 }, { "epoch": 0.42207792207792205, "grad_norm": 9.0889223900474, "learning_rate": 1.4600650377311523e-05, "loss": 0.2881, "step": 65 }, { "epoch": 0.42857142857142855, "grad_norm": 17.44533004191656, "learning_rate": 1.4397347210059059e-05, "loss": 0.3027, "step": 66 }, { "epoch": 0.43506493506493504, "grad_norm": 19.97489878606359, "learning_rate": 1.4191765204166643e-05, "loss": 0.3013, "step": 67 }, { "epoch": 0.44155844155844154, "grad_norm": 16.933568982459366, "learning_rate": 1.3984010898462417e-05, "loss": 0.3018, "step": 68 }, { "epoch": 0.44805194805194803, "grad_norm": 29.645129941175437, "learning_rate": 1.3774191957526144e-05, "loss": 0.3391, "step": 69 }, { "epoch": 0.45454545454545453, "grad_norm": 9.80114973140799, "learning_rate": 1.356241711589417e-05, "loss": 0.2963, "step": 70 }, { "epoch": 0.461038961038961, "grad_norm": 4.338277899349995, "learning_rate": 1.3348796121709862e-05, "loss": 0.2732, "step": 71 }, { "epoch": 0.4675324675324675, "grad_norm": 22.540725434050554, "learning_rate": 1.3133439679848824e-05, "loss": 0.3253, "step": 72 }, { "epoch": 0.474025974025974, "grad_norm": 6.180079243988936, "learning_rate": 1.291645939454825e-05, "loss": 0.2838, "step": 73 }, { "epoch": 0.4805194805194805, "grad_norm": 9.944164489572458, "learning_rate": 1.2697967711570243e-05, "loss": 0.2912, "step": 74 }, { "epoch": 0.487012987012987, "grad_norm": 22.006231698484978, "learning_rate": 1.2478077859929e-05, "loss": 0.3109, "step": 75 }, { "epoch": 0.4935064935064935, "grad_norm": 0.9993376662904194, "learning_rate": 1.2256903793212107e-05, "loss": 0.2769, "step": 76 }, { "epoch": 0.5, "grad_norm": 6.736485919129092, "learning_rate": 1.2034560130526341e-05, "loss": 0.2776, "step": 77 }, { "epoch": 0.5064935064935064, "grad_norm": 1.5505988858577997, "learning_rate": 1.1811162097098559e-05, "loss": 0.259, "step": 78 }, { "epoch": 0.512987012987013, "grad_norm": 19.317012183505398, "learning_rate": 1.1586825464562515e-05, "loss": 0.3168, "step": 79 }, { "epoch": 0.5194805194805194, "grad_norm": 24.98335565690831, "learning_rate": 1.1361666490962468e-05, "loss": 0.361, "step": 80 }, { "epoch": 0.525974025974026, "grad_norm": 15.391559490756684, "learning_rate": 1.113580186050475e-05, "loss": 0.3048, "step": 81 }, { "epoch": 0.5324675324675324, "grad_norm": 4.355627253302395, "learning_rate": 1.0909348623088472e-05, "loss": 0.28, "step": 82 }, { "epoch": 0.538961038961039, "grad_norm": 17.405555596408444, "learning_rate": 1.0682424133646712e-05, "loss": 0.2902, "step": 83 }, { "epoch": 0.5454545454545454, "grad_norm": 13.407085055385325, "learning_rate": 1.0455145991329639e-05, "loss": 0.2765, "step": 84 }, { "epoch": 0.551948051948052, "grad_norm": 2.6139015199788034, "learning_rate": 1.0227631978561057e-05, "loss": 0.2785, "step": 85 }, { "epoch": 0.5584415584415584, "grad_norm": 12.810695157873214, "learning_rate": 1e-05, "loss": 0.2845, "step": 86 }, { "epoch": 0.564935064935065, "grad_norm": 8.816846652405896, "learning_rate": 9.772368021438943e-06, "loss": 0.2827, "step": 87 }, { "epoch": 0.5714285714285714, "grad_norm": 21.911718407785227, "learning_rate": 9.544854008670366e-06, "loss": 0.3217, "step": 88 }, { "epoch": 0.577922077922078, "grad_norm": 12.550189539530098, "learning_rate": 9.317575866353293e-06, "loss": 0.292, "step": 89 }, { "epoch": 0.5844155844155844, "grad_norm": 15.167023999667723, "learning_rate": 9.090651376911532e-06, "loss": 0.3001, "step": 90 }, { "epoch": 0.5909090909090909, "grad_norm": 7.973065612261872, "learning_rate": 8.86419813949525e-06, "loss": 0.2886, "step": 91 }, { "epoch": 0.5974025974025974, "grad_norm": 16.15242808963043, "learning_rate": 8.638333509037537e-06, "loss": 0.299, "step": 92 }, { "epoch": 0.6038961038961039, "grad_norm": 18.179063200373662, "learning_rate": 8.413174535437486e-06, "loss": 0.3178, "step": 93 }, { "epoch": 0.6103896103896104, "grad_norm": 17.88073233317887, "learning_rate": 8.188837902901441e-06, "loss": 0.3026, "step": 94 }, { "epoch": 0.6168831168831169, "grad_norm": 17.894047190834744, "learning_rate": 7.965439869473664e-06, "loss": 0.3063, "step": 95 }, { "epoch": 0.6233766233766234, "grad_norm": 0.9912752221363975, "learning_rate": 7.743096206787894e-06, "loss": 0.2861, "step": 96 }, { "epoch": 0.6298701298701299, "grad_norm": 12.59974964084945, "learning_rate": 7.521922140071003e-06, "loss": 0.2873, "step": 97 }, { "epoch": 0.6363636363636364, "grad_norm": 5.247248714404641, "learning_rate": 7.3020322884297565e-06, "loss": 0.2772, "step": 98 }, { "epoch": 0.6428571428571429, "grad_norm": 15.713529017302493, "learning_rate": 7.0835406054517505e-06, "loss": 0.2954, "step": 99 }, { "epoch": 0.6493506493506493, "grad_norm": 14.670526789277048, "learning_rate": 6.866560320151179e-06, "loss": 0.2928, "step": 100 }, { "epoch": 0.6558441558441559, "grad_norm": 1.4479761532269853, "learning_rate": 6.651203878290139e-06, "loss": 0.2747, "step": 101 }, { "epoch": 0.6623376623376623, "grad_norm": 8.546550504285172, "learning_rate": 6.437582884105835e-06, "loss": 0.2796, "step": 102 }, { "epoch": 0.6688311688311688, "grad_norm": 5.205487389970545, "learning_rate": 6.225808042473857e-06, "loss": 0.2645, "step": 103 }, { "epoch": 0.6753246753246753, "grad_norm": 10.186260022708211, "learning_rate": 6.015989101537586e-06, "loss": 0.2813, "step": 104 }, { "epoch": 0.6818181818181818, "grad_norm": 8.851239912787552, "learning_rate": 5.8082347958333625e-06, "loss": 0.2801, "step": 105 }, { "epoch": 0.6883116883116883, "grad_norm": 0.47334586479886903, "learning_rate": 5.602652789940941e-06, "loss": 0.2814, "step": 106 }, { "epoch": 0.6948051948051948, "grad_norm": 10.137248782728967, "learning_rate": 5.399349622688479e-06, "loss": 0.2771, "step": 107 }, { "epoch": 0.7012987012987013, "grad_norm": 14.23493444448787, "learning_rate": 5.198430651940846e-06, "loss": 0.3031, "step": 108 }, { "epoch": 0.7077922077922078, "grad_norm": 25.77828835781949, "learning_rate": 5.000000000000003e-06, "loss": 0.3317, "step": 109 }, { "epoch": 0.7142857142857143, "grad_norm": 4.983254834642921, "learning_rate": 4.804160499645667e-06, "loss": 0.284, "step": 110 }, { "epoch": 0.7207792207792207, "grad_norm": 3.210921791733905, "learning_rate": 4.611013640844245e-06, "loss": 0.2656, "step": 111 }, { "epoch": 0.7272727272727273, "grad_norm": 12.882476468987955, "learning_rate": 4.420659518153667e-06, "loss": 0.2952, "step": 112 }, { "epoch": 0.7337662337662337, "grad_norm": 8.496092897870577, "learning_rate": 4.2331967788513295e-06, "loss": 0.2894, "step": 113 }, { "epoch": 0.7402597402597403, "grad_norm": 14.782117212880996, "learning_rate": 4.048722571812105e-06, "loss": 0.2943, "step": 114 }, { "epoch": 0.7467532467532467, "grad_norm": 8.221163363530726, "learning_rate": 3.867332497162836e-06, "loss": 0.2717, "step": 115 }, { "epoch": 0.7532467532467533, "grad_norm": 3.319249243011192, "learning_rate": 3.689120556739475e-06, "loss": 0.2675, "step": 116 }, { "epoch": 0.7597402597402597, "grad_norm": 5.8378264660336665, "learning_rate": 3.5141791053724405e-06, "loss": 0.2891, "step": 117 }, { "epoch": 0.7662337662337663, "grad_norm": 7.759679103047117, "learning_rate": 3.342598803025595e-06, "loss": 0.2742, "step": 118 }, { "epoch": 0.7727272727272727, "grad_norm": 3.391758679327134, "learning_rate": 3.174468567813461e-06, "loss": 0.2828, "step": 119 }, { "epoch": 0.7792207792207793, "grad_norm": 4.212349972665664, "learning_rate": 3.009875529921181e-06, "loss": 0.2702, "step": 120 }, { "epoch": 0.7857142857142857, "grad_norm": 16.260452235360823, "learning_rate": 2.8489049864510053e-06, "loss": 0.2843, "step": 121 }, { "epoch": 0.7922077922077922, "grad_norm": 12.65271768504945, "learning_rate": 2.691640357218759e-06, "loss": 0.2767, "step": 122 }, { "epoch": 0.7987012987012987, "grad_norm": 7.533838996842269, "learning_rate": 2.5381631415231455e-06, "loss": 0.2646, "step": 123 }, { "epoch": 0.8051948051948052, "grad_norm": 2.97864671142113, "learning_rate": 2.388552875910354e-06, "loss": 0.2922, "step": 124 }, { "epoch": 0.8116883116883117, "grad_norm": 0.3722989784490546, "learning_rate": 2.2428870929558012e-06, "loss": 0.2836, "step": 125 }, { "epoch": 0.8181818181818182, "grad_norm": 0.7184053225638433, "learning_rate": 2.101241281084416e-06, "loss": 0.2809, "step": 126 }, { "epoch": 0.8246753246753247, "grad_norm": 7.043657965270735, "learning_rate": 1.963688845450218e-06, "loss": 0.2745, "step": 127 }, { "epoch": 0.8311688311688312, "grad_norm": 10.025849594465186, "learning_rate": 1.8303010698955803e-06, "loss": 0.2933, "step": 128 }, { "epoch": 0.8376623376623377, "grad_norm": 3.4255662736765844, "learning_rate": 1.7011470800097496e-06, "loss": 0.279, "step": 129 }, { "epoch": 0.8441558441558441, "grad_norm": 9.520856580468687, "learning_rate": 1.5762938073058853e-06, "loss": 0.2927, "step": 130 }, { "epoch": 0.8506493506493507, "grad_norm": 4.373148084541858, "learning_rate": 1.4558059545351144e-06, "loss": 0.2723, "step": 131 }, { "epoch": 0.8571428571428571, "grad_norm": 5.775208392633576, "learning_rate": 1.339745962155613e-06, "loss": 0.2787, "step": 132 }, { "epoch": 0.8636363636363636, "grad_norm": 2.1662144441766635, "learning_rate": 1.2281739759740575e-06, "loss": 0.2775, "step": 133 }, { "epoch": 0.8701298701298701, "grad_norm": 4.505056971511744, "learning_rate": 1.121147815976248e-06, "loss": 0.2818, "step": 134 }, { "epoch": 0.8766233766233766, "grad_norm": 5.227972464338057, "learning_rate": 1.01872294636304e-06, "loss": 0.2704, "step": 135 }, { "epoch": 0.8831168831168831, "grad_norm": 1.8227797095460878, "learning_rate": 9.209524468071096e-07, "loss": 0.2765, "step": 136 }, { "epoch": 0.8896103896103896, "grad_norm": 8.904887866554748, "learning_rate": 8.278869849454718e-07, "loss": 0.2752, "step": 137 }, { "epoch": 0.8961038961038961, "grad_norm": 3.2675223808297766, "learning_rate": 7.395747901219474e-07, "loss": 0.2721, "step": 138 }, { "epoch": 0.9025974025974026, "grad_norm": 1.0782893352353118, "learning_rate": 6.560616283932897e-07, "loss": 0.2666, "step": 139 }, { "epoch": 0.9090909090909091, "grad_norm": 10.237618635319794, "learning_rate": 5.77390778811796e-07, "loss": 0.279, "step": 140 }, { "epoch": 0.9155844155844156, "grad_norm": 7.524756342899654, "learning_rate": 5.036030109968082e-07, "loss": 0.2798, "step": 141 }, { "epoch": 0.922077922077922, "grad_norm": 6.865627899284416, "learning_rate": 4.3473656400665256e-07, "loss": 0.2697, "step": 142 }, { "epoch": 0.9285714285714286, "grad_norm": 0.7216520448297371, "learning_rate": 3.708271265220087e-07, "loss": 0.2745, "step": 143 }, { "epoch": 0.935064935064935, "grad_norm": 1.7017798820527004, "learning_rate": 3.119078183509372e-07, "loss": 0.2682, "step": 144 }, { "epoch": 0.9415584415584416, "grad_norm": 8.223255165977816, "learning_rate": 2.5800917326521013e-07, "loss": 0.2781, "step": 145 }, { "epoch": 0.948051948051948, "grad_norm": 9.206762539208919, "learning_rate": 2.091591231767709e-07, "loss": 0.2708, "step": 146 }, { "epoch": 0.9545454545454546, "grad_norm": 4.056391637422524, "learning_rate": 1.6538298366257975e-07, "loss": 0.2666, "step": 147 }, { "epoch": 0.961038961038961, "grad_norm": 4.623116122211878, "learning_rate": 1.2670344084530384e-07, "loss": 0.2809, "step": 148 }, { "epoch": 0.9675324675324676, "grad_norm": 0.5191836726417989, "learning_rate": 9.314053963669245e-08, "loss": 0.2679, "step": 149 }, { "epoch": 0.974025974025974, "grad_norm": 3.9295463971723485, "learning_rate": 6.471167334968887e-08, "loss": 0.2691, "step": 150 }, { "epoch": 0.9805194805194806, "grad_norm": 1.6238761795878893, "learning_rate": 4.143157468468717e-08, "loss": 0.276, "step": 151 }, { "epoch": 0.987012987012987, "grad_norm": 8.527985399139007, "learning_rate": 2.3312308094607382e-08, "loss": 0.2796, "step": 152 }, { "epoch": 0.9935064935064936, "grad_norm": 10.628431594177156, "learning_rate": 1.0363263532724433e-08, "loss": 0.2645, "step": 153 }, { "epoch": 1.0, "grad_norm": 5.0009395147341795, "learning_rate": 2.591151586508467e-09, "loss": 0.2856, "step": 154 }, { "epoch": 1.0, "step": 154, "total_flos": 98781319004160.0, "train_loss": 1.0868141991751534, "train_runtime": 5317.4783, "train_samples_per_second": 1.852, "train_steps_per_second": 0.029 } ], "logging_steps": 1, "max_steps": 154, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 98781319004160.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }