{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10, "global_step": 154, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006493506493506494, "grad_norm": 726.0834329999584, "learning_rate": 0.0, "loss": 14.3592, "step": 1 }, { "epoch": 0.012987012987012988, "grad_norm": 1354.174599535607, "learning_rate": 1.25e-06, "loss": 15.2173, "step": 2 }, { "epoch": 0.01948051948051948, "grad_norm": 926.1255534526953, "learning_rate": 2.5e-06, "loss": 14.1437, "step": 3 }, { "epoch": 0.025974025974025976, "grad_norm": 1026.398684088101, "learning_rate": 3.7500000000000005e-06, "loss": 11.8689, "step": 4 }, { "epoch": 0.032467532467532464, "grad_norm": 5701.298542750317, "learning_rate": 5e-06, "loss": 7.9204, "step": 5 }, { "epoch": 0.03896103896103896, "grad_norm": 815.0186534581939, "learning_rate": 6.25e-06, "loss": 5.7131, "step": 6 }, { "epoch": 0.045454545454545456, "grad_norm": 639.6025882710119, "learning_rate": 7.500000000000001e-06, "loss": 4.2718, "step": 7 }, { "epoch": 0.05194805194805195, "grad_norm": 593.0247847991094, "learning_rate": 8.750000000000001e-06, "loss": 4.0282, "step": 8 }, { "epoch": 0.05844155844155844, "grad_norm": 147.74015484894977, "learning_rate": 1e-05, "loss": 3.6317, "step": 9 }, { "epoch": 0.06493506493506493, "grad_norm": 214.08805924869526, "learning_rate": 1.125e-05, "loss": 4.1092, "step": 10 }, { "epoch": 0.06493506493506493, "eval_loss": 4.491167068481445, "eval_runtime": 313.3108, "eval_samples_per_second": 7.858, "eval_steps_per_second": 0.492, "step": 10 }, { "epoch": 0.07142857142857142, "grad_norm": 207.2140467913208, "learning_rate": 1.25e-05, "loss": 4.7431, "step": 11 }, { "epoch": 0.07792207792207792, "grad_norm": 367.6773808358454, "learning_rate": 1.375e-05, "loss": 3.802, "step": 12 }, { "epoch": 0.08441558441558442, "grad_norm": 76.96162731098995, "learning_rate": 1.5000000000000002e-05, "loss": 2.9634, "step": 13 }, { "epoch": 0.09090909090909091, "grad_norm": 112.88790355598223, "learning_rate": 1.6250000000000002e-05, "loss": 4.4968, "step": 14 }, { "epoch": 0.09740259740259741, "grad_norm": 71.63608033728643, "learning_rate": 1.7500000000000002e-05, "loss": 2.7575, "step": 15 }, { "epoch": 0.1038961038961039, "grad_norm": 63.23395532173562, "learning_rate": 1.8750000000000002e-05, "loss": 2.518, "step": 16 }, { "epoch": 0.11038961038961038, "grad_norm": 63.763972695361566, "learning_rate": 2e-05, "loss": 2.3895, "step": 17 }, { "epoch": 0.11688311688311688, "grad_norm": 65.26434525585104, "learning_rate": 1.9997408848413494e-05, "loss": 2.2629, "step": 18 }, { "epoch": 0.12337662337662338, "grad_norm": 87.01969531617279, "learning_rate": 1.9989636736467278e-05, "loss": 2.35, "step": 19 }, { "epoch": 0.12987012987012986, "grad_norm": 75.90266993301385, "learning_rate": 1.9976687691905394e-05, "loss": 2.1524, "step": 20 }, { "epoch": 0.12987012987012986, "eval_loss": 1.8664342164993286, "eval_runtime": 312.9861, "eval_samples_per_second": 7.866, "eval_steps_per_second": 0.492, "step": 20 }, { "epoch": 0.13636363636363635, "grad_norm": 69.56262783043563, "learning_rate": 1.9958568425315316e-05, "loss": 1.8148, "step": 21 }, { "epoch": 0.14285714285714285, "grad_norm": 90.677953953057, "learning_rate": 1.9935288326650314e-05, "loss": 2.1677, "step": 22 }, { "epoch": 0.14935064935064934, "grad_norm": 85.24754349133875, "learning_rate": 1.9906859460363307e-05, "loss": 1.8116, "step": 23 }, { "epoch": 0.15584415584415584, "grad_norm": 70.43495434468849, "learning_rate": 1.98732965591547e-05, "loss": 1.5267, "step": 24 }, { "epoch": 0.16233766233766234, "grad_norm": 74.50513213038015, "learning_rate": 1.9834617016337424e-05, "loss": 1.5871, "step": 25 }, { "epoch": 0.16883116883116883, "grad_norm": 66.73449283745359, "learning_rate": 1.979084087682323e-05, "loss": 1.19, "step": 26 }, { "epoch": 0.17532467532467533, "grad_norm": 65.898397773073, "learning_rate": 1.9741990826734793e-05, "loss": 1.0546, "step": 27 }, { "epoch": 0.18181818181818182, "grad_norm": 69.3358355227665, "learning_rate": 1.9688092181649065e-05, "loss": 1.0371, "step": 28 }, { "epoch": 0.18831168831168832, "grad_norm": 48.31640643122106, "learning_rate": 1.9629172873477995e-05, "loss": 0.7383, "step": 29 }, { "epoch": 0.19480519480519481, "grad_norm": 51.337154937080896, "learning_rate": 1.956526343599335e-05, "loss": 0.7796, "step": 30 }, { "epoch": 0.19480519480519481, "eval_loss": 0.6781104207038879, "eval_runtime": 312.8421, "eval_samples_per_second": 7.87, "eval_steps_per_second": 0.492, "step": 30 }, { "epoch": 0.2012987012987013, "grad_norm": 50.43983994185883, "learning_rate": 1.9496396989003195e-05, "loss": 0.7575, "step": 31 }, { "epoch": 0.2077922077922078, "grad_norm": 23.373734879853394, "learning_rate": 1.9422609221188208e-05, "loss": 0.4457, "step": 32 }, { "epoch": 0.21428571428571427, "grad_norm": 32.56292698364167, "learning_rate": 1.9343938371606714e-05, "loss": 0.4635, "step": 33 }, { "epoch": 0.22077922077922077, "grad_norm": 12.1673376304705, "learning_rate": 1.9260425209878052e-05, "loss": 0.3696, "step": 34 }, { "epoch": 0.22727272727272727, "grad_norm": 14.90801285506298, "learning_rate": 1.917211301505453e-05, "loss": 0.3534, "step": 35 }, { "epoch": 0.23376623376623376, "grad_norm": 15.512474973117753, "learning_rate": 1.907904755319289e-05, "loss": 0.3338, "step": 36 }, { "epoch": 0.24025974025974026, "grad_norm": 14.51683196387391, "learning_rate": 1.8981277053636963e-05, "loss": 0.3249, "step": 37 }, { "epoch": 0.24675324675324675, "grad_norm": 26.98693793169379, "learning_rate": 1.8878852184023754e-05, "loss": 0.3303, "step": 38 }, { "epoch": 0.2532467532467532, "grad_norm": 14.295603956126358, "learning_rate": 1.8771826024025944e-05, "loss": 0.314, "step": 39 }, { "epoch": 0.2597402597402597, "grad_norm": 22.594540566060722, "learning_rate": 1.866025403784439e-05, "loss": 0.3127, "step": 40 }, { "epoch": 0.2597402597402597, "eval_loss": 0.2871454656124115, "eval_runtime": 312.3537, "eval_samples_per_second": 7.882, "eval_steps_per_second": 0.493, "step": 40 }, { "epoch": 0.2662337662337662, "grad_norm": 1.6825613545148905, "learning_rate": 1.8544194045464888e-05, "loss": 0.2746, "step": 41 }, { "epoch": 0.2727272727272727, "grad_norm": 15.147773363541827, "learning_rate": 1.8423706192694118e-05, "loss": 0.3081, "step": 42 }, { "epoch": 0.2792207792207792, "grad_norm": 44.969304157615376, "learning_rate": 1.8298852919990254e-05, "loss": 0.4312, "step": 43 }, { "epoch": 0.2857142857142857, "grad_norm": 22.146658044000166, "learning_rate": 1.816969893010442e-05, "loss": 0.319, "step": 44 }, { "epoch": 0.2922077922077922, "grad_norm": 1.2517281512465708, "learning_rate": 1.8036311154549783e-05, "loss": 0.2725, "step": 45 }, { "epoch": 0.2987012987012987, "grad_norm": 16.533852306195453, "learning_rate": 1.789875871891559e-05, "loss": 0.3119, "step": 46 }, { "epoch": 0.3051948051948052, "grad_norm": 14.76027633819278, "learning_rate": 1.77571129070442e-05, "loss": 0.2963, "step": 47 }, { "epoch": 0.3116883116883117, "grad_norm": 16.944473272575436, "learning_rate": 1.761144712408965e-05, "loss": 0.2917, "step": 48 }, { "epoch": 0.3181818181818182, "grad_norm": 28.972359556136805, "learning_rate": 1.7461836858476858e-05, "loss": 0.3685, "step": 49 }, { "epoch": 0.3246753246753247, "grad_norm": 37.7457413258774, "learning_rate": 1.730835964278124e-05, "loss": 0.4223, "step": 50 }, { "epoch": 0.3246753246753247, "eval_loss": 0.27618616819381714, "eval_runtime": 312.9618, "eval_samples_per_second": 7.867, "eval_steps_per_second": 0.492, "step": 50 }, { "epoch": 0.33116883116883117, "grad_norm": 4.127570833592912, "learning_rate": 1.7151095013548996e-05, "loss": 0.2815, "step": 51 }, { "epoch": 0.33766233766233766, "grad_norm": 1.6278107406314142, "learning_rate": 1.699012447007882e-05, "loss": 0.2779, "step": 52 }, { "epoch": 0.34415584415584416, "grad_norm": 17.3511262256263, "learning_rate": 1.6825531432186545e-05, "loss": 0.3014, "step": 53 }, { "epoch": 0.35064935064935066, "grad_norm": 11.40703878136249, "learning_rate": 1.6657401196974405e-05, "loss": 0.2863, "step": 54 }, { "epoch": 0.35714285714285715, "grad_norm": 23.997342596287215, "learning_rate": 1.648582089462756e-05, "loss": 0.3328, "step": 55 }, { "epoch": 0.36363636363636365, "grad_norm": 14.564526744534577, "learning_rate": 1.631087944326053e-05, "loss": 0.2914, "step": 56 }, { "epoch": 0.37012987012987014, "grad_norm": 12.345788885897743, "learning_rate": 1.6132667502837164e-05, "loss": 0.274, "step": 57 }, { "epoch": 0.37662337662337664, "grad_norm": 0.7197316580035946, "learning_rate": 1.59512774281879e-05, "loss": 0.2805, "step": 58 }, { "epoch": 0.38311688311688313, "grad_norm": 14.427152900312965, "learning_rate": 1.5766803221148676e-05, "loss": 0.2961, "step": 59 }, { "epoch": 0.38961038961038963, "grad_norm": 12.190424170565226, "learning_rate": 1.5579340481846338e-05, "loss": 0.2854, "step": 60 }, { "epoch": 0.38961038961038963, "eval_loss": 0.2876938581466675, "eval_runtime": 313.6156, "eval_samples_per_second": 7.85, "eval_steps_per_second": 0.491, "step": 60 }, { "epoch": 0.3961038961038961, "grad_norm": 1.3306084204470197, "learning_rate": 1.538898635915576e-05, "loss": 0.2868, "step": 61 }, { "epoch": 0.4025974025974026, "grad_norm": 7.557011276582443, "learning_rate": 1.5195839500354337e-05, "loss": 0.2834, "step": 62 }, { "epoch": 0.4090909090909091, "grad_norm": 17.70221295910782, "learning_rate": 1.5000000000000002e-05, "loss": 0.3043, "step": 63 }, { "epoch": 0.4155844155844156, "grad_norm": 1.7289278134350183, "learning_rate": 1.4801569348059158e-05, "loss": 0.2768, "step": 64 }, { "epoch": 0.42207792207792205, "grad_norm": 22.763720539205544, "learning_rate": 1.4600650377311523e-05, "loss": 0.3204, "step": 65 }, { "epoch": 0.42857142857142855, "grad_norm": 22.538798018222526, "learning_rate": 1.4397347210059059e-05, "loss": 0.3214, "step": 66 }, { "epoch": 0.43506493506493504, "grad_norm": 10.793149523817009, "learning_rate": 1.4191765204166643e-05, "loss": 0.2784, "step": 67 }, { "epoch": 0.44155844155844154, "grad_norm": 5.981451928809483, "learning_rate": 1.3984010898462417e-05, "loss": 0.2822, "step": 68 }, { "epoch": 0.44805194805194803, "grad_norm": 6.718767069585849, "learning_rate": 1.3774191957526144e-05, "loss": 0.2797, "step": 69 }, { "epoch": 0.45454545454545453, "grad_norm": 9.613912854847623, "learning_rate": 1.356241711589417e-05, "loss": 0.2908, "step": 70 }, { "epoch": 0.45454545454545453, "eval_loss": 0.3327561318874359, "eval_runtime": 313.0203, "eval_samples_per_second": 7.865, "eval_steps_per_second": 0.492, "step": 70 }, { "epoch": 0.461038961038961, "grad_norm": 29.842336751352633, "learning_rate": 1.3348796121709862e-05, "loss": 0.3395, "step": 71 }, { "epoch": 0.4675324675324675, "grad_norm": 16.599922535835642, "learning_rate": 1.3133439679848824e-05, "loss": 0.3008, "step": 72 }, { "epoch": 0.474025974025974, "grad_norm": 7.600771238707423, "learning_rate": 1.291645939454825e-05, "loss": 0.2842, "step": 73 }, { "epoch": 0.4805194805194805, "grad_norm": 28.375472340075955, "learning_rate": 1.2697967711570243e-05, "loss": 0.3674, "step": 74 }, { "epoch": 0.487012987012987, "grad_norm": 28.153412546519636, "learning_rate": 1.2478077859929e-05, "loss": 0.3821, "step": 75 }, { "epoch": 0.4935064935064935, "grad_norm": 34.98227719889924, "learning_rate": 1.2256903793212107e-05, "loss": 0.4089, "step": 76 }, { "epoch": 0.5, "grad_norm": 6.234636803986055, "learning_rate": 1.2034560130526341e-05, "loss": 0.2794, "step": 77 }, { "epoch": 0.5064935064935064, "grad_norm": 46.061076550306694, "learning_rate": 1.1811162097098559e-05, "loss": 0.419, "step": 78 }, { "epoch": 0.512987012987013, "grad_norm": 48.7128914656825, "learning_rate": 1.1586825464562515e-05, "loss": 0.4955, "step": 79 }, { "epoch": 0.5194805194805194, "grad_norm": 42.16162352138822, "learning_rate": 1.1361666490962468e-05, "loss": 0.4468, "step": 80 }, { "epoch": 0.5194805194805194, "eval_loss": 0.38780251145362854, "eval_runtime": 312.1719, "eval_samples_per_second": 7.887, "eval_steps_per_second": 0.493, "step": 80 }, { "epoch": 0.525974025974026, "grad_norm": 30.562818276672758, "learning_rate": 1.113580186050475e-05, "loss": 0.3573, "step": 81 }, { "epoch": 0.5324675324675324, "grad_norm": 0.3251217256944746, "learning_rate": 1.0909348623088472e-05, "loss": 0.2794, "step": 82 }, { "epoch": 0.538961038961039, "grad_norm": 28.94634046803869, "learning_rate": 1.0682424133646712e-05, "loss": 0.3553, "step": 83 }, { "epoch": 0.5454545454545454, "grad_norm": 27.90134096849091, "learning_rate": 1.0455145991329639e-05, "loss": 0.3586, "step": 84 }, { "epoch": 0.551948051948052, "grad_norm": 22.29121645686305, "learning_rate": 1.0227631978561057e-05, "loss": 0.3257, "step": 85 }, { "epoch": 0.5584415584415584, "grad_norm": 11.445235345846717, "learning_rate": 1e-05, "loss": 0.283, "step": 86 }, { "epoch": 0.564935064935065, "grad_norm": 12.40156017559838, "learning_rate": 9.772368021438943e-06, "loss": 0.3033, "step": 87 }, { "epoch": 0.5714285714285714, "grad_norm": 37.28624320216419, "learning_rate": 9.544854008670366e-06, "loss": 0.3894, "step": 88 }, { "epoch": 0.577922077922078, "grad_norm": 18.091654165377797, "learning_rate": 9.317575866353293e-06, "loss": 0.3072, "step": 89 }, { "epoch": 0.5844155844155844, "grad_norm": 12.727116235793448, "learning_rate": 9.090651376911532e-06, "loss": 0.2962, "step": 90 }, { "epoch": 0.5844155844155844, "eval_loss": 0.2746981978416443, "eval_runtime": 312.8185, "eval_samples_per_second": 7.87, "eval_steps_per_second": 0.492, "step": 90 }, { "epoch": 0.5909090909090909, "grad_norm": 4.967544001463393, "learning_rate": 8.86419813949525e-06, "loss": 0.2833, "step": 91 }, { "epoch": 0.5974025974025974, "grad_norm": 16.69130565770402, "learning_rate": 8.638333509037537e-06, "loss": 0.3017, "step": 92 }, { "epoch": 0.6038961038961039, "grad_norm": 10.205301459301646, "learning_rate": 8.413174535437486e-06, "loss": 0.2945, "step": 93 }, { "epoch": 0.6103896103896104, "grad_norm": 0.9629305540099369, "learning_rate": 8.188837902901441e-06, "loss": 0.2752, "step": 94 }, { "epoch": 0.6168831168831169, "grad_norm": 5.173525467305663, "learning_rate": 7.965439869473664e-06, "loss": 0.2817, "step": 95 }, { "epoch": 0.6233766233766234, "grad_norm": 9.386886918591138, "learning_rate": 7.743096206787894e-06, "loss": 0.2847, "step": 96 }, { "epoch": 0.6298701298701299, "grad_norm": 17.724151458735076, "learning_rate": 7.521922140071003e-06, "loss": 0.299, "step": 97 }, { "epoch": 0.6363636363636364, "grad_norm": 8.720198729613006, "learning_rate": 7.3020322884297565e-06, "loss": 0.2774, "step": 98 }, { "epoch": 0.6428571428571429, "grad_norm": 8.406425632029876, "learning_rate": 7.0835406054517505e-06, "loss": 0.2811, "step": 99 }, { "epoch": 0.6493506493506493, "grad_norm": 3.066337187677972, "learning_rate": 6.866560320151179e-06, "loss": 0.2759, "step": 100 }, { "epoch": 0.6493506493506493, "eval_loss": 0.28351110219955444, "eval_runtime": 312.9908, "eval_samples_per_second": 7.866, "eval_steps_per_second": 0.492, "step": 100 }, { "epoch": 0.6558441558441559, "grad_norm": 15.176884918214549, "learning_rate": 6.651203878290139e-06, "loss": 0.2874, "step": 101 }, { "epoch": 0.6623376623376623, "grad_norm": 16.279890884456005, "learning_rate": 6.437582884105835e-06, "loss": 0.2956, "step": 102 }, { "epoch": 0.6688311688311688, "grad_norm": 4.714023910671981, "learning_rate": 6.225808042473857e-06, "loss": 0.2634, "step": 103 }, { "epoch": 0.6753246753246753, "grad_norm": 1.309120461516702, "learning_rate": 6.015989101537586e-06, "loss": 0.2773, "step": 104 }, { "epoch": 0.6818181818181818, "grad_norm": 11.045936072111946, "learning_rate": 5.8082347958333625e-06, "loss": 0.2824, "step": 105 }, { "epoch": 0.6883116883116883, "grad_norm": 16.18834779099887, "learning_rate": 5.602652789940941e-06, "loss": 0.3002, "step": 106 }, { "epoch": 0.6948051948051948, "grad_norm": 14.11890859017768, "learning_rate": 5.399349622688479e-06, "loss": 0.2863, "step": 107 }, { "epoch": 0.7012987012987013, "grad_norm": 7.617080769083321, "learning_rate": 5.198430651940846e-06, "loss": 0.2862, "step": 108 }, { "epoch": 0.7077922077922078, "grad_norm": 9.54938019527376, "learning_rate": 5.000000000000003e-06, "loss": 0.2927, "step": 109 }, { "epoch": 0.7142857142857143, "grad_norm": 20.71405636255643, "learning_rate": 4.804160499645667e-06, "loss": 0.3065, "step": 110 }, { "epoch": 0.7142857142857143, "eval_loss": 0.29005691409111023, "eval_runtime": 313.9643, "eval_samples_per_second": 7.842, "eval_steps_per_second": 0.491, "step": 110 }, { "epoch": 0.7207792207792207, "grad_norm": 19.655686505101173, "learning_rate": 4.611013640844245e-06, "loss": 0.3122, "step": 111 }, { "epoch": 0.7272727272727273, "grad_norm": 27.81134094714508, "learning_rate": 4.420659518153667e-06, "loss": 0.3481, "step": 112 }, { "epoch": 0.7337662337662337, "grad_norm": 13.151919474717086, "learning_rate": 4.2331967788513295e-06, "loss": 0.2965, "step": 113 }, { "epoch": 0.7402597402597403, "grad_norm": 12.727379197985805, "learning_rate": 4.048722571812105e-06, "loss": 0.2869, "step": 114 }, { "epoch": 0.7467532467532467, "grad_norm": 5.690625853165586, "learning_rate": 3.867332497162836e-06, "loss": 0.2744, "step": 115 }, { "epoch": 0.7532467532467533, "grad_norm": 15.977602330482075, "learning_rate": 3.689120556739475e-06, "loss": 0.2779, "step": 116 }, { "epoch": 0.7597402597402597, "grad_norm": 4.665182760538795, "learning_rate": 3.5141791053724405e-06, "loss": 0.2802, "step": 117 }, { "epoch": 0.7662337662337663, "grad_norm": 12.032346240955409, "learning_rate": 3.342598803025595e-06, "loss": 0.2736, "step": 118 }, { "epoch": 0.7727272727272727, "grad_norm": 1.0284849949386232, "learning_rate": 3.174468567813461e-06, "loss": 0.2792, "step": 119 }, { "epoch": 0.7792207792207793, "grad_norm": 9.684572155925773, "learning_rate": 3.009875529921181e-06, "loss": 0.2882, "step": 120 }, { "epoch": 0.7792207792207793, "eval_loss": 0.2734771966934204, "eval_runtime": 313.3823, "eval_samples_per_second": 7.856, "eval_steps_per_second": 0.491, "step": 120 }, { "epoch": 0.7857142857142857, "grad_norm": 21.667928299844853, "learning_rate": 2.8489049864510053e-06, "loss": 0.298, "step": 121 }, { "epoch": 0.7922077922077922, "grad_norm": 10.409256737126345, "learning_rate": 2.691640357218759e-06, "loss": 0.2668, "step": 122 }, { "epoch": 0.7987012987012987, "grad_norm": 4.130333792523025, "learning_rate": 2.5381631415231455e-06, "loss": 0.2694, "step": 123 }, { "epoch": 0.8051948051948052, "grad_norm": 5.4669486542991494, "learning_rate": 2.388552875910354e-06, "loss": 0.2879, "step": 124 }, { "epoch": 0.8116883116883117, "grad_norm": 2.6141478827601885, "learning_rate": 2.2428870929558012e-06, "loss": 0.2818, "step": 125 }, { "epoch": 0.8181818181818182, "grad_norm": 0.3834229507407078, "learning_rate": 2.101241281084416e-06, "loss": 0.284, "step": 126 }, { "epoch": 0.8246753246753247, "grad_norm": 5.302017805812908, "learning_rate": 1.963688845450218e-06, "loss": 0.2848, "step": 127 }, { "epoch": 0.8311688311688312, "grad_norm": 13.23430036873663, "learning_rate": 1.8303010698955803e-06, "loss": 0.2967, "step": 128 }, { "epoch": 0.8376623376623377, "grad_norm": 7.70370346833261, "learning_rate": 1.7011470800097496e-06, "loss": 0.2893, "step": 129 }, { "epoch": 0.8441558441558441, "grad_norm": 13.16371922100576, "learning_rate": 1.5762938073058853e-06, "loss": 0.2945, "step": 130 }, { "epoch": 0.8441558441558441, "eval_loss": 0.29197612404823303, "eval_runtime": 313.0571, "eval_samples_per_second": 7.864, "eval_steps_per_second": 0.492, "step": 130 }, { "epoch": 0.8506493506493507, "grad_norm": 8.525118019615128, "learning_rate": 1.4558059545351144e-06, "loss": 0.2827, "step": 131 }, { "epoch": 0.8571428571428571, "grad_norm": 8.9200482890423, "learning_rate": 1.339745962155613e-06, "loss": 0.2825, "step": 132 }, { "epoch": 0.8636363636363636, "grad_norm": 39.33548538833043, "learning_rate": 1.2281739759740575e-06, "loss": 0.3201, "step": 133 }, { "epoch": 0.8701298701298701, "grad_norm": 3.4926670793881556, "learning_rate": 1.121147815976248e-06, "loss": 0.283, "step": 134 }, { "epoch": 0.8766233766233766, "grad_norm": 5.747932762741362, "learning_rate": 1.01872294636304e-06, "loss": 0.2725, "step": 135 }, { "epoch": 0.8831168831168831, "grad_norm": 1.1086161252216624, "learning_rate": 9.209524468071096e-07, "loss": 0.2804, "step": 136 }, { "epoch": 0.8896103896103896, "grad_norm": 7.848619146903053, "learning_rate": 8.278869849454718e-07, "loss": 0.2724, "step": 137 }, { "epoch": 0.8961038961038961, "grad_norm": 5.522219422846495, "learning_rate": 7.395747901219474e-07, "loss": 0.2762, "step": 138 }, { "epoch": 0.9025974025974026, "grad_norm": 1.8057002109690163, "learning_rate": 6.560616283932897e-07, "loss": 0.272, "step": 139 }, { "epoch": 0.9090909090909091, "grad_norm": 11.912370593556862, "learning_rate": 5.77390778811796e-07, "loss": 0.2805, "step": 140 }, { "epoch": 0.9090909090909091, "eval_loss": 0.27342790365219116, "eval_runtime": 312.9689, "eval_samples_per_second": 7.867, "eval_steps_per_second": 0.492, "step": 140 }, { "epoch": 0.9155844155844156, "grad_norm": 7.228975883198848, "learning_rate": 5.036030109968082e-07, "loss": 0.2874, "step": 141 }, { "epoch": 0.922077922077922, "grad_norm": 6.9424955685050165, "learning_rate": 4.3473656400665256e-07, "loss": 0.2664, "step": 142 }, { "epoch": 0.9285714285714286, "grad_norm": 0.49152848735688254, "learning_rate": 3.708271265220087e-07, "loss": 0.2779, "step": 143 }, { "epoch": 0.935064935064935, "grad_norm": 1.8992578425605613, "learning_rate": 3.119078183509372e-07, "loss": 0.2711, "step": 144 }, { "epoch": 0.9415584415584416, "grad_norm": 8.02402217841754, "learning_rate": 2.5800917326521013e-07, "loss": 0.2785, "step": 145 }, { "epoch": 0.948051948051948, "grad_norm": 10.47081819262579, "learning_rate": 2.091591231767709e-07, "loss": 0.2669, "step": 146 }, { "epoch": 0.9545454545454546, "grad_norm": 4.420672218002455, "learning_rate": 1.6538298366257975e-07, "loss": 0.2719, "step": 147 }, { "epoch": 0.961038961038961, "grad_norm": 5.273851950008196, "learning_rate": 1.2670344084530384e-07, "loss": 0.2802, "step": 148 }, { "epoch": 0.9675324675324676, "grad_norm": 0.5952860316326587, "learning_rate": 9.314053963669245e-08, "loss": 0.2758, "step": 149 }, { "epoch": 0.974025974025974, "grad_norm": 4.345110535347966, "learning_rate": 6.471167334968887e-08, "loss": 0.2696, "step": 150 }, { "epoch": 0.974025974025974, "eval_loss": 0.2737749218940735, "eval_runtime": 312.8792, "eval_samples_per_second": 7.869, "eval_steps_per_second": 0.492, "step": 150 }, { "epoch": 0.9805194805194806, "grad_norm": 2.6965570408875545, "learning_rate": 4.143157468468717e-08, "loss": 0.2795, "step": 151 }, { "epoch": 0.987012987012987, "grad_norm": 7.447516529100159, "learning_rate": 2.3312308094607382e-08, "loss": 0.2878, "step": 152 }, { "epoch": 0.9935064935064936, "grad_norm": 11.167954853871736, "learning_rate": 1.0363263532724433e-08, "loss": 0.2612, "step": 153 }, { "epoch": 1.0, "grad_norm": 3.5261027782304932, "learning_rate": 2.591151586508467e-09, "loss": 0.2801, "step": 154 }, { "epoch": 1.0, "step": 154, "total_flos": 98781319004160.0, "train_loss": 1.0889477551757516, "train_runtime": 10011.5106, "train_samples_per_second": 0.984, "train_steps_per_second": 0.015 } ], "logging_steps": 1, "max_steps": 154, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 98781319004160.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }