{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 188, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02127659574468085, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 3.872, "step": 1 }, { "epoch": 0.0425531914893617, "grad_norm": 0.8125, "learning_rate": 4e-05, "loss": 3.9714, "step": 2 }, { "epoch": 0.06382978723404255, "grad_norm": 0.80859375, "learning_rate": 6e-05, "loss": 3.9502, "step": 3 }, { "epoch": 0.0851063829787234, "grad_norm": 0.9296875, "learning_rate": 8e-05, "loss": 4.0793, "step": 4 }, { "epoch": 0.10638297872340426, "grad_norm": 0.93359375, "learning_rate": 0.0001, "loss": 3.9545, "step": 5 }, { "epoch": 0.1276595744680851, "grad_norm": 0.66015625, "learning_rate": 0.00012, "loss": 3.803, "step": 6 }, { "epoch": 0.14893617021276595, "grad_norm": 1.2109375, "learning_rate": 0.00014, "loss": 3.5993, "step": 7 }, { "epoch": 0.1702127659574468, "grad_norm": 0.9609375, "learning_rate": 0.00016, "loss": 3.8622, "step": 8 }, { "epoch": 0.19148936170212766, "grad_norm": 0.640625, "learning_rate": 0.00018, "loss": 3.6551, "step": 9 }, { "epoch": 0.2127659574468085, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 3.6643, "step": 10 }, { "epoch": 0.23404255319148937, "grad_norm": 0.341796875, "learning_rate": 0.00019998442534369985, "loss": 3.7215, "step": 11 }, { "epoch": 0.2553191489361702, "grad_norm": 0.345703125, "learning_rate": 0.00019993770622619782, "loss": 3.6194, "step": 12 }, { "epoch": 0.2765957446808511, "grad_norm": 0.287109375, "learning_rate": 0.00019985985720017785, "loss": 3.7064, "step": 13 }, { "epoch": 0.2978723404255319, "grad_norm": 0.2314453125, "learning_rate": 0.00019975090251507638, "loss": 3.5715, "step": 14 }, { "epoch": 0.3191489361702128, "grad_norm": 0.1865234375, "learning_rate": 0.0001996108761095289, "loss": 3.5818, "step": 15 }, { "epoch": 0.3404255319148936, "grad_norm": 0.279296875, "learning_rate": 0.0001994398216007982, "loss": 3.5229, "step": 16 }, { "epoch": 0.3617021276595745, "grad_norm": 0.265625, "learning_rate": 0.0001992377922711879, "loss": 3.5894, "step": 17 }, { "epoch": 0.3829787234042553, "grad_norm": 0.279296875, "learning_rate": 0.00019900485105144543, "loss": 3.5212, "step": 18 }, { "epoch": 0.40425531914893614, "grad_norm": 0.2138671875, "learning_rate": 0.00019874107050115954, "loss": 3.5943, "step": 19 }, { "epoch": 0.425531914893617, "grad_norm": 0.173828125, "learning_rate": 0.00019844653278615833, "loss": 3.5288, "step": 20 }, { "epoch": 0.44680851063829785, "grad_norm": 0.2119140625, "learning_rate": 0.00019812132965291545, "loss": 3.6237, "step": 21 }, { "epoch": 0.46808510638297873, "grad_norm": 0.23828125, "learning_rate": 0.00019776556239997146, "loss": 3.5176, "step": 22 }, { "epoch": 0.48936170212765956, "grad_norm": 0.2060546875, "learning_rate": 0.00019737934184638006, "loss": 3.5751, "step": 23 }, { "epoch": 0.5106382978723404, "grad_norm": 0.1953125, "learning_rate": 0.00019696278829718883, "loss": 3.5265, "step": 24 }, { "epoch": 0.5319148936170213, "grad_norm": 0.2734375, "learning_rate": 0.00019651603150596495, "loss": 3.5635, "step": 25 }, { "epoch": 0.5531914893617021, "grad_norm": 0.1708984375, "learning_rate": 0.00019603921063437793, "loss": 3.543, "step": 26 }, { "epoch": 0.574468085106383, "grad_norm": 0.197265625, "learning_rate": 0.00019553247420885157, "loss": 3.573, "step": 27 }, { "epoch": 0.5957446808510638, "grad_norm": 0.22265625, "learning_rate": 0.0001949959800742991, "loss": 3.566, "step": 28 }, { "epoch": 0.6170212765957447, "grad_norm": 0.1748046875, "learning_rate": 0.00019442989534495557, "loss": 3.6022, "step": 29 }, { "epoch": 0.6382978723404256, "grad_norm": 0.232421875, "learning_rate": 0.00019383439635232294, "loss": 3.514, "step": 30 }, { "epoch": 0.6595744680851063, "grad_norm": 0.216796875, "learning_rate": 0.00019320966859024397, "loss": 3.5634, "step": 31 }, { "epoch": 0.6808510638297872, "grad_norm": 0.1962890625, "learning_rate": 0.00019255590665712214, "loss": 3.6299, "step": 32 }, { "epoch": 0.7021276595744681, "grad_norm": 0.28515625, "learning_rate": 0.0001918733141953056, "loss": 3.4891, "step": 33 }, { "epoch": 0.723404255319149, "grad_norm": 0.2265625, "learning_rate": 0.0001911621038276542, "loss": 3.6105, "step": 34 }, { "epoch": 0.7446808510638298, "grad_norm": 0.31640625, "learning_rate": 0.0001904224970913085, "loss": 3.5861, "step": 35 }, { "epoch": 0.7659574468085106, "grad_norm": 0.298828125, "learning_rate": 0.00018965472436868286, "loss": 3.5116, "step": 36 }, { "epoch": 0.7872340425531915, "grad_norm": 0.2890625, "learning_rate": 0.0001888590248157027, "loss": 3.4935, "step": 37 }, { "epoch": 0.8085106382978723, "grad_norm": 0.2158203125, "learning_rate": 0.00018803564628730915, "loss": 3.6211, "step": 38 }, { "epoch": 0.8297872340425532, "grad_norm": 0.2236328125, "learning_rate": 0.00018718484526025387, "loss": 3.634, "step": 39 }, { "epoch": 0.851063829787234, "grad_norm": 0.234375, "learning_rate": 0.00018630688675320842, "loss": 3.6452, "step": 40 }, { "epoch": 0.8723404255319149, "grad_norm": 0.201171875, "learning_rate": 0.00018540204424421263, "loss": 3.5727, "step": 41 }, { "epoch": 0.8936170212765957, "grad_norm": 0.2451171875, "learning_rate": 0.0001844705995854882, "loss": 3.5772, "step": 42 }, { "epoch": 0.9148936170212766, "grad_norm": 0.232421875, "learning_rate": 0.00018351284291564358, "loss": 3.6298, "step": 43 }, { "epoch": 0.9361702127659575, "grad_norm": 0.3046875, "learning_rate": 0.00018252907256929775, "loss": 3.6612, "step": 44 }, { "epoch": 0.9574468085106383, "grad_norm": 0.263671875, "learning_rate": 0.00018151959498415122, "loss": 3.6252, "step": 45 }, { "epoch": 0.9787234042553191, "grad_norm": 0.353515625, "learning_rate": 0.00018048472460553257, "loss": 3.6423, "step": 46 }, { "epoch": 1.0, "grad_norm": 0.62109375, "learning_rate": 0.0001794247837884511, "loss": 3.4633, "step": 47 }, { "epoch": 1.0212765957446808, "grad_norm": 1.0859375, "learning_rate": 0.00017834010269718526, "loss": 3.2725, "step": 48 }, { "epoch": 1.0425531914893618, "grad_norm": 0.57421875, "learning_rate": 0.0001772310192024389, "loss": 3.3142, "step": 49 }, { "epoch": 1.0638297872340425, "grad_norm": 0.6953125, "learning_rate": 0.0001760978787760968, "loss": 3.3171, "step": 50 }, { "epoch": 1.0851063829787233, "grad_norm": 1.0078125, "learning_rate": 0.0001749410343836125, "loss": 3.4249, "step": 51 }, { "epoch": 1.1063829787234043, "grad_norm": 0.28125, "learning_rate": 0.00017376084637406222, "loss": 3.37, "step": 52 }, { "epoch": 1.127659574468085, "grad_norm": 0.625, "learning_rate": 0.00017255768236789826, "loss": 3.3532, "step": 53 }, { "epoch": 1.148936170212766, "grad_norm": 0.71484375, "learning_rate": 0.00017133191714243805, "loss": 3.1741, "step": 54 }, { "epoch": 1.1702127659574468, "grad_norm": 0.259765625, "learning_rate": 0.00017008393251512332, "loss": 3.402, "step": 55 }, { "epoch": 1.1914893617021276, "grad_norm": 0.474609375, "learning_rate": 0.00016881411722458688, "loss": 3.2908, "step": 56 }, { "epoch": 1.2127659574468086, "grad_norm": 0.51171875, "learning_rate": 0.00016752286680956306, "loss": 3.3467, "step": 57 }, { "epoch": 1.2340425531914894, "grad_norm": 0.447265625, "learning_rate": 0.00016621058348568007, "loss": 3.3667, "step": 58 }, { "epoch": 1.2553191489361701, "grad_norm": 0.306640625, "learning_rate": 0.00016487767602017263, "loss": 3.308, "step": 59 }, { "epoch": 1.2765957446808511, "grad_norm": 0.6328125, "learning_rate": 0.00016352455960455387, "loss": 3.4156, "step": 60 }, { "epoch": 1.297872340425532, "grad_norm": 0.447265625, "learning_rate": 0.00016215165572528597, "loss": 3.3147, "step": 61 }, { "epoch": 1.3191489361702127, "grad_norm": 0.201171875, "learning_rate": 0.0001607593920324899, "loss": 3.3296, "step": 62 }, { "epoch": 1.3404255319148937, "grad_norm": 0.380859375, "learning_rate": 0.00015934820220673564, "loss": 3.2744, "step": 63 }, { "epoch": 1.3617021276595744, "grad_norm": 0.39453125, "learning_rate": 0.00015791852582395334, "loss": 3.3446, "step": 64 }, { "epoch": 1.3829787234042552, "grad_norm": 0.35546875, "learning_rate": 0.0001564708082185087, "loss": 3.2568, "step": 65 }, { "epoch": 1.4042553191489362, "grad_norm": 0.220703125, "learning_rate": 0.00015500550034448413, "loss": 3.319, "step": 66 }, { "epoch": 1.425531914893617, "grad_norm": 0.3203125, "learning_rate": 0.00015352305863520991, "loss": 3.2982, "step": 67 }, { "epoch": 1.4468085106382977, "grad_norm": 0.36328125, "learning_rate": 0.0001520239448610882, "loss": 3.3751, "step": 68 }, { "epoch": 1.4680851063829787, "grad_norm": 0.25, "learning_rate": 0.00015050862598575476, "loss": 3.293, "step": 69 }, { "epoch": 1.4893617021276595, "grad_norm": 0.1962890625, "learning_rate": 0.00014897757402062284, "loss": 3.3238, "step": 70 }, { "epoch": 1.5106382978723403, "grad_norm": 0.328125, "learning_rate": 0.00014743126587785522, "loss": 3.2718, "step": 71 }, { "epoch": 1.5319148936170213, "grad_norm": 0.267578125, "learning_rate": 0.00014587018322180905, "loss": 3.3136, "step": 72 }, { "epoch": 1.5531914893617023, "grad_norm": 0.232421875, "learning_rate": 0.00014429481231900083, "loss": 3.3282, "step": 73 }, { "epoch": 1.574468085106383, "grad_norm": 0.2578125, "learning_rate": 0.0001427056438866376, "loss": 3.3317, "step": 74 }, { "epoch": 1.5957446808510638, "grad_norm": 0.310546875, "learning_rate": 0.00014110317293976218, "loss": 3.3342, "step": 75 }, { "epoch": 1.6170212765957448, "grad_norm": 0.2451171875, "learning_rate": 0.00013948789863705912, "loss": 3.351, "step": 76 }, { "epoch": 1.6382978723404256, "grad_norm": 0.208984375, "learning_rate": 0.00013786032412537035, "loss": 3.3016, "step": 77 }, { "epoch": 1.6595744680851063, "grad_norm": 0.2412109375, "learning_rate": 0.00013622095638296826, "loss": 3.3357, "step": 78 }, { "epoch": 1.6808510638297873, "grad_norm": 0.26171875, "learning_rate": 0.00013457030606163562, "loss": 3.3777, "step": 79 }, { "epoch": 1.702127659574468, "grad_norm": 0.267578125, "learning_rate": 0.000132908887327601, "loss": 3.2683, "step": 80 }, { "epoch": 1.7234042553191489, "grad_norm": 0.263671875, "learning_rate": 0.00013123721770137944, "loss": 3.3866, "step": 81 }, { "epoch": 1.7446808510638299, "grad_norm": 0.2578125, "learning_rate": 0.00012955581789656843, "loss": 3.3582, "step": 82 }, { "epoch": 1.7659574468085106, "grad_norm": 0.279296875, "learning_rate": 0.0001278652116576492, "loss": 3.2956, "step": 83 }, { "epoch": 1.7872340425531914, "grad_norm": 0.2451171875, "learning_rate": 0.0001261659255968441, "loss": 3.258, "step": 84 }, { "epoch": 1.8085106382978724, "grad_norm": 0.24609375, "learning_rate": 0.00012445848903008003, "loss": 3.4091, "step": 85 }, { "epoch": 1.8297872340425532, "grad_norm": 0.2451171875, "learning_rate": 0.00012274343381211066, "loss": 3.4071, "step": 86 }, { "epoch": 1.851063829787234, "grad_norm": 0.251953125, "learning_rate": 0.00012102129417084714, "loss": 3.4251, "step": 87 }, { "epoch": 1.872340425531915, "grad_norm": 0.26953125, "learning_rate": 0.00011929260654094969, "loss": 3.3607, "step": 88 }, { "epoch": 1.8936170212765957, "grad_norm": 0.271484375, "learning_rate": 0.00011755790939673209, "loss": 3.3509, "step": 89 }, { "epoch": 1.9148936170212765, "grad_norm": 0.279296875, "learning_rate": 0.0001158177430844304, "loss": 3.4081, "step": 90 }, { "epoch": 1.9361702127659575, "grad_norm": 0.328125, "learning_rate": 0.00011407264965388906, "loss": 3.4399, "step": 91 }, { "epoch": 1.9574468085106385, "grad_norm": 0.30859375, "learning_rate": 0.00011232317268971585, "loss": 3.4059, "step": 92 }, { "epoch": 1.978723404255319, "grad_norm": 0.37890625, "learning_rate": 0.00011056985714195932, "loss": 3.4202, "step": 93 }, { "epoch": 2.0, "grad_norm": 0.84375, "learning_rate": 0.00010881324915636019, "loss": 3.0814, "step": 94 }, { "epoch": 2.021276595744681, "grad_norm": 0.44140625, "learning_rate": 0.0001070538959042311, "loss": 3.1079, "step": 95 }, { "epoch": 2.0425531914893615, "grad_norm": 0.625, "learning_rate": 0.00010529234541201631, "loss": 3.1756, "step": 96 }, { "epoch": 2.0638297872340425, "grad_norm": 0.51171875, "learning_rate": 0.00010352914639058526, "loss": 3.1467, "step": 97 }, { "epoch": 2.0851063829787235, "grad_norm": 0.330078125, "learning_rate": 0.00010176484806431288, "loss": 3.2387, "step": 98 }, { "epoch": 2.106382978723404, "grad_norm": 0.421875, "learning_rate": 0.0001, "loss": 3.2315, "step": 99 }, { "epoch": 2.127659574468085, "grad_norm": 0.41015625, "learning_rate": 9.823515193568715e-05, "loss": 3.2065, "step": 100 }, { "epoch": 2.148936170212766, "grad_norm": 0.291015625, "learning_rate": 9.647085360941476e-05, "loss": 3.0125, "step": 101 }, { "epoch": 2.1702127659574466, "grad_norm": 0.30078125, "learning_rate": 9.470765458798368e-05, "loss": 3.25, "step": 102 }, { "epoch": 2.1914893617021276, "grad_norm": 0.30859375, "learning_rate": 9.29461040957689e-05, "loss": 3.1501, "step": 103 }, { "epoch": 2.2127659574468086, "grad_norm": 0.3359375, "learning_rate": 9.118675084363986e-05, "loss": 3.1963, "step": 104 }, { "epoch": 2.2340425531914896, "grad_norm": 0.30078125, "learning_rate": 8.943014285804072e-05, "loss": 3.205, "step": 105 }, { "epoch": 2.25531914893617, "grad_norm": 0.26171875, "learning_rate": 8.767682731028415e-05, "loss": 3.1699, "step": 106 }, { "epoch": 2.276595744680851, "grad_norm": 0.318359375, "learning_rate": 8.592735034611097e-05, "loss": 3.2659, "step": 107 }, { "epoch": 2.297872340425532, "grad_norm": 0.302734375, "learning_rate": 8.418225691556962e-05, "loss": 3.1723, "step": 108 }, { "epoch": 2.3191489361702127, "grad_norm": 0.2578125, "learning_rate": 8.244209060326794e-05, "loss": 3.1985, "step": 109 }, { "epoch": 2.3404255319148937, "grad_norm": 0.279296875, "learning_rate": 8.070739345905032e-05, "loss": 3.1365, "step": 110 }, { "epoch": 2.3617021276595747, "grad_norm": 0.31640625, "learning_rate": 7.897870582915288e-05, "loss": 3.2121, "step": 111 }, { "epoch": 2.382978723404255, "grad_norm": 0.31640625, "learning_rate": 7.725656618788937e-05, "loss": 3.115, "step": 112 }, { "epoch": 2.404255319148936, "grad_norm": 0.296875, "learning_rate": 7.554151096992001e-05, "loss": 3.1752, "step": 113 }, { "epoch": 2.425531914893617, "grad_norm": 0.265625, "learning_rate": 7.383407440315596e-05, "loss": 3.1718, "step": 114 }, { "epoch": 2.4468085106382977, "grad_norm": 0.265625, "learning_rate": 7.213478834235079e-05, "loss": 3.2311, "step": 115 }, { "epoch": 2.4680851063829787, "grad_norm": 0.29296875, "learning_rate": 7.04441821034316e-05, "loss": 3.167, "step": 116 }, { "epoch": 2.4893617021276597, "grad_norm": 0.322265625, "learning_rate": 6.87627822986206e-05, "loss": 3.1866, "step": 117 }, { "epoch": 2.5106382978723403, "grad_norm": 0.33984375, "learning_rate": 6.7091112672399e-05, "loss": 3.1326, "step": 118 }, { "epoch": 2.5319148936170213, "grad_norm": 0.287109375, "learning_rate": 6.542969393836436e-05, "loss": 3.1708, "step": 119 }, { "epoch": 2.5531914893617023, "grad_norm": 0.28125, "learning_rate": 6.377904361703178e-05, "loss": 3.1963, "step": 120 }, { "epoch": 2.574468085106383, "grad_norm": 0.373046875, "learning_rate": 6.213967587462968e-05, "loss": 3.1986, "step": 121 }, { "epoch": 2.595744680851064, "grad_norm": 0.341796875, "learning_rate": 6.051210136294089e-05, "loss": 3.1945, "step": 122 }, { "epoch": 2.617021276595745, "grad_norm": 0.291015625, "learning_rate": 5.889682706023783e-05, "loss": 3.2107, "step": 123 }, { "epoch": 2.6382978723404253, "grad_norm": 0.314453125, "learning_rate": 5.729435611336239e-05, "loss": 3.1784, "step": 124 }, { "epoch": 2.6595744680851063, "grad_norm": 0.33203125, "learning_rate": 5.570518768099918e-05, "loss": 3.1984, "step": 125 }, { "epoch": 2.6808510638297873, "grad_norm": 0.310546875, "learning_rate": 5.4129816778190936e-05, "loss": 3.2371, "step": 126 }, { "epoch": 2.702127659574468, "grad_norm": 0.3046875, "learning_rate": 5.2568734122144756e-05, "loss": 3.1334, "step": 127 }, { "epoch": 2.723404255319149, "grad_norm": 0.318359375, "learning_rate": 5.102242597937717e-05, "loss": 3.255, "step": 128 }, { "epoch": 2.74468085106383, "grad_norm": 0.3203125, "learning_rate": 4.949137401424527e-05, "loss": 3.2194, "step": 129 }, { "epoch": 2.7659574468085104, "grad_norm": 0.3125, "learning_rate": 4.797605513891179e-05, "loss": 3.1648, "step": 130 }, { "epoch": 2.7872340425531914, "grad_norm": 0.3125, "learning_rate": 4.6476941364790074e-05, "loss": 3.1241, "step": 131 }, { "epoch": 2.8085106382978724, "grad_norm": 0.34375, "learning_rate": 4.4994499655515865e-05, "loss": 3.2732, "step": 132 }, { "epoch": 2.829787234042553, "grad_norm": 0.33984375, "learning_rate": 4.352919178149132e-05, "loss": 3.2714, "step": 133 }, { "epoch": 2.851063829787234, "grad_norm": 0.322265625, "learning_rate": 4.2081474176046646e-05, "loss": 3.2881, "step": 134 }, { "epoch": 2.872340425531915, "grad_norm": 0.322265625, "learning_rate": 4.0651797793264354e-05, "loss": 3.2259, "step": 135 }, { "epoch": 2.8936170212765955, "grad_norm": 0.361328125, "learning_rate": 3.924060796751012e-05, "loss": 3.2168, "step": 136 }, { "epoch": 2.9148936170212765, "grad_norm": 0.423828125, "learning_rate": 3.784834427471408e-05, "loss": 3.2721, "step": 137 }, { "epoch": 2.9361702127659575, "grad_norm": 0.35546875, "learning_rate": 3.647544039544615e-05, "loss": 3.2962, "step": 138 }, { "epoch": 2.9574468085106385, "grad_norm": 0.353515625, "learning_rate": 3.5122323979827395e-05, "loss": 3.2635, "step": 139 }, { "epoch": 2.978723404255319, "grad_norm": 0.37109375, "learning_rate": 3.378941651431996e-05, "loss": 3.2729, "step": 140 }, { "epoch": 3.0, "grad_norm": 0.76953125, "learning_rate": 3.2477133190436945e-05, "loss": 2.838, "step": 141 }, { "epoch": 3.021276595744681, "grad_norm": 0.345703125, "learning_rate": 3.118588277541312e-05, "loss": 3.0105, "step": 142 }, { "epoch": 3.0425531914893615, "grad_norm": 0.470703125, "learning_rate": 2.99160674848767e-05, "loss": 3.0744, "step": 143 }, { "epoch": 3.0638297872340425, "grad_norm": 0.51171875, "learning_rate": 2.8668082857562005e-05, "loss": 3.0395, "step": 144 }, { "epoch": 3.0851063829787235, "grad_norm": 0.369140625, "learning_rate": 2.7442317632101745e-05, "loss": 3.1344, "step": 145 }, { "epoch": 3.106382978723404, "grad_norm": 0.353515625, "learning_rate": 2.6239153625937784e-05, "loss": 3.1349, "step": 146 }, { "epoch": 3.127659574468085, "grad_norm": 0.3203125, "learning_rate": 2.5058965616387498e-05, "loss": 3.1193, "step": 147 }, { "epoch": 3.148936170212766, "grad_norm": 0.328125, "learning_rate": 2.390212122390323e-05, "loss": 2.9173, "step": 148 }, { "epoch": 3.1702127659574466, "grad_norm": 0.330078125, "learning_rate": 2.2768980797561124e-05, "loss": 3.1502, "step": 149 }, { "epoch": 3.1914893617021276, "grad_norm": 0.333984375, "learning_rate": 2.1659897302814747e-05, "loss": 3.065, "step": 150 }, { "epoch": 3.2127659574468086, "grad_norm": 0.3359375, "learning_rate": 2.0575216211548907e-05, "loss": 3.1079, "step": 151 }, { "epoch": 3.2340425531914896, "grad_norm": 0.416015625, "learning_rate": 1.9515275394467446e-05, "loss": 3.1094, "step": 152 }, { "epoch": 3.25531914893617, "grad_norm": 0.333984375, "learning_rate": 1.8480405015848824e-05, "loss": 3.0872, "step": 153 }, { "epoch": 3.276595744680851, "grad_norm": 0.33203125, "learning_rate": 1.7470927430702277e-05, "loss": 3.1752, "step": 154 }, { "epoch": 3.297872340425532, "grad_norm": 0.314453125, "learning_rate": 1.648715708435645e-05, "loss": 3.0836, "step": 155 }, { "epoch": 3.3191489361702127, "grad_norm": 0.296875, "learning_rate": 1.5529400414511806e-05, "loss": 3.1217, "step": 156 }, { "epoch": 3.3404255319148937, "grad_norm": 0.318359375, "learning_rate": 1.4597955755787373e-05, "loss": 3.0535, "step": 157 }, { "epoch": 3.3617021276595747, "grad_norm": 0.30859375, "learning_rate": 1.3693113246791589e-05, "loss": 3.1319, "step": 158 }, { "epoch": 3.382978723404255, "grad_norm": 0.326171875, "learning_rate": 1.2815154739746138e-05, "loss": 3.0385, "step": 159 }, { "epoch": 3.404255319148936, "grad_norm": 0.322265625, "learning_rate": 1.196435371269089e-05, "loss": 3.0892, "step": 160 }, { "epoch": 3.425531914893617, "grad_norm": 0.33203125, "learning_rate": 1.1140975184297331e-05, "loss": 3.1002, "step": 161 }, { "epoch": 3.4468085106382977, "grad_norm": 0.337890625, "learning_rate": 1.0345275631317163e-05, "loss": 3.1541, "step": 162 }, { "epoch": 3.4680851063829787, "grad_norm": 0.3203125, "learning_rate": 9.577502908691526e-06, "loss": 3.0974, "step": 163 }, { "epoch": 3.4893617021276597, "grad_norm": 0.3203125, "learning_rate": 8.837896172345827e-06, "loss": 3.1121, "step": 164 }, { "epoch": 3.5106382978723403, "grad_norm": 0.32421875, "learning_rate": 8.1266858046944e-06, "loss": 3.0568, "step": 165 }, { "epoch": 3.5319148936170213, "grad_norm": 0.3359375, "learning_rate": 7.4440933428779e-06, "loss": 3.0998, "step": 166 }, { "epoch": 3.5531914893617023, "grad_norm": 0.302734375, "learning_rate": 6.7903314097560454e-06, "loss": 3.1263, "step": 167 }, { "epoch": 3.574468085106383, "grad_norm": 0.326171875, "learning_rate": 6.165603647677054e-06, "loss": 3.1272, "step": 168 }, { "epoch": 3.595744680851064, "grad_norm": 0.345703125, "learning_rate": 5.570104655044428e-06, "loss": 3.1268, "step": 169 }, { "epoch": 3.617021276595745, "grad_norm": 0.32421875, "learning_rate": 5.00401992570092e-06, "loss": 3.1398, "step": 170 }, { "epoch": 3.6382978723404253, "grad_norm": 0.310546875, "learning_rate": 4.467525791148453e-06, "loss": 3.1151, "step": 171 }, { "epoch": 3.6595744680851063, "grad_norm": 0.322265625, "learning_rate": 3.960789365622075e-06, "loss": 3.1346, "step": 172 }, { "epoch": 3.6808510638297873, "grad_norm": 0.341796875, "learning_rate": 3.483968494035039e-06, "loss": 3.1692, "step": 173 }, { "epoch": 3.702127659574468, "grad_norm": 0.32421875, "learning_rate": 3.0372117028111825e-06, "loss": 3.0739, "step": 174 }, { "epoch": 3.723404255319149, "grad_norm": 0.33203125, "learning_rate": 2.6206581536199594e-06, "loss": 3.1945, "step": 175 }, { "epoch": 3.74468085106383, "grad_norm": 0.34375, "learning_rate": 2.2344376000285604e-06, "loss": 3.1558, "step": 176 }, { "epoch": 3.7659574468085104, "grad_norm": 0.330078125, "learning_rate": 1.8786703470845547e-06, "loss": 3.1134, "step": 177 }, { "epoch": 3.7872340425531914, "grad_norm": 0.3359375, "learning_rate": 1.553467213841664e-06, "loss": 3.0711, "step": 178 }, { "epoch": 3.8085106382978724, "grad_norm": 0.361328125, "learning_rate": 1.2589294988404888e-06, "loss": 3.2178, "step": 179 }, { "epoch": 3.829787234042553, "grad_norm": 0.392578125, "learning_rate": 9.951489485545695e-07, "loss": 3.2206, "step": 180 }, { "epoch": 3.851063829787234, "grad_norm": 0.3984375, "learning_rate": 7.622077288121033e-07, "loss": 3.24, "step": 181 }, { "epoch": 3.872340425531915, "grad_norm": 0.388671875, "learning_rate": 5.60178399201805e-07, "loss": 3.1806, "step": 182 }, { "epoch": 3.8936170212765955, "grad_norm": 0.373046875, "learning_rate": 3.8912389047108813e-07, "loss": 3.1704, "step": 183 }, { "epoch": 3.9148936170212765, "grad_norm": 0.380859375, "learning_rate": 2.490974849236216e-07, "loss": 3.2246, "step": 184 }, { "epoch": 3.9361702127659575, "grad_norm": 0.4296875, "learning_rate": 1.401427998221627e-07, "loss": 3.2522, "step": 185 }, { "epoch": 3.9574468085106385, "grad_norm": 0.400390625, "learning_rate": 6.229377380218005e-08, "loss": 3.2196, "step": 186 }, { "epoch": 3.978723404255319, "grad_norm": 0.408203125, "learning_rate": 1.5574656300143542e-08, "loss": 3.2318, "step": 187 }, { "epoch": 4.0, "grad_norm": 0.8125, "learning_rate": 0.0, "loss": 2.7607, "step": 188 } ], "logging_steps": 1, "max_steps": 188, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 24, "total_flos": 2.79753919561728e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }