{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 299, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033444816053511705, "grad_norm": 0.482421875, "learning_rate": 9.966555183946488e-06, "loss": 1.7841, "step": 1 }, { "epoch": 0.006688963210702341, "grad_norm": 0.486328125, "learning_rate": 9.933110367892978e-06, "loss": 1.711, "step": 2 }, { "epoch": 0.010033444816053512, "grad_norm": 0.462890625, "learning_rate": 9.899665551839465e-06, "loss": 1.6722, "step": 3 }, { "epoch": 0.013377926421404682, "grad_norm": 0.439453125, "learning_rate": 9.866220735785954e-06, "loss": 1.6429, "step": 4 }, { "epoch": 0.016722408026755852, "grad_norm": 0.4453125, "learning_rate": 9.832775919732442e-06, "loss": 1.6428, "step": 5 }, { "epoch": 0.020066889632107024, "grad_norm": 0.42578125, "learning_rate": 9.799331103678931e-06, "loss": 1.5967, "step": 6 }, { "epoch": 0.023411371237458192, "grad_norm": 0.40234375, "learning_rate": 9.765886287625419e-06, "loss": 1.6212, "step": 7 }, { "epoch": 0.026755852842809364, "grad_norm": 0.388671875, "learning_rate": 9.732441471571908e-06, "loss": 1.615, "step": 8 }, { "epoch": 0.030100334448160536, "grad_norm": 0.388671875, "learning_rate": 9.698996655518395e-06, "loss": 1.6172, "step": 9 }, { "epoch": 0.033444816053511704, "grad_norm": 0.3515625, "learning_rate": 9.665551839464884e-06, "loss": 1.5622, "step": 10 }, { "epoch": 0.03678929765886288, "grad_norm": 0.345703125, "learning_rate": 9.632107023411372e-06, "loss": 1.5931, "step": 11 }, { "epoch": 0.04013377926421405, "grad_norm": 0.361328125, "learning_rate": 9.598662207357861e-06, "loss": 1.6176, "step": 12 }, { "epoch": 0.043478260869565216, "grad_norm": 0.328125, "learning_rate": 9.565217391304349e-06, "loss": 1.5989, "step": 13 }, { "epoch": 0.046822742474916385, "grad_norm": 0.302734375, "learning_rate": 9.531772575250838e-06, "loss": 1.501, "step": 14 }, { "epoch": 0.05016722408026756, "grad_norm": 0.32421875, "learning_rate": 9.498327759197325e-06, "loss": 1.4805, "step": 15 }, { "epoch": 0.05351170568561873, "grad_norm": 0.296875, "learning_rate": 9.464882943143815e-06, "loss": 1.5987, "step": 16 }, { "epoch": 0.056856187290969896, "grad_norm": 0.259765625, "learning_rate": 9.431438127090302e-06, "loss": 1.5577, "step": 17 }, { "epoch": 0.06020066889632107, "grad_norm": 0.2890625, "learning_rate": 9.39799331103679e-06, "loss": 1.516, "step": 18 }, { "epoch": 0.06354515050167224, "grad_norm": 0.27734375, "learning_rate": 9.364548494983279e-06, "loss": 1.4679, "step": 19 }, { "epoch": 0.06688963210702341, "grad_norm": 0.26953125, "learning_rate": 9.331103678929766e-06, "loss": 1.4369, "step": 20 }, { "epoch": 0.07023411371237458, "grad_norm": 0.26953125, "learning_rate": 9.297658862876256e-06, "loss": 1.5734, "step": 21 }, { "epoch": 0.07357859531772576, "grad_norm": 0.26953125, "learning_rate": 9.264214046822743e-06, "loss": 1.3749, "step": 22 }, { "epoch": 0.07692307692307693, "grad_norm": 0.263671875, "learning_rate": 9.230769230769232e-06, "loss": 1.4977, "step": 23 }, { "epoch": 0.0802675585284281, "grad_norm": 0.2412109375, "learning_rate": 9.19732441471572e-06, "loss": 1.4185, "step": 24 }, { "epoch": 0.08361204013377926, "grad_norm": 0.28515625, "learning_rate": 9.163879598662207e-06, "loss": 1.4903, "step": 25 }, { "epoch": 0.08695652173913043, "grad_norm": 0.2490234375, "learning_rate": 9.130434782608697e-06, "loss": 1.4328, "step": 26 }, { "epoch": 0.0903010033444816, "grad_norm": 0.28125, "learning_rate": 9.096989966555184e-06, "loss": 1.4603, "step": 27 }, { "epoch": 0.09364548494983277, "grad_norm": 0.216796875, "learning_rate": 9.063545150501673e-06, "loss": 1.3327, "step": 28 }, { "epoch": 0.09698996655518395, "grad_norm": 0.25, "learning_rate": 9.03010033444816e-06, "loss": 1.4307, "step": 29 }, { "epoch": 0.10033444816053512, "grad_norm": 0.291015625, "learning_rate": 8.996655518394648e-06, "loss": 1.3776, "step": 30 }, { "epoch": 0.10367892976588629, "grad_norm": 0.2294921875, "learning_rate": 8.963210702341138e-06, "loss": 1.4161, "step": 31 }, { "epoch": 0.10702341137123746, "grad_norm": 0.2119140625, "learning_rate": 8.929765886287625e-06, "loss": 1.3789, "step": 32 }, { "epoch": 0.11036789297658862, "grad_norm": 0.2119140625, "learning_rate": 8.896321070234114e-06, "loss": 1.3506, "step": 33 }, { "epoch": 0.11371237458193979, "grad_norm": 0.21484375, "learning_rate": 8.862876254180602e-06, "loss": 1.3179, "step": 34 }, { "epoch": 0.11705685618729098, "grad_norm": 0.2734375, "learning_rate": 8.829431438127091e-06, "loss": 1.3589, "step": 35 }, { "epoch": 0.12040133779264214, "grad_norm": 0.2333984375, "learning_rate": 8.795986622073578e-06, "loss": 1.3118, "step": 36 }, { "epoch": 0.12374581939799331, "grad_norm": 0.20703125, "learning_rate": 8.762541806020068e-06, "loss": 1.3768, "step": 37 }, { "epoch": 0.12709030100334448, "grad_norm": 0.251953125, "learning_rate": 8.729096989966555e-06, "loss": 1.3581, "step": 38 }, { "epoch": 0.13043478260869565, "grad_norm": 0.2021484375, "learning_rate": 8.695652173913044e-06, "loss": 1.2116, "step": 39 }, { "epoch": 0.13377926421404682, "grad_norm": 0.2265625, "learning_rate": 8.662207357859532e-06, "loss": 1.3651, "step": 40 }, { "epoch": 0.13712374581939799, "grad_norm": 0.205078125, "learning_rate": 8.628762541806021e-06, "loss": 1.265, "step": 41 }, { "epoch": 0.14046822742474915, "grad_norm": 0.2041015625, "learning_rate": 8.595317725752509e-06, "loss": 1.2728, "step": 42 }, { "epoch": 0.14381270903010032, "grad_norm": 0.205078125, "learning_rate": 8.561872909698998e-06, "loss": 1.3688, "step": 43 }, { "epoch": 0.14715719063545152, "grad_norm": 0.2099609375, "learning_rate": 8.528428093645485e-06, "loss": 1.2869, "step": 44 }, { "epoch": 0.1505016722408027, "grad_norm": 0.2080078125, "learning_rate": 8.494983277591975e-06, "loss": 1.2908, "step": 45 }, { "epoch": 0.15384615384615385, "grad_norm": 0.2001953125, "learning_rate": 8.461538461538462e-06, "loss": 1.3163, "step": 46 }, { "epoch": 0.15719063545150502, "grad_norm": 0.1826171875, "learning_rate": 8.42809364548495e-06, "loss": 1.3155, "step": 47 }, { "epoch": 0.1605351170568562, "grad_norm": 0.1904296875, "learning_rate": 8.394648829431439e-06, "loss": 1.2229, "step": 48 }, { "epoch": 0.16387959866220736, "grad_norm": 0.16796875, "learning_rate": 8.361204013377926e-06, "loss": 1.2475, "step": 49 }, { "epoch": 0.16722408026755853, "grad_norm": 0.171875, "learning_rate": 8.327759197324416e-06, "loss": 1.2195, "step": 50 }, { "epoch": 0.1705685618729097, "grad_norm": 0.16796875, "learning_rate": 8.294314381270903e-06, "loss": 1.2595, "step": 51 }, { "epoch": 0.17391304347826086, "grad_norm": 0.2041015625, "learning_rate": 8.260869565217392e-06, "loss": 1.3781, "step": 52 }, { "epoch": 0.17725752508361203, "grad_norm": 0.1689453125, "learning_rate": 8.22742474916388e-06, "loss": 1.2302, "step": 53 }, { "epoch": 0.1806020066889632, "grad_norm": 0.1865234375, "learning_rate": 8.193979933110369e-06, "loss": 1.2334, "step": 54 }, { "epoch": 0.18394648829431437, "grad_norm": 0.173828125, "learning_rate": 8.160535117056857e-06, "loss": 1.2597, "step": 55 }, { "epoch": 0.18729096989966554, "grad_norm": 0.2080078125, "learning_rate": 8.127090301003346e-06, "loss": 1.2875, "step": 56 }, { "epoch": 0.19063545150501673, "grad_norm": 0.1826171875, "learning_rate": 8.093645484949833e-06, "loss": 1.1507, "step": 57 }, { "epoch": 0.1939799331103679, "grad_norm": 0.1728515625, "learning_rate": 8.060200668896322e-06, "loss": 1.1626, "step": 58 }, { "epoch": 0.19732441471571907, "grad_norm": 0.1806640625, "learning_rate": 8.02675585284281e-06, "loss": 1.1655, "step": 59 }, { "epoch": 0.20066889632107024, "grad_norm": 0.181640625, "learning_rate": 7.9933110367893e-06, "loss": 1.165, "step": 60 }, { "epoch": 0.2040133779264214, "grad_norm": 0.267578125, "learning_rate": 7.959866220735787e-06, "loss": 1.2272, "step": 61 }, { "epoch": 0.20735785953177258, "grad_norm": 0.154296875, "learning_rate": 7.926421404682276e-06, "loss": 1.1434, "step": 62 }, { "epoch": 0.21070234113712374, "grad_norm": 0.1650390625, "learning_rate": 7.892976588628763e-06, "loss": 1.2262, "step": 63 }, { "epoch": 0.2140468227424749, "grad_norm": 0.19140625, "learning_rate": 7.859531772575253e-06, "loss": 1.267, "step": 64 }, { "epoch": 0.21739130434782608, "grad_norm": 0.166015625, "learning_rate": 7.82608695652174e-06, "loss": 1.1357, "step": 65 }, { "epoch": 0.22073578595317725, "grad_norm": 0.18359375, "learning_rate": 7.792642140468228e-06, "loss": 1.2462, "step": 66 }, { "epoch": 0.22408026755852842, "grad_norm": 0.23046875, "learning_rate": 7.759197324414717e-06, "loss": 1.1735, "step": 67 }, { "epoch": 0.22742474916387959, "grad_norm": 0.203125, "learning_rate": 7.725752508361204e-06, "loss": 1.2543, "step": 68 }, { "epoch": 0.23076923076923078, "grad_norm": 0.1962890625, "learning_rate": 7.692307692307694e-06, "loss": 1.1491, "step": 69 }, { "epoch": 0.23411371237458195, "grad_norm": 0.162109375, "learning_rate": 7.658862876254181e-06, "loss": 1.1287, "step": 70 }, { "epoch": 0.23745819397993312, "grad_norm": 0.1689453125, "learning_rate": 7.62541806020067e-06, "loss": 1.1978, "step": 71 }, { "epoch": 0.2408026755852843, "grad_norm": 0.1884765625, "learning_rate": 7.591973244147159e-06, "loss": 1.2821, "step": 72 }, { "epoch": 0.24414715719063546, "grad_norm": 0.1591796875, "learning_rate": 7.558528428093647e-06, "loss": 1.151, "step": 73 }, { "epoch": 0.24749163879598662, "grad_norm": 0.1650390625, "learning_rate": 7.5250836120401346e-06, "loss": 1.1282, "step": 74 }, { "epoch": 0.2508361204013378, "grad_norm": 0.162109375, "learning_rate": 7.491638795986622e-06, "loss": 1.1938, "step": 75 }, { "epoch": 0.25418060200668896, "grad_norm": 0.15234375, "learning_rate": 7.4581939799331104e-06, "loss": 1.1531, "step": 76 }, { "epoch": 0.25752508361204013, "grad_norm": 0.1806640625, "learning_rate": 7.424749163879599e-06, "loss": 1.1669, "step": 77 }, { "epoch": 0.2608695652173913, "grad_norm": 0.166015625, "learning_rate": 7.391304347826087e-06, "loss": 1.2088, "step": 78 }, { "epoch": 0.26421404682274247, "grad_norm": 0.1572265625, "learning_rate": 7.3578595317725755e-06, "loss": 1.1693, "step": 79 }, { "epoch": 0.26755852842809363, "grad_norm": 0.1865234375, "learning_rate": 7.324414715719064e-06, "loss": 1.2381, "step": 80 }, { "epoch": 0.2709030100334448, "grad_norm": 0.1552734375, "learning_rate": 7.290969899665552e-06, "loss": 1.1103, "step": 81 }, { "epoch": 0.27424749163879597, "grad_norm": 0.1552734375, "learning_rate": 7.257525083612041e-06, "loss": 1.163, "step": 82 }, { "epoch": 0.27759197324414714, "grad_norm": 0.25390625, "learning_rate": 7.224080267558529e-06, "loss": 1.1752, "step": 83 }, { "epoch": 0.2809364548494983, "grad_norm": 0.158203125, "learning_rate": 7.190635451505017e-06, "loss": 1.1534, "step": 84 }, { "epoch": 0.2842809364548495, "grad_norm": 0.25, "learning_rate": 7.157190635451506e-06, "loss": 1.1847, "step": 85 }, { "epoch": 0.28762541806020064, "grad_norm": 0.1650390625, "learning_rate": 7.123745819397993e-06, "loss": 1.0745, "step": 86 }, { "epoch": 0.2909698996655518, "grad_norm": 0.1689453125, "learning_rate": 7.0903010033444816e-06, "loss": 1.1894, "step": 87 }, { "epoch": 0.29431438127090304, "grad_norm": 0.1669921875, "learning_rate": 7.05685618729097e-06, "loss": 1.2001, "step": 88 }, { "epoch": 0.2976588628762542, "grad_norm": 0.1669921875, "learning_rate": 7.023411371237458e-06, "loss": 1.2034, "step": 89 }, { "epoch": 0.3010033444816054, "grad_norm": 0.1962890625, "learning_rate": 6.989966555183947e-06, "loss": 1.0924, "step": 90 }, { "epoch": 0.30434782608695654, "grad_norm": 0.1640625, "learning_rate": 6.956521739130435e-06, "loss": 1.1236, "step": 91 }, { "epoch": 0.3076923076923077, "grad_norm": 0.1572265625, "learning_rate": 6.923076923076923e-06, "loss": 1.1549, "step": 92 }, { "epoch": 0.3110367892976589, "grad_norm": 0.158203125, "learning_rate": 6.889632107023412e-06, "loss": 1.1318, "step": 93 }, { "epoch": 0.31438127090301005, "grad_norm": 0.1513671875, "learning_rate": 6.8561872909699e-06, "loss": 1.162, "step": 94 }, { "epoch": 0.3177257525083612, "grad_norm": 0.482421875, "learning_rate": 6.8227424749163885e-06, "loss": 1.0802, "step": 95 }, { "epoch": 0.3210702341137124, "grad_norm": 0.181640625, "learning_rate": 6.789297658862877e-06, "loss": 1.1413, "step": 96 }, { "epoch": 0.32441471571906355, "grad_norm": 0.154296875, "learning_rate": 6.755852842809365e-06, "loss": 1.0861, "step": 97 }, { "epoch": 0.3277591973244147, "grad_norm": 0.1767578125, "learning_rate": 6.7224080267558536e-06, "loss": 1.1744, "step": 98 }, { "epoch": 0.3311036789297659, "grad_norm": 0.173828125, "learning_rate": 6.688963210702342e-06, "loss": 1.1311, "step": 99 }, { "epoch": 0.33444816053511706, "grad_norm": 0.1650390625, "learning_rate": 6.65551839464883e-06, "loss": 1.1272, "step": 100 }, { "epoch": 0.3377926421404682, "grad_norm": 0.1708984375, "learning_rate": 6.622073578595319e-06, "loss": 1.0945, "step": 101 }, { "epoch": 0.3411371237458194, "grad_norm": 0.17578125, "learning_rate": 6.588628762541807e-06, "loss": 1.1065, "step": 102 }, { "epoch": 0.34448160535117056, "grad_norm": 0.486328125, "learning_rate": 6.5551839464882945e-06, "loss": 1.0844, "step": 103 }, { "epoch": 0.34782608695652173, "grad_norm": 0.1884765625, "learning_rate": 6.521739130434783e-06, "loss": 1.0762, "step": 104 }, { "epoch": 0.3511705685618729, "grad_norm": 0.19921875, "learning_rate": 6.488294314381271e-06, "loss": 1.124, "step": 105 }, { "epoch": 0.35451505016722407, "grad_norm": 0.27734375, "learning_rate": 6.45484949832776e-06, "loss": 1.1916, "step": 106 }, { "epoch": 0.35785953177257523, "grad_norm": 0.216796875, "learning_rate": 6.421404682274248e-06, "loss": 1.0871, "step": 107 }, { "epoch": 0.3612040133779264, "grad_norm": 0.162109375, "learning_rate": 6.387959866220736e-06, "loss": 1.1086, "step": 108 }, { "epoch": 0.36454849498327757, "grad_norm": 0.177734375, "learning_rate": 6.354515050167225e-06, "loss": 1.1142, "step": 109 }, { "epoch": 0.36789297658862874, "grad_norm": 0.166015625, "learning_rate": 6.321070234113713e-06, "loss": 1.0967, "step": 110 }, { "epoch": 0.3712374581939799, "grad_norm": 0.162109375, "learning_rate": 6.287625418060201e-06, "loss": 1.1557, "step": 111 }, { "epoch": 0.3745819397993311, "grad_norm": 0.16015625, "learning_rate": 6.25418060200669e-06, "loss": 1.1473, "step": 112 }, { "epoch": 0.3779264214046823, "grad_norm": 0.1640625, "learning_rate": 6.220735785953178e-06, "loss": 1.0826, "step": 113 }, { "epoch": 0.38127090301003347, "grad_norm": 0.1708984375, "learning_rate": 6.1872909698996665e-06, "loss": 1.1239, "step": 114 }, { "epoch": 0.38461538461538464, "grad_norm": 0.166015625, "learning_rate": 6.153846153846155e-06, "loss": 1.1016, "step": 115 }, { "epoch": 0.3879598662207358, "grad_norm": 0.1513671875, "learning_rate": 6.120401337792643e-06, "loss": 1.0871, "step": 116 }, { "epoch": 0.391304347826087, "grad_norm": 0.2275390625, "learning_rate": 6.086956521739132e-06, "loss": 1.1044, "step": 117 }, { "epoch": 0.39464882943143814, "grad_norm": 0.1591796875, "learning_rate": 6.05351170568562e-06, "loss": 1.0628, "step": 118 }, { "epoch": 0.3979933110367893, "grad_norm": 0.1611328125, "learning_rate": 6.020066889632108e-06, "loss": 1.0777, "step": 119 }, { "epoch": 0.4013377926421405, "grad_norm": 0.2109375, "learning_rate": 5.986622073578597e-06, "loss": 1.1311, "step": 120 }, { "epoch": 0.40468227424749165, "grad_norm": 0.162109375, "learning_rate": 5.953177257525084e-06, "loss": 1.032, "step": 121 }, { "epoch": 0.4080267558528428, "grad_norm": 0.154296875, "learning_rate": 5.9197324414715726e-06, "loss": 1.0533, "step": 122 }, { "epoch": 0.411371237458194, "grad_norm": 0.212890625, "learning_rate": 5.886287625418061e-06, "loss": 1.0541, "step": 123 }, { "epoch": 0.41471571906354515, "grad_norm": 0.1796875, "learning_rate": 5.852842809364549e-06, "loss": 1.1329, "step": 124 }, { "epoch": 0.4180602006688963, "grad_norm": 0.1611328125, "learning_rate": 5.819397993311037e-06, "loss": 1.0873, "step": 125 }, { "epoch": 0.4214046822742475, "grad_norm": 0.234375, "learning_rate": 5.785953177257525e-06, "loss": 1.0586, "step": 126 }, { "epoch": 0.42474916387959866, "grad_norm": 0.2734375, "learning_rate": 5.7525083612040135e-06, "loss": 1.0843, "step": 127 }, { "epoch": 0.4280936454849498, "grad_norm": 0.189453125, "learning_rate": 5.719063545150502e-06, "loss": 1.1847, "step": 128 }, { "epoch": 0.431438127090301, "grad_norm": 0.1552734375, "learning_rate": 5.68561872909699e-06, "loss": 1.0489, "step": 129 }, { "epoch": 0.43478260869565216, "grad_norm": 0.169921875, "learning_rate": 5.652173913043479e-06, "loss": 0.9888, "step": 130 }, { "epoch": 0.43812709030100333, "grad_norm": 0.1552734375, "learning_rate": 5.618729096989967e-06, "loss": 1.0906, "step": 131 }, { "epoch": 0.4414715719063545, "grad_norm": 0.17578125, "learning_rate": 5.585284280936455e-06, "loss": 1.0365, "step": 132 }, { "epoch": 0.44481605351170567, "grad_norm": 0.2216796875, "learning_rate": 5.551839464882943e-06, "loss": 1.0511, "step": 133 }, { "epoch": 0.44816053511705684, "grad_norm": 0.15234375, "learning_rate": 5.518394648829431e-06, "loss": 1.0228, "step": 134 }, { "epoch": 0.451505016722408, "grad_norm": 0.1728515625, "learning_rate": 5.48494983277592e-06, "loss": 1.0415, "step": 135 }, { "epoch": 0.45484949832775917, "grad_norm": 0.166015625, "learning_rate": 5.451505016722408e-06, "loss": 1.1016, "step": 136 }, { "epoch": 0.45819397993311034, "grad_norm": 0.1962890625, "learning_rate": 5.418060200668896e-06, "loss": 1.15, "step": 137 }, { "epoch": 0.46153846153846156, "grad_norm": 0.23046875, "learning_rate": 5.384615384615385e-06, "loss": 1.0811, "step": 138 }, { "epoch": 0.46488294314381273, "grad_norm": 0.251953125, "learning_rate": 5.351170568561873e-06, "loss": 1.0708, "step": 139 }, { "epoch": 0.4682274247491639, "grad_norm": 0.1494140625, "learning_rate": 5.317725752508361e-06, "loss": 1.0356, "step": 140 }, { "epoch": 0.47157190635451507, "grad_norm": 0.2734375, "learning_rate": 5.28428093645485e-06, "loss": 1.072, "step": 141 }, { "epoch": 0.47491638795986624, "grad_norm": 0.19140625, "learning_rate": 5.250836120401338e-06, "loss": 1.0946, "step": 142 }, { "epoch": 0.4782608695652174, "grad_norm": 0.15625, "learning_rate": 5.2173913043478265e-06, "loss": 1.0906, "step": 143 }, { "epoch": 0.4816053511705686, "grad_norm": 0.15234375, "learning_rate": 5.183946488294315e-06, "loss": 1.0418, "step": 144 }, { "epoch": 0.48494983277591974, "grad_norm": 0.251953125, "learning_rate": 5.150501672240803e-06, "loss": 1.1278, "step": 145 }, { "epoch": 0.4882943143812709, "grad_norm": 0.1533203125, "learning_rate": 5.1170568561872916e-06, "loss": 1.0354, "step": 146 }, { "epoch": 0.4916387959866221, "grad_norm": 0.15625, "learning_rate": 5.08361204013378e-06, "loss": 1.0924, "step": 147 }, { "epoch": 0.49498327759197325, "grad_norm": 0.193359375, "learning_rate": 5.050167224080268e-06, "loss": 1.0608, "step": 148 }, { "epoch": 0.4983277591973244, "grad_norm": 0.19921875, "learning_rate": 5.016722408026757e-06, "loss": 1.0304, "step": 149 }, { "epoch": 0.5016722408026756, "grad_norm": 0.1630859375, "learning_rate": 4.983277591973244e-06, "loss": 1.0018, "step": 150 }, { "epoch": 0.5050167224080268, "grad_norm": 0.2099609375, "learning_rate": 4.9498327759197325e-06, "loss": 1.1473, "step": 151 }, { "epoch": 0.5083612040133779, "grad_norm": 0.1611328125, "learning_rate": 4.916387959866221e-06, "loss": 1.0457, "step": 152 }, { "epoch": 0.5117056856187291, "grad_norm": 0.216796875, "learning_rate": 4.882943143812709e-06, "loss": 1.0113, "step": 153 }, { "epoch": 0.5150501672240803, "grad_norm": 0.1640625, "learning_rate": 4.849498327759198e-06, "loss": 1.0603, "step": 154 }, { "epoch": 0.5183946488294314, "grad_norm": 0.2138671875, "learning_rate": 4.816053511705686e-06, "loss": 1.1318, "step": 155 }, { "epoch": 0.5217391304347826, "grad_norm": 0.1904296875, "learning_rate": 4.782608695652174e-06, "loss": 1.0252, "step": 156 }, { "epoch": 0.5250836120401338, "grad_norm": 0.166015625, "learning_rate": 4.749163879598663e-06, "loss": 1.0458, "step": 157 }, { "epoch": 0.5284280936454849, "grad_norm": 0.17578125, "learning_rate": 4.715719063545151e-06, "loss": 1.0657, "step": 158 }, { "epoch": 0.5317725752508361, "grad_norm": 0.26171875, "learning_rate": 4.6822742474916394e-06, "loss": 1.0542, "step": 159 }, { "epoch": 0.5351170568561873, "grad_norm": 0.1611328125, "learning_rate": 4.648829431438128e-06, "loss": 1.0848, "step": 160 }, { "epoch": 0.5384615384615384, "grad_norm": 0.177734375, "learning_rate": 4.615384615384616e-06, "loss": 1.0916, "step": 161 }, { "epoch": 0.5418060200668896, "grad_norm": 0.1591796875, "learning_rate": 4.581939799331104e-06, "loss": 1.0833, "step": 162 }, { "epoch": 0.5451505016722408, "grad_norm": 0.1669921875, "learning_rate": 4.548494983277592e-06, "loss": 1.032, "step": 163 }, { "epoch": 0.5484949832775919, "grad_norm": 0.2001953125, "learning_rate": 4.51505016722408e-06, "loss": 1.1677, "step": 164 }, { "epoch": 0.5518394648829431, "grad_norm": 0.1708984375, "learning_rate": 4.481605351170569e-06, "loss": 1.0482, "step": 165 }, { "epoch": 0.5551839464882943, "grad_norm": 0.1767578125, "learning_rate": 4.448160535117057e-06, "loss": 1.0428, "step": 166 }, { "epoch": 0.5585284280936454, "grad_norm": 0.1708984375, "learning_rate": 4.4147157190635455e-06, "loss": 1.0474, "step": 167 }, { "epoch": 0.5618729096989966, "grad_norm": 0.220703125, "learning_rate": 4.381270903010034e-06, "loss": 1.0976, "step": 168 }, { "epoch": 0.5652173913043478, "grad_norm": 0.2451171875, "learning_rate": 4.347826086956522e-06, "loss": 1.0057, "step": 169 }, { "epoch": 0.568561872909699, "grad_norm": 0.1689453125, "learning_rate": 4.3143812709030106e-06, "loss": 1.0783, "step": 170 }, { "epoch": 0.5719063545150501, "grad_norm": 0.255859375, "learning_rate": 4.280936454849499e-06, "loss": 1.0647, "step": 171 }, { "epoch": 0.5752508361204013, "grad_norm": 0.171875, "learning_rate": 4.247491638795987e-06, "loss": 1.0936, "step": 172 }, { "epoch": 0.5785953177257525, "grad_norm": 0.1845703125, "learning_rate": 4.214046822742475e-06, "loss": 1.0089, "step": 173 }, { "epoch": 0.5819397993311036, "grad_norm": 0.18359375, "learning_rate": 4.180602006688963e-06, "loss": 1.0269, "step": 174 }, { "epoch": 0.5852842809364549, "grad_norm": 0.1669921875, "learning_rate": 4.1471571906354515e-06, "loss": 1.0732, "step": 175 }, { "epoch": 0.5886287625418061, "grad_norm": 0.28515625, "learning_rate": 4.11371237458194e-06, "loss": 1.1193, "step": 176 }, { "epoch": 0.5919732441471572, "grad_norm": 0.259765625, "learning_rate": 4.080267558528428e-06, "loss": 1.0064, "step": 177 }, { "epoch": 0.5953177257525084, "grad_norm": 0.1708984375, "learning_rate": 4.046822742474917e-06, "loss": 1.1175, "step": 178 }, { "epoch": 0.5986622073578596, "grad_norm": 0.1591796875, "learning_rate": 4.013377926421405e-06, "loss": 1.0077, "step": 179 }, { "epoch": 0.6020066889632107, "grad_norm": 0.1787109375, "learning_rate": 3.979933110367893e-06, "loss": 1.1174, "step": 180 }, { "epoch": 0.6053511705685619, "grad_norm": 0.1708984375, "learning_rate": 3.946488294314382e-06, "loss": 1.0294, "step": 181 }, { "epoch": 0.6086956521739131, "grad_norm": 0.2421875, "learning_rate": 3.91304347826087e-06, "loss": 1.0338, "step": 182 }, { "epoch": 0.6120401337792643, "grad_norm": 0.228515625, "learning_rate": 3.8795986622073584e-06, "loss": 1.0204, "step": 183 }, { "epoch": 0.6153846153846154, "grad_norm": 0.1787109375, "learning_rate": 3.846153846153847e-06, "loss": 1.0674, "step": 184 }, { "epoch": 0.6187290969899666, "grad_norm": 0.1796875, "learning_rate": 3.812709030100335e-06, "loss": 1.0264, "step": 185 }, { "epoch": 0.6220735785953178, "grad_norm": 0.15625, "learning_rate": 3.7792642140468235e-06, "loss": 0.9849, "step": 186 }, { "epoch": 0.6254180602006689, "grad_norm": 0.16015625, "learning_rate": 3.745819397993311e-06, "loss": 1.0131, "step": 187 }, { "epoch": 0.6287625418060201, "grad_norm": 0.1728515625, "learning_rate": 3.7123745819397994e-06, "loss": 1.1024, "step": 188 }, { "epoch": 0.6321070234113713, "grad_norm": 0.2412109375, "learning_rate": 3.6789297658862878e-06, "loss": 1.065, "step": 189 }, { "epoch": 0.6354515050167224, "grad_norm": 0.2060546875, "learning_rate": 3.645484949832776e-06, "loss": 0.9887, "step": 190 }, { "epoch": 0.6387959866220736, "grad_norm": 0.181640625, "learning_rate": 3.6120401337792645e-06, "loss": 1.0407, "step": 191 }, { "epoch": 0.6421404682274248, "grad_norm": 0.1640625, "learning_rate": 3.578595317725753e-06, "loss": 1.1122, "step": 192 }, { "epoch": 0.6454849498327759, "grad_norm": 0.177734375, "learning_rate": 3.5451505016722408e-06, "loss": 1.0309, "step": 193 }, { "epoch": 0.6488294314381271, "grad_norm": 0.2236328125, "learning_rate": 3.511705685618729e-06, "loss": 1.0361, "step": 194 }, { "epoch": 0.6521739130434783, "grad_norm": 0.1630859375, "learning_rate": 3.4782608695652175e-06, "loss": 1.0263, "step": 195 }, { "epoch": 0.6555183946488294, "grad_norm": 0.1650390625, "learning_rate": 3.444816053511706e-06, "loss": 1.0864, "step": 196 }, { "epoch": 0.6588628762541806, "grad_norm": 0.197265625, "learning_rate": 3.4113712374581942e-06, "loss": 1.0333, "step": 197 }, { "epoch": 0.6622073578595318, "grad_norm": 0.181640625, "learning_rate": 3.3779264214046826e-06, "loss": 1.0323, "step": 198 }, { "epoch": 0.6655518394648829, "grad_norm": 0.171875, "learning_rate": 3.344481605351171e-06, "loss": 1.0545, "step": 199 }, { "epoch": 0.6688963210702341, "grad_norm": 0.224609375, "learning_rate": 3.3110367892976593e-06, "loss": 1.04, "step": 200 }, { "epoch": 0.6722408026755853, "grad_norm": 0.234375, "learning_rate": 3.2775919732441473e-06, "loss": 1.0387, "step": 201 }, { "epoch": 0.6755852842809364, "grad_norm": 0.21875, "learning_rate": 3.2441471571906356e-06, "loss": 1.0503, "step": 202 }, { "epoch": 0.6789297658862876, "grad_norm": 0.1787109375, "learning_rate": 3.210702341137124e-06, "loss": 1.0059, "step": 203 }, { "epoch": 0.6822742474916388, "grad_norm": 0.185546875, "learning_rate": 3.1772575250836123e-06, "loss": 1.0414, "step": 204 }, { "epoch": 0.68561872909699, "grad_norm": 0.177734375, "learning_rate": 3.1438127090301007e-06, "loss": 0.9804, "step": 205 }, { "epoch": 0.6889632107023411, "grad_norm": 0.1884765625, "learning_rate": 3.110367892976589e-06, "loss": 1.0297, "step": 206 }, { "epoch": 0.6923076923076923, "grad_norm": 0.220703125, "learning_rate": 3.0769230769230774e-06, "loss": 0.9484, "step": 207 }, { "epoch": 0.6956521739130435, "grad_norm": 0.240234375, "learning_rate": 3.043478260869566e-06, "loss": 1.0311, "step": 208 }, { "epoch": 0.6989966555183946, "grad_norm": 0.1640625, "learning_rate": 3.010033444816054e-06, "loss": 1.0158, "step": 209 }, { "epoch": 0.7023411371237458, "grad_norm": 0.330078125, "learning_rate": 2.976588628762542e-06, "loss": 1.0655, "step": 210 }, { "epoch": 0.705685618729097, "grad_norm": 0.18359375, "learning_rate": 2.9431438127090305e-06, "loss": 1.0675, "step": 211 }, { "epoch": 0.7090301003344481, "grad_norm": 0.265625, "learning_rate": 2.9096989966555184e-06, "loss": 1.016, "step": 212 }, { "epoch": 0.7123745819397993, "grad_norm": 0.1767578125, "learning_rate": 2.8762541806020068e-06, "loss": 1.0829, "step": 213 }, { "epoch": 0.7157190635451505, "grad_norm": 0.25, "learning_rate": 2.842809364548495e-06, "loss": 1.0976, "step": 214 }, { "epoch": 0.7190635451505016, "grad_norm": 0.1767578125, "learning_rate": 2.8093645484949835e-06, "loss": 0.9929, "step": 215 }, { "epoch": 0.7224080267558528, "grad_norm": 0.1640625, "learning_rate": 2.7759197324414714e-06, "loss": 1.0156, "step": 216 }, { "epoch": 0.725752508361204, "grad_norm": 0.1572265625, "learning_rate": 2.74247491638796e-06, "loss": 1.0602, "step": 217 }, { "epoch": 0.7290969899665551, "grad_norm": 0.1640625, "learning_rate": 2.709030100334448e-06, "loss": 1.0358, "step": 218 }, { "epoch": 0.7324414715719063, "grad_norm": 0.2470703125, "learning_rate": 2.6755852842809365e-06, "loss": 0.9859, "step": 219 }, { "epoch": 0.7357859531772575, "grad_norm": 0.1845703125, "learning_rate": 2.642140468227425e-06, "loss": 1.0011, "step": 220 }, { "epoch": 0.7391304347826086, "grad_norm": 0.1845703125, "learning_rate": 2.6086956521739132e-06, "loss": 1.0146, "step": 221 }, { "epoch": 0.7424749163879598, "grad_norm": 0.2158203125, "learning_rate": 2.5752508361204016e-06, "loss": 1.0372, "step": 222 }, { "epoch": 0.745819397993311, "grad_norm": 0.158203125, "learning_rate": 2.54180602006689e-06, "loss": 1.0236, "step": 223 }, { "epoch": 0.7491638795986622, "grad_norm": 0.16015625, "learning_rate": 2.5083612040133783e-06, "loss": 1.0288, "step": 224 }, { "epoch": 0.7525083612040134, "grad_norm": 0.162109375, "learning_rate": 2.4749163879598663e-06, "loss": 1.0349, "step": 225 }, { "epoch": 0.7558528428093646, "grad_norm": 0.1630859375, "learning_rate": 2.4414715719063546e-06, "loss": 1.013, "step": 226 }, { "epoch": 0.7591973244147158, "grad_norm": 0.189453125, "learning_rate": 2.408026755852843e-06, "loss": 0.9946, "step": 227 }, { "epoch": 0.7625418060200669, "grad_norm": 0.1845703125, "learning_rate": 2.3745819397993314e-06, "loss": 0.965, "step": 228 }, { "epoch": 0.7658862876254181, "grad_norm": 0.2080078125, "learning_rate": 2.3411371237458197e-06, "loss": 1.0677, "step": 229 }, { "epoch": 0.7692307692307693, "grad_norm": 0.15625, "learning_rate": 2.307692307692308e-06, "loss": 1.034, "step": 230 }, { "epoch": 0.7725752508361204, "grad_norm": 0.1923828125, "learning_rate": 2.274247491638796e-06, "loss": 0.9735, "step": 231 }, { "epoch": 0.7759197324414716, "grad_norm": 0.203125, "learning_rate": 2.2408026755852844e-06, "loss": 1.0102, "step": 232 }, { "epoch": 0.7792642140468228, "grad_norm": 0.1875, "learning_rate": 2.2073578595317727e-06, "loss": 1.0126, "step": 233 }, { "epoch": 0.782608695652174, "grad_norm": 0.162109375, "learning_rate": 2.173913043478261e-06, "loss": 1.0255, "step": 234 }, { "epoch": 0.7859531772575251, "grad_norm": 0.2109375, "learning_rate": 2.1404682274247495e-06, "loss": 1.0144, "step": 235 }, { "epoch": 0.7892976588628763, "grad_norm": 0.2099609375, "learning_rate": 2.1070234113712374e-06, "loss": 1.0297, "step": 236 }, { "epoch": 0.7926421404682275, "grad_norm": 0.30078125, "learning_rate": 2.0735785953177258e-06, "loss": 1.0951, "step": 237 }, { "epoch": 0.7959866220735786, "grad_norm": 0.25, "learning_rate": 2.040133779264214e-06, "loss": 0.9703, "step": 238 }, { "epoch": 0.7993311036789298, "grad_norm": 0.1669921875, "learning_rate": 2.0066889632107025e-06, "loss": 1.0409, "step": 239 }, { "epoch": 0.802675585284281, "grad_norm": 0.158203125, "learning_rate": 1.973244147157191e-06, "loss": 1.0338, "step": 240 }, { "epoch": 0.8060200668896321, "grad_norm": 0.314453125, "learning_rate": 1.9397993311036792e-06, "loss": 1.0852, "step": 241 }, { "epoch": 0.8093645484949833, "grad_norm": 0.1572265625, "learning_rate": 1.9063545150501676e-06, "loss": 0.9925, "step": 242 }, { "epoch": 0.8127090301003345, "grad_norm": 0.16015625, "learning_rate": 1.8729096989966555e-06, "loss": 0.9734, "step": 243 }, { "epoch": 0.8160535117056856, "grad_norm": 0.208984375, "learning_rate": 1.8394648829431439e-06, "loss": 1.0202, "step": 244 }, { "epoch": 0.8193979933110368, "grad_norm": 0.1923828125, "learning_rate": 1.8060200668896322e-06, "loss": 1.0136, "step": 245 }, { "epoch": 0.822742474916388, "grad_norm": 0.27734375, "learning_rate": 1.7725752508361204e-06, "loss": 0.9887, "step": 246 }, { "epoch": 0.8260869565217391, "grad_norm": 0.18359375, "learning_rate": 1.7391304347826088e-06, "loss": 1.0297, "step": 247 }, { "epoch": 0.8294314381270903, "grad_norm": 0.212890625, "learning_rate": 1.7056856187290971e-06, "loss": 1.0243, "step": 248 }, { "epoch": 0.8327759197324415, "grad_norm": 0.2099609375, "learning_rate": 1.6722408026755855e-06, "loss": 1.172, "step": 249 }, { "epoch": 0.8361204013377926, "grad_norm": 0.1669921875, "learning_rate": 1.6387959866220736e-06, "loss": 1.011, "step": 250 }, { "epoch": 0.8394648829431438, "grad_norm": 0.177734375, "learning_rate": 1.605351170568562e-06, "loss": 1.0305, "step": 251 }, { "epoch": 0.842809364548495, "grad_norm": 0.22265625, "learning_rate": 1.5719063545150504e-06, "loss": 1.0236, "step": 252 }, { "epoch": 0.8461538461538461, "grad_norm": 0.2001953125, "learning_rate": 1.5384615384615387e-06, "loss": 1.0147, "step": 253 }, { "epoch": 0.8494983277591973, "grad_norm": 0.1611328125, "learning_rate": 1.505016722408027e-06, "loss": 0.9457, "step": 254 }, { "epoch": 0.8528428093645485, "grad_norm": 0.173828125, "learning_rate": 1.4715719063545152e-06, "loss": 1.0117, "step": 255 }, { "epoch": 0.8561872909698997, "grad_norm": 0.15625, "learning_rate": 1.4381270903010034e-06, "loss": 1.0197, "step": 256 }, { "epoch": 0.8595317725752508, "grad_norm": 0.2734375, "learning_rate": 1.4046822742474917e-06, "loss": 0.9876, "step": 257 }, { "epoch": 0.862876254180602, "grad_norm": 0.20703125, "learning_rate": 1.37123745819398e-06, "loss": 1.0106, "step": 258 }, { "epoch": 0.8662207357859532, "grad_norm": 0.1865234375, "learning_rate": 1.3377926421404683e-06, "loss": 1.05, "step": 259 }, { "epoch": 0.8695652173913043, "grad_norm": 0.1650390625, "learning_rate": 1.3043478260869566e-06, "loss": 1.0156, "step": 260 }, { "epoch": 0.8729096989966555, "grad_norm": 0.16015625, "learning_rate": 1.270903010033445e-06, "loss": 0.9463, "step": 261 }, { "epoch": 0.8762541806020067, "grad_norm": 0.197265625, "learning_rate": 1.2374581939799331e-06, "loss": 1.0878, "step": 262 }, { "epoch": 0.8795986622073578, "grad_norm": 0.1904296875, "learning_rate": 1.2040133779264215e-06, "loss": 1.0257, "step": 263 }, { "epoch": 0.882943143812709, "grad_norm": 0.173828125, "learning_rate": 1.1705685618729099e-06, "loss": 1.0443, "step": 264 }, { "epoch": 0.8862876254180602, "grad_norm": 0.1611328125, "learning_rate": 1.137123745819398e-06, "loss": 1.0151, "step": 265 }, { "epoch": 0.8896321070234113, "grad_norm": 0.1640625, "learning_rate": 1.1036789297658864e-06, "loss": 0.9882, "step": 266 }, { "epoch": 0.8929765886287625, "grad_norm": 0.1904296875, "learning_rate": 1.0702341137123747e-06, "loss": 1.0506, "step": 267 }, { "epoch": 0.8963210702341137, "grad_norm": 0.2158203125, "learning_rate": 1.0367892976588629e-06, "loss": 1.0378, "step": 268 }, { "epoch": 0.8996655518394648, "grad_norm": 0.1767578125, "learning_rate": 1.0033444816053512e-06, "loss": 1.0566, "step": 269 }, { "epoch": 0.903010033444816, "grad_norm": 0.189453125, "learning_rate": 9.698996655518396e-07, "loss": 1.0836, "step": 270 }, { "epoch": 0.9063545150501672, "grad_norm": 0.1728515625, "learning_rate": 9.364548494983278e-07, "loss": 1.0268, "step": 271 }, { "epoch": 0.9096989966555183, "grad_norm": 0.16796875, "learning_rate": 9.030100334448161e-07, "loss": 0.9992, "step": 272 }, { "epoch": 0.9130434782608695, "grad_norm": 0.1796875, "learning_rate": 8.695652173913044e-07, "loss": 1.0239, "step": 273 }, { "epoch": 0.9163879598662207, "grad_norm": 0.1611328125, "learning_rate": 8.361204013377927e-07, "loss": 1.0171, "step": 274 }, { "epoch": 0.919732441471572, "grad_norm": 0.169921875, "learning_rate": 8.02675585284281e-07, "loss": 0.9939, "step": 275 }, { "epoch": 0.9230769230769231, "grad_norm": 0.17578125, "learning_rate": 7.692307692307694e-07, "loss": 1.0451, "step": 276 }, { "epoch": 0.9264214046822743, "grad_norm": 0.171875, "learning_rate": 7.357859531772576e-07, "loss": 1.0344, "step": 277 }, { "epoch": 0.9297658862876255, "grad_norm": 0.25390625, "learning_rate": 7.023411371237459e-07, "loss": 1.0243, "step": 278 }, { "epoch": 0.9331103678929766, "grad_norm": 0.291015625, "learning_rate": 6.688963210702341e-07, "loss": 1.1295, "step": 279 }, { "epoch": 0.9364548494983278, "grad_norm": 0.181640625, "learning_rate": 6.354515050167225e-07, "loss": 0.9785, "step": 280 }, { "epoch": 0.939799331103679, "grad_norm": 0.1845703125, "learning_rate": 6.020066889632107e-07, "loss": 0.9921, "step": 281 }, { "epoch": 0.9431438127090301, "grad_norm": 0.208984375, "learning_rate": 5.68561872909699e-07, "loss": 1.0714, "step": 282 }, { "epoch": 0.9464882943143813, "grad_norm": 0.1591796875, "learning_rate": 5.351170568561874e-07, "loss": 0.9891, "step": 283 }, { "epoch": 0.9498327759197325, "grad_norm": 0.166015625, "learning_rate": 5.016722408026756e-07, "loss": 1.02, "step": 284 }, { "epoch": 0.9531772575250836, "grad_norm": 0.16796875, "learning_rate": 4.682274247491639e-07, "loss": 1.0396, "step": 285 }, { "epoch": 0.9565217391304348, "grad_norm": 0.17578125, "learning_rate": 4.347826086956522e-07, "loss": 1.0039, "step": 286 }, { "epoch": 0.959866220735786, "grad_norm": 0.1943359375, "learning_rate": 4.013377926421405e-07, "loss": 0.9614, "step": 287 }, { "epoch": 0.9632107023411371, "grad_norm": 0.1650390625, "learning_rate": 3.678929765886288e-07, "loss": 0.9969, "step": 288 }, { "epoch": 0.9665551839464883, "grad_norm": 0.1826171875, "learning_rate": 3.3444816053511706e-07, "loss": 0.9786, "step": 289 }, { "epoch": 0.9698996655518395, "grad_norm": 0.169921875, "learning_rate": 3.010033444816054e-07, "loss": 1.02, "step": 290 }, { "epoch": 0.9732441471571907, "grad_norm": 0.1865234375, "learning_rate": 2.675585284280937e-07, "loss": 1.0448, "step": 291 }, { "epoch": 0.9765886287625418, "grad_norm": 0.2333984375, "learning_rate": 2.3411371237458194e-07, "loss": 0.9377, "step": 292 }, { "epoch": 0.979933110367893, "grad_norm": 0.255859375, "learning_rate": 2.0066889632107025e-07, "loss": 1.0096, "step": 293 }, { "epoch": 0.9832775919732442, "grad_norm": 0.162109375, "learning_rate": 1.6722408026755853e-07, "loss": 1.0489, "step": 294 }, { "epoch": 0.9866220735785953, "grad_norm": 0.2001953125, "learning_rate": 1.3377926421404684e-07, "loss": 1.0394, "step": 295 }, { "epoch": 0.9899665551839465, "grad_norm": 0.265625, "learning_rate": 1.0033444816053512e-07, "loss": 1.1321, "step": 296 }, { "epoch": 0.9933110367892977, "grad_norm": 0.173828125, "learning_rate": 6.688963210702342e-08, "loss": 1.0429, "step": 297 }, { "epoch": 0.9966555183946488, "grad_norm": 0.1845703125, "learning_rate": 3.344481605351171e-08, "loss": 1.0819, "step": 298 }, { "epoch": 1.0, "grad_norm": 0.185546875, "learning_rate": 0.0, "loss": 1.048, "step": 299 } ], "logging_steps": 1.0, "max_steps": 299, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.459430685052764e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }