{ "best_global_step": 200, "best_metric": 0.1393619030714035, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 6.99290780141844, "eval_steps": 100, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02364066193853428, "grad_norm": 2.0331270694732666, "learning_rate": 2.0000000000000003e-06, "loss": 3.1855, "step": 1 }, { "epoch": 0.02364066193853428, "eval_loss": 3.1986265182495117, "eval_runtime": 26.6879, "eval_samples_per_second": 147.969, "eval_steps_per_second": 1.162, "step": 1 }, { "epoch": 0.04728132387706856, "grad_norm": 1.9515141248703003, "learning_rate": 4.000000000000001e-06, "loss": 3.0592, "step": 2 }, { "epoch": 0.07092198581560284, "grad_norm": 1.994627833366394, "learning_rate": 6e-06, "loss": 3.1985, "step": 3 }, { "epoch": 0.09456264775413711, "grad_norm": 2.0075595378875732, "learning_rate": 8.000000000000001e-06, "loss": 3.1855, "step": 4 }, { "epoch": 0.1182033096926714, "grad_norm": 1.8345943689346313, "learning_rate": 1e-05, "loss": 3.0367, "step": 5 }, { "epoch": 0.14184397163120568, "grad_norm": 1.8345423936843872, "learning_rate": 1.2e-05, "loss": 2.9946, "step": 6 }, { "epoch": 0.16548463356973994, "grad_norm": 1.8188563585281372, "learning_rate": 1.4000000000000001e-05, "loss": 3.0418, "step": 7 }, { "epoch": 0.18912529550827423, "grad_norm": 1.7931904792785645, "learning_rate": 1.6000000000000003e-05, "loss": 2.9121, "step": 8 }, { "epoch": 0.2127659574468085, "grad_norm": 1.6637405157089233, "learning_rate": 1.8e-05, "loss": 2.8397, "step": 9 }, { "epoch": 0.2364066193853428, "grad_norm": 1.6762944459915161, "learning_rate": 2e-05, "loss": 2.866, "step": 10 }, { "epoch": 0.26004728132387706, "grad_norm": 1.564735770225525, "learning_rate": 2.2000000000000003e-05, "loss": 2.6867, "step": 11 }, { "epoch": 0.28368794326241137, "grad_norm": 1.5735734701156616, "learning_rate": 2.4e-05, "loss": 2.6368, "step": 12 }, { "epoch": 0.3073286052009456, "grad_norm": 1.4545141458511353, "learning_rate": 2.6000000000000002e-05, "loss": 2.607, "step": 13 }, { "epoch": 0.3309692671394799, "grad_norm": 1.2153321504592896, "learning_rate": 2.8000000000000003e-05, "loss": 2.4951, "step": 14 }, { "epoch": 0.3546099290780142, "grad_norm": 1.1503349542617798, "learning_rate": 3e-05, "loss": 2.3556, "step": 15 }, { "epoch": 0.37825059101654845, "grad_norm": 1.1345653533935547, "learning_rate": 3.2000000000000005e-05, "loss": 2.2471, "step": 16 }, { "epoch": 0.40189125295508277, "grad_norm": 1.1848257780075073, "learning_rate": 3.4000000000000007e-05, "loss": 2.1661, "step": 17 }, { "epoch": 0.425531914893617, "grad_norm": 1.1684925556182861, "learning_rate": 3.6e-05, "loss": 2.1487, "step": 18 }, { "epoch": 0.4491725768321513, "grad_norm": 1.161915898323059, "learning_rate": 3.8e-05, "loss": 2.0439, "step": 19 }, { "epoch": 0.4728132387706856, "grad_norm": 1.1294256448745728, "learning_rate": 4e-05, "loss": 1.9138, "step": 20 }, { "epoch": 0.49645390070921985, "grad_norm": 1.0700589418411255, "learning_rate": 4.2e-05, "loss": 1.7674, "step": 21 }, { "epoch": 0.5200945626477541, "grad_norm": 1.0576059818267822, "learning_rate": 4.4000000000000006e-05, "loss": 1.7025, "step": 22 }, { "epoch": 0.5437352245862884, "grad_norm": 0.9991436004638672, "learning_rate": 4.600000000000001e-05, "loss": 1.6008, "step": 23 }, { "epoch": 0.5673758865248227, "grad_norm": 1.2524056434631348, "learning_rate": 4.8e-05, "loss": 1.5145, "step": 24 }, { "epoch": 0.5910165484633569, "grad_norm": 1.0108604431152344, "learning_rate": 5e-05, "loss": 1.4335, "step": 25 }, { "epoch": 0.6146572104018913, "grad_norm": 1.0798797607421875, "learning_rate": 5.2000000000000004e-05, "loss": 1.3745, "step": 26 }, { "epoch": 0.6382978723404256, "grad_norm": 1.2048861980438232, "learning_rate": 5.4000000000000005e-05, "loss": 1.3614, "step": 27 }, { "epoch": 0.6619385342789598, "grad_norm": 1.1013531684875488, "learning_rate": 5.6000000000000006e-05, "loss": 1.233, "step": 28 }, { "epoch": 0.6855791962174941, "grad_norm": 0.8776018023490906, "learning_rate": 5.8e-05, "loss": 1.1453, "step": 29 }, { "epoch": 0.7092198581560284, "grad_norm": 0.7686753273010254, "learning_rate": 6e-05, "loss": 1.1305, "step": 30 }, { "epoch": 0.7328605200945626, "grad_norm": 0.6882123947143555, "learning_rate": 6.2e-05, "loss": 1.0684, "step": 31 }, { "epoch": 0.7565011820330969, "grad_norm": 0.7795872092247009, "learning_rate": 6.400000000000001e-05, "loss": 0.9884, "step": 32 }, { "epoch": 0.7801418439716312, "grad_norm": 0.9709330201148987, "learning_rate": 6.6e-05, "loss": 0.9794, "step": 33 }, { "epoch": 0.8037825059101655, "grad_norm": 0.9680677056312561, "learning_rate": 6.800000000000001e-05, "loss": 0.9371, "step": 34 }, { "epoch": 0.8274231678486997, "grad_norm": 0.7505691051483154, "learning_rate": 7e-05, "loss": 0.8933, "step": 35 }, { "epoch": 0.851063829787234, "grad_norm": 0.6132867932319641, "learning_rate": 7.2e-05, "loss": 0.834, "step": 36 }, { "epoch": 0.8747044917257684, "grad_norm": 0.5645897388458252, "learning_rate": 7.4e-05, "loss": 0.7905, "step": 37 }, { "epoch": 0.8983451536643026, "grad_norm": 0.5998417735099792, "learning_rate": 7.6e-05, "loss": 0.8084, "step": 38 }, { "epoch": 0.9219858156028369, "grad_norm": 0.6559155583381653, "learning_rate": 7.800000000000001e-05, "loss": 0.7144, "step": 39 }, { "epoch": 0.9456264775413712, "grad_norm": 0.5562840700149536, "learning_rate": 8e-05, "loss": 0.6862, "step": 40 }, { "epoch": 0.9692671394799054, "grad_norm": 0.4943620562553406, "learning_rate": 8.2e-05, "loss": 0.6627, "step": 41 }, { "epoch": 0.9929078014184397, "grad_norm": 0.4989017844200134, "learning_rate": 8.4e-05, "loss": 0.6604, "step": 42 }, { "epoch": 1.0, "grad_norm": 0.6533769965171814, "learning_rate": 8.6e-05, "loss": 0.675, "step": 43 }, { "epoch": 1.0236406619385343, "grad_norm": 0.5205526947975159, "learning_rate": 8.800000000000001e-05, "loss": 0.5789, "step": 44 }, { "epoch": 1.0472813238770686, "grad_norm": 0.5255026817321777, "learning_rate": 9e-05, "loss": 0.586, "step": 45 }, { "epoch": 1.070921985815603, "grad_norm": 0.4527199864387512, "learning_rate": 9.200000000000001e-05, "loss": 0.5268, "step": 46 }, { "epoch": 1.094562647754137, "grad_norm": 0.46885204315185547, "learning_rate": 9.4e-05, "loss": 0.5221, "step": 47 }, { "epoch": 1.1182033096926713, "grad_norm": 0.44935673475265503, "learning_rate": 9.6e-05, "loss": 0.4869, "step": 48 }, { "epoch": 1.1418439716312057, "grad_norm": 0.36143723130226135, "learning_rate": 9.8e-05, "loss": 0.4791, "step": 49 }, { "epoch": 1.16548463356974, "grad_norm": 0.4179462790489197, "learning_rate": 0.0001, "loss": 0.4307, "step": 50 }, { "epoch": 1.1891252955082743, "grad_norm": 0.4182082712650299, "learning_rate": 0.00010200000000000001, "loss": 0.447, "step": 51 }, { "epoch": 1.2127659574468086, "grad_norm": 0.37552374601364136, "learning_rate": 0.00010400000000000001, "loss": 0.4257, "step": 52 }, { "epoch": 1.2364066193853427, "grad_norm": 0.33822715282440186, "learning_rate": 0.00010600000000000002, "loss": 0.405, "step": 53 }, { "epoch": 1.260047281323877, "grad_norm": 0.3773220479488373, "learning_rate": 0.00010800000000000001, "loss": 0.4175, "step": 54 }, { "epoch": 1.2836879432624113, "grad_norm": 0.38153576850891113, "learning_rate": 0.00011000000000000002, "loss": 0.4161, "step": 55 }, { "epoch": 1.3073286052009456, "grad_norm": 0.3315293788909912, "learning_rate": 0.00011200000000000001, "loss": 0.369, "step": 56 }, { "epoch": 1.33096926713948, "grad_norm": 0.35625559091567993, "learning_rate": 0.00011399999999999999, "loss": 0.3661, "step": 57 }, { "epoch": 1.3546099290780143, "grad_norm": 0.30330926179885864, "learning_rate": 0.000116, "loss": 0.3424, "step": 58 }, { "epoch": 1.3782505910165486, "grad_norm": 0.3344309628009796, "learning_rate": 0.000118, "loss": 0.354, "step": 59 }, { "epoch": 1.4018912529550827, "grad_norm": 0.37152042984962463, "learning_rate": 0.00012, "loss": 0.3198, "step": 60 }, { "epoch": 1.425531914893617, "grad_norm": 0.31594333052635193, "learning_rate": 0.000122, "loss": 0.3431, "step": 61 }, { "epoch": 1.4491725768321513, "grad_norm": 0.3176936209201813, "learning_rate": 0.000124, "loss": 0.3296, "step": 62 }, { "epoch": 1.4728132387706856, "grad_norm": 0.31608521938323975, "learning_rate": 0.000126, "loss": 0.3249, "step": 63 }, { "epoch": 1.49645390070922, "grad_norm": 0.3292820453643799, "learning_rate": 0.00012800000000000002, "loss": 0.3228, "step": 64 }, { "epoch": 1.520094562647754, "grad_norm": 0.2671797573566437, "learning_rate": 0.00013000000000000002, "loss": 0.3128, "step": 65 }, { "epoch": 1.5437352245862885, "grad_norm": 0.31343743205070496, "learning_rate": 0.000132, "loss": 0.2949, "step": 66 }, { "epoch": 1.5673758865248226, "grad_norm": 0.26248085498809814, "learning_rate": 0.000134, "loss": 0.2644, "step": 67 }, { "epoch": 1.591016548463357, "grad_norm": 0.3162164092063904, "learning_rate": 0.00013600000000000003, "loss": 0.2714, "step": 68 }, { "epoch": 1.6146572104018913, "grad_norm": 0.36199912428855896, "learning_rate": 0.000138, "loss": 0.2729, "step": 69 }, { "epoch": 1.6382978723404256, "grad_norm": 0.2817542254924774, "learning_rate": 0.00014, "loss": 0.2744, "step": 70 }, { "epoch": 1.6619385342789599, "grad_norm": 0.2607613205909729, "learning_rate": 0.000142, "loss": 0.2339, "step": 71 }, { "epoch": 1.685579196217494, "grad_norm": 0.32725006341934204, "learning_rate": 0.000144, "loss": 0.2578, "step": 72 }, { "epoch": 1.7092198581560285, "grad_norm": 0.25260740518569946, "learning_rate": 0.000146, "loss": 0.2283, "step": 73 }, { "epoch": 1.7328605200945626, "grad_norm": 0.25773733854293823, "learning_rate": 0.000148, "loss": 0.2372, "step": 74 }, { "epoch": 1.756501182033097, "grad_norm": 0.28829723596572876, "learning_rate": 0.00015000000000000001, "loss": 0.2336, "step": 75 }, { "epoch": 1.7801418439716312, "grad_norm": 0.23049677908420563, "learning_rate": 0.000152, "loss": 0.217, "step": 76 }, { "epoch": 1.8037825059101655, "grad_norm": 0.3179403841495514, "learning_rate": 0.000154, "loss": 0.2455, "step": 77 }, { "epoch": 1.8274231678486998, "grad_norm": 0.2766501009464264, "learning_rate": 0.00015600000000000002, "loss": 0.2469, "step": 78 }, { "epoch": 1.851063829787234, "grad_norm": 0.2457883358001709, "learning_rate": 0.00015800000000000002, "loss": 0.2117, "step": 79 }, { "epoch": 1.8747044917257685, "grad_norm": 0.37009197473526, "learning_rate": 0.00016, "loss": 0.2203, "step": 80 }, { "epoch": 1.8983451536643026, "grad_norm": 0.241745725274086, "learning_rate": 0.000162, "loss": 0.212, "step": 81 }, { "epoch": 1.9219858156028369, "grad_norm": 0.31778717041015625, "learning_rate": 0.000164, "loss": 0.2096, "step": 82 }, { "epoch": 1.9456264775413712, "grad_norm": 0.27776938676834106, "learning_rate": 0.000166, "loss": 0.2119, "step": 83 }, { "epoch": 1.9692671394799053, "grad_norm": 0.24296718835830688, "learning_rate": 0.000168, "loss": 0.2183, "step": 84 }, { "epoch": 1.9929078014184398, "grad_norm": 0.29314127564430237, "learning_rate": 0.00017, "loss": 0.2104, "step": 85 }, { "epoch": 2.0, "grad_norm": 0.4557657837867737, "learning_rate": 0.000172, "loss": 0.2116, "step": 86 }, { "epoch": 2.023640661938534, "grad_norm": 0.32158565521240234, "learning_rate": 0.000174, "loss": 0.2032, "step": 87 }, { "epoch": 2.0472813238770686, "grad_norm": 0.3410867154598236, "learning_rate": 0.00017600000000000002, "loss": 0.1856, "step": 88 }, { "epoch": 2.0709219858156027, "grad_norm": 0.33976373076438904, "learning_rate": 0.00017800000000000002, "loss": 0.1761, "step": 89 }, { "epoch": 2.0945626477541373, "grad_norm": 0.29102054238319397, "learning_rate": 0.00018, "loss": 0.1818, "step": 90 }, { "epoch": 2.1182033096926713, "grad_norm": 0.2819984257221222, "learning_rate": 0.000182, "loss": 0.1679, "step": 91 }, { "epoch": 2.141843971631206, "grad_norm": 0.22580578923225403, "learning_rate": 0.00018400000000000003, "loss": 0.1684, "step": 92 }, { "epoch": 2.16548463356974, "grad_norm": 0.3175092935562134, "learning_rate": 0.00018600000000000002, "loss": 0.1529, "step": 93 }, { "epoch": 2.189125295508274, "grad_norm": 0.2446371167898178, "learning_rate": 0.000188, "loss": 0.1824, "step": 94 }, { "epoch": 2.2127659574468086, "grad_norm": 0.3556613624095917, "learning_rate": 0.00019, "loss": 0.1871, "step": 95 }, { "epoch": 2.2364066193853427, "grad_norm": 0.26111075282096863, "learning_rate": 0.000192, "loss": 0.1506, "step": 96 }, { "epoch": 2.260047281323877, "grad_norm": 0.2725120186805725, "learning_rate": 0.000194, "loss": 0.1729, "step": 97 }, { "epoch": 2.2836879432624113, "grad_norm": 0.29069072008132935, "learning_rate": 0.000196, "loss": 0.2048, "step": 98 }, { "epoch": 2.3073286052009454, "grad_norm": 0.29507631063461304, "learning_rate": 0.00019800000000000002, "loss": 0.1803, "step": 99 }, { "epoch": 2.33096926713948, "grad_norm": 0.2776130139827728, "learning_rate": 0.0002, "loss": 0.1676, "step": 100 }, { "epoch": 2.33096926713948, "eval_loss": 0.18998296558856964, "eval_runtime": 26.6152, "eval_samples_per_second": 148.374, "eval_steps_per_second": 1.165, "step": 100 }, { "epoch": 2.354609929078014, "grad_norm": 0.31718653440475464, "learning_rate": 0.0001999911398855782, "loss": 0.2016, "step": 101 }, { "epoch": 2.3782505910165486, "grad_norm": 0.25378698110580444, "learning_rate": 0.00019996456111234527, "loss": 0.1654, "step": 102 }, { "epoch": 2.4018912529550827, "grad_norm": 0.31272420287132263, "learning_rate": 0.00019992026839012067, "loss": 0.1814, "step": 103 }, { "epoch": 2.425531914893617, "grad_norm": 0.2293945997953415, "learning_rate": 0.0001998582695676762, "loss": 0.1553, "step": 104 }, { "epoch": 2.4491725768321513, "grad_norm": 0.24929125607013702, "learning_rate": 0.000199778575631345, "loss": 0.1633, "step": 105 }, { "epoch": 2.4728132387706854, "grad_norm": 0.2588082253932953, "learning_rate": 0.000199681200703075, "loss": 0.1703, "step": 106 }, { "epoch": 2.49645390070922, "grad_norm": 0.2273349165916443, "learning_rate": 0.00019956616203792635, "loss": 0.1492, "step": 107 }, { "epoch": 2.520094562647754, "grad_norm": 0.30980604887008667, "learning_rate": 0.00019943348002101371, "loss": 0.1869, "step": 108 }, { "epoch": 2.5437352245862885, "grad_norm": 0.22518759965896606, "learning_rate": 0.00019928317816389417, "loss": 0.1433, "step": 109 }, { "epoch": 2.5673758865248226, "grad_norm": 0.3739267587661743, "learning_rate": 0.00019911528310040074, "loss": 0.1608, "step": 110 }, { "epoch": 2.591016548463357, "grad_norm": 0.3102218210697174, "learning_rate": 0.00019892982458192288, "loss": 0.154, "step": 111 }, { "epoch": 2.6146572104018913, "grad_norm": 0.30436208844184875, "learning_rate": 0.00019872683547213446, "loss": 0.1489, "step": 112 }, { "epoch": 2.6382978723404253, "grad_norm": 0.2371385544538498, "learning_rate": 0.00019850635174117033, "loss": 0.1664, "step": 113 }, { "epoch": 2.66193853427896, "grad_norm": 0.33524012565612793, "learning_rate": 0.00019826841245925212, "loss": 0.1693, "step": 114 }, { "epoch": 2.685579196217494, "grad_norm": 0.2563704550266266, "learning_rate": 0.0001980130597897651, "loss": 0.1409, "step": 115 }, { "epoch": 2.7092198581560285, "grad_norm": 0.329569011926651, "learning_rate": 0.00019774033898178667, "loss": 0.1869, "step": 116 }, { "epoch": 2.7328605200945626, "grad_norm": 0.2803850769996643, "learning_rate": 0.00019745029836206813, "loss": 0.1508, "step": 117 }, { "epoch": 2.756501182033097, "grad_norm": 0.27739500999450684, "learning_rate": 0.00019714298932647098, "loss": 0.1661, "step": 118 }, { "epoch": 2.780141843971631, "grad_norm": 0.3814995586872101, "learning_rate": 0.00019681846633085967, "loss": 0.1577, "step": 119 }, { "epoch": 2.8037825059101653, "grad_norm": 0.24363325536251068, "learning_rate": 0.0001964767868814516, "loss": 0.1485, "step": 120 }, { "epoch": 2.8274231678487, "grad_norm": 0.4224121868610382, "learning_rate": 0.00019611801152462715, "loss": 0.1403, "step": 121 }, { "epoch": 2.851063829787234, "grad_norm": 0.24654380977153778, "learning_rate": 0.00019574220383620055, "loss": 0.1501, "step": 122 }, { "epoch": 2.8747044917257685, "grad_norm": 0.3177631199359894, "learning_rate": 0.00019534943041015423, "loss": 0.1299, "step": 123 }, { "epoch": 2.8983451536643026, "grad_norm": 0.3411446511745453, "learning_rate": 0.00019493976084683813, "loss": 0.1388, "step": 124 }, { "epoch": 2.921985815602837, "grad_norm": 0.21619445085525513, "learning_rate": 0.00019451326774063636, "loss": 0.1488, "step": 125 }, { "epoch": 2.945626477541371, "grad_norm": 0.32396766543388367, "learning_rate": 0.00019407002666710336, "loss": 0.141, "step": 126 }, { "epoch": 2.9692671394799053, "grad_norm": 0.31418880820274353, "learning_rate": 0.00019361011616957164, "loss": 0.1173, "step": 127 }, { "epoch": 2.99290780141844, "grad_norm": 0.28613993525505066, "learning_rate": 0.00019313361774523385, "loss": 0.1511, "step": 128 }, { "epoch": 3.0, "grad_norm": 0.5619244575500488, "learning_rate": 0.00019264061583070127, "loss": 0.1587, "step": 129 }, { "epoch": 3.023640661938534, "grad_norm": 0.19919118285179138, "learning_rate": 0.00019213119778704128, "loss": 0.1221, "step": 130 }, { "epoch": 3.0472813238770686, "grad_norm": 0.3841528594493866, "learning_rate": 0.00019160545388429708, "loss": 0.1219, "step": 131 }, { "epoch": 3.0709219858156027, "grad_norm": 0.3244837820529938, "learning_rate": 0.00019106347728549135, "loss": 0.1352, "step": 132 }, { "epoch": 3.0945626477541373, "grad_norm": 0.26563313603401184, "learning_rate": 0.0001905053640301176, "loss": 0.127, "step": 133 }, { "epoch": 3.1182033096926713, "grad_norm": 0.309192031621933, "learning_rate": 0.00018993121301712193, "loss": 0.1355, "step": 134 }, { "epoch": 3.141843971631206, "grad_norm": 0.27086764574050903, "learning_rate": 0.00018934112598737777, "loss": 0.1304, "step": 135 }, { "epoch": 3.16548463356974, "grad_norm": 0.22432225942611694, "learning_rate": 0.00018873520750565718, "loss": 0.1248, "step": 136 }, { "epoch": 3.189125295508274, "grad_norm": 0.225359246134758, "learning_rate": 0.00018811356494210165, "loss": 0.125, "step": 137 }, { "epoch": 3.2127659574468086, "grad_norm": 0.22801683843135834, "learning_rate": 0.00018747630845319612, "loss": 0.1261, "step": 138 }, { "epoch": 3.2364066193853427, "grad_norm": 0.20234979689121246, "learning_rate": 0.00018682355096224872, "loss": 0.1141, "step": 139 }, { "epoch": 3.260047281323877, "grad_norm": 0.2310786247253418, "learning_rate": 0.0001861554081393806, "loss": 0.1207, "step": 140 }, { "epoch": 3.2836879432624113, "grad_norm": 0.22035686671733856, "learning_rate": 0.00018547199838102904, "loss": 0.1225, "step": 141 }, { "epoch": 3.3073286052009454, "grad_norm": 0.26895958185195923, "learning_rate": 0.0001847734427889671, "loss": 0.1266, "step": 142 }, { "epoch": 3.33096926713948, "grad_norm": 0.22639593482017517, "learning_rate": 0.00018405986514884434, "loss": 0.1343, "step": 143 }, { "epoch": 3.354609929078014, "grad_norm": 0.22886884212493896, "learning_rate": 0.0001833313919082515, "loss": 0.1231, "step": 144 }, { "epoch": 3.3782505910165486, "grad_norm": 0.235995352268219, "learning_rate": 0.00018258815215431396, "loss": 0.1225, "step": 145 }, { "epoch": 3.4018912529550827, "grad_norm": 0.20816344022750854, "learning_rate": 0.0001818302775908169, "loss": 0.1128, "step": 146 }, { "epoch": 3.425531914893617, "grad_norm": 0.20572255551815033, "learning_rate": 0.0001810579025148674, "loss": 0.1183, "step": 147 }, { "epoch": 3.4491725768321513, "grad_norm": 0.21447816491127014, "learning_rate": 0.00018027116379309638, "loss": 0.1137, "step": 148 }, { "epoch": 3.4728132387706854, "grad_norm": 0.2198539823293686, "learning_rate": 0.00017947020083740575, "loss": 0.1206, "step": 149 }, { "epoch": 3.49645390070922, "grad_norm": 0.2855454683303833, "learning_rate": 0.00017865515558026428, "loss": 0.1029, "step": 150 }, { "epoch": 3.520094562647754, "grad_norm": 0.21288052201271057, "learning_rate": 0.0001778261724495566, "loss": 0.1202, "step": 151 }, { "epoch": 3.5437352245862885, "grad_norm": 0.24256151914596558, "learning_rate": 0.00017698339834299061, "loss": 0.1283, "step": 152 }, { "epoch": 3.5673758865248226, "grad_norm": 0.23208890855312347, "learning_rate": 0.00017612698260206666, "loss": 0.104, "step": 153 }, { "epoch": 3.591016548463357, "grad_norm": 0.22771845757961273, "learning_rate": 0.00017525707698561385, "loss": 0.1215, "step": 154 }, { "epoch": 3.6146572104018913, "grad_norm": 0.23404033482074738, "learning_rate": 0.00017437383564289816, "loss": 0.1207, "step": 155 }, { "epoch": 3.6382978723404253, "grad_norm": 0.22488094866275787, "learning_rate": 0.00017347741508630672, "loss": 0.1259, "step": 156 }, { "epoch": 3.66193853427896, "grad_norm": 0.25057360529899597, "learning_rate": 0.00017256797416361362, "loss": 0.1366, "step": 157 }, { "epoch": 3.685579196217494, "grad_norm": 0.2152586281299591, "learning_rate": 0.00017164567402983152, "loss": 0.1211, "step": 158 }, { "epoch": 3.7092198581560285, "grad_norm": 0.266215443611145, "learning_rate": 0.00017071067811865476, "loss": 0.122, "step": 159 }, { "epoch": 3.7328605200945626, "grad_norm": 0.21588127315044403, "learning_rate": 0.0001697631521134985, "loss": 0.112, "step": 160 }, { "epoch": 3.756501182033097, "grad_norm": 0.25429075956344604, "learning_rate": 0.00016880326391813916, "loss": 0.1093, "step": 161 }, { "epoch": 3.780141843971631, "grad_norm": 0.2217930555343628, "learning_rate": 0.00016783118362696163, "loss": 0.103, "step": 162 }, { "epoch": 3.8037825059101653, "grad_norm": 0.23772305250167847, "learning_rate": 0.00016684708349481804, "loss": 0.1138, "step": 163 }, { "epoch": 3.8274231678487, "grad_norm": 0.24107149243354797, "learning_rate": 0.00016585113790650388, "loss": 0.1108, "step": 164 }, { "epoch": 3.851063829787234, "grad_norm": 0.23748527467250824, "learning_rate": 0.00016484352334585653, "loss": 0.1479, "step": 165 }, { "epoch": 3.8747044917257685, "grad_norm": 0.296247661113739, "learning_rate": 0.00016382441836448202, "loss": 0.1136, "step": 166 }, { "epoch": 3.8983451536643026, "grad_norm": 0.20156867802143097, "learning_rate": 0.0001627940035501152, "loss": 0.1031, "step": 167 }, { "epoch": 3.921985815602837, "grad_norm": 0.2570931613445282, "learning_rate": 0.0001617524614946192, "loss": 0.1191, "step": 168 }, { "epoch": 3.945626477541371, "grad_norm": 0.26321014761924744, "learning_rate": 0.0001606999767616298, "loss": 0.1184, "step": 169 }, { "epoch": 3.9692671394799053, "grad_norm": 0.23430880904197693, "learning_rate": 0.00015963673585385016, "loss": 0.1117, "step": 170 }, { "epoch": 3.99290780141844, "grad_norm": 0.251304030418396, "learning_rate": 0.00015856292718000235, "loss": 0.121, "step": 171 }, { "epoch": 4.0, "grad_norm": 0.3299131691455841, "learning_rate": 0.0001574787410214407, "loss": 0.1291, "step": 172 }, { "epoch": 4.0236406619385345, "grad_norm": 0.2221948504447937, "learning_rate": 0.0001563843694984336, "loss": 0.0948, "step": 173 }, { "epoch": 4.047281323877068, "grad_norm": 0.20471301674842834, "learning_rate": 0.00015528000653611935, "loss": 0.0916, "step": 174 }, { "epoch": 4.070921985815603, "grad_norm": 0.22837211191654205, "learning_rate": 0.0001541658478301421, "loss": 0.105, "step": 175 }, { "epoch": 4.094562647754137, "grad_norm": 0.2241257280111313, "learning_rate": 0.00015304209081197425, "loss": 0.0953, "step": 176 }, { "epoch": 4.118203309692672, "grad_norm": 0.24325305223464966, "learning_rate": 0.00015190893461393108, "loss": 0.0999, "step": 177 }, { "epoch": 4.141843971631205, "grad_norm": 0.26276087760925293, "learning_rate": 0.000150766580033884, "loss": 0.1008, "step": 178 }, { "epoch": 4.16548463356974, "grad_norm": 0.18270154297351837, "learning_rate": 0.00014961522949967886, "loss": 0.0947, "step": 179 }, { "epoch": 4.1891252955082745, "grad_norm": 0.2478967308998108, "learning_rate": 0.00014845508703326504, "loss": 0.0985, "step": 180 }, { "epoch": 4.212765957446808, "grad_norm": 0.2307574599981308, "learning_rate": 0.00014728635821454255, "loss": 0.1029, "step": 181 }, { "epoch": 4.236406619385343, "grad_norm": 0.2134932577610016, "learning_rate": 0.0001461092501449326, "loss": 0.121, "step": 182 }, { "epoch": 4.260047281323877, "grad_norm": 0.2269890010356903, "learning_rate": 0.00014492397141067887, "loss": 0.1015, "step": 183 }, { "epoch": 4.283687943262412, "grad_norm": 0.21847684681415558, "learning_rate": 0.00014373073204588556, "loss": 0.1005, "step": 184 }, { "epoch": 4.307328605200945, "grad_norm": 0.19407232105731964, "learning_rate": 0.0001425297434952987, "loss": 0.0878, "step": 185 }, { "epoch": 4.33096926713948, "grad_norm": 0.2551313638687134, "learning_rate": 0.00014132121857683783, "loss": 0.0972, "step": 186 }, { "epoch": 4.3546099290780145, "grad_norm": 0.21003836393356323, "learning_rate": 0.00014010537144388416, "loss": 0.0892, "step": 187 }, { "epoch": 4.378250591016548, "grad_norm": 0.1882903277873993, "learning_rate": 0.00013888241754733208, "loss": 0.0984, "step": 188 }, { "epoch": 4.401891252955083, "grad_norm": 0.22186315059661865, "learning_rate": 0.00013765257359741063, "loss": 0.0865, "step": 189 }, { "epoch": 4.425531914893617, "grad_norm": 0.23070953786373138, "learning_rate": 0.00013641605752528224, "loss": 0.0913, "step": 190 }, { "epoch": 4.449172576832151, "grad_norm": 0.2062789797782898, "learning_rate": 0.0001351730884444245, "loss": 0.0836, "step": 191 }, { "epoch": 4.472813238770685, "grad_norm": 0.2151135951280594, "learning_rate": 0.00013392388661180303, "loss": 0.0889, "step": 192 }, { "epoch": 4.49645390070922, "grad_norm": 0.27931350469589233, "learning_rate": 0.0001326686733888413, "loss": 0.1053, "step": 193 }, { "epoch": 4.520094562647754, "grad_norm": 0.22662654519081116, "learning_rate": 0.0001314076712021949, "loss": 0.1026, "step": 194 }, { "epoch": 4.543735224586288, "grad_norm": 0.2030782252550125, "learning_rate": 0.000130141103504337, "loss": 0.0888, "step": 195 }, { "epoch": 4.567375886524823, "grad_norm": 0.23990431427955627, "learning_rate": 0.0001288691947339621, "loss": 0.0966, "step": 196 }, { "epoch": 4.591016548463357, "grad_norm": 0.20122289657592773, "learning_rate": 0.00012759217027621505, "loss": 0.1083, "step": 197 }, { "epoch": 4.614657210401891, "grad_norm": 0.22739186882972717, "learning_rate": 0.00012631025642275212, "loss": 0.0942, "step": 198 }, { "epoch": 4.638297872340425, "grad_norm": 0.23181433975696564, "learning_rate": 0.00012502368033164176, "loss": 0.0969, "step": 199 }, { "epoch": 4.66193853427896, "grad_norm": 0.22937090694904327, "learning_rate": 0.0001237326699871115, "loss": 0.0965, "step": 200 }, { "epoch": 4.66193853427896, "eval_loss": 0.1393619030714035, "eval_runtime": 26.6609, "eval_samples_per_second": 148.12, "eval_steps_per_second": 1.163, "step": 200 }, { "epoch": 4.685579196217494, "grad_norm": 0.24888208508491516, "learning_rate": 0.00012243745415914883, "loss": 0.0975, "step": 201 }, { "epoch": 4.709219858156028, "grad_norm": 0.22495469450950623, "learning_rate": 0.00012113826236296244, "loss": 0.0935, "step": 202 }, { "epoch": 4.732860520094563, "grad_norm": 0.20877662301063538, "learning_rate": 0.0001198353248183118, "loss": 0.095, "step": 203 }, { "epoch": 4.756501182033097, "grad_norm": 0.25470611453056335, "learning_rate": 0.00011852887240871145, "loss": 0.0952, "step": 204 }, { "epoch": 4.780141843971631, "grad_norm": 0.21587012708187103, "learning_rate": 0.00011721913664051813, "loss": 0.0836, "step": 205 }, { "epoch": 4.803782505910165, "grad_norm": 0.22231656312942505, "learning_rate": 0.00011590634960190721, "loss": 0.0911, "step": 206 }, { "epoch": 4.8274231678487, "grad_norm": 0.2475675344467163, "learning_rate": 0.00011459074392174618, "loss": 0.0937, "step": 207 }, { "epoch": 4.851063829787234, "grad_norm": 0.19742602109909058, "learning_rate": 0.00011327255272837221, "loss": 0.0973, "step": 208 }, { "epoch": 4.874704491725768, "grad_norm": 0.18842868506908417, "learning_rate": 0.00011195200960828139, "loss": 0.0888, "step": 209 }, { "epoch": 4.898345153664303, "grad_norm": 0.1946844905614853, "learning_rate": 0.00011062934856473655, "loss": 0.0903, "step": 210 }, { "epoch": 4.921985815602837, "grad_norm": 0.2090204656124115, "learning_rate": 0.00010930480397630145, "loss": 0.1069, "step": 211 }, { "epoch": 4.945626477541371, "grad_norm": 0.21296799182891846, "learning_rate": 0.00010797861055530831, "loss": 0.0993, "step": 212 }, { "epoch": 4.969267139479905, "grad_norm": 0.22559182345867157, "learning_rate": 0.00010665100330626625, "loss": 0.0937, "step": 213 }, { "epoch": 4.99290780141844, "grad_norm": 0.18918611109256744, "learning_rate": 0.00010532221748421787, "loss": 0.0943, "step": 214 }, { "epoch": 5.0, "grad_norm": 0.40800580382347107, "learning_rate": 0.00010399248855305176, "loss": 0.1196, "step": 215 }, { "epoch": 5.0236406619385345, "grad_norm": 0.23491446673870087, "learning_rate": 0.00010266205214377748, "loss": 0.0763, "step": 216 }, { "epoch": 5.047281323877068, "grad_norm": 0.24946476519107819, "learning_rate": 0.00010133114401277139, "loss": 0.0805, "step": 217 }, { "epoch": 5.070921985815603, "grad_norm": 0.23227405548095703, "learning_rate": 0.0001, "loss": 0.0732, "step": 218 }, { "epoch": 5.094562647754137, "grad_norm": 0.24616649746894836, "learning_rate": 9.866885598722863e-05, "loss": 0.0867, "step": 219 }, { "epoch": 5.118203309692672, "grad_norm": 0.24532361328601837, "learning_rate": 9.733794785622253e-05, "loss": 0.0908, "step": 220 }, { "epoch": 5.141843971631205, "grad_norm": 0.19941219687461853, "learning_rate": 9.600751144694827e-05, "loss": 0.0799, "step": 221 }, { "epoch": 5.16548463356974, "grad_norm": 0.20473811030387878, "learning_rate": 9.467778251578217e-05, "loss": 0.0796, "step": 222 }, { "epoch": 5.1891252955082745, "grad_norm": 0.222214475274086, "learning_rate": 9.334899669373379e-05, "loss": 0.0785, "step": 223 }, { "epoch": 5.212765957446808, "grad_norm": 0.21746733784675598, "learning_rate": 9.202138944469168e-05, "loss": 0.0725, "step": 224 }, { "epoch": 5.236406619385343, "grad_norm": 0.203547403216362, "learning_rate": 9.069519602369856e-05, "loss": 0.0773, "step": 225 }, { "epoch": 5.260047281323877, "grad_norm": 0.24523097276687622, "learning_rate": 8.937065143526347e-05, "loss": 0.082, "step": 226 }, { "epoch": 5.283687943262412, "grad_norm": 0.23100948333740234, "learning_rate": 8.804799039171863e-05, "loss": 0.0759, "step": 227 }, { "epoch": 5.307328605200945, "grad_norm": 0.2774072289466858, "learning_rate": 8.672744727162781e-05, "loss": 0.0857, "step": 228 }, { "epoch": 5.33096926713948, "grad_norm": 0.24797679483890533, "learning_rate": 8.540925607825384e-05, "loss": 0.0766, "step": 229 }, { "epoch": 5.3546099290780145, "grad_norm": 0.20143181085586548, "learning_rate": 8.409365039809281e-05, "loss": 0.0828, "step": 230 }, { "epoch": 5.378250591016548, "grad_norm": 0.2065824419260025, "learning_rate": 8.27808633594819e-05, "loss": 0.0742, "step": 231 }, { "epoch": 5.401891252955083, "grad_norm": 0.22358693182468414, "learning_rate": 8.147112759128859e-05, "loss": 0.0706, "step": 232 }, { "epoch": 5.425531914893617, "grad_norm": 0.24426457285881042, "learning_rate": 8.016467518168821e-05, "loss": 0.0773, "step": 233 }, { "epoch": 5.449172576832151, "grad_norm": 0.18924954533576965, "learning_rate": 7.886173763703757e-05, "loss": 0.0752, "step": 234 }, { "epoch": 5.472813238770685, "grad_norm": 0.24037088453769684, "learning_rate": 7.756254584085121e-05, "loss": 0.084, "step": 235 }, { "epoch": 5.49645390070922, "grad_norm": 0.2293759435415268, "learning_rate": 7.626733001288851e-05, "loss": 0.0669, "step": 236 }, { "epoch": 5.520094562647754, "grad_norm": 0.1983073204755783, "learning_rate": 7.497631966835828e-05, "loss": 0.0823, "step": 237 }, { "epoch": 5.543735224586288, "grad_norm": 0.2341061383485794, "learning_rate": 7.368974357724789e-05, "loss": 0.0882, "step": 238 }, { "epoch": 5.567375886524823, "grad_norm": 0.1973034292459488, "learning_rate": 7.240782972378496e-05, "loss": 0.0671, "step": 239 }, { "epoch": 5.591016548463357, "grad_norm": 0.19070158898830414, "learning_rate": 7.113080526603792e-05, "loss": 0.0837, "step": 240 }, { "epoch": 5.614657210401891, "grad_norm": 0.2356303334236145, "learning_rate": 6.985889649566305e-05, "loss": 0.0933, "step": 241 }, { "epoch": 5.638297872340425, "grad_norm": 0.2121330201625824, "learning_rate": 6.859232879780515e-05, "loss": 0.0823, "step": 242 }, { "epoch": 5.66193853427896, "grad_norm": 0.20877498388290405, "learning_rate": 6.73313266111587e-05, "loss": 0.0899, "step": 243 }, { "epoch": 5.685579196217494, "grad_norm": 0.21572048962116241, "learning_rate": 6.607611338819697e-05, "loss": 0.0749, "step": 244 }, { "epoch": 5.709219858156028, "grad_norm": 0.19401253759860992, "learning_rate": 6.48269115555755e-05, "loss": 0.0718, "step": 245 }, { "epoch": 5.732860520094563, "grad_norm": 0.20852094888687134, "learning_rate": 6.358394247471778e-05, "loss": 0.0754, "step": 246 }, { "epoch": 5.756501182033097, "grad_norm": 0.2070273458957672, "learning_rate": 6.234742640258938e-05, "loss": 0.0733, "step": 247 }, { "epoch": 5.780141843971631, "grad_norm": 0.1823720633983612, "learning_rate": 6.111758245266794e-05, "loss": 0.0636, "step": 248 }, { "epoch": 5.803782505910165, "grad_norm": 0.2146531492471695, "learning_rate": 5.9894628556115854e-05, "loss": 0.0821, "step": 249 }, { "epoch": 5.8274231678487, "grad_norm": 0.20586134493350983, "learning_rate": 5.867878142316221e-05, "loss": 0.0861, "step": 250 }, { "epoch": 5.851063829787234, "grad_norm": 0.1832318753004074, "learning_rate": 5.7470256504701347e-05, "loss": 0.0694, "step": 251 }, { "epoch": 5.874704491725768, "grad_norm": 0.17847847938537598, "learning_rate": 5.626926795411447e-05, "loss": 0.0748, "step": 252 }, { "epoch": 5.898345153664303, "grad_norm": 0.19474737346172333, "learning_rate": 5.507602858932113e-05, "loss": 0.0754, "step": 253 }, { "epoch": 5.921985815602837, "grad_norm": 0.20228345692157745, "learning_rate": 5.38907498550674e-05, "loss": 0.0741, "step": 254 }, { "epoch": 5.945626477541371, "grad_norm": 0.19571395218372345, "learning_rate": 5.27136417854575e-05, "loss": 0.0808, "step": 255 }, { "epoch": 5.969267139479905, "grad_norm": 0.1964896023273468, "learning_rate": 5.1544912966734994e-05, "loss": 0.0722, "step": 256 }, { "epoch": 5.99290780141844, "grad_norm": 0.21053136885166168, "learning_rate": 5.0384770500321176e-05, "loss": 0.0748, "step": 257 }, { "epoch": 6.0, "grad_norm": 0.32032114267349243, "learning_rate": 4.9233419966116036e-05, "loss": 0.0792, "step": 258 }, { "epoch": 6.0236406619385345, "grad_norm": 0.18689100444316864, "learning_rate": 4.809106538606896e-05, "loss": 0.0672, "step": 259 }, { "epoch": 6.047281323877068, "grad_norm": 0.19790929555892944, "learning_rate": 4.695790918802576e-05, "loss": 0.0612, "step": 260 }, { "epoch": 6.070921985815603, "grad_norm": 0.17803865671157837, "learning_rate": 4.58341521698579e-05, "loss": 0.0567, "step": 261 }, { "epoch": 6.094562647754137, "grad_norm": 0.16323284804821014, "learning_rate": 4.47199934638807e-05, "loss": 0.0623, "step": 262 }, { "epoch": 6.118203309692672, "grad_norm": 0.183246910572052, "learning_rate": 4.3615630501566384e-05, "loss": 0.0727, "step": 263 }, { "epoch": 6.141843971631205, "grad_norm": 0.1922691911458969, "learning_rate": 4.252125897855932e-05, "loss": 0.0729, "step": 264 }, { "epoch": 6.16548463356974, "grad_norm": 0.18657496571540833, "learning_rate": 4.143707281999767e-05, "loss": 0.0601, "step": 265 }, { "epoch": 6.1891252955082745, "grad_norm": 0.1704358607530594, "learning_rate": 4.036326414614985e-05, "loss": 0.0677, "step": 266 }, { "epoch": 6.212765957446808, "grad_norm": 0.1788199245929718, "learning_rate": 3.930002323837025e-05, "loss": 0.0605, "step": 267 }, { "epoch": 6.236406619385343, "grad_norm": 0.1892111450433731, "learning_rate": 3.824753850538082e-05, "loss": 0.0621, "step": 268 }, { "epoch": 6.260047281323877, "grad_norm": 0.1900961846113205, "learning_rate": 3.720599644988482e-05, "loss": 0.0727, "step": 269 }, { "epoch": 6.283687943262412, "grad_norm": 0.25505387783050537, "learning_rate": 3.617558163551802e-05, "loss": 0.0639, "step": 270 }, { "epoch": 6.307328605200945, "grad_norm": 0.17928794026374817, "learning_rate": 3.5156476654143497e-05, "loss": 0.0595, "step": 271 }, { "epoch": 6.33096926713948, "grad_norm": 0.17975100874900818, "learning_rate": 3.414886209349615e-05, "loss": 0.0697, "step": 272 }, { "epoch": 6.3546099290780145, "grad_norm": 0.16846145689487457, "learning_rate": 3.315291650518197e-05, "loss": 0.0593, "step": 273 }, { "epoch": 6.378250591016548, "grad_norm": 0.15943646430969238, "learning_rate": 3.216881637303839e-05, "loss": 0.0597, "step": 274 }, { "epoch": 6.401891252955083, "grad_norm": 0.16623468697071075, "learning_rate": 3.119673608186085e-05, "loss": 0.0595, "step": 275 }, { "epoch": 6.425531914893617, "grad_norm": 0.17790904641151428, "learning_rate": 3.0236847886501542e-05, "loss": 0.0604, "step": 276 }, { "epoch": 6.449172576832151, "grad_norm": 0.18511582911014557, "learning_rate": 2.9289321881345254e-05, "loss": 0.0678, "step": 277 }, { "epoch": 6.472813238770685, "grad_norm": 0.17497338354587555, "learning_rate": 2.8354325970168484e-05, "loss": 0.0568, "step": 278 }, { "epoch": 6.49645390070922, "grad_norm": 0.1610943078994751, "learning_rate": 2.743202583638641e-05, "loss": 0.068, "step": 279 }, { "epoch": 6.520094562647754, "grad_norm": 0.1880873739719391, "learning_rate": 2.6522584913693294e-05, "loss": 0.06, "step": 280 }, { "epoch": 6.543735224586288, "grad_norm": 0.17921674251556396, "learning_rate": 2.5626164357101857e-05, "loss": 0.0593, "step": 281 }, { "epoch": 6.567375886524823, "grad_norm": 0.17583315074443817, "learning_rate": 2.4742923014386156e-05, "loss": 0.0664, "step": 282 }, { "epoch": 6.591016548463357, "grad_norm": 0.19071973860263824, "learning_rate": 2.3873017397933327e-05, "loss": 0.0644, "step": 283 }, { "epoch": 6.614657210401891, "grad_norm": 0.1757621169090271, "learning_rate": 2.301660165700936e-05, "loss": 0.0612, "step": 284 }, { "epoch": 6.638297872340425, "grad_norm": 0.15712977945804596, "learning_rate": 2.2173827550443417e-05, "loss": 0.0663, "step": 285 }, { "epoch": 6.66193853427896, "grad_norm": 0.16134823858737946, "learning_rate": 2.1344844419735755e-05, "loss": 0.0551, "step": 286 }, { "epoch": 6.685579196217494, "grad_norm": 0.168061301112175, "learning_rate": 2.0529799162594244e-05, "loss": 0.06, "step": 287 }, { "epoch": 6.709219858156028, "grad_norm": 0.1770693063735962, "learning_rate": 1.9728836206903656e-05, "loss": 0.0664, "step": 288 }, { "epoch": 6.732860520094563, "grad_norm": 0.18103648722171783, "learning_rate": 1.8942097485132626e-05, "loss": 0.062, "step": 289 }, { "epoch": 6.756501182033097, "grad_norm": 0.18184252083301544, "learning_rate": 1.8169722409183097e-05, "loss": 0.059, "step": 290 }, { "epoch": 6.780141843971631, "grad_norm": 0.1702430248260498, "learning_rate": 1.741184784568608e-05, "loss": 0.062, "step": 291 }, { "epoch": 6.803782505910165, "grad_norm": 0.16067641973495483, "learning_rate": 1.6668608091748495e-05, "loss": 0.0574, "step": 292 }, { "epoch": 6.8274231678487, "grad_norm": 0.1779567003250122, "learning_rate": 1.5940134851155697e-05, "loss": 0.0593, "step": 293 }, { "epoch": 6.851063829787234, "grad_norm": 0.17295385897159576, "learning_rate": 1.522655721103291e-05, "loss": 0.0695, "step": 294 }, { "epoch": 6.874704491725768, "grad_norm": 0.1924130916595459, "learning_rate": 1.4528001618970966e-05, "loss": 0.0719, "step": 295 }, { "epoch": 6.898345153664303, "grad_norm": 0.17258019745349884, "learning_rate": 1.3844591860619383e-05, "loss": 0.0646, "step": 296 }, { "epoch": 6.921985815602837, "grad_norm": 0.17023594677448273, "learning_rate": 1.3176449037751293e-05, "loss": 0.0608, "step": 297 }, { "epoch": 6.945626477541371, "grad_norm": 0.1798073947429657, "learning_rate": 1.2523691546803873e-05, "loss": 0.0774, "step": 298 }, { "epoch": 6.969267139479905, "grad_norm": 0.1567268669605255, "learning_rate": 1.1886435057898337e-05, "loss": 0.0809, "step": 299 }, { "epoch": 6.99290780141844, "grad_norm": 0.1746884137392044, "learning_rate": 1.1264792494342857e-05, "loss": 0.0597, "step": 300 }, { "epoch": 6.99290780141844, "eval_loss": 0.13989537954330444, "eval_runtime": 26.7265, "eval_samples_per_second": 147.756, "eval_steps_per_second": 1.16, "step": 300 } ], "logging_steps": 1, "max_steps": 336, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.239523305766781e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }