{ "best_global_step": 200, "best_metric": 0.09608737379312515, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 6.37696335078534, "eval_steps": 100, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020942408376963352, "grad_norm": 1.0613059997558594, "learning_rate": 2.0000000000000003e-06, "loss": 0.8495, "step": 1 }, { "epoch": 0.020942408376963352, "eval_loss": 0.8575803637504578, "eval_runtime": 36.5623, "eval_samples_per_second": 41.819, "eval_steps_per_second": 0.328, "step": 1 }, { "epoch": 0.041884816753926704, "grad_norm": 1.0434430837631226, "learning_rate": 4.000000000000001e-06, "loss": 0.8553, "step": 2 }, { "epoch": 0.06282722513089005, "grad_norm": 1.0259255170822144, "learning_rate": 6e-06, "loss": 0.8551, "step": 3 }, { "epoch": 0.08376963350785341, "grad_norm": 0.9780888557434082, "learning_rate": 8.000000000000001e-06, "loss": 0.8348, "step": 4 }, { "epoch": 0.10471204188481675, "grad_norm": 0.9169363379478455, "learning_rate": 1e-05, "loss": 0.8222, "step": 5 }, { "epoch": 0.1256544502617801, "grad_norm": 0.8373011350631714, "learning_rate": 1.2e-05, "loss": 0.8203, "step": 6 }, { "epoch": 0.14659685863874344, "grad_norm": 0.7512023448944092, "learning_rate": 1.4000000000000001e-05, "loss": 0.8004, "step": 7 }, { "epoch": 0.16753926701570682, "grad_norm": 0.6537197232246399, "learning_rate": 1.6000000000000003e-05, "loss": 0.7681, "step": 8 }, { "epoch": 0.18848167539267016, "grad_norm": 0.5785489082336426, "learning_rate": 1.8e-05, "loss": 0.7547, "step": 9 }, { "epoch": 0.2094240837696335, "grad_norm": 0.5355616211891174, "learning_rate": 2e-05, "loss": 0.7317, "step": 10 }, { "epoch": 0.23036649214659685, "grad_norm": 0.5175966024398804, "learning_rate": 2.2000000000000003e-05, "loss": 0.6958, "step": 11 }, { "epoch": 0.2513089005235602, "grad_norm": 0.5501968264579773, "learning_rate": 2.4e-05, "loss": 0.662, "step": 12 }, { "epoch": 0.27225130890052357, "grad_norm": 0.6185138821601868, "learning_rate": 2.6000000000000002e-05, "loss": 0.6469, "step": 13 }, { "epoch": 0.2931937172774869, "grad_norm": 0.6637799143791199, "learning_rate": 2.8000000000000003e-05, "loss": 0.6092, "step": 14 }, { "epoch": 0.31413612565445026, "grad_norm": 0.7460299730300903, "learning_rate": 3e-05, "loss": 0.5674, "step": 15 }, { "epoch": 0.33507853403141363, "grad_norm": 0.7996035218238831, "learning_rate": 3.2000000000000005e-05, "loss": 0.5265, "step": 16 }, { "epoch": 0.35602094240837695, "grad_norm": 0.7589525580406189, "learning_rate": 3.4000000000000007e-05, "loss": 0.4673, "step": 17 }, { "epoch": 0.3769633507853403, "grad_norm": 0.8297927975654602, "learning_rate": 3.6e-05, "loss": 0.4246, "step": 18 }, { "epoch": 0.39790575916230364, "grad_norm": 1.5778415203094482, "learning_rate": 3.8e-05, "loss": 0.3782, "step": 19 }, { "epoch": 0.418848167539267, "grad_norm": 0.936192512512207, "learning_rate": 4e-05, "loss": 0.3322, "step": 20 }, { "epoch": 0.4397905759162304, "grad_norm": 0.6696107387542725, "learning_rate": 4.2e-05, "loss": 0.2931, "step": 21 }, { "epoch": 0.4607329842931937, "grad_norm": 0.7389624714851379, "learning_rate": 4.4000000000000006e-05, "loss": 0.2502, "step": 22 }, { "epoch": 0.4816753926701571, "grad_norm": 0.49148285388946533, "learning_rate": 4.600000000000001e-05, "loss": 0.2294, "step": 23 }, { "epoch": 0.5026178010471204, "grad_norm": 0.6197348237037659, "learning_rate": 4.8e-05, "loss": 0.2097, "step": 24 }, { "epoch": 0.5235602094240838, "grad_norm": 0.45649978518486023, "learning_rate": 5e-05, "loss": 0.1861, "step": 25 }, { "epoch": 0.5445026178010471, "grad_norm": 0.2728537619113922, "learning_rate": 5.2000000000000004e-05, "loss": 0.174, "step": 26 }, { "epoch": 0.5654450261780105, "grad_norm": 0.2067917436361313, "learning_rate": 5.4000000000000005e-05, "loss": 0.1652, "step": 27 }, { "epoch": 0.5863874345549738, "grad_norm": 0.21034236252307892, "learning_rate": 5.6000000000000006e-05, "loss": 0.1697, "step": 28 }, { "epoch": 0.6073298429319371, "grad_norm": 0.19581182301044464, "learning_rate": 5.8e-05, "loss": 0.1599, "step": 29 }, { "epoch": 0.6282722513089005, "grad_norm": 0.17186392843723297, "learning_rate": 6e-05, "loss": 0.1583, "step": 30 }, { "epoch": 0.6492146596858639, "grad_norm": 0.15808530151844025, "learning_rate": 6.2e-05, "loss": 0.1527, "step": 31 }, { "epoch": 0.6701570680628273, "grad_norm": 0.16403080523014069, "learning_rate": 6.400000000000001e-05, "loss": 0.1537, "step": 32 }, { "epoch": 0.6910994764397905, "grad_norm": 0.1503947675228119, "learning_rate": 6.6e-05, "loss": 0.1491, "step": 33 }, { "epoch": 0.7120418848167539, "grad_norm": 0.12099669128656387, "learning_rate": 6.800000000000001e-05, "loss": 0.1452, "step": 34 }, { "epoch": 0.7329842931937173, "grad_norm": 0.1100674644112587, "learning_rate": 7e-05, "loss": 0.1447, "step": 35 }, { "epoch": 0.7539267015706806, "grad_norm": 0.10442424565553665, "learning_rate": 7.2e-05, "loss": 0.1444, "step": 36 }, { "epoch": 0.774869109947644, "grad_norm": 0.10083310306072235, "learning_rate": 7.4e-05, "loss": 0.1432, "step": 37 }, { "epoch": 0.7958115183246073, "grad_norm": 0.09343631565570831, "learning_rate": 7.6e-05, "loss": 0.1378, "step": 38 }, { "epoch": 0.8167539267015707, "grad_norm": 0.09598446637392044, "learning_rate": 7.800000000000001e-05, "loss": 0.1384, "step": 39 }, { "epoch": 0.837696335078534, "grad_norm": 0.09703955799341202, "learning_rate": 8e-05, "loss": 0.1392, "step": 40 }, { "epoch": 0.8586387434554974, "grad_norm": 0.09334489703178406, "learning_rate": 8.2e-05, "loss": 0.1372, "step": 41 }, { "epoch": 0.8795811518324608, "grad_norm": 0.0734427273273468, "learning_rate": 8.4e-05, "loss": 0.1305, "step": 42 }, { "epoch": 0.900523560209424, "grad_norm": 0.08139330893754959, "learning_rate": 8.6e-05, "loss": 0.1305, "step": 43 }, { "epoch": 0.9214659685863874, "grad_norm": 0.0861673653125763, "learning_rate": 8.800000000000001e-05, "loss": 0.13, "step": 44 }, { "epoch": 0.9424083769633508, "grad_norm": 0.08221717923879623, "learning_rate": 9e-05, "loss": 0.1275, "step": 45 }, { "epoch": 0.9633507853403142, "grad_norm": 0.07106878608465195, "learning_rate": 9.200000000000001e-05, "loss": 0.1254, "step": 46 }, { "epoch": 0.9842931937172775, "grad_norm": 0.08307594060897827, "learning_rate": 9.4e-05, "loss": 0.1275, "step": 47 }, { "epoch": 1.0209424083769634, "grad_norm": 0.1994994878768921, "learning_rate": 9.6e-05, "loss": 0.2428, "step": 48 }, { "epoch": 1.0418848167539267, "grad_norm": 0.09307362884283066, "learning_rate": 9.8e-05, "loss": 0.1242, "step": 49 }, { "epoch": 1.0628272251308901, "grad_norm": 0.10429561883211136, "learning_rate": 0.0001, "loss": 0.128, "step": 50 }, { "epoch": 1.0837696335078535, "grad_norm": 0.07438132166862488, "learning_rate": 0.00010200000000000001, "loss": 0.1249, "step": 51 }, { "epoch": 1.1047120418848166, "grad_norm": 0.11682362109422684, "learning_rate": 0.00010400000000000001, "loss": 0.1238, "step": 52 }, { "epoch": 1.12565445026178, "grad_norm": 0.1056235060095787, "learning_rate": 0.00010600000000000002, "loss": 0.1233, "step": 53 }, { "epoch": 1.1465968586387434, "grad_norm": 0.06861541420221329, "learning_rate": 0.00010800000000000001, "loss": 0.1177, "step": 54 }, { "epoch": 1.1675392670157068, "grad_norm": 0.10255040973424911, "learning_rate": 0.00011000000000000002, "loss": 0.1231, "step": 55 }, { "epoch": 1.1884816753926701, "grad_norm": 0.08970703929662704, "learning_rate": 0.00011200000000000001, "loss": 0.1196, "step": 56 }, { "epoch": 1.2094240837696335, "grad_norm": 0.06586827337741852, "learning_rate": 0.00011399999999999999, "loss": 0.1183, "step": 57 }, { "epoch": 1.2303664921465969, "grad_norm": 0.10298311710357666, "learning_rate": 0.000116, "loss": 0.1217, "step": 58 }, { "epoch": 1.2513089005235603, "grad_norm": 0.08795659989118576, "learning_rate": 0.000118, "loss": 0.1129, "step": 59 }, { "epoch": 1.2722513089005236, "grad_norm": 0.08581340312957764, "learning_rate": 0.00012, "loss": 0.1167, "step": 60 }, { "epoch": 1.2931937172774868, "grad_norm": 0.11990631371736526, "learning_rate": 0.000122, "loss": 0.118, "step": 61 }, { "epoch": 1.3141361256544504, "grad_norm": 0.05951250344514847, "learning_rate": 0.000124, "loss": 0.1158, "step": 62 }, { "epoch": 1.3350785340314135, "grad_norm": 0.10062775760889053, "learning_rate": 0.000126, "loss": 0.1136, "step": 63 }, { "epoch": 1.356020942408377, "grad_norm": 0.06757929176092148, "learning_rate": 0.00012800000000000002, "loss": 0.1143, "step": 64 }, { "epoch": 1.3769633507853403, "grad_norm": 0.07045479863882065, "learning_rate": 0.00013000000000000002, "loss": 0.1144, "step": 65 }, { "epoch": 1.3979057591623036, "grad_norm": 0.07338303327560425, "learning_rate": 0.000132, "loss": 0.1088, "step": 66 }, { "epoch": 1.418848167539267, "grad_norm": 0.05409236252307892, "learning_rate": 0.000134, "loss": 0.1098, "step": 67 }, { "epoch": 1.4397905759162304, "grad_norm": 0.06339941173791885, "learning_rate": 0.00013600000000000003, "loss": 0.111, "step": 68 }, { "epoch": 1.4607329842931938, "grad_norm": 0.07639243453741074, "learning_rate": 0.000138, "loss": 0.1159, "step": 69 }, { "epoch": 1.4816753926701571, "grad_norm": 0.07121666520833969, "learning_rate": 0.00014, "loss": 0.1118, "step": 70 }, { "epoch": 1.5026178010471205, "grad_norm": 0.06110168993473053, "learning_rate": 0.000142, "loss": 0.1111, "step": 71 }, { "epoch": 1.5235602094240837, "grad_norm": 0.06323180347681046, "learning_rate": 0.000144, "loss": 0.1119, "step": 72 }, { "epoch": 1.5445026178010473, "grad_norm": 0.07290653139352798, "learning_rate": 0.000146, "loss": 0.1142, "step": 73 }, { "epoch": 1.5654450261780104, "grad_norm": 0.05730397254228592, "learning_rate": 0.000148, "loss": 0.1109, "step": 74 }, { "epoch": 1.5863874345549738, "grad_norm": 0.0647931769490242, "learning_rate": 0.00015000000000000001, "loss": 0.1112, "step": 75 }, { "epoch": 1.6073298429319371, "grad_norm": 0.059707365930080414, "learning_rate": 0.000152, "loss": 0.1066, "step": 76 }, { "epoch": 1.6282722513089005, "grad_norm": 0.061403073370456696, "learning_rate": 0.000154, "loss": 0.1136, "step": 77 }, { "epoch": 1.649214659685864, "grad_norm": 0.07303062081336975, "learning_rate": 0.00015600000000000002, "loss": 0.1089, "step": 78 }, { "epoch": 1.6701570680628273, "grad_norm": 0.07086256146430969, "learning_rate": 0.00015800000000000002, "loss": 0.1086, "step": 79 }, { "epoch": 1.6910994764397906, "grad_norm": 0.06083432585000992, "learning_rate": 0.00016, "loss": 0.1068, "step": 80 }, { "epoch": 1.7120418848167538, "grad_norm": 0.0719650462269783, "learning_rate": 0.000162, "loss": 0.1067, "step": 81 }, { "epoch": 1.7329842931937174, "grad_norm": 0.06711898744106293, "learning_rate": 0.000164, "loss": 0.1077, "step": 82 }, { "epoch": 1.7539267015706805, "grad_norm": 0.07270397245883942, "learning_rate": 0.000166, "loss": 0.1107, "step": 83 }, { "epoch": 1.7748691099476441, "grad_norm": 0.06899525970220566, "learning_rate": 0.000168, "loss": 0.1104, "step": 84 }, { "epoch": 1.7958115183246073, "grad_norm": 0.08055053651332855, "learning_rate": 0.00017, "loss": 0.1071, "step": 85 }, { "epoch": 1.8167539267015707, "grad_norm": 0.06272236257791519, "learning_rate": 0.000172, "loss": 0.1079, "step": 86 }, { "epoch": 1.837696335078534, "grad_norm": 0.07758674025535583, "learning_rate": 0.000174, "loss": 0.1045, "step": 87 }, { "epoch": 1.8586387434554974, "grad_norm": 0.09495542198419571, "learning_rate": 0.00017600000000000002, "loss": 0.1093, "step": 88 }, { "epoch": 1.8795811518324608, "grad_norm": 0.0627971813082695, "learning_rate": 0.00017800000000000002, "loss": 0.1026, "step": 89 }, { "epoch": 1.900523560209424, "grad_norm": 0.10044968128204346, "learning_rate": 0.00018, "loss": 0.1084, "step": 90 }, { "epoch": 1.9214659685863875, "grad_norm": 0.09752042591571808, "learning_rate": 0.000182, "loss": 0.108, "step": 91 }, { "epoch": 1.9424083769633507, "grad_norm": 0.07474254071712494, "learning_rate": 0.00018400000000000003, "loss": 0.1038, "step": 92 }, { "epoch": 1.9633507853403143, "grad_norm": 0.07099518179893494, "learning_rate": 0.00018600000000000002, "loss": 0.1019, "step": 93 }, { "epoch": 1.9842931937172774, "grad_norm": 0.10210294276475906, "learning_rate": 0.000188, "loss": 0.1075, "step": 94 }, { "epoch": 2.020942408376963, "grad_norm": 0.22472955286502838, "learning_rate": 0.00019, "loss": 0.2116, "step": 95 }, { "epoch": 2.0418848167539267, "grad_norm": 0.23824606835842133, "learning_rate": 0.000192, "loss": 0.1054, "step": 96 }, { "epoch": 2.06282722513089, "grad_norm": 0.23182818293571472, "learning_rate": 0.000194, "loss": 0.1037, "step": 97 }, { "epoch": 2.0837696335078535, "grad_norm": 0.08839768171310425, "learning_rate": 0.000196, "loss": 0.1021, "step": 98 }, { "epoch": 2.1047120418848166, "grad_norm": 0.2024339735507965, "learning_rate": 0.00019800000000000002, "loss": 0.1048, "step": 99 }, { "epoch": 2.1256544502617802, "grad_norm": 0.20248624682426453, "learning_rate": 0.0002, "loss": 0.1023, "step": 100 }, { "epoch": 2.1256544502617802, "eval_loss": 0.10521107167005539, "eval_runtime": 36.241, "eval_samples_per_second": 42.19, "eval_steps_per_second": 0.331, "step": 100 }, { "epoch": 2.1465968586387434, "grad_norm": 0.07539942115545273, "learning_rate": 0.00019999639534510347, "loss": 0.1037, "step": 101 }, { "epoch": 2.167539267015707, "grad_norm": 0.1776709109544754, "learning_rate": 0.00019998558164028465, "loss": 0.1047, "step": 102 }, { "epoch": 2.18848167539267, "grad_norm": 0.07409574091434479, "learning_rate": 0.000199967559665137, "loss": 0.1022, "step": 103 }, { "epoch": 2.2094240837696333, "grad_norm": 0.16068875789642334, "learning_rate": 0.00019994233071892056, "loss": 0.1063, "step": 104 }, { "epoch": 2.230366492146597, "grad_norm": 0.10198356211185455, "learning_rate": 0.00019990989662046818, "loss": 0.0997, "step": 105 }, { "epoch": 2.25130890052356, "grad_norm": 0.12020247429609299, "learning_rate": 0.00019987025970805448, "loss": 0.1018, "step": 106 }, { "epoch": 2.2722513089005236, "grad_norm": 0.09725957363843918, "learning_rate": 0.00019982342283922738, "loss": 0.0986, "step": 107 }, { "epoch": 2.2931937172774868, "grad_norm": 0.1063511073589325, "learning_rate": 0.00019976938939060172, "loss": 0.1005, "step": 108 }, { "epoch": 2.3141361256544504, "grad_norm": 0.10711782425642014, "learning_rate": 0.00019970816325761627, "loss": 0.1003, "step": 109 }, { "epoch": 2.3350785340314135, "grad_norm": 0.07888664305210114, "learning_rate": 0.00019963974885425266, "loss": 0.1011, "step": 110 }, { "epoch": 2.356020942408377, "grad_norm": 0.09453505277633667, "learning_rate": 0.00019956415111271712, "loss": 0.0996, "step": 111 }, { "epoch": 2.3769633507853403, "grad_norm": 0.07988961786031723, "learning_rate": 0.00019948137548308502, "loss": 0.1015, "step": 112 }, { "epoch": 2.3979057591623034, "grad_norm": 0.11309799551963806, "learning_rate": 0.00019939142793290798, "loss": 0.101, "step": 113 }, { "epoch": 2.418848167539267, "grad_norm": 0.06662476062774658, "learning_rate": 0.00019929431494678356, "loss": 0.093, "step": 114 }, { "epoch": 2.4397905759162306, "grad_norm": 0.10978271067142487, "learning_rate": 0.00019919004352588767, "loss": 0.0959, "step": 115 }, { "epoch": 2.4607329842931938, "grad_norm": 0.061751991510391235, "learning_rate": 0.00019907862118747022, "loss": 0.0989, "step": 116 }, { "epoch": 2.481675392670157, "grad_norm": 0.09909060597419739, "learning_rate": 0.00019896005596431264, "loss": 0.0948, "step": 117 }, { "epoch": 2.5026178010471205, "grad_norm": 0.08946210145950317, "learning_rate": 0.00019883435640414922, "loss": 0.0967, "step": 118 }, { "epoch": 2.5235602094240837, "grad_norm": 0.09633102267980576, "learning_rate": 0.00019870153156905068, "loss": 0.0982, "step": 119 }, { "epoch": 2.5445026178010473, "grad_norm": 0.08664353936910629, "learning_rate": 0.00019856159103477086, "loss": 0.0975, "step": 120 }, { "epoch": 2.5654450261780104, "grad_norm": 0.08524741977453232, "learning_rate": 0.00019841454489005636, "loss": 0.0964, "step": 121 }, { "epoch": 2.5863874345549736, "grad_norm": 0.08638955652713776, "learning_rate": 0.00019826040373591933, "loss": 0.1007, "step": 122 }, { "epoch": 2.607329842931937, "grad_norm": 0.0855293795466423, "learning_rate": 0.00019809917868487308, "loss": 0.1004, "step": 123 }, { "epoch": 2.6282722513089007, "grad_norm": 0.1074497401714325, "learning_rate": 0.000197930881360131, "loss": 0.0975, "step": 124 }, { "epoch": 2.649214659685864, "grad_norm": 0.07675390690565109, "learning_rate": 0.00019775552389476864, "loss": 0.0946, "step": 125 }, { "epoch": 2.670157068062827, "grad_norm": 0.09924424439668655, "learning_rate": 0.00019757311893084885, "loss": 0.0944, "step": 126 }, { "epoch": 2.6910994764397906, "grad_norm": 0.09917759150266647, "learning_rate": 0.00019738367961851064, "loss": 0.0979, "step": 127 }, { "epoch": 2.712041884816754, "grad_norm": 0.08026636391878128, "learning_rate": 0.0001971872196150208, "loss": 0.0994, "step": 128 }, { "epoch": 2.7329842931937174, "grad_norm": 0.1067788228392601, "learning_rate": 0.00019698375308378974, "loss": 0.0994, "step": 129 }, { "epoch": 2.7539267015706805, "grad_norm": 0.08681651949882507, "learning_rate": 0.0001967732946933499, "loss": 0.0966, "step": 130 }, { "epoch": 2.774869109947644, "grad_norm": 0.10136966407299042, "learning_rate": 0.00019655585961629867, "loss": 0.0944, "step": 131 }, { "epoch": 2.7958115183246073, "grad_norm": 0.10188725590705872, "learning_rate": 0.0001963314635282044, "loss": 0.0962, "step": 132 }, { "epoch": 2.816753926701571, "grad_norm": 0.07988719642162323, "learning_rate": 0.00019610012260647618, "loss": 0.0966, "step": 133 }, { "epoch": 2.837696335078534, "grad_norm": 0.09458421915769577, "learning_rate": 0.0001958618535291978, "loss": 0.0996, "step": 134 }, { "epoch": 2.858638743455497, "grad_norm": 0.09394548088312149, "learning_rate": 0.00019561667347392508, "loss": 0.0971, "step": 135 }, { "epoch": 2.8795811518324608, "grad_norm": 0.08295969665050507, "learning_rate": 0.0001953646001164479, "loss": 0.0935, "step": 136 }, { "epoch": 2.900523560209424, "grad_norm": 0.09273595362901688, "learning_rate": 0.00019510565162951537, "loss": 0.0919, "step": 137 }, { "epoch": 2.9214659685863875, "grad_norm": 0.08656938374042511, "learning_rate": 0.00019483984668152617, "loss": 0.0963, "step": 138 }, { "epoch": 2.9424083769633507, "grad_norm": 0.12363547831773758, "learning_rate": 0.00019456720443518247, "loss": 0.0983, "step": 139 }, { "epoch": 2.9633507853403143, "grad_norm": 0.07976437360048294, "learning_rate": 0.00019428774454610843, "loss": 0.0955, "step": 140 }, { "epoch": 2.9842931937172774, "grad_norm": 0.0959707498550415, "learning_rate": 0.00019400148716143317, "loss": 0.0961, "step": 141 }, { "epoch": 3.020942408376963, "grad_norm": 0.22410845756530762, "learning_rate": 0.00019370845291833837, "loss": 0.1888, "step": 142 }, { "epoch": 3.0418848167539267, "grad_norm": 0.15948638319969177, "learning_rate": 0.00019340866294257042, "loss": 0.0938, "step": 143 }, { "epoch": 3.06282722513089, "grad_norm": 0.08823582530021667, "learning_rate": 0.0001931021388469174, "loss": 0.0899, "step": 144 }, { "epoch": 3.0837696335078535, "grad_norm": 0.16904295980930328, "learning_rate": 0.00019278890272965096, "loss": 0.0948, "step": 145 }, { "epoch": 3.1047120418848166, "grad_norm": 0.09499840438365936, "learning_rate": 0.00019246897717293315, "loss": 0.0907, "step": 146 }, { "epoch": 3.1256544502617802, "grad_norm": 0.14423631131649017, "learning_rate": 0.0001921423852411885, "loss": 0.0947, "step": 147 }, { "epoch": 3.1465968586387434, "grad_norm": 0.10220122337341309, "learning_rate": 0.00019180915047944112, "loss": 0.0914, "step": 148 }, { "epoch": 3.167539267015707, "grad_norm": 0.12001827359199524, "learning_rate": 0.00019146929691161727, "loss": 0.0946, "step": 149 }, { "epoch": 3.18848167539267, "grad_norm": 0.10218129307031631, "learning_rate": 0.0001911228490388136, "loss": 0.0956, "step": 150 }, { "epoch": 3.2094240837696333, "grad_norm": 0.09977877885103226, "learning_rate": 0.00019076983183753045, "loss": 0.0881, "step": 151 }, { "epoch": 3.230366492146597, "grad_norm": 0.11310956627130508, "learning_rate": 0.0001904102707578715, "loss": 0.0899, "step": 152 }, { "epoch": 3.25130890052356, "grad_norm": 0.12192897498607635, "learning_rate": 0.00019004419172170887, "loss": 0.0917, "step": 153 }, { "epoch": 3.2722513089005236, "grad_norm": 0.09461195021867752, "learning_rate": 0.00018967162112081438, "loss": 0.0857, "step": 154 }, { "epoch": 3.2931937172774868, "grad_norm": 0.13657408952713013, "learning_rate": 0.00018929258581495685, "loss": 0.0887, "step": 155 }, { "epoch": 3.3141361256544504, "grad_norm": 0.07357782125473022, "learning_rate": 0.0001889071131299657, "loss": 0.093, "step": 156 }, { "epoch": 3.3350785340314135, "grad_norm": 0.12468763440847397, "learning_rate": 0.00018851523085576096, "loss": 0.0896, "step": 157 }, { "epoch": 3.356020942408377, "grad_norm": 0.08894907683134079, "learning_rate": 0.00018811696724434983, "loss": 0.089, "step": 158 }, { "epoch": 3.3769633507853403, "grad_norm": 0.09679420292377472, "learning_rate": 0.0001877123510077898, "loss": 0.0916, "step": 159 }, { "epoch": 3.3979057591623034, "grad_norm": 0.09011490643024445, "learning_rate": 0.00018730141131611882, "loss": 0.0904, "step": 160 }, { "epoch": 3.418848167539267, "grad_norm": 0.07602863758802414, "learning_rate": 0.0001868841777952524, "loss": 0.0928, "step": 161 }, { "epoch": 3.4397905759162306, "grad_norm": 0.09943581372499466, "learning_rate": 0.00018646068052484755, "loss": 0.0865, "step": 162 }, { "epoch": 3.4607329842931938, "grad_norm": 0.07819810509681702, "learning_rate": 0.0001860309500361345, "loss": 0.0907, "step": 163 }, { "epoch": 3.481675392670157, "grad_norm": 0.10208116471767426, "learning_rate": 0.00018559501730971544, "loss": 0.0888, "step": 164 }, { "epoch": 3.5026178010471205, "grad_norm": 0.06591776013374329, "learning_rate": 0.00018515291377333112, "loss": 0.0887, "step": 165 }, { "epoch": 3.5235602094240837, "grad_norm": 0.10797963291406631, "learning_rate": 0.0001847046712995951, "loss": 0.092, "step": 166 }, { "epoch": 3.5445026178010473, "grad_norm": 0.09793265908956528, "learning_rate": 0.00018425032220369589, "loss": 0.0879, "step": 167 }, { "epoch": 3.5654450261780104, "grad_norm": 0.08764797449111938, "learning_rate": 0.00018378989924106736, "loss": 0.09, "step": 168 }, { "epoch": 3.5863874345549736, "grad_norm": 0.10197529196739197, "learning_rate": 0.0001833234356050273, "loss": 0.0929, "step": 169 }, { "epoch": 3.607329842931937, "grad_norm": 0.09853745996952057, "learning_rate": 0.00018285096492438424, "loss": 0.0882, "step": 170 }, { "epoch": 3.6282722513089007, "grad_norm": 0.07436931878328323, "learning_rate": 0.00018237252126101323, "loss": 0.0894, "step": 171 }, { "epoch": 3.649214659685864, "grad_norm": 0.08946939557790756, "learning_rate": 0.0001818881391074002, "loss": 0.091, "step": 172 }, { "epoch": 3.670157068062827, "grad_norm": 0.07849813997745514, "learning_rate": 0.00018139785338415517, "loss": 0.0869, "step": 173 }, { "epoch": 3.6910994764397906, "grad_norm": 0.10748233646154404, "learning_rate": 0.00018090169943749476, "loss": 0.0903, "step": 174 }, { "epoch": 3.712041884816754, "grad_norm": 0.07644122838973999, "learning_rate": 0.00018039971303669407, "loss": 0.0896, "step": 175 }, { "epoch": 3.7329842931937174, "grad_norm": 0.09647019952535629, "learning_rate": 0.00017989193037150784, "loss": 0.086, "step": 176 }, { "epoch": 3.7539267015706805, "grad_norm": 0.11045010387897491, "learning_rate": 0.0001793783880495615, "loss": 0.0902, "step": 177 }, { "epoch": 3.774869109947644, "grad_norm": 0.08213932067155838, "learning_rate": 0.00017885912309371192, "loss": 0.0871, "step": 178 }, { "epoch": 3.7958115183246073, "grad_norm": 0.08789657056331635, "learning_rate": 0.00017833417293937847, "loss": 0.0896, "step": 179 }, { "epoch": 3.816753926701571, "grad_norm": 0.09543125331401825, "learning_rate": 0.00017780357543184397, "loss": 0.0883, "step": 180 }, { "epoch": 3.837696335078534, "grad_norm": 0.07702449709177017, "learning_rate": 0.0001772673688235265, "loss": 0.0874, "step": 181 }, { "epoch": 3.858638743455497, "grad_norm": 0.09097757190465927, "learning_rate": 0.00017672559177122165, "loss": 0.089, "step": 182 }, { "epoch": 3.8795811518324608, "grad_norm": 0.07636068016290665, "learning_rate": 0.00017617828333331545, "loss": 0.0834, "step": 183 }, { "epoch": 3.900523560209424, "grad_norm": 0.07845642417669296, "learning_rate": 0.00017562548296696875, "loss": 0.0886, "step": 184 }, { "epoch": 3.9214659685863875, "grad_norm": 0.08484441787004471, "learning_rate": 0.00017506723052527242, "loss": 0.0887, "step": 185 }, { "epoch": 3.9424083769633507, "grad_norm": 0.08102953433990479, "learning_rate": 0.0001745035662543745, "loss": 0.0847, "step": 186 }, { "epoch": 3.9633507853403143, "grad_norm": 0.08981820195913315, "learning_rate": 0.00017393453079057847, "loss": 0.0867, "step": 187 }, { "epoch": 3.9842931937172774, "grad_norm": 0.08697811514139175, "learning_rate": 0.00017336016515741366, "loss": 0.0896, "step": 188 }, { "epoch": 4.020942408376963, "grad_norm": 0.21698719263076782, "learning_rate": 0.00017278051076267796, "loss": 0.168, "step": 189 }, { "epoch": 4.041884816753926, "grad_norm": 0.1936062127351761, "learning_rate": 0.00017219560939545246, "loss": 0.086, "step": 190 }, { "epoch": 4.06282722513089, "grad_norm": 0.12648779153823853, "learning_rate": 0.00017160550322308863, "loss": 0.0863, "step": 191 }, { "epoch": 4.0837696335078535, "grad_norm": 0.1570994108915329, "learning_rate": 0.00017101023478816857, "loss": 0.0822, "step": 192 }, { "epoch": 4.104712041884817, "grad_norm": 0.14297914505004883, "learning_rate": 0.00017040984700543793, "loss": 0.0833, "step": 193 }, { "epoch": 4.12565445026178, "grad_norm": 0.15018002688884735, "learning_rate": 0.00016980438315871178, "loss": 0.085, "step": 194 }, { "epoch": 4.146596858638744, "grad_norm": 0.1094595193862915, "learning_rate": 0.00016919388689775464, "loss": 0.0783, "step": 195 }, { "epoch": 4.167539267015707, "grad_norm": 0.1448257714509964, "learning_rate": 0.00016857840223513315, "loss": 0.0865, "step": 196 }, { "epoch": 4.18848167539267, "grad_norm": 0.10195231437683105, "learning_rate": 0.00016795797354304345, "loss": 0.0793, "step": 197 }, { "epoch": 4.209424083769633, "grad_norm": 0.11689773947000504, "learning_rate": 0.00016733264555011195, "loss": 0.0824, "step": 198 }, { "epoch": 4.230366492146596, "grad_norm": 0.11591346561908722, "learning_rate": 0.00016670246333817088, "loss": 0.083, "step": 199 }, { "epoch": 4.2513089005235605, "grad_norm": 0.11457241326570511, "learning_rate": 0.00016606747233900815, "loss": 0.083, "step": 200 }, { "epoch": 4.2513089005235605, "eval_loss": 0.09608737379312515, "eval_runtime": 36.2211, "eval_samples_per_second": 42.213, "eval_steps_per_second": 0.331, "step": 200 }, { "epoch": 4.272251308900524, "grad_norm": 0.10457887500524521, "learning_rate": 0.0001654277183310921, "loss": 0.082, "step": 201 }, { "epoch": 4.293193717277487, "grad_norm": 0.10059670358896255, "learning_rate": 0.00016478324743627101, "loss": 0.0858, "step": 202 }, { "epoch": 4.31413612565445, "grad_norm": 0.11438459903001785, "learning_rate": 0.00016413410611644825, "loss": 0.084, "step": 203 }, { "epoch": 4.335078534031414, "grad_norm": 0.06561236828565598, "learning_rate": 0.00016348034117023258, "loss": 0.0822, "step": 204 }, { "epoch": 4.356020942408377, "grad_norm": 0.11459755152463913, "learning_rate": 0.00016282199972956425, "loss": 0.0826, "step": 205 }, { "epoch": 4.37696335078534, "grad_norm": 0.07275859266519547, "learning_rate": 0.00016215912925631723, "loss": 0.081, "step": 206 }, { "epoch": 4.397905759162303, "grad_norm": 0.10688365250825882, "learning_rate": 0.00016149177753887746, "loss": 0.0804, "step": 207 }, { "epoch": 4.418848167539267, "grad_norm": 0.07950358837842941, "learning_rate": 0.00016081999268869766, "loss": 0.0817, "step": 208 }, { "epoch": 4.439790575916231, "grad_norm": 0.09884137660264969, "learning_rate": 0.00016014382313682881, "loss": 0.0818, "step": 209 }, { "epoch": 4.460732984293194, "grad_norm": 0.09773270040750504, "learning_rate": 0.00015946331763042867, "loss": 0.0841, "step": 210 }, { "epoch": 4.481675392670157, "grad_norm": 0.07958532869815826, "learning_rate": 0.00015877852522924732, "loss": 0.081, "step": 211 }, { "epoch": 4.50261780104712, "grad_norm": 0.10386484861373901, "learning_rate": 0.0001580894953020904, "loss": 0.0821, "step": 212 }, { "epoch": 4.523560209424084, "grad_norm": 0.0741114616394043, "learning_rate": 0.00015739627752325996, "loss": 0.081, "step": 213 }, { "epoch": 4.544502617801047, "grad_norm": 0.10303428024053574, "learning_rate": 0.00015669892186897318, "loss": 0.0811, "step": 214 }, { "epoch": 4.56544502617801, "grad_norm": 0.07302123308181763, "learning_rate": 0.00015599747861375955, "loss": 0.0824, "step": 215 }, { "epoch": 4.5863874345549736, "grad_norm": 0.09889702498912811, "learning_rate": 0.00015529199832683635, "loss": 0.0798, "step": 216 }, { "epoch": 4.607329842931938, "grad_norm": 0.0839948058128357, "learning_rate": 0.00015458253186846301, "loss": 0.084, "step": 217 }, { "epoch": 4.628272251308901, "grad_norm": 0.07695591449737549, "learning_rate": 0.0001538691303862744, "loss": 0.0823, "step": 218 }, { "epoch": 4.649214659685864, "grad_norm": 0.09040986001491547, "learning_rate": 0.0001531518453115934, "loss": 0.0783, "step": 219 }, { "epoch": 4.670157068062827, "grad_norm": 0.09041474014520645, "learning_rate": 0.00015243072835572318, "loss": 0.0825, "step": 220 }, { "epoch": 4.69109947643979, "grad_norm": 0.08503681421279907, "learning_rate": 0.00015170583150621905, "loss": 0.0818, "step": 221 }, { "epoch": 4.712041884816754, "grad_norm": 0.08206664770841599, "learning_rate": 0.00015097720702314055, "loss": 0.0799, "step": 222 }, { "epoch": 4.732984293193717, "grad_norm": 0.08691777288913727, "learning_rate": 0.00015024490743528393, "loss": 0.0818, "step": 223 }, { "epoch": 4.7539267015706805, "grad_norm": 0.07108013331890106, "learning_rate": 0.00014950898553639505, "loss": 0.0796, "step": 224 }, { "epoch": 4.774869109947644, "grad_norm": 0.09734012186527252, "learning_rate": 0.00014876949438136347, "loss": 0.0848, "step": 225 }, { "epoch": 4.795811518324607, "grad_norm": 0.07660133391618729, "learning_rate": 0.00014802648728239742, "loss": 0.0823, "step": 226 }, { "epoch": 4.816753926701571, "grad_norm": 0.0771099254488945, "learning_rate": 0.0001472800178051805, "loss": 0.0816, "step": 227 }, { "epoch": 4.837696335078534, "grad_norm": 0.08631302416324615, "learning_rate": 0.00014653013976500975, "loss": 0.0824, "step": 228 }, { "epoch": 4.858638743455497, "grad_norm": 0.07344726473093033, "learning_rate": 0.00014577690722291622, "loss": 0.0785, "step": 229 }, { "epoch": 4.879581151832461, "grad_norm": 0.08363424241542816, "learning_rate": 0.00014502037448176734, "loss": 0.0796, "step": 230 }, { "epoch": 4.900523560209424, "grad_norm": 0.07857396453619003, "learning_rate": 0.00014426059608235208, "loss": 0.0806, "step": 231 }, { "epoch": 4.9214659685863875, "grad_norm": 0.08594755083322525, "learning_rate": 0.00014349762679944896, "loss": 0.0812, "step": 232 }, { "epoch": 4.942408376963351, "grad_norm": 0.06925872713327408, "learning_rate": 0.00014273152163787726, "loss": 0.0808, "step": 233 }, { "epoch": 4.963350785340314, "grad_norm": 0.08095414191484451, "learning_rate": 0.0001419623358285314, "loss": 0.0796, "step": 234 }, { "epoch": 4.984293193717278, "grad_norm": 0.07342205196619034, "learning_rate": 0.0001411901248243993, "loss": 0.0806, "step": 235 }, { "epoch": 5.020942408376963, "grad_norm": 0.23300093412399292, "learning_rate": 0.00014041494429656442, "loss": 0.1565, "step": 236 }, { "epoch": 5.041884816753926, "grad_norm": 0.12772393226623535, "learning_rate": 0.0001396368501301925, "loss": 0.0732, "step": 237 }, { "epoch": 5.06282722513089, "grad_norm": 0.10052972286939621, "learning_rate": 0.00013885589842050253, "loss": 0.0738, "step": 238 }, { "epoch": 5.0837696335078535, "grad_norm": 0.1047709733247757, "learning_rate": 0.00013807214546872256, "loss": 0.075, "step": 239 }, { "epoch": 5.104712041884817, "grad_norm": 0.10601655393838882, "learning_rate": 0.00013728564777803088, "loss": 0.0737, "step": 240 }, { "epoch": 5.12565445026178, "grad_norm": 0.08795251697301865, "learning_rate": 0.00013649646204948255, "loss": 0.0717, "step": 241 }, { "epoch": 5.146596858638744, "grad_norm": 0.10085717588663101, "learning_rate": 0.00013570464517792153, "loss": 0.0751, "step": 242 }, { "epoch": 5.167539267015707, "grad_norm": 0.09512262046337128, "learning_rate": 0.00013491025424787915, "loss": 0.073, "step": 243 }, { "epoch": 5.18848167539267, "grad_norm": 0.08350583910942078, "learning_rate": 0.0001341133465294585, "loss": 0.0761, "step": 244 }, { "epoch": 5.209424083769633, "grad_norm": 0.09553380310535431, "learning_rate": 0.00013331397947420576, "loss": 0.0747, "step": 245 }, { "epoch": 5.230366492146596, "grad_norm": 0.07822317630052567, "learning_rate": 0.00013251221071096836, "loss": 0.0745, "step": 246 }, { "epoch": 5.2513089005235605, "grad_norm": 0.10339541733264923, "learning_rate": 0.00013170809804174022, "loss": 0.0762, "step": 247 }, { "epoch": 5.272251308900524, "grad_norm": 0.08298144489526749, "learning_rate": 0.00013090169943749476, "loss": 0.0751, "step": 248 }, { "epoch": 5.293193717277487, "grad_norm": 0.0966297909617424, "learning_rate": 0.00013009307303400556, "loss": 0.0724, "step": 249 }, { "epoch": 5.31413612565445, "grad_norm": 0.09534008800983429, "learning_rate": 0.00012928227712765504, "loss": 0.0731, "step": 250 }, { "epoch": 5.335078534031414, "grad_norm": 0.08681947737932205, "learning_rate": 0.00012846937017123197, "loss": 0.075, "step": 251 }, { "epoch": 5.356020942408377, "grad_norm": 0.09559471905231476, "learning_rate": 0.00012765441076971712, "loss": 0.0717, "step": 252 }, { "epoch": 5.37696335078534, "grad_norm": 0.08022520691156387, "learning_rate": 0.00012683745767605846, "loss": 0.0766, "step": 253 }, { "epoch": 5.397905759162303, "grad_norm": 0.10284972935914993, "learning_rate": 0.0001260185697869353, "loss": 0.0704, "step": 254 }, { "epoch": 5.418848167539267, "grad_norm": 0.08318132907152176, "learning_rate": 0.00012519780613851254, "loss": 0.0746, "step": 255 }, { "epoch": 5.439790575916231, "grad_norm": 0.08917541056871414, "learning_rate": 0.00012437522590218417, "loss": 0.0733, "step": 256 }, { "epoch": 5.460732984293194, "grad_norm": 0.08834797143936157, "learning_rate": 0.00012355088838030776, "loss": 0.075, "step": 257 }, { "epoch": 5.481675392670157, "grad_norm": 0.08001340925693512, "learning_rate": 0.00012272485300192902, "loss": 0.0731, "step": 258 }, { "epoch": 5.50261780104712, "grad_norm": 0.07309938222169876, "learning_rate": 0.00012189717931849731, "loss": 0.0719, "step": 259 }, { "epoch": 5.523560209424084, "grad_norm": 0.07951314002275467, "learning_rate": 0.00012106792699957263, "loss": 0.0741, "step": 260 }, { "epoch": 5.544502617801047, "grad_norm": 0.07957018166780472, "learning_rate": 0.00012023715582852357, "loss": 0.0738, "step": 261 }, { "epoch": 5.56544502617801, "grad_norm": 0.076540008187294, "learning_rate": 0.00011940492569821753, "loss": 0.0714, "step": 262 }, { "epoch": 5.5863874345549736, "grad_norm": 0.08407393842935562, "learning_rate": 0.00011857129660670281, "loss": 0.0777, "step": 263 }, { "epoch": 5.607329842931938, "grad_norm": 0.0788414478302002, "learning_rate": 0.00011773632865288309, "loss": 0.0732, "step": 264 }, { "epoch": 5.628272251308901, "grad_norm": 0.0724525973200798, "learning_rate": 0.00011690008203218493, "loss": 0.0783, "step": 265 }, { "epoch": 5.649214659685864, "grad_norm": 0.0882321372628212, "learning_rate": 0.00011606261703221772, "loss": 0.0781, "step": 266 }, { "epoch": 5.670157068062827, "grad_norm": 0.07683246582746506, "learning_rate": 0.00011522399402842783, "loss": 0.0706, "step": 267 }, { "epoch": 5.69109947643979, "grad_norm": 0.07433947920799255, "learning_rate": 0.00011438427347974554, "loss": 0.074, "step": 268 }, { "epoch": 5.712041884816754, "grad_norm": 0.07308503985404968, "learning_rate": 0.00011354351592422665, "loss": 0.0729, "step": 269 }, { "epoch": 5.732984293193717, "grad_norm": 0.08603333681821823, "learning_rate": 0.00011270178197468789, "loss": 0.0752, "step": 270 }, { "epoch": 5.7539267015706805, "grad_norm": 0.07910820841789246, "learning_rate": 0.00011185913231433733, "loss": 0.0752, "step": 271 }, { "epoch": 5.774869109947644, "grad_norm": 0.07771284133195877, "learning_rate": 0.00011101562769239946, "loss": 0.0739, "step": 272 }, { "epoch": 5.795811518324607, "grad_norm": 0.08137574046850204, "learning_rate": 0.0001101713289197356, "loss": 0.0704, "step": 273 }, { "epoch": 5.816753926701571, "grad_norm": 0.07771284133195877, "learning_rate": 0.00010932629686445986, "loss": 0.0766, "step": 274 }, { "epoch": 5.837696335078534, "grad_norm": 0.07269325852394104, "learning_rate": 0.00010848059244755093, "loss": 0.0738, "step": 275 }, { "epoch": 5.858638743455497, "grad_norm": 0.09434104710817337, "learning_rate": 0.00010763427663846015, "loss": 0.0754, "step": 276 }, { "epoch": 5.879581151832461, "grad_norm": 0.07986113429069519, "learning_rate": 0.00010678741045071609, "loss": 0.0727, "step": 277 }, { "epoch": 5.900523560209424, "grad_norm": 0.08152402937412262, "learning_rate": 0.00010594005493752568, "loss": 0.0763, "step": 278 }, { "epoch": 5.9214659685863875, "grad_norm": 0.08020524680614471, "learning_rate": 0.00010509227118737298, "loss": 0.0728, "step": 279 }, { "epoch": 5.942408376963351, "grad_norm": 0.08128321915864944, "learning_rate": 0.00010424412031961484, "loss": 0.0726, "step": 280 }, { "epoch": 5.963350785340314, "grad_norm": 0.09842672944068909, "learning_rate": 0.00010339566348007487, "loss": 0.0738, "step": 281 }, { "epoch": 5.984293193717278, "grad_norm": 0.0821060761809349, "learning_rate": 0.00010254696183663511, "loss": 0.0741, "step": 282 }, { "epoch": 6.020942408376963, "grad_norm": 0.24885313212871552, "learning_rate": 0.00010169807657482623, "loss": 0.1464, "step": 283 }, { "epoch": 6.041884816753926, "grad_norm": 0.10097737610340118, "learning_rate": 0.00010084906889341656, "loss": 0.0664, "step": 284 }, { "epoch": 6.06282722513089, "grad_norm": 0.10555354505777359, "learning_rate": 0.0001, "loss": 0.0672, "step": 285 }, { "epoch": 6.0837696335078535, "grad_norm": 0.10018379241228104, "learning_rate": 9.915093110658346e-05, "loss": 0.0678, "step": 286 }, { "epoch": 6.104712041884817, "grad_norm": 0.09475495666265488, "learning_rate": 9.830192342517379e-05, "loss": 0.0662, "step": 287 }, { "epoch": 6.12565445026178, "grad_norm": 0.10079528391361237, "learning_rate": 9.745303816336489e-05, "loss": 0.0645, "step": 288 }, { "epoch": 6.146596858638744, "grad_norm": 0.08908044546842575, "learning_rate": 9.660433651992514e-05, "loss": 0.0671, "step": 289 }, { "epoch": 6.167539267015707, "grad_norm": 0.10220327973365784, "learning_rate": 9.57558796803852e-05, "loss": 0.0652, "step": 290 }, { "epoch": 6.18848167539267, "grad_norm": 0.0913078561425209, "learning_rate": 9.490772881262709e-05, "loss": 0.0654, "step": 291 }, { "epoch": 6.209424083769633, "grad_norm": 0.09089622646570206, "learning_rate": 9.405994506247432e-05, "loss": 0.0659, "step": 292 }, { "epoch": 6.230366492146596, "grad_norm": 0.10402899235486984, "learning_rate": 9.321258954928393e-05, "loss": 0.0672, "step": 293 }, { "epoch": 6.2513089005235605, "grad_norm": 0.09207270294427872, "learning_rate": 9.236572336153986e-05, "loss": 0.0688, "step": 294 }, { "epoch": 6.272251308900524, "grad_norm": 0.10593326389789581, "learning_rate": 9.151940755244912e-05, "loss": 0.0655, "step": 295 }, { "epoch": 6.293193717277487, "grad_norm": 0.09085794538259506, "learning_rate": 9.067370313554015e-05, "loss": 0.0663, "step": 296 }, { "epoch": 6.31413612565445, "grad_norm": 0.106470987200737, "learning_rate": 8.982867108026442e-05, "loss": 0.0659, "step": 297 }, { "epoch": 6.335078534031414, "grad_norm": 0.08805633336305618, "learning_rate": 8.898437230760058e-05, "loss": 0.0672, "step": 298 }, { "epoch": 6.356020942408377, "grad_norm": 0.10489070415496826, "learning_rate": 8.814086768566272e-05, "loss": 0.0665, "step": 299 }, { "epoch": 6.37696335078534, "grad_norm": 0.11271199584007263, "learning_rate": 8.729821802531212e-05, "loss": 0.0685, "step": 300 }, { "epoch": 6.37696335078534, "eval_loss": 0.09648442268371582, "eval_runtime": 36.2137, "eval_samples_per_second": 42.222, "eval_steps_per_second": 0.331, "step": 300 } ], "logging_steps": 1, "max_steps": 470, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.071804808450802e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }