|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0005134788189987, |
|
"eval_steps": 244, |
|
"global_step": 974, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0010269576379974327, |
|
"grad_norm": 3.957693576812744, |
|
"learning_rate": 2e-05, |
|
"loss": 3.6592, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0010269576379974327, |
|
"eval_loss": 3.853421926498413, |
|
"eval_runtime": 26.7657, |
|
"eval_samples_per_second": 15.318, |
|
"eval_steps_per_second": 7.659, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0020539152759948653, |
|
"grad_norm": 4.336184501647949, |
|
"learning_rate": 4e-05, |
|
"loss": 4.1936, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0030808729139922978, |
|
"grad_norm": 3.962000846862793, |
|
"learning_rate": 6e-05, |
|
"loss": 3.801, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.004107830551989731, |
|
"grad_norm": 4.063741207122803, |
|
"learning_rate": 8e-05, |
|
"loss": 3.8505, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.005134788189987163, |
|
"grad_norm": 3.923612594604492, |
|
"learning_rate": 0.0001, |
|
"loss": 3.5915, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0061617458279845955, |
|
"grad_norm": 3.681149959564209, |
|
"learning_rate": 0.00012, |
|
"loss": 3.4027, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.007188703465982028, |
|
"grad_norm": 2.262376546859741, |
|
"learning_rate": 0.00014, |
|
"loss": 2.8853, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.008215661103979461, |
|
"grad_norm": 1.8211342096328735, |
|
"learning_rate": 0.00016, |
|
"loss": 2.8695, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.009242618741976894, |
|
"grad_norm": 3.154785633087158, |
|
"learning_rate": 0.00018, |
|
"loss": 2.9279, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.010269576379974325, |
|
"grad_norm": 2.458535671234131, |
|
"learning_rate": 0.0002, |
|
"loss": 2.9412, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011296534017971758, |
|
"grad_norm": 3.2424135208129883, |
|
"learning_rate": 0.0001999994689745966, |
|
"loss": 2.979, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.012323491655969191, |
|
"grad_norm": 2.4848523139953613, |
|
"learning_rate": 0.00019999787590402606, |
|
"loss": 2.9422, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.013350449293966624, |
|
"grad_norm": 1.9753702878952026, |
|
"learning_rate": 0.00019999522080520765, |
|
"loss": 2.7611, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.014377406931964057, |
|
"grad_norm": 2.2294538021087646, |
|
"learning_rate": 0.00019999150370633988, |
|
"loss": 2.5417, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01540436456996149, |
|
"grad_norm": 2.10558819770813, |
|
"learning_rate": 0.00019998672464690022, |
|
"loss": 2.6994, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.016431322207958923, |
|
"grad_norm": 1.6060516834259033, |
|
"learning_rate": 0.00019998088367764467, |
|
"loss": 2.6246, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.017458279845956354, |
|
"grad_norm": 1.5200241804122925, |
|
"learning_rate": 0.00019997398086060735, |
|
"loss": 2.4818, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01848523748395379, |
|
"grad_norm": 1.8856867551803589, |
|
"learning_rate": 0.00019996601626909964, |
|
"loss": 2.895, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01951219512195122, |
|
"grad_norm": 3.8515360355377197, |
|
"learning_rate": 0.00019995698998770956, |
|
"loss": 3.6369, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.02053915275994865, |
|
"grad_norm": 2.049919366836548, |
|
"learning_rate": 0.00019994690211230082, |
|
"loss": 2.6389, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.021566110397946085, |
|
"grad_norm": 1.6166138648986816, |
|
"learning_rate": 0.00019993575275001175, |
|
"loss": 2.4489, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.022593068035943516, |
|
"grad_norm": 2.4144058227539062, |
|
"learning_rate": 0.00019992354201925428, |
|
"loss": 2.6399, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.02362002567394095, |
|
"grad_norm": 1.8632601499557495, |
|
"learning_rate": 0.00019991027004971255, |
|
"loss": 2.4482, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.024646983311938382, |
|
"grad_norm": 1.7382563352584839, |
|
"learning_rate": 0.00019989593698234163, |
|
"loss": 2.4847, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.025673940949935817, |
|
"grad_norm": 1.5046484470367432, |
|
"learning_rate": 0.000199880542969366, |
|
"loss": 2.1829, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.026700898587933248, |
|
"grad_norm": 1.998173475265503, |
|
"learning_rate": 0.0001998640881742778, |
|
"loss": 2.5759, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02772785622593068, |
|
"grad_norm": 1.9964630603790283, |
|
"learning_rate": 0.00019984657277183544, |
|
"loss": 2.1866, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.028754813863928114, |
|
"grad_norm": 1.5282529592514038, |
|
"learning_rate": 0.00019982799694806135, |
|
"loss": 2.4439, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.029781771501925545, |
|
"grad_norm": 1.4352314472198486, |
|
"learning_rate": 0.0001998083609002402, |
|
"loss": 2.3027, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03080872913992298, |
|
"grad_norm": 1.9962196350097656, |
|
"learning_rate": 0.00019978766483691676, |
|
"loss": 2.696, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03183568677792041, |
|
"grad_norm": 1.603895902633667, |
|
"learning_rate": 0.00019976590897789382, |
|
"loss": 2.5533, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.032862644415917845, |
|
"grad_norm": 1.7145624160766602, |
|
"learning_rate": 0.00019974309355422963, |
|
"loss": 2.4172, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.03388960205391527, |
|
"grad_norm": 1.640822172164917, |
|
"learning_rate": 0.00019971921880823553, |
|
"loss": 2.5851, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.03491655969191271, |
|
"grad_norm": 1.5972627401351929, |
|
"learning_rate": 0.0001996942849934735, |
|
"loss": 2.378, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.03594351732991014, |
|
"grad_norm": 1.2989180088043213, |
|
"learning_rate": 0.0001996682923747533, |
|
"loss": 2.2183, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03697047496790758, |
|
"grad_norm": 1.5472369194030762, |
|
"learning_rate": 0.00019964124122812975, |
|
"loss": 2.4299, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.037997432605905004, |
|
"grad_norm": 1.8404388427734375, |
|
"learning_rate": 0.0001996131318408998, |
|
"loss": 2.6303, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.03902439024390244, |
|
"grad_norm": 1.7805739641189575, |
|
"learning_rate": 0.00019958396451159936, |
|
"loss": 2.5277, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.040051347881899874, |
|
"grad_norm": 1.4411464929580688, |
|
"learning_rate": 0.0001995537395500004, |
|
"loss": 2.5659, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0410783055198973, |
|
"grad_norm": 2.1974802017211914, |
|
"learning_rate": 0.00019952245727710723, |
|
"loss": 2.4178, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.042105263157894736, |
|
"grad_norm": 1.7201310396194458, |
|
"learning_rate": 0.00019949011802515356, |
|
"loss": 2.3348, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.04313222079589217, |
|
"grad_norm": 1.501447319984436, |
|
"learning_rate": 0.0001994567221375987, |
|
"loss": 2.2731, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.044159178433889605, |
|
"grad_norm": 2.366774797439575, |
|
"learning_rate": 0.00019942226996912384, |
|
"loss": 2.4477, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.04518613607188703, |
|
"grad_norm": 1.6313987970352173, |
|
"learning_rate": 0.00019938676188562863, |
|
"loss": 2.6674, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.04621309370988447, |
|
"grad_norm": 1.5255392789840698, |
|
"learning_rate": 0.00019935019826422692, |
|
"loss": 2.428, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0472400513478819, |
|
"grad_norm": 1.4218732118606567, |
|
"learning_rate": 0.00019931257949324288, |
|
"loss": 2.3119, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.04826700898587933, |
|
"grad_norm": 1.8226125240325928, |
|
"learning_rate": 0.0001992739059722071, |
|
"loss": 2.6293, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.049293966623876764, |
|
"grad_norm": 1.7285653352737427, |
|
"learning_rate": 0.00019923417811185186, |
|
"loss": 2.5319, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0503209242618742, |
|
"grad_norm": 1.639489769935608, |
|
"learning_rate": 0.00019919339633410737, |
|
"loss": 2.5006, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.051347881899871634, |
|
"grad_norm": 1.6802177429199219, |
|
"learning_rate": 0.00019915156107209675, |
|
"loss": 2.4881, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05237483953786906, |
|
"grad_norm": 1.5928065776824951, |
|
"learning_rate": 0.0001991086727701317, |
|
"loss": 2.3264, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.053401797175866496, |
|
"grad_norm": 1.5361305475234985, |
|
"learning_rate": 0.0001990647318837079, |
|
"loss": 2.3946, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.05442875481386393, |
|
"grad_norm": 1.5837557315826416, |
|
"learning_rate": 0.0001990197388794998, |
|
"loss": 2.4201, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.05545571245186136, |
|
"grad_norm": 1.4728038311004639, |
|
"learning_rate": 0.000198973694235356, |
|
"loss": 2.3488, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.05648267008985879, |
|
"grad_norm": 1.5199151039123535, |
|
"learning_rate": 0.00019892659844029397, |
|
"loss": 2.2515, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.05750962772785623, |
|
"grad_norm": 1.5088133811950684, |
|
"learning_rate": 0.00019887845199449504, |
|
"loss": 2.2909, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.05853658536585366, |
|
"grad_norm": 1.553422451019287, |
|
"learning_rate": 0.00019882925540929888, |
|
"loss": 2.6474, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.05956354300385109, |
|
"grad_norm": 1.7582780122756958, |
|
"learning_rate": 0.00019877900920719827, |
|
"loss": 2.4533, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.060590500641848524, |
|
"grad_norm": 1.7055028676986694, |
|
"learning_rate": 0.00019872771392183332, |
|
"loss": 2.4449, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.06161745827984596, |
|
"grad_norm": 1.4432878494262695, |
|
"learning_rate": 0.0001986753700979861, |
|
"loss": 2.2296, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0626444159178434, |
|
"grad_norm": 1.9076869487762451, |
|
"learning_rate": 0.00019862197829157457, |
|
"loss": 2.5907, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.06367137355584082, |
|
"grad_norm": 1.348199725151062, |
|
"learning_rate": 0.00019856753906964686, |
|
"loss": 2.2372, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.06469833119383825, |
|
"grad_norm": 1.7192755937576294, |
|
"learning_rate": 0.0001985120530103752, |
|
"loss": 2.4893, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.06572528883183569, |
|
"grad_norm": 1.456470251083374, |
|
"learning_rate": 0.00019845552070304966, |
|
"loss": 2.3985, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.06675224646983312, |
|
"grad_norm": 1.9413789510726929, |
|
"learning_rate": 0.00019839794274807213, |
|
"loss": 2.3872, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.06777920410783055, |
|
"grad_norm": 1.5768566131591797, |
|
"learning_rate": 0.0001983393197569497, |
|
"loss": 2.1808, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.06880616174582799, |
|
"grad_norm": 1.5114476680755615, |
|
"learning_rate": 0.00019827965235228834, |
|
"loss": 2.0469, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.06983311938382541, |
|
"grad_norm": 2.0698795318603516, |
|
"learning_rate": 0.00019821894116778615, |
|
"loss": 2.3916, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.07086007702182286, |
|
"grad_norm": 1.5041264295578003, |
|
"learning_rate": 0.00019815718684822688, |
|
"loss": 2.4773, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.07188703465982028, |
|
"grad_norm": 1.5561423301696777, |
|
"learning_rate": 0.00019809439004947268, |
|
"loss": 2.2211, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07291399229781771, |
|
"grad_norm": 1.5526721477508545, |
|
"learning_rate": 0.00019803055143845745, |
|
"loss": 2.2947, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.07394094993581515, |
|
"grad_norm": 2.027614116668701, |
|
"learning_rate": 0.00019796567169317973, |
|
"loss": 2.4629, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.07496790757381258, |
|
"grad_norm": 1.4370557069778442, |
|
"learning_rate": 0.00019789975150269536, |
|
"loss": 2.3772, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.07599486521181001, |
|
"grad_norm": 1.493373990058899, |
|
"learning_rate": 0.00019783279156711022, |
|
"loss": 2.169, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.07702182284980745, |
|
"grad_norm": 1.905671238899231, |
|
"learning_rate": 0.00019776479259757287, |
|
"loss": 2.3834, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07804878048780488, |
|
"grad_norm": 1.5494719743728638, |
|
"learning_rate": 0.00019769575531626695, |
|
"loss": 2.1067, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0790757381258023, |
|
"grad_norm": 1.686172366142273, |
|
"learning_rate": 0.00019762568045640343, |
|
"loss": 2.3663, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.08010269576379975, |
|
"grad_norm": 1.640641689300537, |
|
"learning_rate": 0.0001975545687622129, |
|
"loss": 2.4287, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.08112965340179717, |
|
"grad_norm": 1.7262331247329712, |
|
"learning_rate": 0.0001974824209889377, |
|
"loss": 2.5115, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0821566110397946, |
|
"grad_norm": 1.6362113952636719, |
|
"learning_rate": 0.00019740923790282389, |
|
"loss": 2.2623, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08318356867779204, |
|
"grad_norm": 1.9334293603897095, |
|
"learning_rate": 0.00019733502028111295, |
|
"loss": 2.3749, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.08421052631578947, |
|
"grad_norm": 1.7188832759857178, |
|
"learning_rate": 0.00019725976891203376, |
|
"loss": 2.3507, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0852374839537869, |
|
"grad_norm": 1.7757607698440552, |
|
"learning_rate": 0.0001971834845947941, |
|
"loss": 2.3925, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.08626444159178434, |
|
"grad_norm": 1.2982068061828613, |
|
"learning_rate": 0.00019710616813957218, |
|
"loss": 2.0348, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.08729139922978177, |
|
"grad_norm": 1.8079991340637207, |
|
"learning_rate": 0.000197027820367508, |
|
"loss": 2.3944, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.08831835686777921, |
|
"grad_norm": 1.812174916267395, |
|
"learning_rate": 0.00019694844211069477, |
|
"loss": 2.2011, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.08934531450577664, |
|
"grad_norm": 1.7926369905471802, |
|
"learning_rate": 0.00019686803421216985, |
|
"loss": 2.5529, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.09037227214377407, |
|
"grad_norm": 1.5705771446228027, |
|
"learning_rate": 0.00019678659752590602, |
|
"loss": 2.2078, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.09139922978177151, |
|
"grad_norm": 1.5993576049804688, |
|
"learning_rate": 0.00019670413291680223, |
|
"loss": 2.4257, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.09242618741976893, |
|
"grad_norm": 1.531992793083191, |
|
"learning_rate": 0.00019662064126067452, |
|
"loss": 2.3995, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09345314505776636, |
|
"grad_norm": 1.2814476490020752, |
|
"learning_rate": 0.0001965361234442467, |
|
"loss": 2.4486, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.0944801026957638, |
|
"grad_norm": 1.4734814167022705, |
|
"learning_rate": 0.00019645058036514096, |
|
"loss": 2.1195, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.09550706033376123, |
|
"grad_norm": 1.9309577941894531, |
|
"learning_rate": 0.00019636401293186823, |
|
"loss": 2.1903, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.09653401797175866, |
|
"grad_norm": 1.780761480331421, |
|
"learning_rate": 0.00019627642206381863, |
|
"loss": 2.1368, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0975609756097561, |
|
"grad_norm": 1.5235934257507324, |
|
"learning_rate": 0.00019618780869125172, |
|
"loss": 2.2518, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.09858793324775353, |
|
"grad_norm": 2.0815932750701904, |
|
"learning_rate": 0.0001960981737552865, |
|
"loss": 2.2663, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.09961489088575096, |
|
"grad_norm": 1.7994256019592285, |
|
"learning_rate": 0.00019600751820789152, |
|
"loss": 2.2666, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.1006418485237484, |
|
"grad_norm": 1.9231226444244385, |
|
"learning_rate": 0.00019591584301187478, |
|
"loss": 2.1893, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.10166880616174583, |
|
"grad_norm": 1.6883149147033691, |
|
"learning_rate": 0.00019582314914087342, |
|
"loss": 2.3377, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.10269576379974327, |
|
"grad_norm": 1.4279382228851318, |
|
"learning_rate": 0.00019572943757934348, |
|
"loss": 2.3135, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1037227214377407, |
|
"grad_norm": 1.853316307067871, |
|
"learning_rate": 0.00019563470932254932, |
|
"loss": 2.0813, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.10474967907573812, |
|
"grad_norm": 1.6416527032852173, |
|
"learning_rate": 0.00019553896537655318, |
|
"loss": 2.3543, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.10577663671373556, |
|
"grad_norm": 1.6184829473495483, |
|
"learning_rate": 0.00019544220675820438, |
|
"loss": 2.1147, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.10680359435173299, |
|
"grad_norm": 1.7264267206192017, |
|
"learning_rate": 0.00019534443449512862, |
|
"loss": 2.294, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.10783055198973042, |
|
"grad_norm": 2.0255539417266846, |
|
"learning_rate": 0.00019524564962571702, |
|
"loss": 2.3318, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.10885750962772786, |
|
"grad_norm": 2.9923765659332275, |
|
"learning_rate": 0.0001951458531991151, |
|
"loss": 2.3253, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.10988446726572529, |
|
"grad_norm": 2.349273443222046, |
|
"learning_rate": 0.00019504504627521153, |
|
"loss": 2.3938, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.11091142490372272, |
|
"grad_norm": 1.649088978767395, |
|
"learning_rate": 0.00019494322992462716, |
|
"loss": 2.3371, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.11193838254172016, |
|
"grad_norm": 2.8484344482421875, |
|
"learning_rate": 0.00019484040522870332, |
|
"loss": 2.0286, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.11296534017971759, |
|
"grad_norm": 1.859044075012207, |
|
"learning_rate": 0.00019473657327949054, |
|
"loss": 2.2637, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11399229781771501, |
|
"grad_norm": 1.8111779689788818, |
|
"learning_rate": 0.00019463173517973682, |
|
"loss": 2.2017, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.11501925545571245, |
|
"grad_norm": 2.0917441844940186, |
|
"learning_rate": 0.00019452589204287612, |
|
"loss": 2.2183, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.11604621309370988, |
|
"grad_norm": 1.8554123640060425, |
|
"learning_rate": 0.0001944190449930163, |
|
"loss": 2.3178, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.11707317073170732, |
|
"grad_norm": 1.534638524055481, |
|
"learning_rate": 0.00019431119516492726, |
|
"loss": 2.4167, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.11810012836970475, |
|
"grad_norm": 1.856016755104065, |
|
"learning_rate": 0.00019420234370402906, |
|
"loss": 2.226, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.11912708600770218, |
|
"grad_norm": 2.0289316177368164, |
|
"learning_rate": 0.00019409249176637945, |
|
"loss": 2.4272, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.12015404364569962, |
|
"grad_norm": 1.7497624158859253, |
|
"learning_rate": 0.00019398164051866184, |
|
"loss": 2.5199, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.12118100128369705, |
|
"grad_norm": 1.3328431844711304, |
|
"learning_rate": 0.00019386979113817282, |
|
"loss": 2.1596, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.12220795892169448, |
|
"grad_norm": 1.2402849197387695, |
|
"learning_rate": 0.00019375694481280965, |
|
"loss": 2.138, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.12323491655969192, |
|
"grad_norm": 1.7122734785079956, |
|
"learning_rate": 0.00019364310274105757, |
|
"loss": 2.1925, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12426187419768935, |
|
"grad_norm": 1.8203654289245605, |
|
"learning_rate": 0.00019352826613197726, |
|
"loss": 2.2711, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.1252888318356868, |
|
"grad_norm": 1.6196566820144653, |
|
"learning_rate": 0.0001934124362051919, |
|
"loss": 2.3819, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.12631578947368421, |
|
"grad_norm": 1.8960175514221191, |
|
"learning_rate": 0.0001932956141908741, |
|
"loss": 2.1812, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.12734274711168164, |
|
"grad_norm": 1.275739312171936, |
|
"learning_rate": 0.00019317780132973303, |
|
"loss": 2.2348, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.12836970474967907, |
|
"grad_norm": 1.3010280132293701, |
|
"learning_rate": 0.00019305899887300112, |
|
"loss": 2.2978, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1293966623876765, |
|
"grad_norm": 1.5741634368896484, |
|
"learning_rate": 0.00019293920808242083, |
|
"loss": 2.5089, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.13042362002567395, |
|
"grad_norm": 1.253029227256775, |
|
"learning_rate": 0.00019281843023023122, |
|
"loss": 2.3474, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.13145057766367138, |
|
"grad_norm": 1.4684383869171143, |
|
"learning_rate": 0.00019269666659915444, |
|
"loss": 2.5293, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.1324775353016688, |
|
"grad_norm": 1.4969675540924072, |
|
"learning_rate": 0.00019257391848238214, |
|
"loss": 2.4311, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.13350449293966624, |
|
"grad_norm": 1.6982190608978271, |
|
"learning_rate": 0.0001924501871835616, |
|
"loss": 2.4293, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13453145057766366, |
|
"grad_norm": 1.5805507898330688, |
|
"learning_rate": 0.00019232547401678218, |
|
"loss": 2.5934, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.1355584082156611, |
|
"grad_norm": 1.6854584217071533, |
|
"learning_rate": 0.00019219978030656103, |
|
"loss": 2.3307, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.13658536585365855, |
|
"grad_norm": 1.4633373022079468, |
|
"learning_rate": 0.00019207310738782922, |
|
"loss": 2.3324, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.13761232349165597, |
|
"grad_norm": 1.3780254125595093, |
|
"learning_rate": 0.00019194545660591752, |
|
"loss": 2.3986, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.1386392811296534, |
|
"grad_norm": 1.321736216545105, |
|
"learning_rate": 0.00019181682931654202, |
|
"loss": 2.5895, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.13966623876765083, |
|
"grad_norm": 1.1892857551574707, |
|
"learning_rate": 0.00019168722688578998, |
|
"loss": 2.2351, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.14069319640564826, |
|
"grad_norm": 1.3525614738464355, |
|
"learning_rate": 0.00019155665069010497, |
|
"loss": 2.4157, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.1417201540436457, |
|
"grad_norm": 1.760393500328064, |
|
"learning_rate": 0.00019142510211627264, |
|
"loss": 2.5114, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.14274711168164314, |
|
"grad_norm": 1.3307132720947266, |
|
"learning_rate": 0.00019129258256140555, |
|
"loss": 2.4521, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.14377406931964057, |
|
"grad_norm": 1.3006818294525146, |
|
"learning_rate": 0.0001911590934329288, |
|
"loss": 2.172, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.144801026957638, |
|
"grad_norm": 1.4609280824661255, |
|
"learning_rate": 0.00019102463614856474, |
|
"loss": 2.3233, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.14582798459563542, |
|
"grad_norm": 1.4396119117736816, |
|
"learning_rate": 0.000190889212136318, |
|
"loss": 1.9802, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.14685494223363285, |
|
"grad_norm": 1.3639447689056396, |
|
"learning_rate": 0.00019075282283446043, |
|
"loss": 1.9724, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.1478818998716303, |
|
"grad_norm": 1.2768394947052002, |
|
"learning_rate": 0.0001906154696915157, |
|
"loss": 2.2345, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.14890885750962773, |
|
"grad_norm": 1.6951898336410522, |
|
"learning_rate": 0.00019047715416624402, |
|
"loss": 2.1663, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.14993581514762516, |
|
"grad_norm": 1.5235414505004883, |
|
"learning_rate": 0.00019033787772762645, |
|
"loss": 2.3048, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.1509627727856226, |
|
"grad_norm": 1.5483514070510864, |
|
"learning_rate": 0.0001901976418548496, |
|
"loss": 1.9881, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.15198973042362002, |
|
"grad_norm": 1.3355050086975098, |
|
"learning_rate": 0.00019005644803728967, |
|
"loss": 2.1689, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.15301668806161745, |
|
"grad_norm": 1.7160605192184448, |
|
"learning_rate": 0.00018991429777449672, |
|
"loss": 2.363, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.1540436456996149, |
|
"grad_norm": 1.6425933837890625, |
|
"learning_rate": 0.00018977119257617878, |
|
"loss": 2.2472, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15507060333761233, |
|
"grad_norm": 1.3999041318893433, |
|
"learning_rate": 0.00018962713396218574, |
|
"loss": 2.2772, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.15609756097560976, |
|
"grad_norm": 1.7348566055297852, |
|
"learning_rate": 0.00018948212346249333, |
|
"loss": 2.1869, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.15712451861360718, |
|
"grad_norm": 1.8590364456176758, |
|
"learning_rate": 0.0001893361626171867, |
|
"loss": 2.3029, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.1581514762516046, |
|
"grad_norm": 1.3040502071380615, |
|
"learning_rate": 0.00018918925297644416, |
|
"loss": 2.3171, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.15917843388960207, |
|
"grad_norm": 1.3961435556411743, |
|
"learning_rate": 0.00018904139610052077, |
|
"loss": 2.3829, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.1602053915275995, |
|
"grad_norm": 1.4264347553253174, |
|
"learning_rate": 0.00018889259355973163, |
|
"loss": 2.0808, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.16123234916559692, |
|
"grad_norm": 1.4675887823104858, |
|
"learning_rate": 0.00018874284693443536, |
|
"loss": 2.6039, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.16225930680359435, |
|
"grad_norm": 1.436353087425232, |
|
"learning_rate": 0.00018859215781501725, |
|
"loss": 2.0781, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.16328626444159178, |
|
"grad_norm": 1.6552093029022217, |
|
"learning_rate": 0.0001884405278018722, |
|
"loss": 2.3886, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.1643132220795892, |
|
"grad_norm": 1.694690465927124, |
|
"learning_rate": 0.00018828795850538805, |
|
"loss": 2.327, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16534017971758666, |
|
"grad_norm": 1.3449724912643433, |
|
"learning_rate": 0.00018813445154592826, |
|
"loss": 2.347, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.1663671373555841, |
|
"grad_norm": 2.1681737899780273, |
|
"learning_rate": 0.0001879800085538147, |
|
"loss": 2.31, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.16739409499358152, |
|
"grad_norm": 1.3670250177383423, |
|
"learning_rate": 0.00018782463116931043, |
|
"loss": 2.263, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.16842105263157894, |
|
"grad_norm": 1.5687174797058105, |
|
"learning_rate": 0.0001876683210426022, |
|
"loss": 2.2114, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.16944801026957637, |
|
"grad_norm": 1.7429057359695435, |
|
"learning_rate": 0.000187511079833783, |
|
"loss": 2.2278, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.1704749679075738, |
|
"grad_norm": 1.7417389154434204, |
|
"learning_rate": 0.0001873529092128343, |
|
"loss": 2.294, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.17150192554557125, |
|
"grad_norm": 1.469560980796814, |
|
"learning_rate": 0.0001871938108596085, |
|
"loss": 2.2767, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.17252888318356868, |
|
"grad_norm": 1.9978508949279785, |
|
"learning_rate": 0.00018703378646381098, |
|
"loss": 2.4065, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.1735558408215661, |
|
"grad_norm": 1.6224966049194336, |
|
"learning_rate": 0.00018687283772498206, |
|
"loss": 2.2886, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.17458279845956354, |
|
"grad_norm": 1.9054279327392578, |
|
"learning_rate": 0.00018671096635247914, |
|
"loss": 2.471, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17560975609756097, |
|
"grad_norm": 2.048701763153076, |
|
"learning_rate": 0.00018654817406545845, |
|
"loss": 2.2566, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.17663671373555842, |
|
"grad_norm": 1.327681303024292, |
|
"learning_rate": 0.00018638446259285678, |
|
"loss": 2.1956, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.17766367137355585, |
|
"grad_norm": 1.559768557548523, |
|
"learning_rate": 0.00018621983367337315, |
|
"loss": 2.4249, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.17869062901155328, |
|
"grad_norm": 1.3940227031707764, |
|
"learning_rate": 0.00018605428905545032, |
|
"loss": 2.435, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.1797175866495507, |
|
"grad_norm": 1.5285940170288086, |
|
"learning_rate": 0.00018588783049725623, |
|
"loss": 2.2861, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.18074454428754813, |
|
"grad_norm": 1.4059717655181885, |
|
"learning_rate": 0.00018572045976666534, |
|
"loss": 2.3312, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.18177150192554556, |
|
"grad_norm": 1.327272891998291, |
|
"learning_rate": 0.0001855521786412399, |
|
"loss": 2.267, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.18279845956354301, |
|
"grad_norm": 1.8960320949554443, |
|
"learning_rate": 0.0001853829889082109, |
|
"loss": 2.2957, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.18382541720154044, |
|
"grad_norm": 2.1734304428100586, |
|
"learning_rate": 0.0001852128923644593, |
|
"loss": 2.3058, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.18485237483953787, |
|
"grad_norm": 1.4910588264465332, |
|
"learning_rate": 0.00018504189081649676, |
|
"loss": 2.3334, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1858793324775353, |
|
"grad_norm": 1.785188913345337, |
|
"learning_rate": 0.00018486998608044667, |
|
"loss": 2.3145, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.18690629011553272, |
|
"grad_norm": 1.7088462114334106, |
|
"learning_rate": 0.00018469717998202462, |
|
"loss": 2.3375, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.18793324775353018, |
|
"grad_norm": 1.542039155960083, |
|
"learning_rate": 0.0001845234743565192, |
|
"loss": 2.3347, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.1889602053915276, |
|
"grad_norm": 1.6210240125656128, |
|
"learning_rate": 0.00018434887104877242, |
|
"loss": 2.3396, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.18998716302952504, |
|
"grad_norm": 1.2093369960784912, |
|
"learning_rate": 0.00018417337191316003, |
|
"loss": 2.2205, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.19101412066752246, |
|
"grad_norm": 1.4520978927612305, |
|
"learning_rate": 0.00018399697881357212, |
|
"loss": 2.3782, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.1920410783055199, |
|
"grad_norm": 1.655849814414978, |
|
"learning_rate": 0.00018381969362339298, |
|
"loss": 2.4586, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.19306803594351732, |
|
"grad_norm": 1.3043372631072998, |
|
"learning_rate": 0.00018364151822548142, |
|
"loss": 2.3635, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.19409499358151477, |
|
"grad_norm": 2.0884976387023926, |
|
"learning_rate": 0.00018346245451215067, |
|
"loss": 2.2686, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 1.5124359130859375, |
|
"learning_rate": 0.00018328250438514836, |
|
"loss": 2.4451, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.19614890885750963, |
|
"grad_norm": 1.4146149158477783, |
|
"learning_rate": 0.00018310166975563625, |
|
"loss": 2.2692, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.19717586649550706, |
|
"grad_norm": 1.1612133979797363, |
|
"learning_rate": 0.00018291995254417, |
|
"loss": 2.2671, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.19820282413350448, |
|
"grad_norm": 1.9117320775985718, |
|
"learning_rate": 0.00018273735468067872, |
|
"loss": 2.3375, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.1992297817715019, |
|
"grad_norm": 1.1931228637695312, |
|
"learning_rate": 0.00018255387810444448, |
|
"loss": 2.1428, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.20025673940949937, |
|
"grad_norm": 1.616768717765808, |
|
"learning_rate": 0.0001823695247640817, |
|
"loss": 2.3382, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2012836970474968, |
|
"grad_norm": 1.5892282724380493, |
|
"learning_rate": 0.0001821842966175166, |
|
"loss": 2.4363, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.20231065468549422, |
|
"grad_norm": 1.5550988912582397, |
|
"learning_rate": 0.00018199819563196617, |
|
"loss": 2.2785, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.20333761232349165, |
|
"grad_norm": 2.0319631099700928, |
|
"learning_rate": 0.0001818112237839174, |
|
"loss": 2.3412, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.20436456996148908, |
|
"grad_norm": 1.500908613204956, |
|
"learning_rate": 0.00018162338305910636, |
|
"loss": 2.2796, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.20539152759948653, |
|
"grad_norm": 1.1863993406295776, |
|
"learning_rate": 0.00018143467545249692, |
|
"loss": 1.9584, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.20641848523748396, |
|
"grad_norm": 1.2843319177627563, |
|
"learning_rate": 0.00018124510296825983, |
|
"loss": 2.2401, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.2074454428754814, |
|
"grad_norm": 1.886354684829712, |
|
"learning_rate": 0.00018105466761975109, |
|
"loss": 2.4187, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.20847240051347882, |
|
"grad_norm": 1.7772157192230225, |
|
"learning_rate": 0.00018086337142949094, |
|
"loss": 2.4715, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.20949935815147624, |
|
"grad_norm": 1.5349968671798706, |
|
"learning_rate": 0.00018067121642914206, |
|
"loss": 2.2919, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 1.6492220163345337, |
|
"learning_rate": 0.00018047820465948817, |
|
"loss": 2.2219, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.21155327342747113, |
|
"grad_norm": 1.8684515953063965, |
|
"learning_rate": 0.00018028433817041236, |
|
"loss": 2.4697, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.21258023106546856, |
|
"grad_norm": 1.287240743637085, |
|
"learning_rate": 0.00018008961902087528, |
|
"loss": 2.3128, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.21360718870346598, |
|
"grad_norm": 2.5276641845703125, |
|
"learning_rate": 0.00017989404927889316, |
|
"loss": 2.3647, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.2146341463414634, |
|
"grad_norm": 1.3976030349731445, |
|
"learning_rate": 0.00017969763102151603, |
|
"loss": 2.1668, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.21566110397946084, |
|
"grad_norm": 1.5781503915786743, |
|
"learning_rate": 0.00017950036633480556, |
|
"loss": 2.2656, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21668806161745827, |
|
"grad_norm": 1.918054223060608, |
|
"learning_rate": 0.00017930225731381302, |
|
"loss": 2.5755, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.21771501925545572, |
|
"grad_norm": 1.5252821445465088, |
|
"learning_rate": 0.00017910330606255682, |
|
"loss": 2.1814, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.21874197689345315, |
|
"grad_norm": 1.4365805387496948, |
|
"learning_rate": 0.00017890351469400034, |
|
"loss": 2.4923, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.21976893453145058, |
|
"grad_norm": 1.3583849668502808, |
|
"learning_rate": 0.00017870288533002938, |
|
"loss": 2.3919, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.220795892169448, |
|
"grad_norm": 1.389434576034546, |
|
"learning_rate": 0.00017850142010142982, |
|
"loss": 2.3916, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.22182284980744543, |
|
"grad_norm": 1.134041428565979, |
|
"learning_rate": 0.00017829912114786462, |
|
"loss": 1.9883, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.2228498074454429, |
|
"grad_norm": 1.3355865478515625, |
|
"learning_rate": 0.00017809599061785155, |
|
"loss": 2.1283, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.22387676508344032, |
|
"grad_norm": 1.5802854299545288, |
|
"learning_rate": 0.00017789203066873998, |
|
"loss": 2.3526, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.22490372272143774, |
|
"grad_norm": 1.919191837310791, |
|
"learning_rate": 0.0001776872434666882, |
|
"loss": 2.2701, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.22593068035943517, |
|
"grad_norm": 1.7387783527374268, |
|
"learning_rate": 0.0001774816311866404, |
|
"loss": 2.4047, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2269576379974326, |
|
"grad_norm": 1.3135710954666138, |
|
"learning_rate": 0.0001772751960123034, |
|
"loss": 2.2767, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.22798459563543003, |
|
"grad_norm": 2.7130353450775146, |
|
"learning_rate": 0.00017706794013612364, |
|
"loss": 2.269, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.22901155327342748, |
|
"grad_norm": 1.4474828243255615, |
|
"learning_rate": 0.00017685986575926386, |
|
"loss": 2.3064, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.2300385109114249, |
|
"grad_norm": 2.2177886962890625, |
|
"learning_rate": 0.00017665097509157962, |
|
"loss": 2.2529, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.23106546854942234, |
|
"grad_norm": 1.9783529043197632, |
|
"learning_rate": 0.00017644127035159596, |
|
"loss": 2.1634, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.23209242618741976, |
|
"grad_norm": 1.5306183099746704, |
|
"learning_rate": 0.00017623075376648376, |
|
"loss": 2.0591, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.2331193838254172, |
|
"grad_norm": 1.4206281900405884, |
|
"learning_rate": 0.00017601942757203612, |
|
"loss": 2.1645, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.23414634146341465, |
|
"grad_norm": 1.2981724739074707, |
|
"learning_rate": 0.0001758072940126446, |
|
"loss": 2.3063, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.23517329910141208, |
|
"grad_norm": 1.5360183715820312, |
|
"learning_rate": 0.00017559435534127534, |
|
"loss": 2.3622, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.2362002567394095, |
|
"grad_norm": 1.329416036605835, |
|
"learning_rate": 0.00017538061381944524, |
|
"loss": 2.3301, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.23722721437740693, |
|
"grad_norm": 1.191867470741272, |
|
"learning_rate": 0.00017516607171719786, |
|
"loss": 2.0651, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.23825417201540436, |
|
"grad_norm": 1.8169773817062378, |
|
"learning_rate": 0.00017495073131307932, |
|
"loss": 2.4059, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.2392811296534018, |
|
"grad_norm": 1.5405203104019165, |
|
"learning_rate": 0.00017473459489411415, |
|
"loss": 2.4716, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.24030808729139924, |
|
"grad_norm": 1.899759292602539, |
|
"learning_rate": 0.0001745176647557809, |
|
"loss": 2.3427, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.24133504492939667, |
|
"grad_norm": 1.3432451486587524, |
|
"learning_rate": 0.00017429994320198786, |
|
"loss": 2.1323, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2423620025673941, |
|
"grad_norm": 1.5715628862380981, |
|
"learning_rate": 0.00017408143254504856, |
|
"loss": 2.3001, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.24338896020539152, |
|
"grad_norm": 1.7504726648330688, |
|
"learning_rate": 0.00017386213510565715, |
|
"loss": 2.2933, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.24441591784338895, |
|
"grad_norm": 2.6738271713256836, |
|
"learning_rate": 0.00017364205321286394, |
|
"loss": 2.1217, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.24544287548138638, |
|
"grad_norm": 1.704504132270813, |
|
"learning_rate": 0.00017342118920405034, |
|
"loss": 2.3705, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.24646983311938384, |
|
"grad_norm": 1.2226909399032593, |
|
"learning_rate": 0.00017319954542490445, |
|
"loss": 2.1303, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24749679075738126, |
|
"grad_norm": 1.40041184425354, |
|
"learning_rate": 0.00017297712422939573, |
|
"loss": 2.2961, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.2485237483953787, |
|
"grad_norm": 1.618201494216919, |
|
"learning_rate": 0.00017275392797975032, |
|
"loss": 2.3552, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.24955070603337612, |
|
"grad_norm": 1.7513333559036255, |
|
"learning_rate": 0.0001725299590464258, |
|
"loss": 2.4547, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.2505776636713736, |
|
"grad_norm": 1.9280955791473389, |
|
"learning_rate": 0.000172305219808086, |
|
"loss": 2.3315, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.2505776636713736, |
|
"eval_loss": 2.3158633708953857, |
|
"eval_runtime": 26.8137, |
|
"eval_samples_per_second": 15.291, |
|
"eval_steps_per_second": 7.645, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.251604621309371, |
|
"grad_norm": 1.8800264596939087, |
|
"learning_rate": 0.00017207971265157586, |
|
"loss": 2.2771, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.25263157894736843, |
|
"grad_norm": 2.2899258136749268, |
|
"learning_rate": 0.00017185343997189588, |
|
"loss": 2.4253, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.25365853658536586, |
|
"grad_norm": 1.3957444429397583, |
|
"learning_rate": 0.00017162640417217695, |
|
"loss": 2.2387, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.2546854942233633, |
|
"grad_norm": 1.471025824546814, |
|
"learning_rate": 0.00017139860766365457, |
|
"loss": 2.3249, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.2557124518613607, |
|
"grad_norm": 1.5242273807525635, |
|
"learning_rate": 0.00017117005286564342, |
|
"loss": 2.274, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.25673940949935814, |
|
"grad_norm": 2.3328254222869873, |
|
"learning_rate": 0.00017094074220551158, |
|
"loss": 2.3465, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25776636713735557, |
|
"grad_norm": 1.3928115367889404, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 2.1276, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.258793324775353, |
|
"grad_norm": 1.318634271621704, |
|
"learning_rate": 0.00017047986304847044, |
|
"loss": 2.3943, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.2598202824133504, |
|
"grad_norm": 1.8041123151779175, |
|
"learning_rate": 0.00017024829944633195, |
|
"loss": 2.2782, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.2608472400513479, |
|
"grad_norm": 1.4186384677886963, |
|
"learning_rate": 0.0001700159897715624, |
|
"loss": 2.2574, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.26187419768934533, |
|
"grad_norm": 1.4350296258926392, |
|
"learning_rate": 0.00016978293649140853, |
|
"loss": 2.103, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.26290115532734276, |
|
"grad_norm": 1.6985528469085693, |
|
"learning_rate": 0.0001695491420810146, |
|
"loss": 2.426, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.2639281129653402, |
|
"grad_norm": 1.5659419298171997, |
|
"learning_rate": 0.00016931460902339608, |
|
"loss": 2.3175, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.2649550706033376, |
|
"grad_norm": 1.5348899364471436, |
|
"learning_rate": 0.00016907933980941312, |
|
"loss": 2.0148, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.26598202824133504, |
|
"grad_norm": 1.5878114700317383, |
|
"learning_rate": 0.00016884333693774437, |
|
"loss": 2.3478, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.26700898587933247, |
|
"grad_norm": 1.6665490865707397, |
|
"learning_rate": 0.0001686066029148602, |
|
"loss": 2.2672, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2680359435173299, |
|
"grad_norm": 2.0139546394348145, |
|
"learning_rate": 0.00016836914025499623, |
|
"loss": 2.2764, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.2690629011553273, |
|
"grad_norm": 1.5040104389190674, |
|
"learning_rate": 0.0001681309514801265, |
|
"loss": 2.4329, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.27008985879332476, |
|
"grad_norm": 1.9814090728759766, |
|
"learning_rate": 0.0001678920391199369, |
|
"loss": 2.1721, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.2711168164313222, |
|
"grad_norm": 1.6174941062927246, |
|
"learning_rate": 0.00016765240571179802, |
|
"loss": 2.2285, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.27214377406931967, |
|
"grad_norm": 1.6301368474960327, |
|
"learning_rate": 0.00016741205380073842, |
|
"loss": 2.1531, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.2731707317073171, |
|
"grad_norm": 1.551831603050232, |
|
"learning_rate": 0.00016717098593941752, |
|
"loss": 2.1856, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.2741976893453145, |
|
"grad_norm": 1.327530860900879, |
|
"learning_rate": 0.00016692920468809846, |
|
"loss": 2.4251, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.27522464698331195, |
|
"grad_norm": 1.6990398168563843, |
|
"learning_rate": 0.00016668671261462102, |
|
"loss": 2.2068, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.2762516046213094, |
|
"grad_norm": 2.0649585723876953, |
|
"learning_rate": 0.00016644351229437416, |
|
"loss": 2.5073, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.2772785622593068, |
|
"grad_norm": 2.4592831134796143, |
|
"learning_rate": 0.00016619960631026888, |
|
"loss": 2.507, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.27830551989730423, |
|
"grad_norm": 1.1428396701812744, |
|
"learning_rate": 0.00016595499725271067, |
|
"loss": 2.33, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.27933247753530166, |
|
"grad_norm": 1.5621798038482666, |
|
"learning_rate": 0.00016570968771957196, |
|
"loss": 2.4772, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.2803594351732991, |
|
"grad_norm": 2.22113037109375, |
|
"learning_rate": 0.00016546368031616465, |
|
"loss": 2.1417, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.2813863928112965, |
|
"grad_norm": 1.6532412767410278, |
|
"learning_rate": 0.0001652169776552123, |
|
"loss": 2.1814, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.28241335044929394, |
|
"grad_norm": 1.35118567943573, |
|
"learning_rate": 0.0001649695823568226, |
|
"loss": 2.4149, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2834403080872914, |
|
"grad_norm": 1.3244826793670654, |
|
"learning_rate": 0.00016472149704845927, |
|
"loss": 2.3371, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.28446726572528885, |
|
"grad_norm": 1.3463726043701172, |
|
"learning_rate": 0.00016447272436491433, |
|
"loss": 2.1964, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.2854942233632863, |
|
"grad_norm": 1.2562531232833862, |
|
"learning_rate": 0.00016422326694828007, |
|
"loss": 2.3121, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.2865211810012837, |
|
"grad_norm": 1.1574952602386475, |
|
"learning_rate": 0.000163973127447921, |
|
"loss": 2.0812, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.28754813863928114, |
|
"grad_norm": 1.3550206422805786, |
|
"learning_rate": 0.0001637223085204457, |
|
"loss": 2.3015, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.28857509627727856, |
|
"grad_norm": 1.6482678651809692, |
|
"learning_rate": 0.0001634708128296786, |
|
"loss": 2.4281, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.289602053915276, |
|
"grad_norm": 1.4810844659805298, |
|
"learning_rate": 0.00016321864304663173, |
|
"loss": 2.4883, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.2906290115532734, |
|
"grad_norm": 1.8965210914611816, |
|
"learning_rate": 0.00016296580184947633, |
|
"loss": 2.5187, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.29165596919127085, |
|
"grad_norm": 1.3743977546691895, |
|
"learning_rate": 0.00016271229192351428, |
|
"loss": 2.1911, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.2926829268292683, |
|
"grad_norm": 1.7670851945877075, |
|
"learning_rate": 0.0001624581159611499, |
|
"loss": 2.0569, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2937098844672657, |
|
"grad_norm": 1.865688443183899, |
|
"learning_rate": 0.000162203276661861, |
|
"loss": 2.3579, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.29473684210526313, |
|
"grad_norm": 1.762488842010498, |
|
"learning_rate": 0.00016194777673217043, |
|
"loss": 2.1302, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.2957637997432606, |
|
"grad_norm": 1.545452356338501, |
|
"learning_rate": 0.00016169161888561723, |
|
"loss": 2.3168, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.29679075738125804, |
|
"grad_norm": 1.5264406204223633, |
|
"learning_rate": 0.00016143480584272793, |
|
"loss": 2.3294, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.29781771501925547, |
|
"grad_norm": 1.4859471321105957, |
|
"learning_rate": 0.00016117734033098744, |
|
"loss": 2.4337, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2988446726572529, |
|
"grad_norm": 2.1913015842437744, |
|
"learning_rate": 0.0001609192250848104, |
|
"loss": 2.314, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.2998716302952503, |
|
"grad_norm": 1.3715288639068604, |
|
"learning_rate": 0.00016066046284551178, |
|
"loss": 2.3759, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.30089858793324775, |
|
"grad_norm": 1.243543267250061, |
|
"learning_rate": 0.00016040105636127807, |
|
"loss": 2.289, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.3019255455712452, |
|
"grad_norm": 1.3851499557495117, |
|
"learning_rate": 0.00016014100838713797, |
|
"loss": 2.5364, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.3029525032092426, |
|
"grad_norm": 1.318569540977478, |
|
"learning_rate": 0.000159880321684933, |
|
"loss": 2.2201, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.30397946084724004, |
|
"grad_norm": 1.7080748081207275, |
|
"learning_rate": 0.00015961899902328845, |
|
"loss": 2.4458, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.30500641848523746, |
|
"grad_norm": 1.462457299232483, |
|
"learning_rate": 0.0001593570431775837, |
|
"loss": 2.1665, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.3060333761232349, |
|
"grad_norm": 1.3777559995651245, |
|
"learning_rate": 0.000159094456929923, |
|
"loss": 2.2534, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.3070603337612324, |
|
"grad_norm": 2.4722256660461426, |
|
"learning_rate": 0.00015883124306910565, |
|
"loss": 2.4955, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.3080872913992298, |
|
"grad_norm": 1.4936336278915405, |
|
"learning_rate": 0.0001585674043905966, |
|
"loss": 2.3775, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.30911424903722723, |
|
"grad_norm": 1.8284447193145752, |
|
"learning_rate": 0.00015830294369649668, |
|
"loss": 2.1124, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.31014120667522466, |
|
"grad_norm": 1.5503596067428589, |
|
"learning_rate": 0.0001580378637955128, |
|
"loss": 2.3898, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.3111681643132221, |
|
"grad_norm": 1.868800163269043, |
|
"learning_rate": 0.00015777216750292823, |
|
"loss": 2.2951, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.3121951219512195, |
|
"grad_norm": 1.6728500127792358, |
|
"learning_rate": 0.0001575058576405725, |
|
"loss": 2.2314, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.31322207958921694, |
|
"grad_norm": 1.3890676498413086, |
|
"learning_rate": 0.00015723893703679172, |
|
"loss": 2.1032, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.31424903722721437, |
|
"grad_norm": 1.9047855138778687, |
|
"learning_rate": 0.00015697140852641834, |
|
"loss": 2.1308, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.3152759948652118, |
|
"grad_norm": 1.7245334386825562, |
|
"learning_rate": 0.00015670327495074103, |
|
"loss": 2.0928, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.3163029525032092, |
|
"grad_norm": 1.295081615447998, |
|
"learning_rate": 0.00015643453915747455, |
|
"loss": 2.2695, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.31732991014120665, |
|
"grad_norm": 1.3265256881713867, |
|
"learning_rate": 0.00015616520400072963, |
|
"loss": 2.3772, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.31835686777920413, |
|
"grad_norm": 1.2527427673339844, |
|
"learning_rate": 0.00015589527234098247, |
|
"loss": 2.3704, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.31938382541720156, |
|
"grad_norm": 1.2341771125793457, |
|
"learning_rate": 0.00015562474704504438, |
|
"loss": 2.1607, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.320410783055199, |
|
"grad_norm": 1.3696041107177734, |
|
"learning_rate": 0.00015535363098603152, |
|
"loss": 2.3136, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.3214377406931964, |
|
"grad_norm": 1.7420108318328857, |
|
"learning_rate": 0.00015508192704333413, |
|
"loss": 2.4447, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.32246469833119384, |
|
"grad_norm": 1.5119073390960693, |
|
"learning_rate": 0.00015480963810258613, |
|
"loss": 2.3928, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.32349165596919127, |
|
"grad_norm": 1.2545535564422607, |
|
"learning_rate": 0.00015453676705563444, |
|
"loss": 2.4365, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3245186136071887, |
|
"grad_norm": 1.4043221473693848, |
|
"learning_rate": 0.00015426331680050824, |
|
"loss": 2.5759, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.3255455712451861, |
|
"grad_norm": 1.4104640483856201, |
|
"learning_rate": 0.00015398929024138807, |
|
"loss": 2.4133, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.32657252888318355, |
|
"grad_norm": 1.2791924476623535, |
|
"learning_rate": 0.00015371469028857532, |
|
"loss": 2.0351, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.327599486521181, |
|
"grad_norm": 1.2710974216461182, |
|
"learning_rate": 0.00015343951985846095, |
|
"loss": 2.0818, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.3286264441591784, |
|
"grad_norm": 1.5024012327194214, |
|
"learning_rate": 0.00015316378187349474, |
|
"loss": 2.1922, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3296534017971759, |
|
"grad_norm": 1.1681658029556274, |
|
"learning_rate": 0.00015288747926215418, |
|
"loss": 1.9304, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.3306803594351733, |
|
"grad_norm": 1.3742514848709106, |
|
"learning_rate": 0.00015261061495891345, |
|
"loss": 2.3971, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.33170731707317075, |
|
"grad_norm": 2.3611044883728027, |
|
"learning_rate": 0.00015233319190421197, |
|
"loss": 2.3189, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.3327342747111682, |
|
"grad_norm": 1.5957375764846802, |
|
"learning_rate": 0.00015205521304442366, |
|
"loss": 2.4309, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.3337612323491656, |
|
"grad_norm": 1.3179574012756348, |
|
"learning_rate": 0.00015177668133182522, |
|
"loss": 2.2626, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.33478818998716303, |
|
"grad_norm": 1.4424761533737183, |
|
"learning_rate": 0.0001514975997245649, |
|
"loss": 2.2325, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.33581514762516046, |
|
"grad_norm": 1.499611496925354, |
|
"learning_rate": 0.00015121797118663124, |
|
"loss": 2.5169, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.3368421052631579, |
|
"grad_norm": 2.3086583614349365, |
|
"learning_rate": 0.0001509377986878213, |
|
"loss": 2.4343, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.3378690629011553, |
|
"grad_norm": 2.3896758556365967, |
|
"learning_rate": 0.00015065708520370944, |
|
"loss": 2.2872, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.33889602053915274, |
|
"grad_norm": 2.070495367050171, |
|
"learning_rate": 0.00015037583371561535, |
|
"loss": 2.1122, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.33992297817715017, |
|
"grad_norm": 1.6736974716186523, |
|
"learning_rate": 0.0001500940472105729, |
|
"loss": 2.2805, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.3409499358151476, |
|
"grad_norm": 1.298316240310669, |
|
"learning_rate": 0.00014981172868129786, |
|
"loss": 2.2484, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.3419768934531451, |
|
"grad_norm": 1.763777732849121, |
|
"learning_rate": 0.00014952888112615645, |
|
"loss": 2.3688, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.3430038510911425, |
|
"grad_norm": 1.5646326541900635, |
|
"learning_rate": 0.0001492455075491334, |
|
"loss": 2.2685, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.34403080872913994, |
|
"grad_norm": 2.415956735610962, |
|
"learning_rate": 0.00014896161095980008, |
|
"loss": 2.2984, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.34505776636713736, |
|
"grad_norm": 1.9530178308486938, |
|
"learning_rate": 0.00014867719437328252, |
|
"loss": 2.3111, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.3460847240051348, |
|
"grad_norm": 1.4584389925003052, |
|
"learning_rate": 0.00014839226081022938, |
|
"loss": 2.4523, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.3471116816431322, |
|
"grad_norm": 2.4603986740112305, |
|
"learning_rate": 0.00014810681329677987, |
|
"loss": 2.2778, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.34813863928112965, |
|
"grad_norm": 1.2564218044281006, |
|
"learning_rate": 0.00014782085486453154, |
|
"loss": 2.171, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.3491655969191271, |
|
"grad_norm": 8.110305786132812, |
|
"learning_rate": 0.00014753438855050828, |
|
"loss": 2.3417, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3501925545571245, |
|
"grad_norm": 1.9910926818847656, |
|
"learning_rate": 0.00014724741739712794, |
|
"loss": 2.0176, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.35121951219512193, |
|
"grad_norm": 1.5625698566436768, |
|
"learning_rate": 0.00014695994445216985, |
|
"loss": 2.4374, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.35224646983311936, |
|
"grad_norm": 1.5060590505599976, |
|
"learning_rate": 0.00014667197276874286, |
|
"loss": 2.3906, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.35327342747111684, |
|
"grad_norm": 1.2828539609909058, |
|
"learning_rate": 0.00014638350540525246, |
|
"loss": 2.2784, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.35430038510911427, |
|
"grad_norm": 1.1949716806411743, |
|
"learning_rate": 0.0001460945454253687, |
|
"loss": 2.2366, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.3553273427471117, |
|
"grad_norm": 1.9188710451126099, |
|
"learning_rate": 0.00014580509589799329, |
|
"loss": 2.2652, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.3563543003851091, |
|
"grad_norm": 1.220718502998352, |
|
"learning_rate": 0.00014551515989722733, |
|
"loss": 2.1333, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.35738125802310655, |
|
"grad_norm": 1.4678255319595337, |
|
"learning_rate": 0.00014522474050233846, |
|
"loss": 2.1317, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.358408215661104, |
|
"grad_norm": 1.2910618782043457, |
|
"learning_rate": 0.00014493384079772813, |
|
"loss": 2.4736, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.3594351732991014, |
|
"grad_norm": 1.3866629600524902, |
|
"learning_rate": 0.00014464246387289913, |
|
"loss": 2.1866, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.36046213093709883, |
|
"grad_norm": 1.3790191411972046, |
|
"learning_rate": 0.00014435061282242232, |
|
"loss": 2.2616, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.36148908857509626, |
|
"grad_norm": 1.481720209121704, |
|
"learning_rate": 0.00014405829074590424, |
|
"loss": 2.2923, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.3625160462130937, |
|
"grad_norm": 1.427502155303955, |
|
"learning_rate": 0.00014376550074795375, |
|
"loss": 2.3331, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.3635430038510911, |
|
"grad_norm": 1.3853408098220825, |
|
"learning_rate": 0.00014347224593814944, |
|
"loss": 2.3992, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.3645699614890886, |
|
"grad_norm": 1.7792441844940186, |
|
"learning_rate": 0.00014317852943100643, |
|
"loss": 2.4461, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.36559691912708603, |
|
"grad_norm": 1.5491658449172974, |
|
"learning_rate": 0.00014288435434594315, |
|
"loss": 2.1179, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.36662387676508346, |
|
"grad_norm": 1.3137096166610718, |
|
"learning_rate": 0.00014258972380724858, |
|
"loss": 2.0743, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.3676508344030809, |
|
"grad_norm": 1.8642239570617676, |
|
"learning_rate": 0.00014229464094404865, |
|
"loss": 2.3197, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.3686777920410783, |
|
"grad_norm": 1.9754403829574585, |
|
"learning_rate": 0.00014199910889027334, |
|
"loss": 2.4367, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.36970474967907574, |
|
"grad_norm": 1.3563640117645264, |
|
"learning_rate": 0.00014170313078462317, |
|
"loss": 2.3651, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.37073170731707317, |
|
"grad_norm": 1.7116272449493408, |
|
"learning_rate": 0.00014140670977053603, |
|
"loss": 2.2974, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.3717586649550706, |
|
"grad_norm": 1.1759010553359985, |
|
"learning_rate": 0.00014110984899615367, |
|
"loss": 2.0675, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.372785622593068, |
|
"grad_norm": 1.3649141788482666, |
|
"learning_rate": 0.00014081255161428838, |
|
"loss": 2.3528, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.37381258023106545, |
|
"grad_norm": 1.641750454902649, |
|
"learning_rate": 0.00014051482078238932, |
|
"loss": 2.439, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.3748395378690629, |
|
"grad_norm": 1.3235962390899658, |
|
"learning_rate": 0.00014021665966250927, |
|
"loss": 2.3409, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.37586649550706036, |
|
"grad_norm": 1.463796615600586, |
|
"learning_rate": 0.0001399180714212708, |
|
"loss": 2.427, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.3768934531450578, |
|
"grad_norm": 1.3167238235473633, |
|
"learning_rate": 0.0001396190592298327, |
|
"loss": 2.1034, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.3779204107830552, |
|
"grad_norm": 2.2455053329467773, |
|
"learning_rate": 0.0001393196262638564, |
|
"loss": 2.2139, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.37894736842105264, |
|
"grad_norm": 1.6802003383636475, |
|
"learning_rate": 0.0001390197757034721, |
|
"loss": 2.3074, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.37997432605905007, |
|
"grad_norm": 1.349055290222168, |
|
"learning_rate": 0.00013871951073324507, |
|
"loss": 2.2607, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3810012836970475, |
|
"grad_norm": 1.5269880294799805, |
|
"learning_rate": 0.00013841883454214195, |
|
"loss": 2.3848, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.3820282413350449, |
|
"grad_norm": 1.4355286359786987, |
|
"learning_rate": 0.00013811775032349655, |
|
"loss": 2.2389, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.38305519897304235, |
|
"grad_norm": 1.4193495512008667, |
|
"learning_rate": 0.00013781626127497631, |
|
"loss": 2.574, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.3840821566110398, |
|
"grad_norm": 1.2355318069458008, |
|
"learning_rate": 0.0001375143705985481, |
|
"loss": 2.0485, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.3851091142490372, |
|
"grad_norm": 1.8594748973846436, |
|
"learning_rate": 0.0001372120815004442, |
|
"loss": 2.1466, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.38613607188703464, |
|
"grad_norm": 2.062476873397827, |
|
"learning_rate": 0.0001369093971911285, |
|
"loss": 2.264, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.38716302952503207, |
|
"grad_norm": 1.016082525253296, |
|
"learning_rate": 0.00013660632088526213, |
|
"loss": 1.8858, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.38818998716302955, |
|
"grad_norm": 1.8727707862854004, |
|
"learning_rate": 0.00013630285580166945, |
|
"loss": 2.3701, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.389216944801027, |
|
"grad_norm": 2.2555723190307617, |
|
"learning_rate": 0.00013599900516330382, |
|
"loss": 2.34, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 1.6758098602294922, |
|
"learning_rate": 0.00013569477219721335, |
|
"loss": 2.3075, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.39127086007702183, |
|
"grad_norm": 1.789185643196106, |
|
"learning_rate": 0.0001353901601345068, |
|
"loss": 2.2605, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.39229781771501926, |
|
"grad_norm": 1.6649662256240845, |
|
"learning_rate": 0.000135085172210319, |
|
"loss": 2.3859, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.3933247753530167, |
|
"grad_norm": 1.5336881875991821, |
|
"learning_rate": 0.00013477981166377663, |
|
"loss": 2.4461, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.3943517329910141, |
|
"grad_norm": 1.6705595254898071, |
|
"learning_rate": 0.00013447408173796385, |
|
"loss": 2.2572, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.39537869062901154, |
|
"grad_norm": 1.7873886823654175, |
|
"learning_rate": 0.00013416798567988784, |
|
"loss": 2.1611, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.39640564826700897, |
|
"grad_norm": 1.7348014116287231, |
|
"learning_rate": 0.00013386152674044422, |
|
"loss": 2.2431, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.3974326059050064, |
|
"grad_norm": 1.687648057937622, |
|
"learning_rate": 0.00013355470817438264, |
|
"loss": 2.2149, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.3984595635430038, |
|
"grad_norm": 1.5006660223007202, |
|
"learning_rate": 0.00013324753324027216, |
|
"loss": 2.5911, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.3994865211810013, |
|
"grad_norm": 1.5582588911056519, |
|
"learning_rate": 0.00013294000520046664, |
|
"loss": 2.2855, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.40051347881899874, |
|
"grad_norm": 1.5377285480499268, |
|
"learning_rate": 0.00013263212732107012, |
|
"loss": 2.3232, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.40154043645699616, |
|
"grad_norm": 1.326240062713623, |
|
"learning_rate": 0.00013232390287190208, |
|
"loss": 2.205, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.4025673940949936, |
|
"grad_norm": 1.4398608207702637, |
|
"learning_rate": 0.0001320153351264628, |
|
"loss": 2.4615, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.403594351732991, |
|
"grad_norm": 1.499045729637146, |
|
"learning_rate": 0.0001317064273618985, |
|
"loss": 2.3906, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.40462130937098845, |
|
"grad_norm": 1.3966516256332397, |
|
"learning_rate": 0.00013139718285896655, |
|
"loss": 2.3382, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.4056482670089859, |
|
"grad_norm": 1.1530864238739014, |
|
"learning_rate": 0.0001310876049020007, |
|
"loss": 2.2139, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4066752246469833, |
|
"grad_norm": 1.4689910411834717, |
|
"learning_rate": 0.00013077769677887619, |
|
"loss": 2.3199, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.40770218228498073, |
|
"grad_norm": 1.7335143089294434, |
|
"learning_rate": 0.00013046746178097467, |
|
"loss": 2.1957, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.40872913992297816, |
|
"grad_norm": 1.2896586656570435, |
|
"learning_rate": 0.00013015690320314954, |
|
"loss": 2.4031, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.4097560975609756, |
|
"grad_norm": 1.3342877626419067, |
|
"learning_rate": 0.0001298460243436906, |
|
"loss": 2.2962, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.41078305519897307, |
|
"grad_norm": 1.4110368490219116, |
|
"learning_rate": 0.00012953482850428926, |
|
"loss": 2.316, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4118100128369705, |
|
"grad_norm": 1.4386307001113892, |
|
"learning_rate": 0.00012922331899000353, |
|
"loss": 2.269, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.4128369704749679, |
|
"grad_norm": 1.1738072633743286, |
|
"learning_rate": 0.00012891149910922267, |
|
"loss": 2.3723, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.41386392811296535, |
|
"grad_norm": 1.9212470054626465, |
|
"learning_rate": 0.00012859937217363224, |
|
"loss": 2.1092, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.4148908857509628, |
|
"grad_norm": 1.2879332304000854, |
|
"learning_rate": 0.00012828694149817887, |
|
"loss": 2.228, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.4159178433889602, |
|
"grad_norm": 1.5181375741958618, |
|
"learning_rate": 0.00012797421040103513, |
|
"loss": 2.5877, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.41694480102695763, |
|
"grad_norm": 1.5899111032485962, |
|
"learning_rate": 0.00012766118220356408, |
|
"loss": 2.3006, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.41797175866495506, |
|
"grad_norm": 1.445020318031311, |
|
"learning_rate": 0.00012734786023028423, |
|
"loss": 2.1708, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.4189987163029525, |
|
"grad_norm": 1.396371603012085, |
|
"learning_rate": 0.0001270342478088342, |
|
"loss": 2.1616, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.4200256739409499, |
|
"grad_norm": 1.4456965923309326, |
|
"learning_rate": 0.00012672034826993715, |
|
"loss": 2.285, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 1.3749362230300903, |
|
"learning_rate": 0.0001264061649473657, |
|
"loss": 2.2865, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.42207958921694483, |
|
"grad_norm": 1.549967646598816, |
|
"learning_rate": 0.0001260917011779064, |
|
"loss": 2.2219, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.42310654685494226, |
|
"grad_norm": 1.52138090133667, |
|
"learning_rate": 0.00012577696030132421, |
|
"loss": 2.2663, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.4241335044929397, |
|
"grad_norm": 1.315598726272583, |
|
"learning_rate": 0.00012546194566032714, |
|
"loss": 2.5049, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.4251604621309371, |
|
"grad_norm": 1.5495781898498535, |
|
"learning_rate": 0.00012514666060053076, |
|
"loss": 2.2123, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.42618741976893454, |
|
"grad_norm": 1.3094850778579712, |
|
"learning_rate": 0.00012483110847042256, |
|
"loss": 2.3119, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.42721437740693197, |
|
"grad_norm": 1.3870692253112793, |
|
"learning_rate": 0.0001245152926213265, |
|
"loss": 2.2084, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.4282413350449294, |
|
"grad_norm": 1.5510514974594116, |
|
"learning_rate": 0.0001241992164073674, |
|
"loss": 2.3628, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.4292682926829268, |
|
"grad_norm": 1.489369511604309, |
|
"learning_rate": 0.00012388288318543512, |
|
"loss": 2.2547, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.43029525032092425, |
|
"grad_norm": 1.5452933311462402, |
|
"learning_rate": 0.00012356629631514929, |
|
"loss": 2.2346, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.4313222079589217, |
|
"grad_norm": 1.330915927886963, |
|
"learning_rate": 0.00012324945915882332, |
|
"loss": 2.0298, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4323491655969191, |
|
"grad_norm": 1.3958618640899658, |
|
"learning_rate": 0.00012293237508142877, |
|
"loss": 2.2811, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.43337612323491653, |
|
"grad_norm": 1.4904903173446655, |
|
"learning_rate": 0.00012261504745055964, |
|
"loss": 2.1233, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.434403080872914, |
|
"grad_norm": 1.2206223011016846, |
|
"learning_rate": 0.00012229747963639654, |
|
"loss": 2.196, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.43543003851091144, |
|
"grad_norm": 1.7518811225891113, |
|
"learning_rate": 0.00012197967501167112, |
|
"loss": 2.4287, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.43645699614890887, |
|
"grad_norm": 1.3162510395050049, |
|
"learning_rate": 0.00012166163695162983, |
|
"loss": 2.1072, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4374839537869063, |
|
"grad_norm": 1.5282611846923828, |
|
"learning_rate": 0.00012134336883399855, |
|
"loss": 2.2738, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.4385109114249037, |
|
"grad_norm": 1.561740517616272, |
|
"learning_rate": 0.00012102487403894633, |
|
"loss": 2.4247, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.43953786906290115, |
|
"grad_norm": 1.488586664199829, |
|
"learning_rate": 0.00012070615594904977, |
|
"loss": 2.3052, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.4405648267008986, |
|
"grad_norm": 1.209370493888855, |
|
"learning_rate": 0.00012038721794925689, |
|
"loss": 2.1714, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.441591784338896, |
|
"grad_norm": 1.4916038513183594, |
|
"learning_rate": 0.00012006806342685126, |
|
"loss": 2.1914, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.44261874197689344, |
|
"grad_norm": 1.618943691253662, |
|
"learning_rate": 0.00011974869577141611, |
|
"loss": 2.2324, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.44364569961489086, |
|
"grad_norm": 1.1960707902908325, |
|
"learning_rate": 0.00011942911837479817, |
|
"loss": 2.1326, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.4446726572528883, |
|
"grad_norm": 1.9583168029785156, |
|
"learning_rate": 0.0001191093346310718, |
|
"loss": 2.5026, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.4456996148908858, |
|
"grad_norm": 1.3039475679397583, |
|
"learning_rate": 0.00011878934793650273, |
|
"loss": 2.3416, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.4467265725288832, |
|
"grad_norm": 1.7960904836654663, |
|
"learning_rate": 0.00011846916168951232, |
|
"loss": 2.2458, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.44775353016688063, |
|
"grad_norm": 1.5260133743286133, |
|
"learning_rate": 0.00011814877929064118, |
|
"loss": 2.2471, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.44878048780487806, |
|
"grad_norm": 1.8603578805923462, |
|
"learning_rate": 0.00011782820414251314, |
|
"loss": 2.2966, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.4498074454428755, |
|
"grad_norm": 2.106266498565674, |
|
"learning_rate": 0.00011750743964979918, |
|
"loss": 2.3253, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.4508344030808729, |
|
"grad_norm": 1.6676055192947388, |
|
"learning_rate": 0.00011718648921918112, |
|
"loss": 2.302, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.45186136071887034, |
|
"grad_norm": 1.521597146987915, |
|
"learning_rate": 0.00011686535625931565, |
|
"loss": 2.4547, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.45288831835686777, |
|
"grad_norm": 2.344489097595215, |
|
"learning_rate": 0.00011654404418079794, |
|
"loss": 2.2382, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.4539152759948652, |
|
"grad_norm": 1.9089165925979614, |
|
"learning_rate": 0.00011622255639612554, |
|
"loss": 2.2597, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.4549422336328626, |
|
"grad_norm": 1.8408923149108887, |
|
"learning_rate": 0.00011590089631966206, |
|
"loss": 2.3862, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.45596919127086005, |
|
"grad_norm": 1.6886488199234009, |
|
"learning_rate": 0.00011557906736760089, |
|
"loss": 2.2685, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.45699614890885754, |
|
"grad_norm": 1.1247152090072632, |
|
"learning_rate": 0.00011525707295792907, |
|
"loss": 1.9322, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.45802310654685496, |
|
"grad_norm": 1.5255829095840454, |
|
"learning_rate": 0.00011493491651039077, |
|
"loss": 2.162, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.4590500641848524, |
|
"grad_norm": 1.3797430992126465, |
|
"learning_rate": 0.00011461260144645119, |
|
"loss": 2.2511, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.4600770218228498, |
|
"grad_norm": 1.63080632686615, |
|
"learning_rate": 0.00011429013118926002, |
|
"loss": 2.2, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.46110397946084725, |
|
"grad_norm": 1.5671621561050415, |
|
"learning_rate": 0.00011396750916361524, |
|
"loss": 2.036, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.4621309370988447, |
|
"grad_norm": 1.4500336647033691, |
|
"learning_rate": 0.00011364473879592674, |
|
"loss": 2.4217, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4631578947368421, |
|
"grad_norm": 1.1536897420883179, |
|
"learning_rate": 0.00011332182351417975, |
|
"loss": 2.027, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.46418485237483953, |
|
"grad_norm": 1.3846821784973145, |
|
"learning_rate": 0.00011299876674789864, |
|
"loss": 2.3609, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.46521181001283696, |
|
"grad_norm": 1.307686448097229, |
|
"learning_rate": 0.00011267557192811038, |
|
"loss": 2.1803, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.4662387676508344, |
|
"grad_norm": 1.2983853816986084, |
|
"learning_rate": 0.0001123522424873082, |
|
"loss": 2.1518, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.4672657252888318, |
|
"grad_norm": 1.341827154159546, |
|
"learning_rate": 0.00011202878185941501, |
|
"loss": 2.2907, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.4682926829268293, |
|
"grad_norm": 1.4334162473678589, |
|
"learning_rate": 0.00011170519347974704, |
|
"loss": 2.1504, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.4693196405648267, |
|
"grad_norm": 1.19010591506958, |
|
"learning_rate": 0.00011138148078497728, |
|
"loss": 2.2356, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.47034659820282415, |
|
"grad_norm": 2.3501694202423096, |
|
"learning_rate": 0.000111057647213099, |
|
"loss": 2.3976, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.4713735558408216, |
|
"grad_norm": 1.4157503843307495, |
|
"learning_rate": 0.00011073369620338928, |
|
"loss": 2.2394, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.472400513478819, |
|
"grad_norm": 1.45404052734375, |
|
"learning_rate": 0.0001104096311963724, |
|
"loss": 2.354, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.47342747111681643, |
|
"grad_norm": 1.3356945514678955, |
|
"learning_rate": 0.00011008545563378346, |
|
"loss": 2.2715, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.47445442875481386, |
|
"grad_norm": 1.5381147861480713, |
|
"learning_rate": 0.00010976117295853154, |
|
"loss": 2.4286, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.4754813863928113, |
|
"grad_norm": 1.5712486505508423, |
|
"learning_rate": 0.00010943678661466346, |
|
"loss": 2.4077, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.4765083440308087, |
|
"grad_norm": 1.5998138189315796, |
|
"learning_rate": 0.00010911230004732703, |
|
"loss": 2.2756, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.47753530166880614, |
|
"grad_norm": 1.5580840110778809, |
|
"learning_rate": 0.0001087877167027344, |
|
"loss": 2.3054, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4785622593068036, |
|
"grad_norm": 1.1897094249725342, |
|
"learning_rate": 0.00010846304002812564, |
|
"loss": 2.0802, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.479589216944801, |
|
"grad_norm": 1.3887132406234741, |
|
"learning_rate": 0.00010813827347173195, |
|
"loss": 2.1605, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.4806161745827985, |
|
"grad_norm": 1.958724856376648, |
|
"learning_rate": 0.00010781342048273921, |
|
"loss": 2.2558, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.4816431322207959, |
|
"grad_norm": 1.5566227436065674, |
|
"learning_rate": 0.0001074884845112512, |
|
"loss": 2.2141, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.48267008985879334, |
|
"grad_norm": 1.616424560546875, |
|
"learning_rate": 0.00010716346900825299, |
|
"loss": 2.1406, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.48369704749679077, |
|
"grad_norm": 1.6108357906341553, |
|
"learning_rate": 0.00010683837742557436, |
|
"loss": 2.4381, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.4847240051347882, |
|
"grad_norm": 1.4207314252853394, |
|
"learning_rate": 0.00010651321321585315, |
|
"loss": 2.1121, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.4857509627727856, |
|
"grad_norm": 1.2405275106430054, |
|
"learning_rate": 0.00010618797983249841, |
|
"loss": 2.2362, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.48677792041078305, |
|
"grad_norm": 1.5273611545562744, |
|
"learning_rate": 0.00010586268072965396, |
|
"loss": 2.0912, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 1.4871629476547241, |
|
"learning_rate": 0.00010553731936216149, |
|
"loss": 2.1009, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4888318356867779, |
|
"grad_norm": 1.320486068725586, |
|
"learning_rate": 0.00010521189918552406, |
|
"loss": 2.1408, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.48985879332477533, |
|
"grad_norm": 1.4156553745269775, |
|
"learning_rate": 0.0001048864236558693, |
|
"loss": 2.1874, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.49088575096277276, |
|
"grad_norm": 1.620944857597351, |
|
"learning_rate": 0.00010456089622991263, |
|
"loss": 2.0141, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.49191270860077024, |
|
"grad_norm": 1.3774328231811523, |
|
"learning_rate": 0.00010423532036492077, |
|
"loss": 2.0491, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.49293966623876767, |
|
"grad_norm": 1.6792056560516357, |
|
"learning_rate": 0.00010390969951867482, |
|
"loss": 2.3037, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4939666238767651, |
|
"grad_norm": 1.4954179525375366, |
|
"learning_rate": 0.00010358403714943357, |
|
"loss": 2.3587, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.4949935815147625, |
|
"grad_norm": 1.3356521129608154, |
|
"learning_rate": 0.00010325833671589687, |
|
"loss": 2.205, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.49602053915275995, |
|
"grad_norm": 1.6689985990524292, |
|
"learning_rate": 0.00010293260167716876, |
|
"loss": 2.2734, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.4970474967907574, |
|
"grad_norm": 1.5499637126922607, |
|
"learning_rate": 0.00010260683549272089, |
|
"loss": 2.1623, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.4980744544287548, |
|
"grad_norm": 1.2015153169631958, |
|
"learning_rate": 0.00010228104162235563, |
|
"loss": 2.2428, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.49910141206675224, |
|
"grad_norm": 1.4437837600708008, |
|
"learning_rate": 0.00010195522352616943, |
|
"loss": 2.1359, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.5001283697047497, |
|
"grad_norm": 1.5427740812301636, |
|
"learning_rate": 0.00010162938466451599, |
|
"loss": 2.29, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.5011553273427471, |
|
"grad_norm": 1.5538071393966675, |
|
"learning_rate": 0.00010130352849796958, |
|
"loss": 2.1563, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.5011553273427471, |
|
"eval_loss": 2.2764453887939453, |
|
"eval_runtime": 26.8115, |
|
"eval_samples_per_second": 15.292, |
|
"eval_steps_per_second": 7.646, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.5021822849807446, |
|
"grad_norm": 1.645787239074707, |
|
"learning_rate": 0.00010097765848728823, |
|
"loss": 2.4307, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.503209242618742, |
|
"grad_norm": 1.417846441268921, |
|
"learning_rate": 0.00010065177809337702, |
|
"loss": 2.2206, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5042362002567394, |
|
"grad_norm": 2.257739782333374, |
|
"learning_rate": 0.00010032589077725134, |
|
"loss": 2.2328, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.5052631578947369, |
|
"grad_norm": 1.419081211090088, |
|
"learning_rate": 0.0001, |
|
"loss": 2.1812, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.5062901155327343, |
|
"grad_norm": 1.5131466388702393, |
|
"learning_rate": 9.967410922274868e-05, |
|
"loss": 2.3043, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.5073170731707317, |
|
"grad_norm": 1.4304825067520142, |
|
"learning_rate": 9.934822190662299e-05, |
|
"loss": 2.2831, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.5083440308087291, |
|
"grad_norm": 1.3889083862304688, |
|
"learning_rate": 9.902234151271177e-05, |
|
"loss": 2.2284, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5093709884467266, |
|
"grad_norm": 1.4664725065231323, |
|
"learning_rate": 9.869647150203046e-05, |
|
"loss": 2.2311, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.510397946084724, |
|
"grad_norm": 1.3767751455307007, |
|
"learning_rate": 9.837061533548403e-05, |
|
"loss": 2.1475, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.5114249037227214, |
|
"grad_norm": 1.3076868057250977, |
|
"learning_rate": 9.80447764738306e-05, |
|
"loss": 2.198, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.5124518613607189, |
|
"grad_norm": 1.886834979057312, |
|
"learning_rate": 9.771895837764439e-05, |
|
"loss": 2.1954, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.5134788189987163, |
|
"grad_norm": 1.3807141780853271, |
|
"learning_rate": 9.739316450727913e-05, |
|
"loss": 1.8945, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5145057766367137, |
|
"grad_norm": 1.3954834938049316, |
|
"learning_rate": 9.706739832283127e-05, |
|
"loss": 2.2309, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.5155327342747111, |
|
"grad_norm": 1.31797456741333, |
|
"learning_rate": 9.674166328410318e-05, |
|
"loss": 2.2839, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.5165596919127086, |
|
"grad_norm": 1.9284818172454834, |
|
"learning_rate": 9.641596285056648e-05, |
|
"loss": 2.3199, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.517586649550706, |
|
"grad_norm": 2.1243088245391846, |
|
"learning_rate": 9.609030048132523e-05, |
|
"loss": 2.3362, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.5186136071887034, |
|
"grad_norm": 1.7423335313796997, |
|
"learning_rate": 9.576467963507925e-05, |
|
"loss": 2.2644, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5196405648267008, |
|
"grad_norm": 1.9456878900527954, |
|
"learning_rate": 9.543910377008742e-05, |
|
"loss": 2.3168, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.5206675224646984, |
|
"grad_norm": 1.2658746242523193, |
|
"learning_rate": 9.511357634413075e-05, |
|
"loss": 2.46, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.5216944801026958, |
|
"grad_norm": 1.1964818239212036, |
|
"learning_rate": 9.478810081447595e-05, |
|
"loss": 2.0302, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.5227214377406932, |
|
"grad_norm": 1.313723087310791, |
|
"learning_rate": 9.446268063783853e-05, |
|
"loss": 2.3145, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.5237483953786907, |
|
"grad_norm": 1.8463740348815918, |
|
"learning_rate": 9.413731927034605e-05, |
|
"loss": 2.1669, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5247753530166881, |
|
"grad_norm": 1.1602845191955566, |
|
"learning_rate": 9.381202016750158e-05, |
|
"loss": 2.198, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.5258023106546855, |
|
"grad_norm": 1.203307032585144, |
|
"learning_rate": 9.348678678414686e-05, |
|
"loss": 2.2497, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.526829268292683, |
|
"grad_norm": 1.3734437227249146, |
|
"learning_rate": 9.316162257442562e-05, |
|
"loss": 2.5348, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.5278562259306804, |
|
"grad_norm": 1.2321196794509888, |
|
"learning_rate": 9.283653099174704e-05, |
|
"loss": 2.1953, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.5288831835686778, |
|
"grad_norm": 1.2227696180343628, |
|
"learning_rate": 9.251151548874884e-05, |
|
"loss": 2.2543, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5299101412066752, |
|
"grad_norm": 1.7139776945114136, |
|
"learning_rate": 9.21865795172608e-05, |
|
"loss": 2.2894, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.5309370988446727, |
|
"grad_norm": 1.5949795246124268, |
|
"learning_rate": 9.186172652826808e-05, |
|
"loss": 2.2051, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.5319640564826701, |
|
"grad_norm": 1.3807586431503296, |
|
"learning_rate": 9.15369599718744e-05, |
|
"loss": 2.2512, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.5329910141206675, |
|
"grad_norm": 1.3648223876953125, |
|
"learning_rate": 9.121228329726563e-05, |
|
"loss": 2.4064, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.5340179717586649, |
|
"grad_norm": 1.2418193817138672, |
|
"learning_rate": 9.0887699952673e-05, |
|
"loss": 2.2983, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5350449293966624, |
|
"grad_norm": 1.6278191804885864, |
|
"learning_rate": 9.056321338533656e-05, |
|
"loss": 2.5757, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.5360718870346598, |
|
"grad_norm": 4.154942989349365, |
|
"learning_rate": 9.023882704146848e-05, |
|
"loss": 2.1376, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.5370988446726572, |
|
"grad_norm": 1.5529977083206177, |
|
"learning_rate": 8.991454436621657e-05, |
|
"loss": 2.235, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.5381258023106547, |
|
"grad_norm": 1.4370229244232178, |
|
"learning_rate": 8.959036880362763e-05, |
|
"loss": 2.1873, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.5391527599486521, |
|
"grad_norm": 1.117822289466858, |
|
"learning_rate": 8.926630379661075e-05, |
|
"loss": 1.9378, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5401797175866495, |
|
"grad_norm": 1.476722240447998, |
|
"learning_rate": 8.894235278690104e-05, |
|
"loss": 2.3088, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.5412066752246469, |
|
"grad_norm": 1.3249425888061523, |
|
"learning_rate": 8.861851921502275e-05, |
|
"loss": 2.2814, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.5422336328626444, |
|
"grad_norm": 1.800880789756775, |
|
"learning_rate": 8.829480652025297e-05, |
|
"loss": 2.2492, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.5432605905006418, |
|
"grad_norm": 1.4842054843902588, |
|
"learning_rate": 8.797121814058501e-05, |
|
"loss": 2.3106, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.5442875481386393, |
|
"grad_norm": 1.6607662439346313, |
|
"learning_rate": 8.764775751269182e-05, |
|
"loss": 2.437, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5453145057766368, |
|
"grad_norm": 1.5801904201507568, |
|
"learning_rate": 8.732442807188965e-05, |
|
"loss": 2.4136, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.5463414634146342, |
|
"grad_norm": 1.321534514427185, |
|
"learning_rate": 8.70012332521014e-05, |
|
"loss": 2.3567, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.5473684210526316, |
|
"grad_norm": 1.2890071868896484, |
|
"learning_rate": 8.66781764858203e-05, |
|
"loss": 2.1654, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.548395378690629, |
|
"grad_norm": 1.3918139934539795, |
|
"learning_rate": 8.635526120407329e-05, |
|
"loss": 2.5423, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.5494223363286265, |
|
"grad_norm": 2.1062328815460205, |
|
"learning_rate": 8.603249083638477e-05, |
|
"loss": 2.1476, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5504492939666239, |
|
"grad_norm": 1.81232750415802, |
|
"learning_rate": 8.570986881074003e-05, |
|
"loss": 2.2798, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.5514762516046213, |
|
"grad_norm": 1.62819242477417, |
|
"learning_rate": 8.538739855354886e-05, |
|
"loss": 2.2052, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.5525032092426188, |
|
"grad_norm": 1.3328535556793213, |
|
"learning_rate": 8.506508348960924e-05, |
|
"loss": 2.2646, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.5535301668806162, |
|
"grad_norm": 1.2471153736114502, |
|
"learning_rate": 8.474292704207094e-05, |
|
"loss": 2.23, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.5545571245186136, |
|
"grad_norm": 1.668730616569519, |
|
"learning_rate": 8.442093263239912e-05, |
|
"loss": 2.2528, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.555584082156611, |
|
"grad_norm": 2.2429211139678955, |
|
"learning_rate": 8.409910368033795e-05, |
|
"loss": 2.2875, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.5566110397946085, |
|
"grad_norm": 1.9282841682434082, |
|
"learning_rate": 8.377744360387447e-05, |
|
"loss": 2.1459, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.5576379974326059, |
|
"grad_norm": 1.3737177848815918, |
|
"learning_rate": 8.345595581920205e-05, |
|
"loss": 2.31, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.5586649550706033, |
|
"grad_norm": 1.400428056716919, |
|
"learning_rate": 8.313464374068437e-05, |
|
"loss": 1.9801, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.5596919127086007, |
|
"grad_norm": 1.5178526639938354, |
|
"learning_rate": 8.28135107808189e-05, |
|
"loss": 2.3441, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5607188703465982, |
|
"grad_norm": 1.2568373680114746, |
|
"learning_rate": 8.249256035020086e-05, |
|
"loss": 2.2817, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.5617458279845956, |
|
"grad_norm": 1.4138679504394531, |
|
"learning_rate": 8.217179585748688e-05, |
|
"loss": 2.3209, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.562772785622593, |
|
"grad_norm": 1.4604218006134033, |
|
"learning_rate": 8.185122070935884e-05, |
|
"loss": 2.1436, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.5637997432605905, |
|
"grad_norm": 1.4326988458633423, |
|
"learning_rate": 8.15308383104877e-05, |
|
"loss": 2.4009, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.5648267008985879, |
|
"grad_norm": 1.4617232084274292, |
|
"learning_rate": 8.121065206349729e-05, |
|
"loss": 2.1219, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5658536585365853, |
|
"grad_norm": 1.1367815732955933, |
|
"learning_rate": 8.089066536892824e-05, |
|
"loss": 2.1921, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.5668806161745829, |
|
"grad_norm": 1.0669057369232178, |
|
"learning_rate": 8.057088162520186e-05, |
|
"loss": 2.0411, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.5679075738125803, |
|
"grad_norm": 1.1332672834396362, |
|
"learning_rate": 8.02513042285839e-05, |
|
"loss": 2.1418, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.5689345314505777, |
|
"grad_norm": 1.5040186643600464, |
|
"learning_rate": 7.993193657314875e-05, |
|
"loss": 2.2761, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.5699614890885751, |
|
"grad_norm": 1.1837241649627686, |
|
"learning_rate": 7.961278205074313e-05, |
|
"loss": 2.2972, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5709884467265726, |
|
"grad_norm": 1.3050509691238403, |
|
"learning_rate": 7.929384405095025e-05, |
|
"loss": 2.2907, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.57201540436457, |
|
"grad_norm": 1.4179966449737549, |
|
"learning_rate": 7.897512596105368e-05, |
|
"loss": 2.4073, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.5730423620025674, |
|
"grad_norm": 1.321635365486145, |
|
"learning_rate": 7.865663116600148e-05, |
|
"loss": 2.3307, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.5740693196405648, |
|
"grad_norm": 1.3887020349502563, |
|
"learning_rate": 7.833836304837021e-05, |
|
"loss": 2.1635, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.5750962772785623, |
|
"grad_norm": 1.4184433221817017, |
|
"learning_rate": 7.802032498832895e-05, |
|
"loss": 2.1665, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5761232349165597, |
|
"grad_norm": 1.585056185722351, |
|
"learning_rate": 7.770252036360351e-05, |
|
"loss": 2.366, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.5771501925545571, |
|
"grad_norm": 1.7346497774124146, |
|
"learning_rate": 7.738495254944042e-05, |
|
"loss": 2.3033, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.5781771501925546, |
|
"grad_norm": 1.9621151685714722, |
|
"learning_rate": 7.706762491857126e-05, |
|
"loss": 2.2521, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.579204107830552, |
|
"grad_norm": 2.09137225151062, |
|
"learning_rate": 7.675054084117672e-05, |
|
"loss": 2.3874, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.5802310654685494, |
|
"grad_norm": 1.4368287324905396, |
|
"learning_rate": 7.643370368485072e-05, |
|
"loss": 2.1901, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5812580231065468, |
|
"grad_norm": 1.4019243717193604, |
|
"learning_rate": 7.611711681456493e-05, |
|
"loss": 2.2313, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.5822849807445443, |
|
"grad_norm": 1.383966326713562, |
|
"learning_rate": 7.580078359263267e-05, |
|
"loss": 2.2297, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.5833119383825417, |
|
"grad_norm": 1.407023310661316, |
|
"learning_rate": 7.54847073786735e-05, |
|
"loss": 2.4025, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.5843388960205391, |
|
"grad_norm": 1.3167904615402222, |
|
"learning_rate": 7.516889152957744e-05, |
|
"loss": 2.2873, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.5853658536585366, |
|
"grad_norm": 1.6377153396606445, |
|
"learning_rate": 7.485333939946926e-05, |
|
"loss": 2.2427, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.586392811296534, |
|
"grad_norm": 2.095268726348877, |
|
"learning_rate": 7.453805433967287e-05, |
|
"loss": 2.3737, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.5874197689345314, |
|
"grad_norm": 1.2016150951385498, |
|
"learning_rate": 7.422303969867581e-05, |
|
"loss": 2.2338, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.5884467265725288, |
|
"grad_norm": 1.3786977529525757, |
|
"learning_rate": 7.39082988220936e-05, |
|
"loss": 2.1732, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.5894736842105263, |
|
"grad_norm": 1.3394079208374023, |
|
"learning_rate": 7.359383505263431e-05, |
|
"loss": 2.2412, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.5905006418485238, |
|
"grad_norm": 1.994136095046997, |
|
"learning_rate": 7.327965173006286e-05, |
|
"loss": 2.455, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5915275994865212, |
|
"grad_norm": 1.2922275066375732, |
|
"learning_rate": 7.296575219116582e-05, |
|
"loss": 2.2607, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.5925545571245187, |
|
"grad_norm": 1.4127111434936523, |
|
"learning_rate": 7.265213976971577e-05, |
|
"loss": 2.3233, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.5935815147625161, |
|
"grad_norm": 1.5525174140930176, |
|
"learning_rate": 7.233881779643594e-05, |
|
"loss": 2.2218, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.5946084724005135, |
|
"grad_norm": 1.2909669876098633, |
|
"learning_rate": 7.202578959896491e-05, |
|
"loss": 2.4499, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.5956354300385109, |
|
"grad_norm": 1.5280107259750366, |
|
"learning_rate": 7.171305850182113e-05, |
|
"loss": 2.5343, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5966623876765084, |
|
"grad_norm": 1.4098429679870605, |
|
"learning_rate": 7.140062782636777e-05, |
|
"loss": 2.2938, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.5976893453145058, |
|
"grad_norm": 1.3740651607513428, |
|
"learning_rate": 7.108850089077735e-05, |
|
"loss": 2.3292, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.5987163029525032, |
|
"grad_norm": 1.922861099243164, |
|
"learning_rate": 7.077668100999648e-05, |
|
"loss": 2.6001, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.5997432605905006, |
|
"grad_norm": 1.2894479036331177, |
|
"learning_rate": 7.046517149571075e-05, |
|
"loss": 2.1586, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.6007702182284981, |
|
"grad_norm": 1.6118210554122925, |
|
"learning_rate": 7.015397565630944e-05, |
|
"loss": 2.5227, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.6017971758664955, |
|
"grad_norm": 1.51421058177948, |
|
"learning_rate": 6.98430967968505e-05, |
|
"loss": 2.4502, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.6028241335044929, |
|
"grad_norm": 1.5112583637237549, |
|
"learning_rate": 6.953253821902532e-05, |
|
"loss": 2.1649, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.6038510911424904, |
|
"grad_norm": 1.23862624168396, |
|
"learning_rate": 6.922230322112382e-05, |
|
"loss": 2.2405, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.6048780487804878, |
|
"grad_norm": 1.5707911252975464, |
|
"learning_rate": 6.891239509799931e-05, |
|
"loss": 2.3034, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.6059050064184852, |
|
"grad_norm": 1.347216010093689, |
|
"learning_rate": 6.86028171410335e-05, |
|
"loss": 2.1914, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6069319640564826, |
|
"grad_norm": 1.8234333992004395, |
|
"learning_rate": 6.829357263810156e-05, |
|
"loss": 2.215, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.6079589216944801, |
|
"grad_norm": 1.4702991247177124, |
|
"learning_rate": 6.798466487353723e-05, |
|
"loss": 2.3195, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.6089858793324775, |
|
"grad_norm": 1.6644995212554932, |
|
"learning_rate": 6.767609712809793e-05, |
|
"loss": 2.3335, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.6100128369704749, |
|
"grad_norm": 2.144955635070801, |
|
"learning_rate": 6.736787267892991e-05, |
|
"loss": 2.4013, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.6110397946084724, |
|
"grad_norm": 1.5009002685546875, |
|
"learning_rate": 6.705999479953338e-05, |
|
"loss": 2.1646, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6120667522464698, |
|
"grad_norm": 1.6303348541259766, |
|
"learning_rate": 6.675246675972789e-05, |
|
"loss": 2.1712, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.6130937098844673, |
|
"grad_norm": 1.4405221939086914, |
|
"learning_rate": 6.644529182561739e-05, |
|
"loss": 2.2961, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.6141206675224647, |
|
"grad_norm": 1.6782268285751343, |
|
"learning_rate": 6.613847325955578e-05, |
|
"loss": 2.2071, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.6151476251604622, |
|
"grad_norm": 1.2012258768081665, |
|
"learning_rate": 6.583201432011217e-05, |
|
"loss": 1.8268, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.6161745827984596, |
|
"grad_norm": 1.4990489482879639, |
|
"learning_rate": 6.552591826203616e-05, |
|
"loss": 2.0925, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.617201540436457, |
|
"grad_norm": 1.6155550479888916, |
|
"learning_rate": 6.522018833622338e-05, |
|
"loss": 2.2946, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.6182284980744545, |
|
"grad_norm": 1.6897152662277222, |
|
"learning_rate": 6.491482778968104e-05, |
|
"loss": 2.212, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.6192554557124519, |
|
"grad_norm": 1.3177330493927002, |
|
"learning_rate": 6.460983986549321e-05, |
|
"loss": 1.7627, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.6202824133504493, |
|
"grad_norm": 1.4330686330795288, |
|
"learning_rate": 6.430522780278663e-05, |
|
"loss": 2.4068, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.6213093709884467, |
|
"grad_norm": 1.3086313009262085, |
|
"learning_rate": 6.400099483669621e-05, |
|
"loss": 2.2901, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6223363286264442, |
|
"grad_norm": 1.5514755249023438, |
|
"learning_rate": 6.369714419833056e-05, |
|
"loss": 2.0377, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.6233632862644416, |
|
"grad_norm": 1.43644380569458, |
|
"learning_rate": 6.339367911473788e-05, |
|
"loss": 2.0917, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.624390243902439, |
|
"grad_norm": 1.5321091413497925, |
|
"learning_rate": 6.309060280887151e-05, |
|
"loss": 2.2432, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.6254172015404365, |
|
"grad_norm": 1.3088302612304688, |
|
"learning_rate": 6.278791849955583e-05, |
|
"loss": 2.2123, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.6264441591784339, |
|
"grad_norm": 1.256137728691101, |
|
"learning_rate": 6.248562940145195e-05, |
|
"loss": 2.2079, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6274711168164313, |
|
"grad_norm": 1.4580830335617065, |
|
"learning_rate": 6.21837387250237e-05, |
|
"loss": 2.4283, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.6284980744544287, |
|
"grad_norm": 1.6171642541885376, |
|
"learning_rate": 6.188224967650347e-05, |
|
"loss": 2.3705, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.6295250320924262, |
|
"grad_norm": 1.3267604112625122, |
|
"learning_rate": 6.158116545785809e-05, |
|
"loss": 2.2335, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.6305519897304236, |
|
"grad_norm": 1.5025784969329834, |
|
"learning_rate": 6.128048926675494e-05, |
|
"loss": 2.3648, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 1.0958548784255981, |
|
"learning_rate": 6.098022429652794e-05, |
|
"loss": 2.1022, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6326059050064184, |
|
"grad_norm": 1.2155437469482422, |
|
"learning_rate": 6.068037373614364e-05, |
|
"loss": 2.2298, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.6336328626444159, |
|
"grad_norm": 1.4382355213165283, |
|
"learning_rate": 6.0380940770167336e-05, |
|
"loss": 2.3538, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.6346598202824133, |
|
"grad_norm": 1.2357529401779175, |
|
"learning_rate": 6.008192857872923e-05, |
|
"loss": 1.8669, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.6356867779204107, |
|
"grad_norm": 1.3660916090011597, |
|
"learning_rate": 5.9783340337490754e-05, |
|
"loss": 2.3527, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.6367137355584083, |
|
"grad_norm": 1.899346947669983, |
|
"learning_rate": 5.94851792176107e-05, |
|
"loss": 2.3702, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6377406931964057, |
|
"grad_norm": 1.207220196723938, |
|
"learning_rate": 5.9187448385711685e-05, |
|
"loss": 2.2377, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.6387676508344031, |
|
"grad_norm": 1.7264626026153564, |
|
"learning_rate": 5.889015100384636e-05, |
|
"loss": 2.3115, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.6397946084724005, |
|
"grad_norm": 1.4954859018325806, |
|
"learning_rate": 5.859329022946399e-05, |
|
"loss": 2.1911, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.640821566110398, |
|
"grad_norm": 1.2664867639541626, |
|
"learning_rate": 5.8296869215376846e-05, |
|
"loss": 2.0781, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.6418485237483954, |
|
"grad_norm": 1.6995880603790283, |
|
"learning_rate": 5.8000891109726706e-05, |
|
"loss": 2.2242, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6428754813863928, |
|
"grad_norm": 1.3912755250930786, |
|
"learning_rate": 5.770535905595138e-05, |
|
"loss": 2.1297, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.6439024390243903, |
|
"grad_norm": 1.3863202333450317, |
|
"learning_rate": 5.741027619275146e-05, |
|
"loss": 2.4886, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.6449293966623877, |
|
"grad_norm": 1.896565318107605, |
|
"learning_rate": 5.7115645654056815e-05, |
|
"loss": 2.4106, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.6459563543003851, |
|
"grad_norm": 1.4082729816436768, |
|
"learning_rate": 5.6821470568993606e-05, |
|
"loss": 2.4035, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.6469833119383825, |
|
"grad_norm": 1.357062578201294, |
|
"learning_rate": 5.6527754061850554e-05, |
|
"loss": 2.0427, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.64801026957638, |
|
"grad_norm": 1.3760401010513306, |
|
"learning_rate": 5.623449925204627e-05, |
|
"loss": 2.0807, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.6490372272143774, |
|
"grad_norm": 1.399768590927124, |
|
"learning_rate": 5.594170925409579e-05, |
|
"loss": 2.1833, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.6500641848523748, |
|
"grad_norm": 1.6240482330322266, |
|
"learning_rate": 5.564938717757766e-05, |
|
"loss": 2.4324, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.6510911424903723, |
|
"grad_norm": 1.388628363609314, |
|
"learning_rate": 5.5357536127100904e-05, |
|
"loss": 2.2536, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.6521181001283697, |
|
"grad_norm": 1.4866918325424194, |
|
"learning_rate": 5.506615920227186e-05, |
|
"loss": 2.1934, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6531450577663671, |
|
"grad_norm": 1.1647313833236694, |
|
"learning_rate": 5.4775259497661555e-05, |
|
"loss": 2.1951, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.6541720154043645, |
|
"grad_norm": 1.377734661102295, |
|
"learning_rate": 5.448484010277267e-05, |
|
"loss": 2.3885, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.655198973042362, |
|
"grad_norm": 1.5145090818405151, |
|
"learning_rate": 5.419490410200675e-05, |
|
"loss": 2.2318, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.6562259306803594, |
|
"grad_norm": 1.4447916746139526, |
|
"learning_rate": 5.390545457463134e-05, |
|
"loss": 2.1195, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.6572528883183568, |
|
"grad_norm": 2.2646050453186035, |
|
"learning_rate": 5.361649459474756e-05, |
|
"loss": 2.1927, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6582798459563542, |
|
"grad_norm": 1.7940598726272583, |
|
"learning_rate": 5.332802723125716e-05, |
|
"loss": 2.2811, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.6593068035943518, |
|
"grad_norm": 1.3245995044708252, |
|
"learning_rate": 5.304005554783015e-05, |
|
"loss": 2.2386, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.6603337612323492, |
|
"grad_norm": 2.0891315937042236, |
|
"learning_rate": 5.275258260287211e-05, |
|
"loss": 2.2728, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.6613607188703466, |
|
"grad_norm": 1.4078350067138672, |
|
"learning_rate": 5.246561144949173e-05, |
|
"loss": 2.1705, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.6623876765083441, |
|
"grad_norm": 1.688930630683899, |
|
"learning_rate": 5.217914513546848e-05, |
|
"loss": 2.5033, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6634146341463415, |
|
"grad_norm": 1.6725249290466309, |
|
"learning_rate": 5.1893186703220165e-05, |
|
"loss": 2.4268, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.6644415917843389, |
|
"grad_norm": 1.7176076173782349, |
|
"learning_rate": 5.160773918977061e-05, |
|
"loss": 2.388, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.6654685494223364, |
|
"grad_norm": 1.360771894454956, |
|
"learning_rate": 5.13228056267175e-05, |
|
"loss": 2.2381, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.6664955070603338, |
|
"grad_norm": 1.610059142112732, |
|
"learning_rate": 5.103838904019993e-05, |
|
"loss": 1.8292, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.6675224646983312, |
|
"grad_norm": 1.5176039934158325, |
|
"learning_rate": 5.0754492450866607e-05, |
|
"loss": 2.283, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6685494223363286, |
|
"grad_norm": 1.2839123010635376, |
|
"learning_rate": 5.047111887384357e-05, |
|
"loss": 2.4285, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.6695763799743261, |
|
"grad_norm": 1.2786331176757812, |
|
"learning_rate": 5.018827131870214e-05, |
|
"loss": 2.2895, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.6706033376123235, |
|
"grad_norm": 1.634473204612732, |
|
"learning_rate": 4.9905952789427126e-05, |
|
"loss": 2.0957, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.6716302952503209, |
|
"grad_norm": 1.3561400175094604, |
|
"learning_rate": 4.9624166284384656e-05, |
|
"loss": 2.1695, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.6726572528883183, |
|
"grad_norm": 1.4681189060211182, |
|
"learning_rate": 4.934291479629063e-05, |
|
"loss": 2.415, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.6736842105263158, |
|
"grad_norm": 1.7720856666564941, |
|
"learning_rate": 4.9062201312178725e-05, |
|
"loss": 2.2386, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.6747111681643132, |
|
"grad_norm": 1.505091667175293, |
|
"learning_rate": 4.8782028813368786e-05, |
|
"loss": 2.2751, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.6757381258023106, |
|
"grad_norm": 1.2972487211227417, |
|
"learning_rate": 4.850240027543509e-05, |
|
"loss": 2.4914, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.6767650834403081, |
|
"grad_norm": 2.298083543777466, |
|
"learning_rate": 4.822331866817478e-05, |
|
"loss": 2.3478, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.6777920410783055, |
|
"grad_norm": 1.8115476369857788, |
|
"learning_rate": 4.7944786955576313e-05, |
|
"loss": 2.4564, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6788189987163029, |
|
"grad_norm": 1.3283237218856812, |
|
"learning_rate": 4.766680809578804e-05, |
|
"loss": 2.0925, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.6798459563543003, |
|
"grad_norm": 2.0927574634552, |
|
"learning_rate": 4.738938504108659e-05, |
|
"loss": 2.1346, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.6808729139922978, |
|
"grad_norm": 1.2842458486557007, |
|
"learning_rate": 4.7112520737845814e-05, |
|
"loss": 2.176, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.6818998716302952, |
|
"grad_norm": 1.6555302143096924, |
|
"learning_rate": 4.683621812650525e-05, |
|
"loss": 2.3672, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.6829268292682927, |
|
"grad_norm": 1.4594167470932007, |
|
"learning_rate": 4.6560480141539044e-05, |
|
"loss": 2.2314, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.6839537869062902, |
|
"grad_norm": 1.2699614763259888, |
|
"learning_rate": 4.628530971142471e-05, |
|
"loss": 2.1865, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.6849807445442876, |
|
"grad_norm": 1.110546588897705, |
|
"learning_rate": 4.601070975861194e-05, |
|
"loss": 1.8842, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.686007702182285, |
|
"grad_norm": 1.594329833984375, |
|
"learning_rate": 4.573668319949179e-05, |
|
"loss": 2.3873, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.6870346598202824, |
|
"grad_norm": 1.4520328044891357, |
|
"learning_rate": 4.5463232944365554e-05, |
|
"loss": 2.3261, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.6880616174582799, |
|
"grad_norm": 1.9483394622802734, |
|
"learning_rate": 4.519036189741386e-05, |
|
"loss": 2.2976, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6890885750962773, |
|
"grad_norm": 2.0844573974609375, |
|
"learning_rate": 4.4918072956665915e-05, |
|
"loss": 2.1116, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.6901155327342747, |
|
"grad_norm": 1.7560206651687622, |
|
"learning_rate": 4.464636901396852e-05, |
|
"loss": 2.3037, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.6911424903722722, |
|
"grad_norm": 1.24983549118042, |
|
"learning_rate": 4.4375252954955635e-05, |
|
"loss": 2.0582, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.6921694480102696, |
|
"grad_norm": 1.5504297018051147, |
|
"learning_rate": 4.410472765901755e-05, |
|
"loss": 2.4398, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.693196405648267, |
|
"grad_norm": 1.6074193716049194, |
|
"learning_rate": 4.3834795999270364e-05, |
|
"loss": 2.2749, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.6942233632862644, |
|
"grad_norm": 2.164050817489624, |
|
"learning_rate": 4.356546084252548e-05, |
|
"loss": 2.3882, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.6952503209242619, |
|
"grad_norm": 1.8083585500717163, |
|
"learning_rate": 4.3296725049259015e-05, |
|
"loss": 2.244, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.6962772785622593, |
|
"grad_norm": 1.108267068862915, |
|
"learning_rate": 4.302859147358168e-05, |
|
"loss": 1.9859, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.6973042362002567, |
|
"grad_norm": 1.5166776180267334, |
|
"learning_rate": 4.2761062963208275e-05, |
|
"loss": 2.296, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.6983311938382541, |
|
"grad_norm": 1.389072060585022, |
|
"learning_rate": 4.249414235942755e-05, |
|
"loss": 2.1956, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6993581514762516, |
|
"grad_norm": 1.3236082792282104, |
|
"learning_rate": 4.222783249707184e-05, |
|
"loss": 2.0828, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.700385109114249, |
|
"grad_norm": 1.5802843570709229, |
|
"learning_rate": 4.196213620448723e-05, |
|
"loss": 2.2064, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.7014120667522464, |
|
"grad_norm": 1.5572673082351685, |
|
"learning_rate": 4.169705630350335e-05, |
|
"loss": 2.3091, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.7024390243902439, |
|
"grad_norm": 1.3279824256896973, |
|
"learning_rate": 4.143259560940341e-05, |
|
"loss": 2.1509, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.7034659820282413, |
|
"grad_norm": 1.4098100662231445, |
|
"learning_rate": 4.116875693089439e-05, |
|
"loss": 2.352, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.7044929396662387, |
|
"grad_norm": 1.4625084400177002, |
|
"learning_rate": 4.0905543070077036e-05, |
|
"loss": 2.0663, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.7055198973042363, |
|
"grad_norm": 1.8245489597320557, |
|
"learning_rate": 4.064295682241631e-05, |
|
"loss": 2.4885, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.7065468549422337, |
|
"grad_norm": 1.1943061351776123, |
|
"learning_rate": 4.038100097671155e-05, |
|
"loss": 2.2384, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.7075738125802311, |
|
"grad_norm": 1.149429440498352, |
|
"learning_rate": 4.0119678315067025e-05, |
|
"loss": 2.0727, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.7086007702182285, |
|
"grad_norm": 1.3723962306976318, |
|
"learning_rate": 3.985899161286205e-05, |
|
"loss": 2.3764, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.709627727856226, |
|
"grad_norm": 1.4548147916793823, |
|
"learning_rate": 3.959894363872192e-05, |
|
"loss": 2.1269, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.7106546854942234, |
|
"grad_norm": 1.5420607328414917, |
|
"learning_rate": 3.933953715448822e-05, |
|
"loss": 2.1683, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.7116816431322208, |
|
"grad_norm": 1.3956791162490845, |
|
"learning_rate": 3.90807749151896e-05, |
|
"loss": 2.2853, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.7127086007702182, |
|
"grad_norm": 1.335822582244873, |
|
"learning_rate": 3.882265966901257e-05, |
|
"loss": 2.4303, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.7137355584082157, |
|
"grad_norm": 1.4300553798675537, |
|
"learning_rate": 3.85651941572721e-05, |
|
"loss": 2.2918, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7147625160462131, |
|
"grad_norm": 1.398274540901184, |
|
"learning_rate": 3.8308381114382776e-05, |
|
"loss": 2.3322, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.7157894736842105, |
|
"grad_norm": 2.2710065841674805, |
|
"learning_rate": 3.805222326782958e-05, |
|
"loss": 2.2784, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.716816431322208, |
|
"grad_norm": 1.4800862073898315, |
|
"learning_rate": 3.7796723338138995e-05, |
|
"loss": 2.276, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.7178433889602054, |
|
"grad_norm": 1.4330739974975586, |
|
"learning_rate": 3.7541884038850125e-05, |
|
"loss": 2.2517, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.7188703465982028, |
|
"grad_norm": 1.2923405170440674, |
|
"learning_rate": 3.728770807648574e-05, |
|
"loss": 2.0609, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7198973042362002, |
|
"grad_norm": 1.1865580081939697, |
|
"learning_rate": 3.703419815052371e-05, |
|
"loss": 2.0863, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.7209242618741977, |
|
"grad_norm": 1.1879284381866455, |
|
"learning_rate": 3.6781356953368284e-05, |
|
"loss": 2.1486, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.7219512195121951, |
|
"grad_norm": 1.1846188306808472, |
|
"learning_rate": 3.6529187170321446e-05, |
|
"loss": 2.2775, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.7229781771501925, |
|
"grad_norm": 1.822152853012085, |
|
"learning_rate": 3.627769147955433e-05, |
|
"loss": 1.9519, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.72400513478819, |
|
"grad_norm": 1.2924715280532837, |
|
"learning_rate": 3.602687255207903e-05, |
|
"loss": 2.1845, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7250320924261874, |
|
"grad_norm": 1.6589620113372803, |
|
"learning_rate": 3.5776733051719936e-05, |
|
"loss": 2.4154, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.7260590500641848, |
|
"grad_norm": 1.5250493288040161, |
|
"learning_rate": 3.5527275635085666e-05, |
|
"loss": 2.447, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.7270860077021822, |
|
"grad_norm": 1.4836504459381104, |
|
"learning_rate": 3.527850295154075e-05, |
|
"loss": 2.2225, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.7281129653401797, |
|
"grad_norm": 1.7604604959487915, |
|
"learning_rate": 3.5030417643177415e-05, |
|
"loss": 2.396, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.7291399229781772, |
|
"grad_norm": 1.4900736808776855, |
|
"learning_rate": 3.47830223447877e-05, |
|
"loss": 2.3763, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7301668806161746, |
|
"grad_norm": 1.5196295976638794, |
|
"learning_rate": 3.453631968383538e-05, |
|
"loss": 2.2122, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.7311938382541721, |
|
"grad_norm": 1.2298855781555176, |
|
"learning_rate": 3.4290312280428064e-05, |
|
"loss": 2.1634, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.7322207958921695, |
|
"grad_norm": 1.4192733764648438, |
|
"learning_rate": 3.404500274728938e-05, |
|
"loss": 1.9621, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.7332477535301669, |
|
"grad_norm": 1.2010129690170288, |
|
"learning_rate": 3.3800393689731146e-05, |
|
"loss": 2.14, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.7342747111681643, |
|
"grad_norm": 1.346863031387329, |
|
"learning_rate": 3.355648770562587e-05, |
|
"loss": 2.3788, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7353016688061618, |
|
"grad_norm": 1.6066118478775024, |
|
"learning_rate": 3.331328738537902e-05, |
|
"loss": 2.5079, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.7363286264441592, |
|
"grad_norm": 1.403447151184082, |
|
"learning_rate": 3.307079531190155e-05, |
|
"loss": 2.2288, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.7373555840821566, |
|
"grad_norm": 1.413053035736084, |
|
"learning_rate": 3.28290140605825e-05, |
|
"loss": 1.987, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.738382541720154, |
|
"grad_norm": 1.3956674337387085, |
|
"learning_rate": 3.2587946199261586e-05, |
|
"loss": 2.2841, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.7394094993581515, |
|
"grad_norm": 1.6530338525772095, |
|
"learning_rate": 3.2347594288201976e-05, |
|
"loss": 2.2004, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7404364569961489, |
|
"grad_norm": 1.269932508468628, |
|
"learning_rate": 3.2107960880063094e-05, |
|
"loss": 2.1639, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.7414634146341463, |
|
"grad_norm": 1.6402631998062134, |
|
"learning_rate": 3.186904851987351e-05, |
|
"loss": 2.3119, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.7424903722721438, |
|
"grad_norm": 1.7420886754989624, |
|
"learning_rate": 3.1630859745003794e-05, |
|
"loss": 2.2652, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.7435173299101412, |
|
"grad_norm": 1.468622088432312, |
|
"learning_rate": 3.139339708513981e-05, |
|
"loss": 2.1195, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.7445442875481386, |
|
"grad_norm": 1.339032530784607, |
|
"learning_rate": 3.115666306225562e-05, |
|
"loss": 2.1675, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.745571245186136, |
|
"grad_norm": 1.675392746925354, |
|
"learning_rate": 3.092066019058689e-05, |
|
"loss": 2.2438, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.7465982028241335, |
|
"grad_norm": 1.2472889423370361, |
|
"learning_rate": 3.0685390976603945e-05, |
|
"loss": 2.2506, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.7476251604621309, |
|
"grad_norm": 1.4236087799072266, |
|
"learning_rate": 3.0450857918985387e-05, |
|
"loss": 2.0269, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.7486521181001283, |
|
"grad_norm": 1.3883495330810547, |
|
"learning_rate": 3.021706350859147e-05, |
|
"loss": 2.3359, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.7496790757381258, |
|
"grad_norm": 1.3572190999984741, |
|
"learning_rate": 2.998401022843761e-05, |
|
"loss": 2.0806, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7507060333761232, |
|
"grad_norm": 1.2549355030059814, |
|
"learning_rate": 2.9751700553668072e-05, |
|
"loss": 2.1428, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.7517329910141207, |
|
"grad_norm": 1.676809310913086, |
|
"learning_rate": 2.9520136951529576e-05, |
|
"loss": 2.4592, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.7517329910141207, |
|
"eval_loss": 2.252448081970215, |
|
"eval_runtime": 26.8305, |
|
"eval_samples_per_second": 15.281, |
|
"eval_steps_per_second": 7.641, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.7527599486521181, |
|
"grad_norm": 1.2231281995773315, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 2.0821, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.7537869062901156, |
|
"grad_norm": 1.4395884275436401, |
|
"learning_rate": 2.9059257794488424e-05, |
|
"loss": 2.2183, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.754813863928113, |
|
"grad_norm": 1.5501489639282227, |
|
"learning_rate": 2.882994713435658e-05, |
|
"loss": 2.4913, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7558408215661104, |
|
"grad_norm": 1.4353766441345215, |
|
"learning_rate": 2.860139233634547e-05, |
|
"loss": 2.2493, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.7568677792041079, |
|
"grad_norm": 1.2123923301696777, |
|
"learning_rate": 2.8373595827823086e-05, |
|
"loss": 2.3579, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.7578947368421053, |
|
"grad_norm": 1.1628131866455078, |
|
"learning_rate": 2.8146560028104153e-05, |
|
"loss": 2.1711, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.7589216944801027, |
|
"grad_norm": 1.0057865381240845, |
|
"learning_rate": 2.792028734842418e-05, |
|
"loss": 1.7073, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.7599486521181001, |
|
"grad_norm": 1.5157403945922852, |
|
"learning_rate": 2.7694780191914006e-05, |
|
"loss": 2.3561, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7609756097560976, |
|
"grad_norm": 1.333644151687622, |
|
"learning_rate": 2.7470040953574238e-05, |
|
"loss": 2.3166, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.762002567394095, |
|
"grad_norm": 1.4937711954116821, |
|
"learning_rate": 2.724607202024969e-05, |
|
"loss": 2.2235, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.7630295250320924, |
|
"grad_norm": 1.6282092332839966, |
|
"learning_rate": 2.7022875770604284e-05, |
|
"loss": 2.3438, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.7640564826700899, |
|
"grad_norm": 1.3984050750732422, |
|
"learning_rate": 2.6800454575095567e-05, |
|
"loss": 2.2035, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.7650834403080873, |
|
"grad_norm": 1.3804353475570679, |
|
"learning_rate": 2.6578810795949682e-05, |
|
"loss": 2.4949, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7661103979460847, |
|
"grad_norm": 1.403314232826233, |
|
"learning_rate": 2.6357946787136113e-05, |
|
"loss": 2.4336, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.7671373555840821, |
|
"grad_norm": 1.34967041015625, |
|
"learning_rate": 2.613786489434287e-05, |
|
"loss": 2.2571, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.7681643132220796, |
|
"grad_norm": 1.5277897119522095, |
|
"learning_rate": 2.591856745495148e-05, |
|
"loss": 2.1579, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.769191270860077, |
|
"grad_norm": 1.5782662630081177, |
|
"learning_rate": 2.5700056798012163e-05, |
|
"loss": 2.2695, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.7702182284980744, |
|
"grad_norm": 1.5577448606491089, |
|
"learning_rate": 2.548233524421911e-05, |
|
"loss": 2.234, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7712451861360718, |
|
"grad_norm": 1.2873456478118896, |
|
"learning_rate": 2.5265405105885855e-05, |
|
"loss": 2.0918, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.7722721437740693, |
|
"grad_norm": 1.7132203578948975, |
|
"learning_rate": 2.5049268686920667e-05, |
|
"loss": 2.2356, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.7732991014120667, |
|
"grad_norm": 1.3685425519943237, |
|
"learning_rate": 2.4833928282802132e-05, |
|
"loss": 2.1547, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.7743260590500641, |
|
"grad_norm": 1.3102649450302124, |
|
"learning_rate": 2.461938618055478e-05, |
|
"loss": 2.1503, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.7753530166880617, |
|
"grad_norm": 1.4933326244354248, |
|
"learning_rate": 2.440564465872469e-05, |
|
"loss": 2.2991, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7763799743260591, |
|
"grad_norm": 1.284360647201538, |
|
"learning_rate": 2.4192705987355424e-05, |
|
"loss": 2.145, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.7774069319640565, |
|
"grad_norm": 1.3690736293792725, |
|
"learning_rate": 2.3980572427963887e-05, |
|
"loss": 2.2586, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.778433889602054, |
|
"grad_norm": 1.6226760149002075, |
|
"learning_rate": 2.3769246233516242e-05, |
|
"loss": 2.4489, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.7794608472400514, |
|
"grad_norm": 1.2062180042266846, |
|
"learning_rate": 2.3558729648404065e-05, |
|
"loss": 2.013, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.7804878048780488, |
|
"grad_norm": 1.2239223718643188, |
|
"learning_rate": 2.33490249084204e-05, |
|
"loss": 2.2947, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7815147625160462, |
|
"grad_norm": 1.7042533159255981, |
|
"learning_rate": 2.3140134240736168e-05, |
|
"loss": 2.1473, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.7825417201540437, |
|
"grad_norm": 1.2973995208740234, |
|
"learning_rate": 2.2932059863876365e-05, |
|
"loss": 2.1991, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.7835686777920411, |
|
"grad_norm": 1.2338628768920898, |
|
"learning_rate": 2.272480398769662e-05, |
|
"loss": 2.1853, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.7845956354300385, |
|
"grad_norm": 2.069164991378784, |
|
"learning_rate": 2.2518368813359637e-05, |
|
"loss": 2.2275, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.785622593068036, |
|
"grad_norm": 1.2554938793182373, |
|
"learning_rate": 2.231275653331181e-05, |
|
"loss": 2.2557, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.7866495507060334, |
|
"grad_norm": 1.2675412893295288, |
|
"learning_rate": 2.2107969331260048e-05, |
|
"loss": 2.3231, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.7876765083440308, |
|
"grad_norm": 1.4239006042480469, |
|
"learning_rate": 2.1904009382148472e-05, |
|
"loss": 2.2516, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.7887034659820282, |
|
"grad_norm": 1.2731049060821533, |
|
"learning_rate": 2.170087885213541e-05, |
|
"loss": 2.1673, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.7897304236200257, |
|
"grad_norm": 1.8812044858932495, |
|
"learning_rate": 2.1498579898570227e-05, |
|
"loss": 2.1782, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.7907573812580231, |
|
"grad_norm": 1.426591396331787, |
|
"learning_rate": 2.1297114669970618e-05, |
|
"loss": 1.9592, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7917843388960205, |
|
"grad_norm": 1.302981972694397, |
|
"learning_rate": 2.109648530599968e-05, |
|
"loss": 2.2618, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.7928112965340179, |
|
"grad_norm": 1.8140078783035278, |
|
"learning_rate": 2.089669393744319e-05, |
|
"loss": 2.3927, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.7938382541720154, |
|
"grad_norm": 2.2119557857513428, |
|
"learning_rate": 2.0697742686187017e-05, |
|
"loss": 2.2843, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.7948652118100128, |
|
"grad_norm": 1.7960866689682007, |
|
"learning_rate": 2.049963366519446e-05, |
|
"loss": 2.4243, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.7958921694480102, |
|
"grad_norm": 1.459788203239441, |
|
"learning_rate": 2.030236897848402e-05, |
|
"loss": 2.2595, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.7969191270860077, |
|
"grad_norm": 1.232600450515747, |
|
"learning_rate": 2.0105950721106894e-05, |
|
"loss": 2.1295, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.7979460847240052, |
|
"grad_norm": 1.4180529117584229, |
|
"learning_rate": 1.9910380979124754e-05, |
|
"loss": 2.1814, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.7989730423620026, |
|
"grad_norm": 1.501356601715088, |
|
"learning_rate": 1.971566182958765e-05, |
|
"loss": 2.4455, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.6094731092453003, |
|
"learning_rate": 1.952179534051183e-05, |
|
"loss": 2.3726, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.8010269576379975, |
|
"grad_norm": 1.3195991516113281, |
|
"learning_rate": 1.9328783570857957e-05, |
|
"loss": 2.2679, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8020539152759949, |
|
"grad_norm": 1.4381358623504639, |
|
"learning_rate": 1.9136628570509063e-05, |
|
"loss": 2.0012, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.8030808729139923, |
|
"grad_norm": 1.612783670425415, |
|
"learning_rate": 1.8945332380248913e-05, |
|
"loss": 2.2622, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.8041078305519898, |
|
"grad_norm": 1.2924425601959229, |
|
"learning_rate": 1.8754897031740192e-05, |
|
"loss": 2.0476, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.8051347881899872, |
|
"grad_norm": 1.5139434337615967, |
|
"learning_rate": 1.856532454750307e-05, |
|
"loss": 2.0912, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.8061617458279846, |
|
"grad_norm": 1.4951143264770508, |
|
"learning_rate": 1.8376616940893654e-05, |
|
"loss": 2.3838, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.807188703465982, |
|
"grad_norm": 1.432755947113037, |
|
"learning_rate": 1.8188776216082603e-05, |
|
"loss": 2.2717, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.8082156611039795, |
|
"grad_norm": 1.244824767112732, |
|
"learning_rate": 1.800180436803386e-05, |
|
"loss": 1.8481, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.8092426187419769, |
|
"grad_norm": 1.570754885673523, |
|
"learning_rate": 1.7815703382483417e-05, |
|
"loss": 2.3096, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.8102695763799743, |
|
"grad_norm": 1.3875128030776978, |
|
"learning_rate": 1.7630475235918308e-05, |
|
"loss": 2.0865, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.8112965340179717, |
|
"grad_norm": 1.4651671648025513, |
|
"learning_rate": 1.7446121895555555e-05, |
|
"loss": 2.4152, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8123234916559692, |
|
"grad_norm": 1.1608806848526, |
|
"learning_rate": 1.7262645319321324e-05, |
|
"loss": 2.214, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.8133504492939666, |
|
"grad_norm": 1.622000813484192, |
|
"learning_rate": 1.708004745583003e-05, |
|
"loss": 2.2874, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.814377406931964, |
|
"grad_norm": 1.3545804023742676, |
|
"learning_rate": 1.689833024436377e-05, |
|
"loss": 2.3003, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.8154043645699615, |
|
"grad_norm": 1.4039793014526367, |
|
"learning_rate": 1.6717495614851652e-05, |
|
"loss": 1.8358, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.8164313222079589, |
|
"grad_norm": 1.3507652282714844, |
|
"learning_rate": 1.6537545487849336e-05, |
|
"loss": 2.2534, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8174582798459563, |
|
"grad_norm": 1.4168148040771484, |
|
"learning_rate": 1.6358481774518606e-05, |
|
"loss": 2.2372, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.8184852374839537, |
|
"grad_norm": 1.266169786453247, |
|
"learning_rate": 1.6180306376607035e-05, |
|
"loss": 2.387, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.8195121951219512, |
|
"grad_norm": 1.277815818786621, |
|
"learning_rate": 1.6003021186427893e-05, |
|
"loss": 2.2015, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.8205391527599486, |
|
"grad_norm": 1.7137157917022705, |
|
"learning_rate": 1.5826628086839968e-05, |
|
"loss": 2.1036, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.8215661103979461, |
|
"grad_norm": 1.4451020956039429, |
|
"learning_rate": 1.5651128951227612e-05, |
|
"loss": 2.0347, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8225930680359436, |
|
"grad_norm": 1.3843578100204468, |
|
"learning_rate": 1.547652564348082e-05, |
|
"loss": 2.1694, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.823620025673941, |
|
"grad_norm": 1.3804799318313599, |
|
"learning_rate": 1.5302820017975394e-05, |
|
"loss": 2.0706, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.8246469833119384, |
|
"grad_norm": 1.2779897451400757, |
|
"learning_rate": 1.5130013919553355e-05, |
|
"loss": 2.2537, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.8256739409499358, |
|
"grad_norm": 1.3886756896972656, |
|
"learning_rate": 1.4958109183503243e-05, |
|
"loss": 2.412, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.8267008985879333, |
|
"grad_norm": 1.5321778059005737, |
|
"learning_rate": 1.4787107635540732e-05, |
|
"loss": 2.0181, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.8277278562259307, |
|
"grad_norm": 1.277282953262329, |
|
"learning_rate": 1.4617011091789135e-05, |
|
"loss": 2.2371, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.8287548138639281, |
|
"grad_norm": 1.8173282146453857, |
|
"learning_rate": 1.4447821358760127e-05, |
|
"loss": 2.3445, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.8297817715019256, |
|
"grad_norm": 2.2534754276275635, |
|
"learning_rate": 1.4279540233334665e-05, |
|
"loss": 2.3177, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.830808729139923, |
|
"grad_norm": 1.2119274139404297, |
|
"learning_rate": 1.4112169502743799e-05, |
|
"loss": 2.0937, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.8318356867779204, |
|
"grad_norm": 1.1014364957809448, |
|
"learning_rate": 1.3945710944549706e-05, |
|
"loss": 2.0666, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8328626444159178, |
|
"grad_norm": 1.0724173784255981, |
|
"learning_rate": 1.3780166326626876e-05, |
|
"loss": 2.383, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.8338896020539153, |
|
"grad_norm": 1.3168890476226807, |
|
"learning_rate": 1.361553740714323e-05, |
|
"loss": 2.0837, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.8349165596919127, |
|
"grad_norm": 1.224159598350525, |
|
"learning_rate": 1.3451825934541551e-05, |
|
"loss": 2.3732, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.8359435173299101, |
|
"grad_norm": 1.204418659210205, |
|
"learning_rate": 1.3289033647520877e-05, |
|
"loss": 2.2253, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.8369704749679076, |
|
"grad_norm": 1.26673424243927, |
|
"learning_rate": 1.3127162275017957e-05, |
|
"loss": 2.2487, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.837997432605905, |
|
"grad_norm": 1.131012201309204, |
|
"learning_rate": 1.2966213536189032e-05, |
|
"loss": 1.9738, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.8390243902439024, |
|
"grad_norm": 1.244232177734375, |
|
"learning_rate": 1.2806189140391489e-05, |
|
"loss": 2.1122, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.8400513478818998, |
|
"grad_norm": 1.446234941482544, |
|
"learning_rate": 1.2647090787165694e-05, |
|
"loss": 2.3183, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.8410783055198973, |
|
"grad_norm": 1.4866114854812622, |
|
"learning_rate": 1.2488920166217032e-05, |
|
"loss": 2.273, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 1.590286374092102, |
|
"learning_rate": 1.2331678957397819e-05, |
|
"loss": 2.2149, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8431322207958921, |
|
"grad_norm": 1.6288813352584839, |
|
"learning_rate": 1.2175368830689593e-05, |
|
"loss": 2.204, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.8441591784338897, |
|
"grad_norm": 1.644627571105957, |
|
"learning_rate": 1.2019991446185309e-05, |
|
"loss": 2.3602, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.8451861360718871, |
|
"grad_norm": 3.2406749725341797, |
|
"learning_rate": 1.186554845407174e-05, |
|
"loss": 2.2797, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.8462130937098845, |
|
"grad_norm": 1.201799988746643, |
|
"learning_rate": 1.1712041494611958e-05, |
|
"loss": 2.2419, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.8472400513478819, |
|
"grad_norm": 3.3227691650390625, |
|
"learning_rate": 1.1559472198127818e-05, |
|
"loss": 2.3199, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8482670089858794, |
|
"grad_norm": 1.7569915056228638, |
|
"learning_rate": 1.1407842184982786e-05, |
|
"loss": 2.2096, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.8492939666238768, |
|
"grad_norm": 1.4387773275375366, |
|
"learning_rate": 1.125715306556464e-05, |
|
"loss": 2.2704, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.8503209242618742, |
|
"grad_norm": 1.222449779510498, |
|
"learning_rate": 1.1107406440268376e-05, |
|
"loss": 2.1525, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.8513478818998717, |
|
"grad_norm": 1.3529927730560303, |
|
"learning_rate": 1.0958603899479281e-05, |
|
"loss": 2.3474, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.8523748395378691, |
|
"grad_norm": 1.2619078159332275, |
|
"learning_rate": 1.0810747023555878e-05, |
|
"loss": 2.1297, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8534017971758665, |
|
"grad_norm": 1.312843918800354, |
|
"learning_rate": 1.0663837382813336e-05, |
|
"loss": 2.1388, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.8544287548138639, |
|
"grad_norm": 2.356374979019165, |
|
"learning_rate": 1.0517876537506687e-05, |
|
"loss": 2.2537, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.8554557124518614, |
|
"grad_norm": 1.5887354612350464, |
|
"learning_rate": 1.0372866037814277e-05, |
|
"loss": 2.1951, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.8564826700898588, |
|
"grad_norm": 1.3657351732254028, |
|
"learning_rate": 1.0228807423821263e-05, |
|
"loss": 2.3044, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.8575096277278562, |
|
"grad_norm": 1.2927043437957764, |
|
"learning_rate": 1.0085702225503313e-05, |
|
"loss": 2.0492, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8585365853658536, |
|
"grad_norm": 1.8013806343078613, |
|
"learning_rate": 9.943551962710362e-06, |
|
"loss": 2.2268, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.8595635430038511, |
|
"grad_norm": 1.4299966096878052, |
|
"learning_rate": 9.802358145150425e-06, |
|
"loss": 2.404, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.8605905006418485, |
|
"grad_norm": 1.9017884731292725, |
|
"learning_rate": 9.662122272373575e-06, |
|
"loss": 2.2539, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.8616174582798459, |
|
"grad_norm": 1.2827292680740356, |
|
"learning_rate": 9.522845833756e-06, |
|
"loss": 2.1102, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.8626444159178434, |
|
"grad_norm": 1.567421555519104, |
|
"learning_rate": 9.384530308484273e-06, |
|
"loss": 2.4192, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8636713735558408, |
|
"grad_norm": 1.4824482202529907, |
|
"learning_rate": 9.247177165539556e-06, |
|
"loss": 2.3457, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.8646983311938382, |
|
"grad_norm": 2.1519415378570557, |
|
"learning_rate": 9.110787863682002e-06, |
|
"loss": 2.2903, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.8657252888318356, |
|
"grad_norm": 1.3354597091674805, |
|
"learning_rate": 8.97536385143527e-06, |
|
"loss": 2.2387, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.8667522464698331, |
|
"grad_norm": 1.363411784172058, |
|
"learning_rate": 8.840906567071194e-06, |
|
"loss": 2.3414, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.8677792041078306, |
|
"grad_norm": 1.2312959432601929, |
|
"learning_rate": 8.707417438594445e-06, |
|
"loss": 2.1462, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.868806161745828, |
|
"grad_norm": 1.4682419300079346, |
|
"learning_rate": 8.574897883727384e-06, |
|
"loss": 2.3364, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.8698331193838255, |
|
"grad_norm": 1.9188342094421387, |
|
"learning_rate": 8.443349309895032e-06, |
|
"loss": 2.1837, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.8708600770218229, |
|
"grad_norm": 1.4827263355255127, |
|
"learning_rate": 8.312773114210049e-06, |
|
"loss": 2.2226, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.8718870346598203, |
|
"grad_norm": 1.3923516273498535, |
|
"learning_rate": 8.183170683457986e-06, |
|
"loss": 2.4293, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.8729139922978177, |
|
"grad_norm": 1.3553410768508911, |
|
"learning_rate": 8.054543394082504e-06, |
|
"loss": 2.0766, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8739409499358152, |
|
"grad_norm": 3.3262252807617188, |
|
"learning_rate": 7.926892612170777e-06, |
|
"loss": 2.2739, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.8749679075738126, |
|
"grad_norm": 1.1720492839813232, |
|
"learning_rate": 7.800219693438981e-06, |
|
"loss": 2.1775, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.87599486521181, |
|
"grad_norm": 1.3928182125091553, |
|
"learning_rate": 7.674525983217828e-06, |
|
"loss": 2.1329, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.8770218228498075, |
|
"grad_norm": 2.7582273483276367, |
|
"learning_rate": 7.5498128164383955e-06, |
|
"loss": 2.2974, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.8780487804878049, |
|
"grad_norm": 1.4647380113601685, |
|
"learning_rate": 7.426081517617889e-06, |
|
"loss": 2.532, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.8790757381258023, |
|
"grad_norm": 1.2066142559051514, |
|
"learning_rate": 7.30333340084558e-06, |
|
"loss": 2.1876, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.8801026957637997, |
|
"grad_norm": 1.2079867124557495, |
|
"learning_rate": 7.181569769768792e-06, |
|
"loss": 2.201, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.8811296534017972, |
|
"grad_norm": 1.4736223220825195, |
|
"learning_rate": 7.0607919175791796e-06, |
|
"loss": 2.2242, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.8821566110397946, |
|
"grad_norm": 1.428174376487732, |
|
"learning_rate": 6.941001126998892e-06, |
|
"loss": 2.3792, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.883183568677792, |
|
"grad_norm": 1.695448875427246, |
|
"learning_rate": 6.822198670266988e-06, |
|
"loss": 2.3633, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8842105263157894, |
|
"grad_norm": 1.5708576440811157, |
|
"learning_rate": 6.7043858091259235e-06, |
|
"loss": 2.297, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.8852374839537869, |
|
"grad_norm": 1.4929567575454712, |
|
"learning_rate": 6.587563794808127e-06, |
|
"loss": 2.3336, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.8862644415917843, |
|
"grad_norm": 1.305184245109558, |
|
"learning_rate": 6.471733868022744e-06, |
|
"loss": 2.4024, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.8872913992297817, |
|
"grad_norm": 1.4627037048339844, |
|
"learning_rate": 6.356897258942451e-06, |
|
"loss": 2.1668, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.8883183568677792, |
|
"grad_norm": 1.36262845993042, |
|
"learning_rate": 6.243055187190383e-06, |
|
"loss": 2.0239, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.8893453145057766, |
|
"grad_norm": 1.3705987930297852, |
|
"learning_rate": 6.130208861827202e-06, |
|
"loss": 2.1388, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.8903722721437741, |
|
"grad_norm": 1.4911210536956787, |
|
"learning_rate": 6.018359481338176e-06, |
|
"loss": 2.3255, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.8913992297817716, |
|
"grad_norm": 1.2989243268966675, |
|
"learning_rate": 5.907508233620573e-06, |
|
"loss": 2.2913, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.892426187419769, |
|
"grad_norm": 1.4845342636108398, |
|
"learning_rate": 5.797656295970955e-06, |
|
"loss": 2.3531, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.8934531450577664, |
|
"grad_norm": 1.3806096315383911, |
|
"learning_rate": 5.688804835072748e-06, |
|
"loss": 2.1876, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8944801026957638, |
|
"grad_norm": 1.2086135149002075, |
|
"learning_rate": 5.580955006983735e-06, |
|
"loss": 2.2474, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.8955070603337613, |
|
"grad_norm": 1.488832712173462, |
|
"learning_rate": 5.474107957123886e-06, |
|
"loss": 2.2586, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.8965340179717587, |
|
"grad_norm": 2.129424571990967, |
|
"learning_rate": 5.3682648202631695e-06, |
|
"loss": 2.2862, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.8975609756097561, |
|
"grad_norm": 1.4406683444976807, |
|
"learning_rate": 5.263426720509468e-06, |
|
"loss": 1.9062, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.8985879332477535, |
|
"grad_norm": 1.2508238554000854, |
|
"learning_rate": 5.159594771296683e-06, |
|
"loss": 2.0821, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.899614890885751, |
|
"grad_norm": 1.6048517227172852, |
|
"learning_rate": 5.056770075372841e-06, |
|
"loss": 2.0023, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.9006418485237484, |
|
"grad_norm": 1.4277167320251465, |
|
"learning_rate": 4.954953724788469e-06, |
|
"loss": 2.3978, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.9016688061617458, |
|
"grad_norm": 1.236256718635559, |
|
"learning_rate": 4.8541468008849285e-06, |
|
"loss": 2.0407, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.9026957637997433, |
|
"grad_norm": 1.405112624168396, |
|
"learning_rate": 4.754350374283001e-06, |
|
"loss": 2.2371, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.9037227214377407, |
|
"grad_norm": 1.4664572477340698, |
|
"learning_rate": 4.6555655048713955e-06, |
|
"loss": 2.4453, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9047496790757381, |
|
"grad_norm": 1.3141064643859863, |
|
"learning_rate": 4.5577932417956495e-06, |
|
"loss": 2.1578, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.9057766367137355, |
|
"grad_norm": 1.4012268781661987, |
|
"learning_rate": 4.461034623446847e-06, |
|
"loss": 2.4731, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.906803594351733, |
|
"grad_norm": 1.1742385625839233, |
|
"learning_rate": 4.3652906774506955e-06, |
|
"loss": 2.1617, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.9078305519897304, |
|
"grad_norm": 1.2050607204437256, |
|
"learning_rate": 4.270562420656543e-06, |
|
"loss": 2.2211, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.9088575096277278, |
|
"grad_norm": 1.6956837177276611, |
|
"learning_rate": 4.176850859126591e-06, |
|
"loss": 2.3175, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.9098844672657252, |
|
"grad_norm": 1.1956061124801636, |
|
"learning_rate": 4.084156988125231e-06, |
|
"loss": 2.2572, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.9109114249037227, |
|
"grad_norm": 1.2421371936798096, |
|
"learning_rate": 3.992481792108493e-06, |
|
"loss": 2.0939, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.9119383825417201, |
|
"grad_norm": 1.1449053287506104, |
|
"learning_rate": 3.901826244713525e-06, |
|
"loss": 2.0271, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.9129653401797175, |
|
"grad_norm": 1.5248461961746216, |
|
"learning_rate": 3.812191308748303e-06, |
|
"loss": 2.3376, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.9139922978177151, |
|
"grad_norm": 1.607293725013733, |
|
"learning_rate": 3.723577936181366e-06, |
|
"loss": 2.3735, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9150192554557125, |
|
"grad_norm": 1.536257266998291, |
|
"learning_rate": 3.6359870681317743e-06, |
|
"loss": 2.2889, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.9160462130937099, |
|
"grad_norm": 1.1060168743133545, |
|
"learning_rate": 3.5494196348590415e-06, |
|
"loss": 1.8348, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.9170731707317074, |
|
"grad_norm": 1.3008726835250854, |
|
"learning_rate": 3.4638765557532983e-06, |
|
"loss": 2.2412, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.9181001283697048, |
|
"grad_norm": 1.4789313077926636, |
|
"learning_rate": 3.3793587393255e-06, |
|
"loss": 2.1869, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.9191270860077022, |
|
"grad_norm": 1.9860131740570068, |
|
"learning_rate": 3.295867083197801e-06, |
|
"loss": 2.3982, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.9201540436456996, |
|
"grad_norm": 1.4276963472366333, |
|
"learning_rate": 3.213402474093996e-06, |
|
"loss": 2.4385, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.9211810012836971, |
|
"grad_norm": 1.2232952117919922, |
|
"learning_rate": 3.131965787830149e-06, |
|
"loss": 2.1511, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.9222079589216945, |
|
"grad_norm": 1.288060188293457, |
|
"learning_rate": 3.0515578893052344e-06, |
|
"loss": 2.2703, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.9232349165596919, |
|
"grad_norm": 2.902099847793579, |
|
"learning_rate": 2.9721796324919893e-06, |
|
"loss": 2.4419, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.9242618741976893, |
|
"grad_norm": 1.4718948602676392, |
|
"learning_rate": 2.8938318604278314e-06, |
|
"loss": 2.2219, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9252888318356868, |
|
"grad_norm": 1.2647035121917725, |
|
"learning_rate": 2.8165154052058997e-06, |
|
"loss": 1.9561, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.9263157894736842, |
|
"grad_norm": 1.5724482536315918, |
|
"learning_rate": 2.7402310879662497e-06, |
|
"loss": 2.2152, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.9273427471116816, |
|
"grad_norm": 1.4992693662643433, |
|
"learning_rate": 2.664979718887073e-06, |
|
"loss": 2.3404, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.9283697047496791, |
|
"grad_norm": 1.3174737691879272, |
|
"learning_rate": 2.590762097176136e-06, |
|
"loss": 2.1052, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.9293966623876765, |
|
"grad_norm": 1.3780856132507324, |
|
"learning_rate": 2.517579011062299e-06, |
|
"loss": 2.1754, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.9304236200256739, |
|
"grad_norm": 1.4942290782928467, |
|
"learning_rate": 2.44543123778711e-06, |
|
"loss": 2.2278, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.9314505776636713, |
|
"grad_norm": 1.3182977437973022, |
|
"learning_rate": 2.3743195435966036e-06, |
|
"loss": 2.416, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.9324775353016688, |
|
"grad_norm": 1.462436318397522, |
|
"learning_rate": 2.304244683733059e-06, |
|
"loss": 2.4912, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.9335044929396662, |
|
"grad_norm": 1.334877610206604, |
|
"learning_rate": 2.2352074024271195e-06, |
|
"loss": 2.1424, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.9345314505776636, |
|
"grad_norm": 1.6479849815368652, |
|
"learning_rate": 2.167208432889789e-06, |
|
"loss": 2.1548, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.935558408215661, |
|
"grad_norm": 1.6695164442062378, |
|
"learning_rate": 2.1002484973046577e-06, |
|
"loss": 2.2078, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.9365853658536586, |
|
"grad_norm": 1.5034846067428589, |
|
"learning_rate": 2.034328306820288e-06, |
|
"loss": 2.3183, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.937612323491656, |
|
"grad_norm": 1.3325846195220947, |
|
"learning_rate": 1.969448561542553e-06, |
|
"loss": 2.1897, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.9386392811296534, |
|
"grad_norm": 1.5285733938217163, |
|
"learning_rate": 1.9056099505273427e-06, |
|
"loss": 2.4954, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.9396662387676509, |
|
"grad_norm": 1.4107942581176758, |
|
"learning_rate": 1.8428131517731373e-06, |
|
"loss": 2.4118, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.9406931964056483, |
|
"grad_norm": 1.5880571603775024, |
|
"learning_rate": 1.7810588322138222e-06, |
|
"loss": 2.1729, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.9417201540436457, |
|
"grad_norm": 1.3712589740753174, |
|
"learning_rate": 1.7203476477116843e-06, |
|
"loss": 2.0461, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.9427471116816432, |
|
"grad_norm": 1.2501283884048462, |
|
"learning_rate": 1.6606802430503166e-06, |
|
"loss": 2.2321, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.9437740693196406, |
|
"grad_norm": 1.9402281045913696, |
|
"learning_rate": 1.6020572519278908e-06, |
|
"loss": 2.2923, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.944801026957638, |
|
"grad_norm": 1.619081735610962, |
|
"learning_rate": 1.5444792969503407e-06, |
|
"loss": 2.4231, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9458279845956354, |
|
"grad_norm": 1.4488154649734497, |
|
"learning_rate": 1.487946989624811e-06, |
|
"loss": 2.3659, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.9468549422336329, |
|
"grad_norm": 1.4263108968734741, |
|
"learning_rate": 1.43246093035313e-06, |
|
"loss": 2.2714, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.9478818998716303, |
|
"grad_norm": 1.1710644960403442, |
|
"learning_rate": 1.3780217084254366e-06, |
|
"loss": 2.128, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.9489088575096277, |
|
"grad_norm": 1.3074594736099243, |
|
"learning_rate": 1.3246299020139185e-06, |
|
"loss": 2.0772, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.9499358151476252, |
|
"grad_norm": 1.3507771492004395, |
|
"learning_rate": 1.2722860781666956e-06, |
|
"loss": 2.2176, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9509627727856226, |
|
"grad_norm": 1.6228179931640625, |
|
"learning_rate": 1.2209907928017795e-06, |
|
"loss": 2.3079, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.95198973042362, |
|
"grad_norm": 1.375373125076294, |
|
"learning_rate": 1.1707445907011339e-06, |
|
"loss": 2.1246, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.9530166880616174, |
|
"grad_norm": 1.8349575996398926, |
|
"learning_rate": 1.1215480055049798e-06, |
|
"loss": 2.2632, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.9540436456996149, |
|
"grad_norm": 1.3294469118118286, |
|
"learning_rate": 1.073401559706022e-06, |
|
"loss": 2.1936, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.9550706033376123, |
|
"grad_norm": 1.4897345304489136, |
|
"learning_rate": 1.0263057646440199e-06, |
|
"loss": 2.2939, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9560975609756097, |
|
"grad_norm": 1.434954047203064, |
|
"learning_rate": 9.802611205002032e-07, |
|
"loss": 2.1473, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.9571245186136071, |
|
"grad_norm": 1.6770613193511963, |
|
"learning_rate": 9.352681162920984e-07, |
|
"loss": 2.2041, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.9581514762516046, |
|
"grad_norm": 1.771236538887024, |
|
"learning_rate": 8.913272298682773e-07, |
|
"loss": 2.1005, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.959178433889602, |
|
"grad_norm": 1.6060534715652466, |
|
"learning_rate": 8.484389279032834e-07, |
|
"loss": 2.3045, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.9602053915275995, |
|
"grad_norm": 1.5178483724594116, |
|
"learning_rate": 8.066036658926579e-07, |
|
"loss": 2.2922, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.961232349165597, |
|
"grad_norm": 1.6939451694488525, |
|
"learning_rate": 7.658218881481439e-07, |
|
"loss": 2.2083, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.9622593068035944, |
|
"grad_norm": 1.2474678754806519, |
|
"learning_rate": 7.260940277929451e-07, |
|
"loss": 2.1818, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.9632862644415918, |
|
"grad_norm": 1.459787368774414, |
|
"learning_rate": 6.874205067571083e-07, |
|
"loss": 2.3195, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.9643132220795892, |
|
"grad_norm": 1.278882384300232, |
|
"learning_rate": 6.498017357731034e-07, |
|
"loss": 2.1238, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.9653401797175867, |
|
"grad_norm": 1.3337883949279785, |
|
"learning_rate": 6.132381143713728e-07, |
|
"loss": 2.1721, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9663671373555841, |
|
"grad_norm": 1.6333835124969482, |
|
"learning_rate": 5.777300308761446e-07, |
|
"loss": 2.2194, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.9673940949935815, |
|
"grad_norm": 1.384678602218628, |
|
"learning_rate": 5.432778624013257e-07, |
|
"loss": 2.4414, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.968421052631579, |
|
"grad_norm": 1.3769394159317017, |
|
"learning_rate": 5.098819748464378e-07, |
|
"loss": 2.3543, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.9694480102695764, |
|
"grad_norm": 1.353691577911377, |
|
"learning_rate": 4.775427228927765e-07, |
|
"loss": 2.1787, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.9704749679075738, |
|
"grad_norm": 1.647281289100647, |
|
"learning_rate": 4.462604499996248e-07, |
|
"loss": 2.2027, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.9715019255455712, |
|
"grad_norm": 1.2839142084121704, |
|
"learning_rate": 4.1603548840062345e-07, |
|
"loss": 2.1197, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.9725288831835687, |
|
"grad_norm": 1.1656153202056885, |
|
"learning_rate": 3.8686815910021767e-07, |
|
"loss": 2.1826, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.9735558408215661, |
|
"grad_norm": 1.508293628692627, |
|
"learning_rate": 3.5875877187024896e-07, |
|
"loss": 2.4654, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.9745827984595635, |
|
"grad_norm": 1.6085854768753052, |
|
"learning_rate": 3.317076252467133e-07, |
|
"loss": 2.2999, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 1.520034670829773, |
|
"learning_rate": 3.0571500652651907e-07, |
|
"loss": 1.7566, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9766367137355584, |
|
"grad_norm": 1.2050639390945435, |
|
"learning_rate": 2.807811917644898e-07, |
|
"loss": 2.2555, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.9776636713735558, |
|
"grad_norm": 1.7655271291732788, |
|
"learning_rate": 2.5690644577039956e-07, |
|
"loss": 2.2677, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.9786906290115532, |
|
"grad_norm": 1.795967698097229, |
|
"learning_rate": 2.340910221061754e-07, |
|
"loss": 2.399, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.9797175866495507, |
|
"grad_norm": 1.9675099849700928, |
|
"learning_rate": 2.1233516308323264e-07, |
|
"loss": 2.2401, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.9807445442875481, |
|
"grad_norm": 1.5977956056594849, |
|
"learning_rate": 1.9163909975982164e-07, |
|
"loss": 1.8277, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.9817715019255455, |
|
"grad_norm": 1.4226264953613281, |
|
"learning_rate": 1.7200305193866284e-07, |
|
"loss": 2.127, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.9827984595635431, |
|
"grad_norm": 1.2429808378219604, |
|
"learning_rate": 1.534272281645488e-07, |
|
"loss": 2.3365, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.9838254172015405, |
|
"grad_norm": 1.4925501346588135, |
|
"learning_rate": 1.359118257221903e-07, |
|
"loss": 2.2891, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.9848523748395379, |
|
"grad_norm": 1.5961631536483765, |
|
"learning_rate": 1.1945703063402924e-07, |
|
"loss": 2.2596, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.9858793324775353, |
|
"grad_norm": 1.4775968790054321, |
|
"learning_rate": 1.0406301765837345e-07, |
|
"loss": 2.2753, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9869062901155328, |
|
"grad_norm": 1.4262019395828247, |
|
"learning_rate": 8.972995028745379e-08, |
|
"loss": 2.2151, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.9879332477535302, |
|
"grad_norm": 1.8904521465301514, |
|
"learning_rate": 7.645798074572552e-08, |
|
"loss": 2.3732, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.9889602053915276, |
|
"grad_norm": 1.2109746932983398, |
|
"learning_rate": 6.424724998825848e-08, |
|
"loss": 2.2444, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.989987163029525, |
|
"grad_norm": 1.230819582939148, |
|
"learning_rate": 5.3097887699193885e-08, |
|
"loss": 2.1996, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.9910141206675225, |
|
"grad_norm": 1.2803691625595093, |
|
"learning_rate": 4.3010012290445324e-08, |
|
"loss": 2.3936, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.9920410783055199, |
|
"grad_norm": 1.3575743436813354, |
|
"learning_rate": 3.3983730900377655e-08, |
|
"loss": 2.3142, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.9930680359435173, |
|
"grad_norm": 1.3123668432235718, |
|
"learning_rate": 2.601913939266343e-08, |
|
"loss": 2.158, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.9940949935815148, |
|
"grad_norm": 1.5591670274734497, |
|
"learning_rate": 1.9116322355339222e-08, |
|
"loss": 2.1941, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.9951219512195122, |
|
"grad_norm": 1.1492890119552612, |
|
"learning_rate": 1.3275353099795329e-08, |
|
"loss": 1.9275, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.9961489088575096, |
|
"grad_norm": 1.3780685663223267, |
|
"learning_rate": 8.496293660120724e-09, |
|
"loss": 2.2059, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.997175866495507, |
|
"grad_norm": 1.3340226411819458, |
|
"learning_rate": 4.779194792348119e-09, |
|
"loss": 2.2606, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.9982028241335045, |
|
"grad_norm": 1.5144405364990234, |
|
"learning_rate": 2.124095973954354e-09, |
|
"loss": 2.181, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.9992297817715019, |
|
"grad_norm": 1.2708326578140259, |
|
"learning_rate": 5.310254034274209e-10, |
|
"loss": 1.9886, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 1.0005134788189987, |
|
"grad_norm": 2.398379325866699, |
|
"learning_rate": 0.0, |
|
"loss": 3.4023, |
|
"step": 974 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 974, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 244, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.7208949874753536e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|