|
{ |
|
"best_metric": 0.723136305809021, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-900", |
|
"epoch": 0.06728190841023855, |
|
"eval_steps": 100, |
|
"global_step": 966, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 6.965000870625109e-05, |
|
"grad_norm": 1.0377936363220215, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3142, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 6.965000870625109e-05, |
|
"eval_loss": 2.563941717147827, |
|
"eval_runtime": 694.6699, |
|
"eval_samples_per_second": 7.198, |
|
"eval_steps_per_second": 1.799, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00013930001741250218, |
|
"grad_norm": 1.5463413000106812, |
|
"learning_rate": 4e-05, |
|
"loss": 2.6219, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00020895002611875328, |
|
"grad_norm": 0.9576510787010193, |
|
"learning_rate": 6e-05, |
|
"loss": 2.4328, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00027860003482500437, |
|
"grad_norm": 0.9721766710281372, |
|
"learning_rate": 8e-05, |
|
"loss": 2.3784, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00034825004353125546, |
|
"grad_norm": 1.0480666160583496, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5397, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00041790005223750655, |
|
"grad_norm": 8.270813941955566, |
|
"learning_rate": 0.00012, |
|
"loss": 2.3505, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00048755006094375764, |
|
"grad_norm": 0.8568385243415833, |
|
"learning_rate": 0.00014, |
|
"loss": 2.2338, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0005572000696500087, |
|
"grad_norm": 0.7186582088470459, |
|
"learning_rate": 0.00016, |
|
"loss": 2.1361, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0006268500783562598, |
|
"grad_norm": 0.8668791055679321, |
|
"learning_rate": 0.00018, |
|
"loss": 2.1404, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0006965000870625109, |
|
"grad_norm": 0.6644730567932129, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9522, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.000766150095768762, |
|
"grad_norm": 0.7633213996887207, |
|
"learning_rate": 0.00019999946004996418, |
|
"loss": 1.9414, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0008358001044750131, |
|
"grad_norm": 0.8425551652908325, |
|
"learning_rate": 0.00019999784020568754, |
|
"loss": 1.7541, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0009054501131812642, |
|
"grad_norm": 0.7430157661437988, |
|
"learning_rate": 0.00019999514048466284, |
|
"loss": 2.0221, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0009751001218875153, |
|
"grad_norm": 0.6478707194328308, |
|
"learning_rate": 0.00019999136091604434, |
|
"loss": 2.061, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0010447501305937664, |
|
"grad_norm": 0.6233001351356506, |
|
"learning_rate": 0.00019998650154064764, |
|
"loss": 1.7651, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0011144001393000175, |
|
"grad_norm": 0.5294800400733948, |
|
"learning_rate": 0.0001999805624109491, |
|
"loss": 1.8399, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0011840501480062686, |
|
"grad_norm": 0.5066989660263062, |
|
"learning_rate": 0.0001999735435910854, |
|
"loss": 1.9775, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0012537001567125197, |
|
"grad_norm": 0.5490643978118896, |
|
"learning_rate": 0.00019996544515685281, |
|
"loss": 1.7321, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0013233501654187707, |
|
"grad_norm": 0.8576249480247498, |
|
"learning_rate": 0.00019995626719570626, |
|
"loss": 1.7238, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0013930001741250218, |
|
"grad_norm": 0.6412836313247681, |
|
"learning_rate": 0.00019994600980675862, |
|
"loss": 1.9291, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.001462650182831273, |
|
"grad_norm": 0.5579408407211304, |
|
"learning_rate": 0.0001999346731007794, |
|
"loss": 1.7642, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.001532300191537524, |
|
"grad_norm": 0.5578712224960327, |
|
"learning_rate": 0.00019992225720019376, |
|
"loss": 1.7988, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0016019502002437751, |
|
"grad_norm": 0.5795004963874817, |
|
"learning_rate": 0.00019990876223908093, |
|
"loss": 1.8818, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0016716002089500262, |
|
"grad_norm": 0.48596304655075073, |
|
"learning_rate": 0.00019989418836317304, |
|
"loss": 1.7715, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0017412502176562773, |
|
"grad_norm": 0.672593355178833, |
|
"learning_rate": 0.00019987853572985342, |
|
"loss": 1.6647, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0018109002263625284, |
|
"grad_norm": 0.6527593731880188, |
|
"learning_rate": 0.00019986180450815485, |
|
"loss": 1.6806, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0018805502350687795, |
|
"grad_norm": 0.6159369945526123, |
|
"learning_rate": 0.00019984399487875778, |
|
"loss": 1.6252, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0019502002437750306, |
|
"grad_norm": 0.6747246384620667, |
|
"learning_rate": 0.00019982510703398843, |
|
"loss": 1.6697, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0020198502524812817, |
|
"grad_norm": 0.6250666975975037, |
|
"learning_rate": 0.00019980514117781667, |
|
"loss": 1.7791, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0020895002611875328, |
|
"grad_norm": 0.5189153552055359, |
|
"learning_rate": 0.00019978409752585376, |
|
"loss": 1.6126, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.002159150269893784, |
|
"grad_norm": 0.6016886234283447, |
|
"learning_rate": 0.00019976197630535014, |
|
"loss": 1.8378, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.002228800278600035, |
|
"grad_norm": 0.6658028364181519, |
|
"learning_rate": 0.00019973877775519285, |
|
"loss": 1.5398, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.002298450287306286, |
|
"grad_norm": 0.6278268098831177, |
|
"learning_rate": 0.0001997145021259031, |
|
"loss": 1.6046, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.002368100296012537, |
|
"grad_norm": 0.6449319124221802, |
|
"learning_rate": 0.00019968914967963337, |
|
"loss": 1.6949, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.002437750304718788, |
|
"grad_norm": 0.5705320239067078, |
|
"learning_rate": 0.0001996627206901648, |
|
"loss": 1.699, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0025074003134250393, |
|
"grad_norm": 0.698817253112793, |
|
"learning_rate": 0.00019963521544290403, |
|
"loss": 1.3933, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0025770503221312904, |
|
"grad_norm": 0.6723275780677795, |
|
"learning_rate": 0.00019960663423488026, |
|
"loss": 1.3995, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0026467003308375415, |
|
"grad_norm": 0.6986438632011414, |
|
"learning_rate": 0.00019957697737474196, |
|
"loss": 1.5379, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0027163503395437926, |
|
"grad_norm": 0.7800816297531128, |
|
"learning_rate": 0.0001995462451827536, |
|
"loss": 1.5991, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0027860003482500437, |
|
"grad_norm": 0.7049386501312256, |
|
"learning_rate": 0.00019951443799079215, |
|
"loss": 1.4532, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0028556503569562948, |
|
"grad_norm": 0.7777565717697144, |
|
"learning_rate": 0.0001994815561423435, |
|
"loss": 1.8033, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.002925300365662546, |
|
"grad_norm": 0.7464177012443542, |
|
"learning_rate": 0.00019944759999249872, |
|
"loss": 1.5926, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.002994950374368797, |
|
"grad_norm": 0.5269952416419983, |
|
"learning_rate": 0.0001994125699079503, |
|
"loss": 1.7358, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.003064600383075048, |
|
"grad_norm": 0.6710164546966553, |
|
"learning_rate": 0.00019937646626698823, |
|
"loss": 1.3895, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.003134250391781299, |
|
"grad_norm": 0.6867531538009644, |
|
"learning_rate": 0.00019933928945949564, |
|
"loss": 1.3977, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0032039004004875502, |
|
"grad_norm": 0.6760386824607849, |
|
"learning_rate": 0.000199301039886945, |
|
"loss": 1.5334, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0032735504091938013, |
|
"grad_norm": 0.8017314076423645, |
|
"learning_rate": 0.0001992617179623934, |
|
"loss": 1.3485, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0033432004179000524, |
|
"grad_norm": 0.8706843256950378, |
|
"learning_rate": 0.00019922132411047833, |
|
"loss": 1.6267, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0034128504266063035, |
|
"grad_norm": 0.8783407807350159, |
|
"learning_rate": 0.0001991798587674131, |
|
"loss": 1.5161, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0034825004353125546, |
|
"grad_norm": 0.7308568954467773, |
|
"learning_rate": 0.0001991373223809819, |
|
"loss": 1.6129, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0035521504440188057, |
|
"grad_norm": 0.7637537717819214, |
|
"learning_rate": 0.00019909371541053524, |
|
"loss": 1.5135, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0036218004527250568, |
|
"grad_norm": 0.7845759987831116, |
|
"learning_rate": 0.00019904903832698484, |
|
"loss": 1.5176, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.003691450461431308, |
|
"grad_norm": 0.7081618309020996, |
|
"learning_rate": 0.0001990032916127985, |
|
"loss": 1.5891, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.003761100470137559, |
|
"grad_norm": 0.7322244048118591, |
|
"learning_rate": 0.00019895647576199506, |
|
"loss": 1.3892, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.00383075047884381, |
|
"grad_norm": 0.8247037529945374, |
|
"learning_rate": 0.0001989085912801389, |
|
"loss": 1.229, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.003900400487550061, |
|
"grad_norm": 0.7730288505554199, |
|
"learning_rate": 0.00019885963868433463, |
|
"loss": 1.4962, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.003970050496256312, |
|
"grad_norm": 0.8732311129570007, |
|
"learning_rate": 0.00019880961850322128, |
|
"loss": 1.507, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.004039700504962563, |
|
"grad_norm": 0.7709734439849854, |
|
"learning_rate": 0.00019875853127696692, |
|
"loss": 1.5573, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.004109350513668814, |
|
"grad_norm": 0.6652419567108154, |
|
"learning_rate": 0.00019870637755726244, |
|
"loss": 1.4967, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0041790005223750655, |
|
"grad_norm": 0.7002225518226624, |
|
"learning_rate": 0.000198653157907316, |
|
"loss": 1.6385, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.004248650531081317, |
|
"grad_norm": 0.7703307867050171, |
|
"learning_rate": 0.00019859887290184656, |
|
"loss": 1.4653, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.004318300539787568, |
|
"grad_norm": 0.7544863820075989, |
|
"learning_rate": 0.00019854352312707798, |
|
"loss": 1.492, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.004387950548493819, |
|
"grad_norm": 0.8162996768951416, |
|
"learning_rate": 0.00019848710918073247, |
|
"loss": 1.1976, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.00445760055720007, |
|
"grad_norm": 0.5825150012969971, |
|
"learning_rate": 0.00019842963167202433, |
|
"loss": 1.4162, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.004527250565906321, |
|
"grad_norm": 0.6794354319572449, |
|
"learning_rate": 0.00019837109122165317, |
|
"loss": 1.4261, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.004596900574612572, |
|
"grad_norm": 0.724295437335968, |
|
"learning_rate": 0.0001983114884617974, |
|
"loss": 1.4105, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.004666550583318823, |
|
"grad_norm": 0.8312812447547913, |
|
"learning_rate": 0.00019825082403610725, |
|
"loss": 1.4328, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.004736200592025074, |
|
"grad_norm": 0.7422550320625305, |
|
"learning_rate": 0.0001981890985996979, |
|
"loss": 1.4478, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.004805850600731325, |
|
"grad_norm": 0.8899093866348267, |
|
"learning_rate": 0.00019812631281914233, |
|
"loss": 1.1302, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.004875500609437576, |
|
"grad_norm": 0.838991105556488, |
|
"learning_rate": 0.0001980624673724643, |
|
"loss": 1.5665, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0049451506181438275, |
|
"grad_norm": 0.7630224823951721, |
|
"learning_rate": 0.0001979975629491308, |
|
"loss": 1.3839, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.005014800626850079, |
|
"grad_norm": 0.7271626591682434, |
|
"learning_rate": 0.00019793160025004475, |
|
"loss": 1.1867, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.00508445063555633, |
|
"grad_norm": 0.6928589344024658, |
|
"learning_rate": 0.00019786457998753737, |
|
"loss": 1.6149, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.005154100644262581, |
|
"grad_norm": 0.8479191660881042, |
|
"learning_rate": 0.00019779650288536058, |
|
"loss": 1.2566, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.005223750652968832, |
|
"grad_norm": 0.7954538464546204, |
|
"learning_rate": 0.000197727369678679, |
|
"loss": 1.1289, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.005293400661675083, |
|
"grad_norm": 0.8336564302444458, |
|
"learning_rate": 0.00019765718111406218, |
|
"loss": 1.227, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.005363050670381334, |
|
"grad_norm": 0.7440236806869507, |
|
"learning_rate": 0.00019758593794947648, |
|
"loss": 1.4401, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.005432700679087585, |
|
"grad_norm": 0.5975192785263062, |
|
"learning_rate": 0.00019751364095427692, |
|
"loss": 1.4655, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.005502350687793836, |
|
"grad_norm": 0.7023612260818481, |
|
"learning_rate": 0.0001974402909091988, |
|
"loss": 1.5098, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.005572000696500087, |
|
"grad_norm": 0.6060627698898315, |
|
"learning_rate": 0.00019736588860634925, |
|
"loss": 1.4346, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.005641650705206338, |
|
"grad_norm": 0.6663565039634705, |
|
"learning_rate": 0.00019729043484919883, |
|
"loss": 1.1718, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0057113007139125895, |
|
"grad_norm": 0.7931796908378601, |
|
"learning_rate": 0.00019721393045257277, |
|
"loss": 1.2598, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.005780950722618841, |
|
"grad_norm": 0.8470779061317444, |
|
"learning_rate": 0.000197136376242642, |
|
"loss": 1.0741, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.005850600731325092, |
|
"grad_norm": 0.6171009540557861, |
|
"learning_rate": 0.00019705777305691456, |
|
"loss": 1.4427, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.005920250740031343, |
|
"grad_norm": 0.8087684512138367, |
|
"learning_rate": 0.00019697812174422632, |
|
"loss": 1.4372, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.005989900748737594, |
|
"grad_norm": 0.6931564211845398, |
|
"learning_rate": 0.00019689742316473182, |
|
"loss": 1.1907, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.006059550757443845, |
|
"grad_norm": 0.6536969542503357, |
|
"learning_rate": 0.00019681567818989506, |
|
"loss": 1.3734, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.006129200766150096, |
|
"grad_norm": 0.6640751361846924, |
|
"learning_rate": 0.00019673288770248013, |
|
"loss": 1.4367, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.006198850774856347, |
|
"grad_norm": 0.564035177230835, |
|
"learning_rate": 0.00019664905259654156, |
|
"loss": 1.4644, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.006268500783562598, |
|
"grad_norm": 0.8123689889907837, |
|
"learning_rate": 0.0001965641737774147, |
|
"loss": 1.5373, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.006338150792268849, |
|
"grad_norm": 0.7990655899047852, |
|
"learning_rate": 0.00019647825216170597, |
|
"loss": 1.4824, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.0064078008009751004, |
|
"grad_norm": 0.7879489660263062, |
|
"learning_rate": 0.00019639128867728298, |
|
"loss": 1.3882, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0064774508096813515, |
|
"grad_norm": 0.7157430648803711, |
|
"learning_rate": 0.00019630328426326448, |
|
"loss": 1.5377, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.006547100818387603, |
|
"grad_norm": 0.7268451452255249, |
|
"learning_rate": 0.00019621423987001014, |
|
"loss": 1.2801, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.006616750827093854, |
|
"grad_norm": 0.8534408807754517, |
|
"learning_rate": 0.00019612415645911047, |
|
"loss": 1.2232, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.006686400835800105, |
|
"grad_norm": 0.7566258907318115, |
|
"learning_rate": 0.00019603303500337628, |
|
"loss": 1.0665, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.006756050844506356, |
|
"grad_norm": 0.764929473400116, |
|
"learning_rate": 0.00019594087648682824, |
|
"loss": 1.0974, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.006825700853212607, |
|
"grad_norm": 0.8856674432754517, |
|
"learning_rate": 0.00019584768190468625, |
|
"loss": 1.2374, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.006895350861918858, |
|
"grad_norm": 0.8611932396888733, |
|
"learning_rate": 0.0001957534522633586, |
|
"loss": 1.5207, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.006965000870625109, |
|
"grad_norm": 0.7111679315567017, |
|
"learning_rate": 0.00019565818858043136, |
|
"loss": 1.3399, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.006965000870625109, |
|
"eval_loss": 1.2061141729354858, |
|
"eval_runtime": 699.7303, |
|
"eval_samples_per_second": 7.146, |
|
"eval_steps_per_second": 1.786, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.00703465087933136, |
|
"grad_norm": 0.7039173245429993, |
|
"learning_rate": 0.00019556189188465702, |
|
"loss": 1.4391, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.007104300888037611, |
|
"grad_norm": 0.8350788354873657, |
|
"learning_rate": 0.00019546456321594376, |
|
"loss": 1.1431, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.0071739508967438624, |
|
"grad_norm": 0.6535744667053223, |
|
"learning_rate": 0.0001953662036253438, |
|
"loss": 1.296, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.0072436009054501135, |
|
"grad_norm": 0.7496301531791687, |
|
"learning_rate": 0.00019526681417504258, |
|
"loss": 1.311, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.007313250914156365, |
|
"grad_norm": 0.7061691880226135, |
|
"learning_rate": 0.0001951663959383468, |
|
"loss": 1.3601, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.007382900922862616, |
|
"grad_norm": 0.8221380114555359, |
|
"learning_rate": 0.00019506494999967298, |
|
"loss": 1.3149, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.007452550931568867, |
|
"grad_norm": 0.9544386267662048, |
|
"learning_rate": 0.000194962477454536, |
|
"loss": 1.2967, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.007522200940275118, |
|
"grad_norm": 0.8127594590187073, |
|
"learning_rate": 0.00019485897940953688, |
|
"loss": 1.4015, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.007591850948981369, |
|
"grad_norm": 0.7376645803451538, |
|
"learning_rate": 0.0001947544569823511, |
|
"loss": 1.4958, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.00766150095768762, |
|
"grad_norm": 0.6602767705917358, |
|
"learning_rate": 0.00019464891130171647, |
|
"loss": 1.3593, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.007731150966393871, |
|
"grad_norm": 0.9318028092384338, |
|
"learning_rate": 0.0001945423435074208, |
|
"loss": 1.0125, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.007800800975100122, |
|
"grad_norm": 0.7048940062522888, |
|
"learning_rate": 0.00019443475475028983, |
|
"loss": 1.4342, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.007870450983806372, |
|
"grad_norm": 0.9778817892074585, |
|
"learning_rate": 0.00019432614619217459, |
|
"loss": 1.0368, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.007940100992512624, |
|
"grad_norm": 0.808047890663147, |
|
"learning_rate": 0.000194216519005939, |
|
"loss": 1.105, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.008009751001218875, |
|
"grad_norm": 0.7996501326560974, |
|
"learning_rate": 0.0001941058743754471, |
|
"loss": 1.1383, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.008079401009925127, |
|
"grad_norm": 1.0752230882644653, |
|
"learning_rate": 0.00019399421349555035, |
|
"loss": 1.3508, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.008149051018631377, |
|
"grad_norm": 0.7151166200637817, |
|
"learning_rate": 0.00019388153757207471, |
|
"loss": 1.4086, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.008218701027337629, |
|
"grad_norm": 0.7622511386871338, |
|
"learning_rate": 0.00019376784782180746, |
|
"loss": 1.1942, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.008288351036043879, |
|
"grad_norm": 0.6896407008171082, |
|
"learning_rate": 0.0001936531454724844, |
|
"loss": 1.2571, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.008358001044750131, |
|
"grad_norm": 0.7991106510162354, |
|
"learning_rate": 0.00019353743176277622, |
|
"loss": 1.2531, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.008427651053456381, |
|
"grad_norm": 0.8540248870849609, |
|
"learning_rate": 0.00019342070794227536, |
|
"loss": 1.223, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.008497301062162633, |
|
"grad_norm": 0.8329891562461853, |
|
"learning_rate": 0.00019330297527148246, |
|
"loss": 0.9099, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.008566951070868883, |
|
"grad_norm": 0.7838830351829529, |
|
"learning_rate": 0.00019318423502179272, |
|
"loss": 1.3098, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.008636601079575135, |
|
"grad_norm": 0.7665576338768005, |
|
"learning_rate": 0.00019306448847548216, |
|
"loss": 1.3633, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.008706251088281386, |
|
"grad_norm": 0.7157841324806213, |
|
"learning_rate": 0.00019294373692569383, |
|
"loss": 0.9222, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.008775901096987638, |
|
"grad_norm": 0.944957971572876, |
|
"learning_rate": 0.0001928219816764238, |
|
"loss": 1.0901, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.008845551105693888, |
|
"grad_norm": 0.636736273765564, |
|
"learning_rate": 0.0001926992240425071, |
|
"loss": 1.3484, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.00891520111440014, |
|
"grad_norm": 0.6209918260574341, |
|
"learning_rate": 0.0001925754653496035, |
|
"loss": 1.3551, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.00898485112310639, |
|
"grad_norm": 0.7056594491004944, |
|
"learning_rate": 0.00019245070693418322, |
|
"loss": 1.4229, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.009054501131812642, |
|
"grad_norm": 0.7279839515686035, |
|
"learning_rate": 0.00019232495014351246, |
|
"loss": 1.0699, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.009124151140518892, |
|
"grad_norm": 0.6324151754379272, |
|
"learning_rate": 0.00019219819633563891, |
|
"loss": 1.3833, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.009193801149225144, |
|
"grad_norm": 0.7449592351913452, |
|
"learning_rate": 0.00019207044687937703, |
|
"loss": 1.2067, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.009263451157931394, |
|
"grad_norm": 0.939274787902832, |
|
"learning_rate": 0.0001919417031542933, |
|
"loss": 1.3229, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.009333101166637646, |
|
"grad_norm": 0.8192336559295654, |
|
"learning_rate": 0.00019181196655069127, |
|
"loss": 1.1575, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.009402751175343897, |
|
"grad_norm": 0.7507984638214111, |
|
"learning_rate": 0.00019168123846959666, |
|
"loss": 1.0461, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.009472401184050148, |
|
"grad_norm": 0.6593666672706604, |
|
"learning_rate": 0.00019154952032274206, |
|
"loss": 1.3806, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.009542051192756399, |
|
"grad_norm": 0.6475424766540527, |
|
"learning_rate": 0.00019141681353255184, |
|
"loss": 0.9218, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.00961170120146265, |
|
"grad_norm": 0.7746126651763916, |
|
"learning_rate": 0.00019128311953212678, |
|
"loss": 0.8967, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.009681351210168901, |
|
"grad_norm": 0.7104780673980713, |
|
"learning_rate": 0.00019114843976522842, |
|
"loss": 1.1855, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.009751001218875153, |
|
"grad_norm": 0.597457230091095, |
|
"learning_rate": 0.00019101277568626374, |
|
"loss": 1.0809, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.009820651227581403, |
|
"grad_norm": 0.8071316480636597, |
|
"learning_rate": 0.00019087612876026908, |
|
"loss": 1.0129, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.009890301236287655, |
|
"grad_norm": 0.8741605877876282, |
|
"learning_rate": 0.00019073850046289484, |
|
"loss": 0.8784, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.009959951244993905, |
|
"grad_norm": 0.7503401637077332, |
|
"learning_rate": 0.00019059989228038902, |
|
"loss": 1.1498, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.010029601253700157, |
|
"grad_norm": 0.7068141102790833, |
|
"learning_rate": 0.0001904603057095815, |
|
"loss": 1.2644, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.010099251262406407, |
|
"grad_norm": 0.7954654097557068, |
|
"learning_rate": 0.0001903197422578678, |
|
"loss": 1.1108, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.01016890127111266, |
|
"grad_norm": 0.7548302412033081, |
|
"learning_rate": 0.0001901782034431927, |
|
"loss": 0.9177, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.01023855127981891, |
|
"grad_norm": 0.7617766261100769, |
|
"learning_rate": 0.00019003569079403395, |
|
"loss": 1.256, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.010308201288525162, |
|
"grad_norm": 0.7205716967582703, |
|
"learning_rate": 0.00018989220584938573, |
|
"loss": 1.3767, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.010377851297231412, |
|
"grad_norm": 0.6221201419830322, |
|
"learning_rate": 0.00018974775015874213, |
|
"loss": 1.3329, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.010447501305937664, |
|
"grad_norm": 0.565428614616394, |
|
"learning_rate": 0.00018960232528208022, |
|
"loss": 1.1155, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.010517151314643914, |
|
"grad_norm": 0.7672913074493408, |
|
"learning_rate": 0.00018945593278984333, |
|
"loss": 0.9654, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.010586801323350166, |
|
"grad_norm": 0.737074077129364, |
|
"learning_rate": 0.00018930857426292412, |
|
"loss": 1.0644, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.010656451332056416, |
|
"grad_norm": 0.6545393466949463, |
|
"learning_rate": 0.0001891602512926474, |
|
"loss": 1.2058, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.010726101340762668, |
|
"grad_norm": 0.8019453287124634, |
|
"learning_rate": 0.00018901096548075305, |
|
"loss": 1.3134, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.010795751349468918, |
|
"grad_norm": 0.8307440876960754, |
|
"learning_rate": 0.00018886071843937866, |
|
"loss": 1.152, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.01086540135817517, |
|
"grad_norm": 0.8050329089164734, |
|
"learning_rate": 0.00018870951179104212, |
|
"loss": 0.9473, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.01093505136688142, |
|
"grad_norm": 0.7510560154914856, |
|
"learning_rate": 0.00018855734716862417, |
|
"loss": 1.2265, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.011004701375587672, |
|
"grad_norm": 0.7653977274894714, |
|
"learning_rate": 0.00018840422621535066, |
|
"loss": 1.3356, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.011074351384293923, |
|
"grad_norm": 0.7661434412002563, |
|
"learning_rate": 0.00018825015058477481, |
|
"loss": 0.9601, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.011144001393000175, |
|
"grad_norm": 0.7829368114471436, |
|
"learning_rate": 0.00018809512194075957, |
|
"loss": 1.0675, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.011213651401706425, |
|
"grad_norm": 0.6673858761787415, |
|
"learning_rate": 0.00018793914195745933, |
|
"loss": 1.4312, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.011283301410412677, |
|
"grad_norm": 0.8060672879219055, |
|
"learning_rate": 0.00018778221231930203, |
|
"loss": 1.0241, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.011352951419118927, |
|
"grad_norm": 1.0137969255447388, |
|
"learning_rate": 0.00018762433472097097, |
|
"loss": 1.1867, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.011422601427825179, |
|
"grad_norm": 0.9313655495643616, |
|
"learning_rate": 0.0001874655108673864, |
|
"loss": 1.3046, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.01149225143653143, |
|
"grad_norm": 0.9493317008018494, |
|
"learning_rate": 0.00018730574247368732, |
|
"loss": 1.1123, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.011561901445237681, |
|
"grad_norm": 0.8069944977760315, |
|
"learning_rate": 0.0001871450312652126, |
|
"loss": 1.0592, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.011631551453943931, |
|
"grad_norm": 0.6559287905693054, |
|
"learning_rate": 0.00018698337897748283, |
|
"loss": 1.2388, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.011701201462650183, |
|
"grad_norm": 0.650059700012207, |
|
"learning_rate": 0.0001868207873561811, |
|
"loss": 0.9891, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.011770851471356434, |
|
"grad_norm": 0.6247674822807312, |
|
"learning_rate": 0.00018665725815713443, |
|
"loss": 1.2925, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.011840501480062686, |
|
"grad_norm": 0.7453685402870178, |
|
"learning_rate": 0.00018649279314629483, |
|
"loss": 1.06, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.011910151488768936, |
|
"grad_norm": 0.826835572719574, |
|
"learning_rate": 0.00018632739409972003, |
|
"loss": 0.9637, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.011979801497475188, |
|
"grad_norm": 0.7538785338401794, |
|
"learning_rate": 0.00018616106280355444, |
|
"loss": 1.0126, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.012049451506181438, |
|
"grad_norm": 0.8348299264907837, |
|
"learning_rate": 0.00018599380105400982, |
|
"loss": 0.988, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.01211910151488769, |
|
"grad_norm": 0.8298357725143433, |
|
"learning_rate": 0.00018582561065734604, |
|
"loss": 1.0608, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.01218875152359394, |
|
"grad_norm": 0.6961440443992615, |
|
"learning_rate": 0.00018565649342985118, |
|
"loss": 1.1564, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.012258401532300192, |
|
"grad_norm": 0.664256751537323, |
|
"learning_rate": 0.00018548645119782238, |
|
"loss": 1.1865, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.012328051541006442, |
|
"grad_norm": 0.7857444882392883, |
|
"learning_rate": 0.0001853154857975458, |
|
"loss": 0.9903, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.012397701549712694, |
|
"grad_norm": 0.758602499961853, |
|
"learning_rate": 0.0001851435990752769, |
|
"loss": 1.3456, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.012467351558418945, |
|
"grad_norm": 0.768666684627533, |
|
"learning_rate": 0.0001849707928872206, |
|
"loss": 0.9773, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.012537001567125197, |
|
"grad_norm": 0.8674852848052979, |
|
"learning_rate": 0.00018479706909951094, |
|
"loss": 1.0203, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.012606651575831447, |
|
"grad_norm": 0.6384921669960022, |
|
"learning_rate": 0.0001846224295881913, |
|
"loss": 1.1004, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.012676301584537699, |
|
"grad_norm": 0.6848528981208801, |
|
"learning_rate": 0.00018444687623919386, |
|
"loss": 1.0699, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.012745951593243949, |
|
"grad_norm": 0.6943731307983398, |
|
"learning_rate": 0.00018427041094831937, |
|
"loss": 1.1812, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.012815601601950201, |
|
"grad_norm": 1.0284762382507324, |
|
"learning_rate": 0.00018409303562121662, |
|
"loss": 1.1307, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.012885251610656451, |
|
"grad_norm": 0.7977420091629028, |
|
"learning_rate": 0.00018391475217336193, |
|
"loss": 1.0772, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.012954901619362703, |
|
"grad_norm": 0.678799569606781, |
|
"learning_rate": 0.0001837355625300383, |
|
"loss": 1.1816, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.013024551628068953, |
|
"grad_norm": 0.7933035492897034, |
|
"learning_rate": 0.00018355546862631493, |
|
"loss": 1.2014, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.013094201636775205, |
|
"grad_norm": 0.7373278737068176, |
|
"learning_rate": 0.00018337447240702594, |
|
"loss": 0.9163, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.013163851645481455, |
|
"grad_norm": 0.7306934595108032, |
|
"learning_rate": 0.00018319257582674964, |
|
"loss": 0.8467, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.013233501654187707, |
|
"grad_norm": 0.6722437739372253, |
|
"learning_rate": 0.00018300978084978735, |
|
"loss": 1.1145, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.013303151662893958, |
|
"grad_norm": 0.8375574350357056, |
|
"learning_rate": 0.00018282608945014217, |
|
"loss": 0.8763, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.01337280167160021, |
|
"grad_norm": 0.6876571774482727, |
|
"learning_rate": 0.0001826415036114976, |
|
"loss": 1.3694, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.01344245168030646, |
|
"grad_norm": 0.5936222076416016, |
|
"learning_rate": 0.0001824560253271963, |
|
"loss": 1.4071, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.013512101689012712, |
|
"grad_norm": 0.6679614782333374, |
|
"learning_rate": 0.00018226965660021836, |
|
"loss": 0.8098, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.013581751697718962, |
|
"grad_norm": 0.8226193189620972, |
|
"learning_rate": 0.00018208239944315978, |
|
"loss": 0.6594, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.013651401706425214, |
|
"grad_norm": 0.8376763463020325, |
|
"learning_rate": 0.0001818942558782108, |
|
"loss": 1.0417, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.013721051715131464, |
|
"grad_norm": 0.773747444152832, |
|
"learning_rate": 0.00018170522793713387, |
|
"loss": 0.7496, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.013790701723837716, |
|
"grad_norm": 0.8213014006614685, |
|
"learning_rate": 0.00018151531766124186, |
|
"loss": 0.842, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.013860351732543966, |
|
"grad_norm": 0.6993326544761658, |
|
"learning_rate": 0.000181324527101376, |
|
"loss": 1.1651, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.013930001741250218, |
|
"grad_norm": 0.550957977771759, |
|
"learning_rate": 0.00018113285831788365, |
|
"loss": 1.2762, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.013930001741250218, |
|
"eval_loss": 0.993212103843689, |
|
"eval_runtime": 699.7494, |
|
"eval_samples_per_second": 7.145, |
|
"eval_steps_per_second": 1.786, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.013999651749956469, |
|
"grad_norm": 0.6803005933761597, |
|
"learning_rate": 0.00018094031338059617, |
|
"loss": 1.2403, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.01406930175866272, |
|
"grad_norm": 0.6137078404426575, |
|
"learning_rate": 0.00018074689436880644, |
|
"loss": 0.9294, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.01413895176736897, |
|
"grad_norm": 0.6511885523796082, |
|
"learning_rate": 0.00018055260337124652, |
|
"loss": 1.2509, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.014208601776075223, |
|
"grad_norm": 0.6647017598152161, |
|
"learning_rate": 0.0001803574424860651, |
|
"loss": 1.1067, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.014278251784781473, |
|
"grad_norm": 0.7390187382698059, |
|
"learning_rate": 0.0001801614138208046, |
|
"loss": 1.0816, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.014347901793487725, |
|
"grad_norm": 0.7152518033981323, |
|
"learning_rate": 0.0001799645194923788, |
|
"loss": 0.9844, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.014417551802193975, |
|
"grad_norm": 0.8229650855064392, |
|
"learning_rate": 0.00017976676162704966, |
|
"loss": 1.1316, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.014487201810900227, |
|
"grad_norm": 0.7085878252983093, |
|
"learning_rate": 0.0001795681423604045, |
|
"loss": 0.9282, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.014556851819606477, |
|
"grad_norm": 0.8368147015571594, |
|
"learning_rate": 0.00017936866383733298, |
|
"loss": 0.8718, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.01462650182831273, |
|
"grad_norm": 0.7303407192230225, |
|
"learning_rate": 0.00017916832821200375, |
|
"loss": 0.8913, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.01469615183701898, |
|
"grad_norm": 0.6697463989257812, |
|
"learning_rate": 0.00017896713764784143, |
|
"loss": 1.0783, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.014765801845725231, |
|
"grad_norm": 0.5616613030433655, |
|
"learning_rate": 0.000178765094317503, |
|
"loss": 1.2869, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.014835451854431482, |
|
"grad_norm": 0.5711467266082764, |
|
"learning_rate": 0.00017856220040285458, |
|
"loss": 1.0144, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.014905101863137734, |
|
"grad_norm": 0.7759966850280762, |
|
"learning_rate": 0.00017835845809494768, |
|
"loss": 1.117, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.014974751871843984, |
|
"grad_norm": 0.5759698152542114, |
|
"learning_rate": 0.00017815386959399565, |
|
"loss": 1.1662, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.015044401880550236, |
|
"grad_norm": 0.6275411248207092, |
|
"learning_rate": 0.0001779484371093498, |
|
"loss": 1.2339, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.015114051889256486, |
|
"grad_norm": 0.803784191608429, |
|
"learning_rate": 0.00017774216285947576, |
|
"loss": 0.8127, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.015183701897962738, |
|
"grad_norm": 0.7878329157829285, |
|
"learning_rate": 0.00017753504907192923, |
|
"loss": 0.7944, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.015253351906668988, |
|
"grad_norm": 0.753667950630188, |
|
"learning_rate": 0.00017732709798333221, |
|
"loss": 1.2632, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.01532300191537524, |
|
"grad_norm": 0.6178960204124451, |
|
"learning_rate": 0.0001771183118393486, |
|
"loss": 0.9552, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.01539265192408149, |
|
"grad_norm": 0.6457561254501343, |
|
"learning_rate": 0.00017690869289466017, |
|
"loss": 0.9573, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.015462301932787742, |
|
"grad_norm": 0.7319156527519226, |
|
"learning_rate": 0.00017669824341294202, |
|
"loss": 0.8473, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.015531951941493993, |
|
"grad_norm": 0.6461290717124939, |
|
"learning_rate": 0.00017648696566683824, |
|
"loss": 1.0797, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.015601601950200245, |
|
"grad_norm": 0.7656479477882385, |
|
"learning_rate": 0.00017627486193793742, |
|
"loss": 0.9595, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.015671251958906496, |
|
"grad_norm": 0.7314528226852417, |
|
"learning_rate": 0.00017606193451674785, |
|
"loss": 1.1522, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.015740901967612745, |
|
"grad_norm": 0.5844183564186096, |
|
"learning_rate": 0.00017584818570267284, |
|
"loss": 0.6874, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.015810551976318997, |
|
"grad_norm": 0.756650447845459, |
|
"learning_rate": 0.00017563361780398613, |
|
"loss": 1.1152, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.01588020198502525, |
|
"grad_norm": 0.7920497059822083, |
|
"learning_rate": 0.00017541823313780647, |
|
"loss": 0.7904, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.0159498519937315, |
|
"grad_norm": 0.7280418872833252, |
|
"learning_rate": 0.00017520203403007312, |
|
"loss": 0.9489, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.01601950200243775, |
|
"grad_norm": 0.6644127368927002, |
|
"learning_rate": 0.0001749850228155203, |
|
"loss": 1.0123, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.016089152011144, |
|
"grad_norm": 0.6218852996826172, |
|
"learning_rate": 0.0001747672018376524, |
|
"loss": 1.1297, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.016158802019850253, |
|
"grad_norm": 0.7259179949760437, |
|
"learning_rate": 0.00017454857344871824, |
|
"loss": 1.2077, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.016228452028556505, |
|
"grad_norm": 0.6896301507949829, |
|
"learning_rate": 0.00017432914000968592, |
|
"loss": 1.4735, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.016298102037262754, |
|
"grad_norm": 0.6918095350265503, |
|
"learning_rate": 0.00017410890389021736, |
|
"loss": 1.1311, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.016367752045969006, |
|
"grad_norm": 0.7965865731239319, |
|
"learning_rate": 0.00017388786746864256, |
|
"loss": 1.2436, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.016437402054675258, |
|
"grad_norm": 0.7081993222236633, |
|
"learning_rate": 0.000173666033131934, |
|
"loss": 1.0674, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.01650705206338151, |
|
"grad_norm": 0.6959885358810425, |
|
"learning_rate": 0.00017344340327568082, |
|
"loss": 1.1808, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.016576702072087758, |
|
"grad_norm": 0.6657646298408508, |
|
"learning_rate": 0.000173219980304063, |
|
"loss": 0.9132, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.01664635208079401, |
|
"grad_norm": 0.5461063385009766, |
|
"learning_rate": 0.0001729957666298254, |
|
"loss": 1.2554, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.016716002089500262, |
|
"grad_norm": 0.5713803768157959, |
|
"learning_rate": 0.0001727707646742516, |
|
"loss": 1.236, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.016785652098206514, |
|
"grad_norm": 0.6570878624916077, |
|
"learning_rate": 0.00017254497686713797, |
|
"loss": 1.1216, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.016855302106912762, |
|
"grad_norm": 0.7191223502159119, |
|
"learning_rate": 0.0001723184056467671, |
|
"loss": 1.2225, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.016924952115619014, |
|
"grad_norm": 0.6774346232414246, |
|
"learning_rate": 0.0001720910534598818, |
|
"loss": 1.4341, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.016994602124325266, |
|
"grad_norm": 0.7842647433280945, |
|
"learning_rate": 0.0001718629227616585, |
|
"loss": 1.2086, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.01706425213303152, |
|
"grad_norm": 0.6781778931617737, |
|
"learning_rate": 0.00017163401601568077, |
|
"loss": 0.9324, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.017133902141737767, |
|
"grad_norm": 0.7419726252555847, |
|
"learning_rate": 0.00017140433569391275, |
|
"loss": 0.8826, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.01720355215044402, |
|
"grad_norm": 0.6957391500473022, |
|
"learning_rate": 0.00017117388427667236, |
|
"loss": 0.5565, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.01727320215915027, |
|
"grad_norm": 0.6904794573783875, |
|
"learning_rate": 0.0001709426642526046, |
|
"loss": 1.0979, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.017342852167856523, |
|
"grad_norm": 0.7743323445320129, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.6322, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.01741250217656277, |
|
"grad_norm": 0.6866056323051453, |
|
"learning_rate": 0.0001704779283800412, |
|
"loss": 0.9873, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.017482152185269023, |
|
"grad_norm": 0.5904546976089478, |
|
"learning_rate": 0.00017024441755022856, |
|
"loss": 1.0898, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.017551802193975275, |
|
"grad_norm": 0.6349841952323914, |
|
"learning_rate": 0.00017001014815090038, |
|
"loss": 1.0947, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.017621452202681527, |
|
"grad_norm": 0.6754809617996216, |
|
"learning_rate": 0.0001697751227119322, |
|
"loss": 0.9881, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.017691102211387776, |
|
"grad_norm": 0.6565687656402588, |
|
"learning_rate": 0.00016953934377136377, |
|
"loss": 1.0908, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.017760752220094028, |
|
"grad_norm": 0.5469555854797363, |
|
"learning_rate": 0.0001693028138753721, |
|
"loss": 0.8385, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.01783040222880028, |
|
"grad_norm": 0.6178275942802429, |
|
"learning_rate": 0.0001690655355782437, |
|
"loss": 0.9317, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.017900052237506528, |
|
"grad_norm": 0.8108107447624207, |
|
"learning_rate": 0.0001688275114423471, |
|
"loss": 0.8016, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.01796970224621278, |
|
"grad_norm": 0.6483268141746521, |
|
"learning_rate": 0.00016858874403810509, |
|
"loss": 1.0697, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.018039352254919032, |
|
"grad_norm": 0.7654364109039307, |
|
"learning_rate": 0.00016834923594396698, |
|
"loss": 1.1524, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.018109002263625284, |
|
"grad_norm": 0.6824004650115967, |
|
"learning_rate": 0.00016810898974638097, |
|
"loss": 1.31, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.018178652272331532, |
|
"grad_norm": 0.6116809248924255, |
|
"learning_rate": 0.00016786800803976585, |
|
"loss": 1.0788, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.018248302281037784, |
|
"grad_norm": 0.7678197026252747, |
|
"learning_rate": 0.00016762629342648318, |
|
"loss": 0.7855, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.018317952289744036, |
|
"grad_norm": 0.6764957904815674, |
|
"learning_rate": 0.00016738384851680937, |
|
"loss": 0.9709, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.018387602298450288, |
|
"grad_norm": 0.6751796007156372, |
|
"learning_rate": 0.0001671406759289071, |
|
"loss": 1.2517, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.018457252307156537, |
|
"grad_norm": 0.7578874230384827, |
|
"learning_rate": 0.00016689677828879738, |
|
"loss": 1.0033, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.01852690231586279, |
|
"grad_norm": 0.5653178095817566, |
|
"learning_rate": 0.0001666521582303309, |
|
"loss": 1.1913, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.01859655232456904, |
|
"grad_norm": 0.7313902974128723, |
|
"learning_rate": 0.00016640681839515993, |
|
"loss": 1.0418, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.018666202333275293, |
|
"grad_norm": 0.5821707248687744, |
|
"learning_rate": 0.0001661607614327095, |
|
"loss": 0.886, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.01873585234198154, |
|
"grad_norm": 0.6478776335716248, |
|
"learning_rate": 0.0001659139900001489, |
|
"loss": 1.2479, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.018805502350687793, |
|
"grad_norm": 0.6471793055534363, |
|
"learning_rate": 0.00016566650676236305, |
|
"loss": 0.9999, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.018875152359394045, |
|
"grad_norm": 0.6918301582336426, |
|
"learning_rate": 0.0001654183143919236, |
|
"loss": 0.8315, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.018944802368100297, |
|
"grad_norm": 0.62820965051651, |
|
"learning_rate": 0.0001651694155690601, |
|
"loss": 1.0534, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.019014452376806545, |
|
"grad_norm": 0.5358027219772339, |
|
"learning_rate": 0.00016491981298163118, |
|
"loss": 1.1642, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.019084102385512797, |
|
"grad_norm": 0.6293304562568665, |
|
"learning_rate": 0.0001646695093250953, |
|
"loss": 0.8443, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.01915375239421905, |
|
"grad_norm": 0.6544604301452637, |
|
"learning_rate": 0.00016441850730248184, |
|
"loss": 0.7902, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.0192234024029253, |
|
"grad_norm": 0.723544716835022, |
|
"learning_rate": 0.0001641668096243619, |
|
"loss": 0.7972, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.01929305241163155, |
|
"grad_norm": 0.6971920728683472, |
|
"learning_rate": 0.00016391441900881875, |
|
"loss": 1.0068, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.019362702420337802, |
|
"grad_norm": 0.6442938446998596, |
|
"learning_rate": 0.00016366133818141893, |
|
"loss": 0.9171, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.019432352429044054, |
|
"grad_norm": 0.5508981347084045, |
|
"learning_rate": 0.00016340756987518243, |
|
"loss": 1.2581, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.019502002437750306, |
|
"grad_norm": 0.6451659798622131, |
|
"learning_rate": 0.0001631531168305534, |
|
"loss": 0.692, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.019571652446456554, |
|
"grad_norm": 0.719409704208374, |
|
"learning_rate": 0.00016289798179537046, |
|
"loss": 1.0723, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.019641302455162806, |
|
"grad_norm": 0.6584640145301819, |
|
"learning_rate": 0.00016264216752483697, |
|
"loss": 1.0083, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.019710952463869058, |
|
"grad_norm": 0.6936922669410706, |
|
"learning_rate": 0.00016238567678149147, |
|
"loss": 1.1018, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.01978060247257531, |
|
"grad_norm": 0.8725325465202332, |
|
"learning_rate": 0.00016212851233517772, |
|
"loss": 1.0276, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.01985025248128156, |
|
"grad_norm": 0.6702690720558167, |
|
"learning_rate": 0.0001618706769630147, |
|
"loss": 1.0521, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.01991990248998781, |
|
"grad_norm": 0.604901909828186, |
|
"learning_rate": 0.0001616121734493668, |
|
"loss": 0.8782, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.019989552498694062, |
|
"grad_norm": 0.5754973292350769, |
|
"learning_rate": 0.00016135300458581365, |
|
"loss": 1.1281, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.020059202507400314, |
|
"grad_norm": 0.6314234137535095, |
|
"learning_rate": 0.00016109317317111995, |
|
"loss": 0.8964, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.020128852516106563, |
|
"grad_norm": 0.5530171990394592, |
|
"learning_rate": 0.0001608326820112054, |
|
"loss": 1.278, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.020198502524812815, |
|
"grad_norm": 0.7363768219947815, |
|
"learning_rate": 0.00016057153391911422, |
|
"loss": 1.0563, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.020268152533519067, |
|
"grad_norm": 0.634734570980072, |
|
"learning_rate": 0.00016030973171498477, |
|
"loss": 0.9834, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.02033780254222532, |
|
"grad_norm": 0.5349484086036682, |
|
"learning_rate": 0.00016004727822601934, |
|
"loss": 1.1927, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.020407452550931567, |
|
"grad_norm": 0.6138120889663696, |
|
"learning_rate": 0.00015978417628645326, |
|
"loss": 0.8267, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.02047710255963782, |
|
"grad_norm": 0.5792511701583862, |
|
"learning_rate": 0.0001595204287375246, |
|
"loss": 1.317, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.02054675256834407, |
|
"grad_norm": 0.648102879524231, |
|
"learning_rate": 0.00015925603842744334, |
|
"loss": 0.7643, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.020616402577050323, |
|
"grad_norm": 0.6310989856719971, |
|
"learning_rate": 0.00015899100821136064, |
|
"loss": 0.8994, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.02068605258575657, |
|
"grad_norm": 0.6773801445960999, |
|
"learning_rate": 0.00015872534095133793, |
|
"loss": 0.961, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.020755702594462824, |
|
"grad_norm": 0.6812910437583923, |
|
"learning_rate": 0.00015845903951631623, |
|
"loss": 0.8269, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.020825352603169076, |
|
"grad_norm": 0.7168356776237488, |
|
"learning_rate": 0.00015819210678208484, |
|
"loss": 1.2156, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.020895002611875328, |
|
"grad_norm": 0.6270495653152466, |
|
"learning_rate": 0.0001579245456312506, |
|
"loss": 1.029, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.020895002611875328, |
|
"eval_loss": 0.9014175534248352, |
|
"eval_runtime": 700.0853, |
|
"eval_samples_per_second": 7.142, |
|
"eval_steps_per_second": 1.785, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.020964652620581576, |
|
"grad_norm": 0.6509414315223694, |
|
"learning_rate": 0.00015765635895320656, |
|
"loss": 1.1077, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.021034302629287828, |
|
"grad_norm": 0.7492027282714844, |
|
"learning_rate": 0.00015738754964410084, |
|
"loss": 0.5395, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.02110395263799408, |
|
"grad_norm": 0.601356029510498, |
|
"learning_rate": 0.00015711812060680534, |
|
"loss": 1.0082, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.021173602646700332, |
|
"grad_norm": 0.7457994818687439, |
|
"learning_rate": 0.00015684807475088453, |
|
"loss": 1.318, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.02124325265540658, |
|
"grad_norm": 0.7976076602935791, |
|
"learning_rate": 0.00015657741499256367, |
|
"loss": 0.7, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.021312902664112832, |
|
"grad_norm": 0.7381129264831543, |
|
"learning_rate": 0.00015630614425469775, |
|
"loss": 0.9987, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.021382552672819084, |
|
"grad_norm": 0.8430412411689758, |
|
"learning_rate": 0.00015603426546673967, |
|
"loss": 1.0874, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.021452202681525336, |
|
"grad_norm": 0.6384485363960266, |
|
"learning_rate": 0.00015576178156470862, |
|
"loss": 1.2032, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.021521852690231585, |
|
"grad_norm": 0.788506031036377, |
|
"learning_rate": 0.0001554886954911585, |
|
"loss": 1.3688, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.021591502698937837, |
|
"grad_norm": 0.6341352462768555, |
|
"learning_rate": 0.00015521501019514597, |
|
"loss": 1.4594, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.02166115270764409, |
|
"grad_norm": 0.6707578897476196, |
|
"learning_rate": 0.00015494072863219874, |
|
"loss": 1.1494, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.02173080271635034, |
|
"grad_norm": 0.609851598739624, |
|
"learning_rate": 0.00015466585376428365, |
|
"loss": 0.9684, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.02180045272505659, |
|
"grad_norm": 0.7177265882492065, |
|
"learning_rate": 0.00015439038855977454, |
|
"loss": 0.8522, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.02187010273376284, |
|
"grad_norm": 0.6207813024520874, |
|
"learning_rate": 0.00015411433599342038, |
|
"loss": 0.4699, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.021939752742469093, |
|
"grad_norm": 0.6561682820320129, |
|
"learning_rate": 0.00015383769904631306, |
|
"loss": 0.7518, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.022009402751175345, |
|
"grad_norm": 0.7517587542533875, |
|
"learning_rate": 0.00015356048070585513, |
|
"loss": 1.2278, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.022079052759881593, |
|
"grad_norm": 0.6116645932197571, |
|
"learning_rate": 0.00015328268396572762, |
|
"loss": 0.9742, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.022148702768587845, |
|
"grad_norm": 0.5882527232170105, |
|
"learning_rate": 0.00015300431182585777, |
|
"loss": 0.8036, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.022218352777294097, |
|
"grad_norm": 0.5738014578819275, |
|
"learning_rate": 0.00015272536729238654, |
|
"loss": 0.7848, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.02228800278600035, |
|
"grad_norm": 0.7317819595336914, |
|
"learning_rate": 0.0001524458533776361, |
|
"loss": 1.0656, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.022357652794706598, |
|
"grad_norm": 0.6275020837783813, |
|
"learning_rate": 0.00015216577310007745, |
|
"loss": 0.9123, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.02242730280341285, |
|
"grad_norm": 0.8332412838935852, |
|
"learning_rate": 0.00015188512948429765, |
|
"loss": 1.1836, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.022496952812119102, |
|
"grad_norm": 0.6414222121238708, |
|
"learning_rate": 0.00015160392556096735, |
|
"loss": 0.8959, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.022566602820825354, |
|
"grad_norm": 0.6147682070732117, |
|
"learning_rate": 0.00015132216436680796, |
|
"loss": 0.937, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.022636252829531602, |
|
"grad_norm": 0.5949112176895142, |
|
"learning_rate": 0.00015103984894455878, |
|
"loss": 1.1365, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.022705902838237854, |
|
"grad_norm": 0.6494925022125244, |
|
"learning_rate": 0.00015075698234294423, |
|
"loss": 0.9603, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.022775552846944106, |
|
"grad_norm": 0.6222386956214905, |
|
"learning_rate": 0.00015047356761664098, |
|
"loss": 1.1083, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.022845202855650358, |
|
"grad_norm": 0.6448621153831482, |
|
"learning_rate": 0.00015018960782624486, |
|
"loss": 0.8984, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.022914852864356607, |
|
"grad_norm": 0.7695071697235107, |
|
"learning_rate": 0.00014990510603823782, |
|
"loss": 0.9996, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.02298450287306286, |
|
"grad_norm": 0.7322002649307251, |
|
"learning_rate": 0.00014962006532495488, |
|
"loss": 0.9976, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.02305415288176911, |
|
"grad_norm": 0.5676226615905762, |
|
"learning_rate": 0.00014933448876455096, |
|
"loss": 1.0891, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.023123802890475362, |
|
"grad_norm": 0.839449405670166, |
|
"learning_rate": 0.00014904837944096743, |
|
"loss": 0.6213, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.02319345289918161, |
|
"grad_norm": 0.6786718964576721, |
|
"learning_rate": 0.00014876174044389922, |
|
"loss": 1.0854, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.023263102907887863, |
|
"grad_norm": 0.7376294732093811, |
|
"learning_rate": 0.00014847457486876097, |
|
"loss": 0.9289, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.023332752916594115, |
|
"grad_norm": 0.71031653881073, |
|
"learning_rate": 0.00014818688581665396, |
|
"loss": 1.0325, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.023402402925300367, |
|
"grad_norm": 0.6212656497955322, |
|
"learning_rate": 0.00014789867639433248, |
|
"loss": 1.0627, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.023472052934006615, |
|
"grad_norm": 0.698070228099823, |
|
"learning_rate": 0.00014760994971417022, |
|
"loss": 1.1891, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.023541702942712867, |
|
"grad_norm": 0.7134040594100952, |
|
"learning_rate": 0.00014732070889412693, |
|
"loss": 1.0185, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.02361135295141912, |
|
"grad_norm": 0.5352413058280945, |
|
"learning_rate": 0.00014703095705771434, |
|
"loss": 0.3684, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.02368100296012537, |
|
"grad_norm": 0.6988404393196106, |
|
"learning_rate": 0.00014674069733396276, |
|
"loss": 0.947, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.02375065296883162, |
|
"grad_norm": 0.7194476127624512, |
|
"learning_rate": 0.00014644993285738717, |
|
"loss": 0.8271, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.02382030297753787, |
|
"grad_norm": 0.6885733604431152, |
|
"learning_rate": 0.00014615866676795334, |
|
"loss": 0.7825, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.023889952986244124, |
|
"grad_norm": 0.6990646123886108, |
|
"learning_rate": 0.00014586690221104397, |
|
"loss": 0.9145, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.023959602994950376, |
|
"grad_norm": 0.7719680070877075, |
|
"learning_rate": 0.00014557464233742477, |
|
"loss": 0.5737, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.024029253003656624, |
|
"grad_norm": 0.7187089323997498, |
|
"learning_rate": 0.00014528189030321029, |
|
"loss": 0.7873, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.024098903012362876, |
|
"grad_norm": 0.6850745677947998, |
|
"learning_rate": 0.00014498864926982996, |
|
"loss": 0.9, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.024168553021069128, |
|
"grad_norm": 0.8452913761138916, |
|
"learning_rate": 0.0001446949224039939, |
|
"loss": 0.9123, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.02423820302977538, |
|
"grad_norm": 0.6649196147918701, |
|
"learning_rate": 0.00014440071287765875, |
|
"loss": 0.8189, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.02430785303848163, |
|
"grad_norm": 0.7251694798469543, |
|
"learning_rate": 0.0001441060238679934, |
|
"loss": 1.0816, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.02437750304718788, |
|
"grad_norm": 0.6829720139503479, |
|
"learning_rate": 0.00014381085855734468, |
|
"loss": 0.9725, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.024447153055894132, |
|
"grad_norm": 0.7007995843887329, |
|
"learning_rate": 0.00014351522013320302, |
|
"loss": 1.047, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.024516803064600384, |
|
"grad_norm": 0.7575050592422485, |
|
"learning_rate": 0.0001432191117881679, |
|
"loss": 0.7961, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.024586453073306633, |
|
"grad_norm": 0.6370393633842468, |
|
"learning_rate": 0.0001429225367199136, |
|
"loss": 0.9137, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.024656103082012885, |
|
"grad_norm": 0.6170664429664612, |
|
"learning_rate": 0.0001426254981311545, |
|
"loss": 0.8138, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.024725753090719137, |
|
"grad_norm": 0.7749223709106445, |
|
"learning_rate": 0.00014232799922961052, |
|
"loss": 1.1226, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.02479540309942539, |
|
"grad_norm": 0.6036125421524048, |
|
"learning_rate": 0.00014203004322797252, |
|
"loss": 1.204, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.024865053108131637, |
|
"grad_norm": 0.6835645437240601, |
|
"learning_rate": 0.00014173163334386753, |
|
"loss": 0.8434, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.02493470311683789, |
|
"grad_norm": 0.6302729249000549, |
|
"learning_rate": 0.00014143277279982414, |
|
"loss": 0.6518, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.02500435312554414, |
|
"grad_norm": 0.5898759365081787, |
|
"learning_rate": 0.00014113346482323762, |
|
"loss": 0.6565, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.025074003134250393, |
|
"grad_norm": 0.6143885254859924, |
|
"learning_rate": 0.00014083371264633497, |
|
"loss": 1.2938, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.02514365314295664, |
|
"grad_norm": 0.5840321183204651, |
|
"learning_rate": 0.00014053351950614018, |
|
"loss": 0.7797, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.025213303151662893, |
|
"grad_norm": 0.6148191690444946, |
|
"learning_rate": 0.00014023288864443916, |
|
"loss": 0.7165, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.025282953160369145, |
|
"grad_norm": 0.6650532484054565, |
|
"learning_rate": 0.0001399318233077448, |
|
"loss": 1.0991, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.025352603169075397, |
|
"grad_norm": 0.5263816714286804, |
|
"learning_rate": 0.00013963032674726197, |
|
"loss": 0.5039, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.025422253177781646, |
|
"grad_norm": 0.8048628568649292, |
|
"learning_rate": 0.00013932840221885217, |
|
"loss": 1.19, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.025491903186487898, |
|
"grad_norm": 0.6668381094932556, |
|
"learning_rate": 0.0001390260529829986, |
|
"loss": 0.9708, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.02556155319519415, |
|
"grad_norm": 0.6639387607574463, |
|
"learning_rate": 0.00013872328230477086, |
|
"loss": 0.9414, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.025631203203900402, |
|
"grad_norm": 0.696017324924469, |
|
"learning_rate": 0.00013842009345378976, |
|
"loss": 0.9, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.02570085321260665, |
|
"grad_norm": 0.584456205368042, |
|
"learning_rate": 0.00013811648970419194, |
|
"loss": 1.0158, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.025770503221312902, |
|
"grad_norm": 0.7759786248207092, |
|
"learning_rate": 0.00013781247433459449, |
|
"loss": 0.9564, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.025840153230019154, |
|
"grad_norm": 0.7399227619171143, |
|
"learning_rate": 0.00013750805062805955, |
|
"loss": 0.887, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.025909803238725406, |
|
"grad_norm": 0.6674394607543945, |
|
"learning_rate": 0.00013720322187205897, |
|
"loss": 1.1418, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.025979453247431655, |
|
"grad_norm": 0.591126561164856, |
|
"learning_rate": 0.00013689799135843875, |
|
"loss": 1.1361, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.026049103256137907, |
|
"grad_norm": 0.6162034273147583, |
|
"learning_rate": 0.0001365923623833834, |
|
"loss": 0.9725, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.02611875326484416, |
|
"grad_norm": 0.6250083446502686, |
|
"learning_rate": 0.0001362863382473804, |
|
"loss": 0.8571, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.02618840327355041, |
|
"grad_norm": 0.5744304060935974, |
|
"learning_rate": 0.00013597992225518465, |
|
"loss": 1.2338, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.02625805328225666, |
|
"grad_norm": 0.6333332061767578, |
|
"learning_rate": 0.0001356731177157827, |
|
"loss": 1.0476, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.02632770329096291, |
|
"grad_norm": 0.7278969883918762, |
|
"learning_rate": 0.00013536592794235696, |
|
"loss": 0.9087, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.026397353299669163, |
|
"grad_norm": 0.6979010701179504, |
|
"learning_rate": 0.00013505835625225, |
|
"loss": 0.952, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.026467003308375415, |
|
"grad_norm": 0.6789504289627075, |
|
"learning_rate": 0.00013475040596692877, |
|
"loss": 1.0368, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.026536653317081663, |
|
"grad_norm": 0.7653933763504028, |
|
"learning_rate": 0.00013444208041194855, |
|
"loss": 0.8965, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.026606303325787915, |
|
"grad_norm": 0.5833761096000671, |
|
"learning_rate": 0.00013413338291691726, |
|
"loss": 0.8849, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.026675953334494167, |
|
"grad_norm": 0.742056131362915, |
|
"learning_rate": 0.00013382431681545942, |
|
"loss": 1.0168, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.02674560334320042, |
|
"grad_norm": 0.6038824915885925, |
|
"learning_rate": 0.00013351488544518004, |
|
"loss": 0.7484, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.026815253351906668, |
|
"grad_norm": 0.7503067851066589, |
|
"learning_rate": 0.00013320509214762868, |
|
"loss": 0.7915, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.02688490336061292, |
|
"grad_norm": 0.6701642274856567, |
|
"learning_rate": 0.00013289494026826336, |
|
"loss": 0.791, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.02695455336931917, |
|
"grad_norm": 0.6913783550262451, |
|
"learning_rate": 0.0001325844331564146, |
|
"loss": 0.7336, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.027024203378025424, |
|
"grad_norm": 0.5814367532730103, |
|
"learning_rate": 0.00013227357416524876, |
|
"loss": 0.9077, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.027093853386731672, |
|
"grad_norm": 0.6972191333770752, |
|
"learning_rate": 0.0001319623666517324, |
|
"loss": 0.9515, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.027163503395437924, |
|
"grad_norm": 0.6530499458312988, |
|
"learning_rate": 0.00013165081397659563, |
|
"loss": 0.6957, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.027233153404144176, |
|
"grad_norm": 0.5678091645240784, |
|
"learning_rate": 0.00013133891950429605, |
|
"loss": 0.8997, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.027302803412850428, |
|
"grad_norm": 0.6870533227920532, |
|
"learning_rate": 0.00013102668660298228, |
|
"loss": 1.0608, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.027372453421556676, |
|
"grad_norm": 0.8118611574172974, |
|
"learning_rate": 0.00013071411864445763, |
|
"loss": 0.7108, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.02744210343026293, |
|
"grad_norm": 0.6881155967712402, |
|
"learning_rate": 0.0001304012190041437, |
|
"loss": 1.0917, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.02751175343896918, |
|
"grad_norm": 0.647470235824585, |
|
"learning_rate": 0.00013008799106104397, |
|
"loss": 0.7477, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.027581403447675432, |
|
"grad_norm": 0.653819739818573, |
|
"learning_rate": 0.00012977443819770716, |
|
"loss": 0.8722, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.02765105345638168, |
|
"grad_norm": 0.6762019395828247, |
|
"learning_rate": 0.00012946056380019094, |
|
"loss": 1.0542, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.027720703465087933, |
|
"grad_norm": 0.5804311037063599, |
|
"learning_rate": 0.00012914637125802512, |
|
"loss": 1.2926, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.027790353473794185, |
|
"grad_norm": 0.6955252885818481, |
|
"learning_rate": 0.0001288318639641752, |
|
"loss": 1.0947, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.027860003482500437, |
|
"grad_norm": 0.7045977711677551, |
|
"learning_rate": 0.00012851704531500563, |
|
"loss": 1.1416, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.027860003482500437, |
|
"eval_loss": 0.8443693518638611, |
|
"eval_runtime": 700.1995, |
|
"eval_samples_per_second": 7.141, |
|
"eval_steps_per_second": 1.785, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.027929653491206685, |
|
"grad_norm": 0.6152036786079407, |
|
"learning_rate": 0.00012820191871024328, |
|
"loss": 0.8517, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.027999303499912937, |
|
"grad_norm": 0.6213567852973938, |
|
"learning_rate": 0.00012788648755294055, |
|
"loss": 0.861, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.02806895350861919, |
|
"grad_norm": 0.6279333233833313, |
|
"learning_rate": 0.00012757075524943873, |
|
"loss": 1.1324, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.02813860351732544, |
|
"grad_norm": 0.5852387547492981, |
|
"learning_rate": 0.0001272547252093312, |
|
"loss": 0.9501, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.02820825352603169, |
|
"grad_norm": 0.6280404329299927, |
|
"learning_rate": 0.00012693840084542662, |
|
"loss": 1.1233, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.02827790353473794, |
|
"grad_norm": 0.6563053131103516, |
|
"learning_rate": 0.00012662178557371198, |
|
"loss": 1.1278, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.028347553543444193, |
|
"grad_norm": 0.6248413920402527, |
|
"learning_rate": 0.00012630488281331585, |
|
"loss": 0.9008, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.028417203552150445, |
|
"grad_norm": 0.5682319402694702, |
|
"learning_rate": 0.00012598769598647135, |
|
"loss": 0.9898, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.028486853560856694, |
|
"grad_norm": 0.6207916736602783, |
|
"learning_rate": 0.00012567022851847927, |
|
"loss": 1.0291, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.028556503569562946, |
|
"grad_norm": 0.7249537706375122, |
|
"learning_rate": 0.000125352483837671, |
|
"loss": 0.9478, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.028626153578269198, |
|
"grad_norm": 0.8715054988861084, |
|
"learning_rate": 0.00012503446537537162, |
|
"loss": 1.0623, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.02869580358697545, |
|
"grad_norm": 0.6953936815261841, |
|
"learning_rate": 0.0001247161765658627, |
|
"loss": 1.089, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.0287654535956817, |
|
"grad_norm": 0.5827656388282776, |
|
"learning_rate": 0.0001243976208463453, |
|
"loss": 0.8708, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.02883510360438795, |
|
"grad_norm": 0.7496638298034668, |
|
"learning_rate": 0.00012407880165690287, |
|
"loss": 0.8053, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.028904753613094202, |
|
"grad_norm": 0.7032145261764526, |
|
"learning_rate": 0.00012375972244046415, |
|
"loss": 1.0352, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.028974403621800454, |
|
"grad_norm": 0.7112724184989929, |
|
"learning_rate": 0.00012344038664276568, |
|
"loss": 0.7082, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.029044053630506703, |
|
"grad_norm": 0.6337069869041443, |
|
"learning_rate": 0.0001231207977123151, |
|
"loss": 0.7147, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.029113703639212955, |
|
"grad_norm": 0.639981210231781, |
|
"learning_rate": 0.00012280095910035342, |
|
"loss": 0.4832, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.029183353647919207, |
|
"grad_norm": 0.6611121892929077, |
|
"learning_rate": 0.00012248087426081812, |
|
"loss": 0.9912, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.02925300365662546, |
|
"grad_norm": 0.5735837817192078, |
|
"learning_rate": 0.00012216054665030552, |
|
"loss": 1.2525, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.029322653665331707, |
|
"grad_norm": 0.7706820964813232, |
|
"learning_rate": 0.00012183997972803374, |
|
"loss": 0.8705, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.02939230367403796, |
|
"grad_norm": 0.5474764108657837, |
|
"learning_rate": 0.00012151917695580523, |
|
"loss": 0.7432, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.02946195368274421, |
|
"grad_norm": 0.5462170243263245, |
|
"learning_rate": 0.00012119814179796935, |
|
"loss": 1.0711, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.029531603691450463, |
|
"grad_norm": 0.673670768737793, |
|
"learning_rate": 0.000120876877721385, |
|
"loss": 1.3386, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.02960125370015671, |
|
"grad_norm": 0.7265173196792603, |
|
"learning_rate": 0.00012055538819538319, |
|
"loss": 1.1199, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.029670903708862963, |
|
"grad_norm": 0.5875483751296997, |
|
"learning_rate": 0.00012023367669172946, |
|
"loss": 1.0887, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.029740553717569215, |
|
"grad_norm": 0.6158230304718018, |
|
"learning_rate": 0.00011991174668458666, |
|
"loss": 0.9483, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.029810203726275467, |
|
"grad_norm": 0.6764160990715027, |
|
"learning_rate": 0.00011958960165047717, |
|
"loss": 0.9178, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.029879853734981716, |
|
"grad_norm": 0.6038265824317932, |
|
"learning_rate": 0.00011926724506824538, |
|
"loss": 0.9309, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.029949503743687968, |
|
"grad_norm": 0.5902111530303955, |
|
"learning_rate": 0.0001189446804190203, |
|
"loss": 0.8358, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.03001915375239422, |
|
"grad_norm": 0.6535676121711731, |
|
"learning_rate": 0.00011862191118617775, |
|
"loss": 0.6587, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.03008880376110047, |
|
"grad_norm": 0.6216766834259033, |
|
"learning_rate": 0.00011829894085530298, |
|
"loss": 0.7479, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.03015845376980672, |
|
"grad_norm": 0.6829842925071716, |
|
"learning_rate": 0.0001179757729141528, |
|
"loss": 0.6207, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.030228103778512972, |
|
"grad_norm": 0.7262370586395264, |
|
"learning_rate": 0.00011765241085261802, |
|
"loss": 1.0663, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.030297753787219224, |
|
"grad_norm": 0.6845910549163818, |
|
"learning_rate": 0.00011732885816268582, |
|
"loss": 0.7484, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.030367403795925476, |
|
"grad_norm": 0.7333625555038452, |
|
"learning_rate": 0.00011700511833840186, |
|
"loss": 0.8087, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.030437053804631724, |
|
"grad_norm": 0.6632218360900879, |
|
"learning_rate": 0.00011668119487583277, |
|
"loss": 1.2482, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.030506703813337976, |
|
"grad_norm": 0.5340752601623535, |
|
"learning_rate": 0.00011635709127302829, |
|
"loss": 0.866, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.03057635382204423, |
|
"grad_norm": 0.7423261404037476, |
|
"learning_rate": 0.0001160328110299834, |
|
"loss": 0.94, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.03064600383075048, |
|
"grad_norm": 0.5144674777984619, |
|
"learning_rate": 0.0001157083576486007, |
|
"loss": 0.9346, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.03071565383945673, |
|
"grad_norm": 0.5007227063179016, |
|
"learning_rate": 0.00011538373463265248, |
|
"loss": 1.0962, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.03078530384816298, |
|
"grad_norm": 0.5233269929885864, |
|
"learning_rate": 0.00011505894548774294, |
|
"loss": 0.6513, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.030854953856869233, |
|
"grad_norm": 0.6934007406234741, |
|
"learning_rate": 0.0001147339937212703, |
|
"loss": 0.7084, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.030924603865575485, |
|
"grad_norm": 0.6242351531982422, |
|
"learning_rate": 0.00011440888284238888, |
|
"loss": 0.6915, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.030994253874281733, |
|
"grad_norm": 0.5059527158737183, |
|
"learning_rate": 0.00011408361636197133, |
|
"loss": 1.2365, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.031063903882987985, |
|
"grad_norm": 0.5710117220878601, |
|
"learning_rate": 0.00011375819779257057, |
|
"loss": 0.7813, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.031133553891694237, |
|
"grad_norm": 0.5397061705589294, |
|
"learning_rate": 0.000113432630648382, |
|
"loss": 0.5191, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.03120320390040049, |
|
"grad_norm": 0.6234595775604248, |
|
"learning_rate": 0.00011310691844520543, |
|
"loss": 0.7069, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.03127285390910674, |
|
"grad_norm": 0.5587515830993652, |
|
"learning_rate": 0.00011278106470040717, |
|
"loss": 0.8174, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.03134250391781299, |
|
"grad_norm": 0.4725956618785858, |
|
"learning_rate": 0.00011245507293288204, |
|
"loss": 1.1901, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.03141215392651924, |
|
"grad_norm": 0.7420422434806824, |
|
"learning_rate": 0.00011212894666301536, |
|
"loss": 1.136, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.03148180393522549, |
|
"grad_norm": 0.6457960605621338, |
|
"learning_rate": 0.000111802689412645, |
|
"loss": 0.6502, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.031551453943931745, |
|
"grad_norm": 0.672398567199707, |
|
"learning_rate": 0.00011147630470502319, |
|
"loss": 0.9223, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.031621103952637994, |
|
"grad_norm": 0.7210835218429565, |
|
"learning_rate": 0.00011114979606477866, |
|
"loss": 1.151, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.03169075396134424, |
|
"grad_norm": 0.7231703996658325, |
|
"learning_rate": 0.00011082316701787843, |
|
"loss": 0.8565, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.0317604039700505, |
|
"grad_norm": 0.6620053648948669, |
|
"learning_rate": 0.00011049642109158981, |
|
"loss": 1.088, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.031830053978756746, |
|
"grad_norm": 0.8204821348190308, |
|
"learning_rate": 0.00011016956181444231, |
|
"loss": 1.1381, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.031899703987463, |
|
"grad_norm": 0.6240087747573853, |
|
"learning_rate": 0.00010984259271618947, |
|
"loss": 0.8316, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.03196935399616925, |
|
"grad_norm": 0.6648886203765869, |
|
"learning_rate": 0.00010951551732777083, |
|
"loss": 1.0288, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.0320390040048755, |
|
"grad_norm": 0.8034060001373291, |
|
"learning_rate": 0.00010918833918127376, |
|
"loss": 0.93, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.032108654013581754, |
|
"grad_norm": 0.5740483999252319, |
|
"learning_rate": 0.00010886106180989526, |
|
"loss": 0.7948, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.032178304022288, |
|
"grad_norm": 0.5168555378913879, |
|
"learning_rate": 0.00010853368874790392, |
|
"loss": 0.7923, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.03224795403099425, |
|
"grad_norm": 0.5505993962287903, |
|
"learning_rate": 0.0001082062235306017, |
|
"loss": 1.2188, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.03231760403970051, |
|
"grad_norm": 0.5564302206039429, |
|
"learning_rate": 0.00010787866969428569, |
|
"loss": 0.8798, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.032387254048406755, |
|
"grad_norm": 0.6746006011962891, |
|
"learning_rate": 0.00010755103077620998, |
|
"loss": 1.0295, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.03245690405711301, |
|
"grad_norm": 0.6252794861793518, |
|
"learning_rate": 0.00010722331031454748, |
|
"loss": 1.0924, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.03252655406581926, |
|
"grad_norm": 0.6156384944915771, |
|
"learning_rate": 0.00010689551184835176, |
|
"loss": 0.732, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.03259620407452551, |
|
"grad_norm": 0.7271072268486023, |
|
"learning_rate": 0.00010656763891751865, |
|
"loss": 0.5997, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.03266585408323176, |
|
"grad_norm": 0.6153301000595093, |
|
"learning_rate": 0.00010623969506274813, |
|
"loss": 0.9489, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.03273550409193801, |
|
"grad_norm": 0.7981113791465759, |
|
"learning_rate": 0.00010591168382550616, |
|
"loss": 0.8335, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.03280515410064426, |
|
"grad_norm": 0.781737744808197, |
|
"learning_rate": 0.00010558360874798631, |
|
"loss": 1.1474, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.032874804109350515, |
|
"grad_norm": 0.7384591102600098, |
|
"learning_rate": 0.0001052554733730716, |
|
"loss": 1.0917, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.032944454118056764, |
|
"grad_norm": 0.7052910923957825, |
|
"learning_rate": 0.00010492728124429618, |
|
"loss": 1.068, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.03301410412676302, |
|
"grad_norm": 0.6287469267845154, |
|
"learning_rate": 0.00010459903590580706, |
|
"loss": 0.6939, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.03308375413546927, |
|
"grad_norm": 0.5639947652816772, |
|
"learning_rate": 0.00010427074090232592, |
|
"loss": 0.737, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.033153404144175516, |
|
"grad_norm": 0.7723355293273926, |
|
"learning_rate": 0.00010394239977911068, |
|
"loss": 1.145, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.03322305415288177, |
|
"grad_norm": 0.7035319209098816, |
|
"learning_rate": 0.00010361401608191741, |
|
"loss": 0.584, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.03329270416158802, |
|
"grad_norm": 0.6127707362174988, |
|
"learning_rate": 0.00010328559335696188, |
|
"loss": 0.9795, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.03336235417029427, |
|
"grad_norm": 0.5730832815170288, |
|
"learning_rate": 0.00010295713515088134, |
|
"loss": 0.8133, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.033432004179000524, |
|
"grad_norm": 0.7129435539245605, |
|
"learning_rate": 0.00010262864501069617, |
|
"loss": 1.1408, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.03350165418770677, |
|
"grad_norm": 0.5180230736732483, |
|
"learning_rate": 0.00010230012648377162, |
|
"loss": 0.9543, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.03357130419641303, |
|
"grad_norm": 0.6325164437294006, |
|
"learning_rate": 0.00010197158311777957, |
|
"loss": 0.8672, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.033640954205119276, |
|
"grad_norm": 0.7068666815757751, |
|
"learning_rate": 0.00010164301846066, |
|
"loss": 0.9489, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.033710604213825525, |
|
"grad_norm": 0.6100176572799683, |
|
"learning_rate": 0.0001013144360605829, |
|
"loss": 0.9124, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.03378025422253178, |
|
"grad_norm": 0.6595302820205688, |
|
"learning_rate": 0.00010098583946590985, |
|
"loss": 0.6994, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.03384990423123803, |
|
"grad_norm": 0.6590490341186523, |
|
"learning_rate": 0.00010065723222515566, |
|
"loss": 0.6314, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.03391955423994428, |
|
"grad_norm": 0.619118869304657, |
|
"learning_rate": 0.00010032861788695024, |
|
"loss": 0.7488, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.03398920424865053, |
|
"grad_norm": 0.6756129264831543, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6419, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.03405885425735678, |
|
"grad_norm": 0.7198984026908875, |
|
"learning_rate": 9.967138211304978e-05, |
|
"loss": 0.8794, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.03412850426606304, |
|
"grad_norm": 0.684007465839386, |
|
"learning_rate": 9.934276777484436e-05, |
|
"loss": 1.1634, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.034198154274769285, |
|
"grad_norm": 0.5058736801147461, |
|
"learning_rate": 9.90141605340902e-05, |
|
"loss": 0.7194, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.034267804283475534, |
|
"grad_norm": 0.6622017025947571, |
|
"learning_rate": 9.868556393941713e-05, |
|
"loss": 1.059, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.03433745429218179, |
|
"grad_norm": 0.6841214895248413, |
|
"learning_rate": 9.835698153933999e-05, |
|
"loss": 0.8254, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.03440710430088804, |
|
"grad_norm": 0.6854826807975769, |
|
"learning_rate": 9.802841688222043e-05, |
|
"loss": 0.8211, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.034476754309594286, |
|
"grad_norm": 0.6080586314201355, |
|
"learning_rate": 9.769987351622836e-05, |
|
"loss": 0.8337, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.03454640431830054, |
|
"grad_norm": 0.5680797696113586, |
|
"learning_rate": 9.737135498930385e-05, |
|
"loss": 0.9282, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.03461605432700679, |
|
"grad_norm": 0.5402217507362366, |
|
"learning_rate": 9.704286484911868e-05, |
|
"loss": 0.7917, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.034685704335713045, |
|
"grad_norm": 0.5929046273231506, |
|
"learning_rate": 9.671440664303814e-05, |
|
"loss": 0.9316, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.034755354344419294, |
|
"grad_norm": 0.5998024940490723, |
|
"learning_rate": 9.638598391808261e-05, |
|
"loss": 1.173, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.03482500435312554, |
|
"grad_norm": 0.6345599889755249, |
|
"learning_rate": 9.605760022088934e-05, |
|
"loss": 0.7952, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03482500435312554, |
|
"eval_loss": 0.7969969511032104, |
|
"eval_runtime": 700.3094, |
|
"eval_samples_per_second": 7.14, |
|
"eval_steps_per_second": 1.785, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0348946543618318, |
|
"grad_norm": 0.5795607566833496, |
|
"learning_rate": 9.572925909767412e-05, |
|
"loss": 0.4495, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.034964304370538046, |
|
"grad_norm": 0.6874101161956787, |
|
"learning_rate": 9.540096409419296e-05, |
|
"loss": 0.8444, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.035033954379244295, |
|
"grad_norm": 0.5595911145210266, |
|
"learning_rate": 9.507271875570381e-05, |
|
"loss": 0.9391, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.03510360438795055, |
|
"grad_norm": 0.525644063949585, |
|
"learning_rate": 9.474452662692838e-05, |
|
"loss": 0.7833, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.0351732543966568, |
|
"grad_norm": 0.6366891264915466, |
|
"learning_rate": 9.441639125201368e-05, |
|
"loss": 1.0472, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.035242904405363054, |
|
"grad_norm": 0.8487269878387451, |
|
"learning_rate": 9.408831617449385e-05, |
|
"loss": 1.0513, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.0353125544140693, |
|
"grad_norm": 0.7027648091316223, |
|
"learning_rate": 9.376030493725189e-05, |
|
"loss": 0.9505, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.03538220442277555, |
|
"grad_norm": 0.6772575974464417, |
|
"learning_rate": 9.343236108248139e-05, |
|
"loss": 1.0417, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.03545185443148181, |
|
"grad_norm": 0.5657368898391724, |
|
"learning_rate": 9.310448815164826e-05, |
|
"loss": 0.9236, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.035521504440188055, |
|
"grad_norm": 0.64215087890625, |
|
"learning_rate": 9.277668968545253e-05, |
|
"loss": 1.0035, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.035591154448894304, |
|
"grad_norm": 0.6276829242706299, |
|
"learning_rate": 9.244896922379007e-05, |
|
"loss": 0.8375, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.03566080445760056, |
|
"grad_norm": 0.5804170966148376, |
|
"learning_rate": 9.212133030571437e-05, |
|
"loss": 0.4934, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.03573045446630681, |
|
"grad_norm": 0.7230868935585022, |
|
"learning_rate": 9.17937764693983e-05, |
|
"loss": 0.9427, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.035800104475013056, |
|
"grad_norm": 0.6632394194602966, |
|
"learning_rate": 9.146631125209607e-05, |
|
"loss": 0.4176, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.03586975448371931, |
|
"grad_norm": 0.5885234475135803, |
|
"learning_rate": 9.113893819010475e-05, |
|
"loss": 0.6042, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.03593940449242556, |
|
"grad_norm": 0.5666863322257996, |
|
"learning_rate": 9.081166081872626e-05, |
|
"loss": 1.5152, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.036009054501131815, |
|
"grad_norm": 0.7007538676261902, |
|
"learning_rate": 9.048448267222918e-05, |
|
"loss": 0.9444, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.036078704509838064, |
|
"grad_norm": 0.6212923526763916, |
|
"learning_rate": 9.015740728381054e-05, |
|
"loss": 0.634, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.03614835451854431, |
|
"grad_norm": 0.6189596056938171, |
|
"learning_rate": 8.98304381855577e-05, |
|
"loss": 1.1091, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.03621800452725057, |
|
"grad_norm": 0.6159670948982239, |
|
"learning_rate": 8.95035789084102e-05, |
|
"loss": 0.787, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.036287654535956816, |
|
"grad_norm": 0.6371515989303589, |
|
"learning_rate": 8.917683298212158e-05, |
|
"loss": 0.6172, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.036357304544663065, |
|
"grad_norm": 0.6314066052436829, |
|
"learning_rate": 8.885020393522135e-05, |
|
"loss": 0.9702, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.03642695455336932, |
|
"grad_norm": 0.6285626888275146, |
|
"learning_rate": 8.852369529497679e-05, |
|
"loss": 0.9819, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.03649660456207557, |
|
"grad_norm": 0.5257949233055115, |
|
"learning_rate": 8.819731058735501e-05, |
|
"loss": 0.8288, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.036566254570781824, |
|
"grad_norm": 0.611438512802124, |
|
"learning_rate": 8.787105333698465e-05, |
|
"loss": 0.9246, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.03663590457948807, |
|
"grad_norm": 0.5995710492134094, |
|
"learning_rate": 8.754492706711798e-05, |
|
"loss": 0.6855, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.03670555458819432, |
|
"grad_norm": 0.681425154209137, |
|
"learning_rate": 8.721893529959287e-05, |
|
"loss": 1.1644, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.036775204596900576, |
|
"grad_norm": 0.7111718654632568, |
|
"learning_rate": 8.68930815547946e-05, |
|
"loss": 0.9181, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.036844854605606825, |
|
"grad_norm": 0.5794047713279724, |
|
"learning_rate": 8.656736935161802e-05, |
|
"loss": 1.061, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.03691450461431307, |
|
"grad_norm": 0.5971503257751465, |
|
"learning_rate": 8.624180220742946e-05, |
|
"loss": 0.5903, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.03698415462301933, |
|
"grad_norm": 0.7091482281684875, |
|
"learning_rate": 8.59163836380287e-05, |
|
"loss": 0.8907, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.03705380463172558, |
|
"grad_norm": 0.6185580492019653, |
|
"learning_rate": 8.559111715761114e-05, |
|
"loss": 0.8452, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.03712345464043183, |
|
"grad_norm": 0.68827223777771, |
|
"learning_rate": 8.52660062787297e-05, |
|
"loss": 0.8711, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.03719310464913808, |
|
"grad_norm": 0.6279632449150085, |
|
"learning_rate": 8.494105451225704e-05, |
|
"loss": 0.6453, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.03726275465784433, |
|
"grad_norm": 0.7252237200737, |
|
"learning_rate": 8.461626536734753e-05, |
|
"loss": 1.1148, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.037332404666550585, |
|
"grad_norm": 0.6377342939376831, |
|
"learning_rate": 8.429164235139931e-05, |
|
"loss": 1.0532, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.037402054675256834, |
|
"grad_norm": 0.7409278154373169, |
|
"learning_rate": 8.396718897001663e-05, |
|
"loss": 1.0161, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.03747170468396308, |
|
"grad_norm": 0.6048555970191956, |
|
"learning_rate": 8.364290872697173e-05, |
|
"loss": 1.012, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.03754135469266934, |
|
"grad_norm": 0.7676815390586853, |
|
"learning_rate": 8.331880512416724e-05, |
|
"loss": 0.9402, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.037611004701375586, |
|
"grad_norm": 0.6360906958580017, |
|
"learning_rate": 8.299488166159817e-05, |
|
"loss": 0.4591, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.03768065471008184, |
|
"grad_norm": 0.6816183924674988, |
|
"learning_rate": 8.267114183731421e-05, |
|
"loss": 0.661, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.03775030471878809, |
|
"grad_norm": 0.6955873966217041, |
|
"learning_rate": 8.234758914738199e-05, |
|
"loss": 0.8015, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.03781995472749434, |
|
"grad_norm": 0.787493884563446, |
|
"learning_rate": 8.20242270858472e-05, |
|
"loss": 0.6941, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.037889604736200594, |
|
"grad_norm": 0.5939062833786011, |
|
"learning_rate": 8.170105914469702e-05, |
|
"loss": 0.9034, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.03795925474490684, |
|
"grad_norm": 0.5235042572021484, |
|
"learning_rate": 8.137808881382226e-05, |
|
"loss": 1.0283, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.03802890475361309, |
|
"grad_norm": 0.7017082571983337, |
|
"learning_rate": 8.105531958097972e-05, |
|
"loss": 1.0407, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.038098554762319346, |
|
"grad_norm": 0.7762130498886108, |
|
"learning_rate": 8.073275493175464e-05, |
|
"loss": 0.7814, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.038168204771025595, |
|
"grad_norm": 0.588405191898346, |
|
"learning_rate": 8.041039834952287e-05, |
|
"loss": 0.8832, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.03823785477973185, |
|
"grad_norm": 0.7792285084724426, |
|
"learning_rate": 8.008825331541335e-05, |
|
"loss": 1.051, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.0383075047884381, |
|
"grad_norm": 0.6209467649459839, |
|
"learning_rate": 7.976632330827056e-05, |
|
"loss": 0.8802, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.03837715479714435, |
|
"grad_norm": 0.5231680274009705, |
|
"learning_rate": 7.944461180461686e-05, |
|
"loss": 0.7529, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.0384468048058506, |
|
"grad_norm": 0.6021607518196106, |
|
"learning_rate": 7.912312227861503e-05, |
|
"loss": 1.1235, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.03851645481455685, |
|
"grad_norm": 0.5573668479919434, |
|
"learning_rate": 7.880185820203065e-05, |
|
"loss": 0.6753, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.0385861048232631, |
|
"grad_norm": 0.5354910492897034, |
|
"learning_rate": 7.848082304419478e-05, |
|
"loss": 0.6843, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.038655754831969355, |
|
"grad_norm": 0.606436014175415, |
|
"learning_rate": 7.816002027196627e-05, |
|
"loss": 1.0557, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.038725404840675604, |
|
"grad_norm": 0.6580552458763123, |
|
"learning_rate": 7.783945334969451e-05, |
|
"loss": 0.6222, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.03879505484938186, |
|
"grad_norm": 0.6174128651618958, |
|
"learning_rate": 7.751912573918193e-05, |
|
"loss": 0.8194, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.03886470485808811, |
|
"grad_norm": 0.6724019646644592, |
|
"learning_rate": 7.719904089964658e-05, |
|
"loss": 1.0095, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.038934354866794356, |
|
"grad_norm": 0.7200993299484253, |
|
"learning_rate": 7.687920228768493e-05, |
|
"loss": 0.8115, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.03900400487550061, |
|
"grad_norm": 0.5682472586631775, |
|
"learning_rate": 7.655961335723433e-05, |
|
"loss": 0.7034, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.03907365488420686, |
|
"grad_norm": 0.7236086130142212, |
|
"learning_rate": 7.624027755953592e-05, |
|
"loss": 0.9028, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.03914330489291311, |
|
"grad_norm": 0.5866789221763611, |
|
"learning_rate": 7.592119834309715e-05, |
|
"loss": 0.8919, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.039212954901619364, |
|
"grad_norm": 0.6271937489509583, |
|
"learning_rate": 7.560237915365472e-05, |
|
"loss": 0.6447, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.03928260491032561, |
|
"grad_norm": 0.5319473147392273, |
|
"learning_rate": 7.528382343413734e-05, |
|
"loss": 1.0977, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.03935225491903187, |
|
"grad_norm": 0.673537015914917, |
|
"learning_rate": 7.49655346246284e-05, |
|
"loss": 0.6669, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.039421904927738116, |
|
"grad_norm": 0.7043957114219666, |
|
"learning_rate": 7.464751616232902e-05, |
|
"loss": 0.6334, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.039491554936444365, |
|
"grad_norm": 0.6532731652259827, |
|
"learning_rate": 7.432977148152074e-05, |
|
"loss": 0.659, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.03956120494515062, |
|
"grad_norm": 0.6882482767105103, |
|
"learning_rate": 7.401230401352866e-05, |
|
"loss": 0.711, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.03963085495385687, |
|
"grad_norm": 0.7171745896339417, |
|
"learning_rate": 7.369511718668418e-05, |
|
"loss": 0.941, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.03970050496256312, |
|
"grad_norm": 0.6474679708480835, |
|
"learning_rate": 7.337821442628805e-05, |
|
"loss": 0.8192, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.03977015497126937, |
|
"grad_norm": 0.7054280042648315, |
|
"learning_rate": 7.306159915457342e-05, |
|
"loss": 0.6327, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.03983980497997562, |
|
"grad_norm": 0.7624709606170654, |
|
"learning_rate": 7.274527479066883e-05, |
|
"loss": 0.8132, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.039909454988681876, |
|
"grad_norm": 0.6930527687072754, |
|
"learning_rate": 7.242924475056127e-05, |
|
"loss": 0.8482, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.039979104997388125, |
|
"grad_norm": 0.6599513292312622, |
|
"learning_rate": 7.211351244705946e-05, |
|
"loss": 0.6787, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.04004875500609437, |
|
"grad_norm": 0.7311400771141052, |
|
"learning_rate": 7.179808128975674e-05, |
|
"loss": 0.9747, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.04011840501480063, |
|
"grad_norm": 0.615138828754425, |
|
"learning_rate": 7.148295468499438e-05, |
|
"loss": 0.9404, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.04018805502350688, |
|
"grad_norm": 0.6401761174201965, |
|
"learning_rate": 7.116813603582482e-05, |
|
"loss": 0.4915, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.040257705032213126, |
|
"grad_norm": 0.6191440224647522, |
|
"learning_rate": 7.08536287419749e-05, |
|
"loss": 0.6031, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.04032735504091938, |
|
"grad_norm": 0.5751050710678101, |
|
"learning_rate": 7.053943619980907e-05, |
|
"loss": 0.8371, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.04039700504962563, |
|
"grad_norm": 0.518409252166748, |
|
"learning_rate": 7.022556180229285e-05, |
|
"loss": 0.4333, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.040466655058331885, |
|
"grad_norm": 0.5712803602218628, |
|
"learning_rate": 6.991200893895608e-05, |
|
"loss": 0.796, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.040536305067038134, |
|
"grad_norm": 0.661482036113739, |
|
"learning_rate": 6.959878099585635e-05, |
|
"loss": 0.8585, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.04060595507574438, |
|
"grad_norm": 0.6602011322975159, |
|
"learning_rate": 6.92858813555424e-05, |
|
"loss": 0.9474, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.04067560508445064, |
|
"grad_norm": 0.5971815586090088, |
|
"learning_rate": 6.897331339701776e-05, |
|
"loss": 0.7689, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.040745255093156886, |
|
"grad_norm": 0.571740448474884, |
|
"learning_rate": 6.866108049570397e-05, |
|
"loss": 0.9023, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.040814905101863135, |
|
"grad_norm": 0.6928638219833374, |
|
"learning_rate": 6.834918602340438e-05, |
|
"loss": 0.8899, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.04088455511056939, |
|
"grad_norm": 0.6468199491500854, |
|
"learning_rate": 6.803763334826763e-05, |
|
"loss": 0.8841, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.04095420511927564, |
|
"grad_norm": 0.6777251362800598, |
|
"learning_rate": 6.772642583475126e-05, |
|
"loss": 0.8491, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.041023855127981894, |
|
"grad_norm": 0.5866687297821045, |
|
"learning_rate": 6.741556684358545e-05, |
|
"loss": 0.6435, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.04109350513668814, |
|
"grad_norm": 0.5522730350494385, |
|
"learning_rate": 6.710505973173664e-05, |
|
"loss": 0.9188, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.04116315514539439, |
|
"grad_norm": 0.7048250436782837, |
|
"learning_rate": 6.679490785237137e-05, |
|
"loss": 0.911, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.041232805154100646, |
|
"grad_norm": 0.849677324295044, |
|
"learning_rate": 6.648511455482003e-05, |
|
"loss": 1.0408, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.041302455162806895, |
|
"grad_norm": 0.653287947177887, |
|
"learning_rate": 6.617568318454059e-05, |
|
"loss": 1.187, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.04137210517151314, |
|
"grad_norm": 0.5278560519218445, |
|
"learning_rate": 6.586661708308272e-05, |
|
"loss": 0.8789, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.0414417551802194, |
|
"grad_norm": 0.7803817987442017, |
|
"learning_rate": 6.555791958805147e-05, |
|
"loss": 0.8788, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.04151140518892565, |
|
"grad_norm": 0.6425774097442627, |
|
"learning_rate": 6.524959403307125e-05, |
|
"loss": 0.9296, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.0415810551976319, |
|
"grad_norm": 0.5787883400917053, |
|
"learning_rate": 6.494164374775e-05, |
|
"loss": 1.0127, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.04165070520633815, |
|
"grad_norm": 0.5686517357826233, |
|
"learning_rate": 6.463407205764305e-05, |
|
"loss": 0.7869, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.0417203552150444, |
|
"grad_norm": 0.5126462578773499, |
|
"learning_rate": 6.43268822842173e-05, |
|
"loss": 1.2029, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.041790005223750655, |
|
"grad_norm": 0.5618976950645447, |
|
"learning_rate": 6.402007774481536e-05, |
|
"loss": 0.5725, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.041790005223750655, |
|
"eval_loss": 0.7635987401008606, |
|
"eval_runtime": 701.6781, |
|
"eval_samples_per_second": 7.126, |
|
"eval_steps_per_second": 1.781, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.041859655232456904, |
|
"grad_norm": 0.6774680018424988, |
|
"learning_rate": 6.371366175261964e-05, |
|
"loss": 0.9805, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.04192930524116315, |
|
"grad_norm": 0.7227701544761658, |
|
"learning_rate": 6.340763761661665e-05, |
|
"loss": 0.933, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.04199895524986941, |
|
"grad_norm": 0.7895076870918274, |
|
"learning_rate": 6.310200864156126e-05, |
|
"loss": 0.9677, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.042068605258575656, |
|
"grad_norm": 0.6837015748023987, |
|
"learning_rate": 6.279677812794103e-05, |
|
"loss": 1.1069, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.04213825526728191, |
|
"grad_norm": 0.8501606583595276, |
|
"learning_rate": 6.249194937194047e-05, |
|
"loss": 0.961, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.04220790527598816, |
|
"grad_norm": 0.7296304106712341, |
|
"learning_rate": 6.218752566540554e-05, |
|
"loss": 0.9667, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.04227755528469441, |
|
"grad_norm": 0.5765381455421448, |
|
"learning_rate": 6.188351029580805e-05, |
|
"loss": 1.0982, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.042347205293400664, |
|
"grad_norm": 0.7557181119918823, |
|
"learning_rate": 6.157990654621024e-05, |
|
"loss": 0.9381, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.04241685530210691, |
|
"grad_norm": 0.6191427707672119, |
|
"learning_rate": 6.127671769522916e-05, |
|
"loss": 0.9322, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.04248650531081316, |
|
"grad_norm": 0.5968077778816223, |
|
"learning_rate": 6.097394701700145e-05, |
|
"loss": 0.9394, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.042556155319519416, |
|
"grad_norm": 0.5749527812004089, |
|
"learning_rate": 6.067159778114788e-05, |
|
"loss": 0.7593, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.042625805328225665, |
|
"grad_norm": 0.5655612945556641, |
|
"learning_rate": 6.036967325273807e-05, |
|
"loss": 1.0865, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.04269545533693192, |
|
"grad_norm": 0.7150444984436035, |
|
"learning_rate": 6.0068176692255175e-05, |
|
"loss": 0.612, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.04276510534563817, |
|
"grad_norm": 0.6594777703285217, |
|
"learning_rate": 5.976711135556086e-05, |
|
"loss": 0.6786, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.04283475535434442, |
|
"grad_norm": 0.6561244130134583, |
|
"learning_rate": 5.946648049385985e-05, |
|
"loss": 0.9041, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.04290440536305067, |
|
"grad_norm": 0.5820670127868652, |
|
"learning_rate": 5.916628735366505e-05, |
|
"loss": 0.6228, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.04297405537175692, |
|
"grad_norm": 0.7414914965629578, |
|
"learning_rate": 5.886653517676239e-05, |
|
"loss": 0.7384, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.04304370538046317, |
|
"grad_norm": 0.7077262997627258, |
|
"learning_rate": 5.8567227200175865e-05, |
|
"loss": 1.0201, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.043113355389169425, |
|
"grad_norm": 0.6975839734077454, |
|
"learning_rate": 5.8268366656132476e-05, |
|
"loss": 0.6453, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.04318300539787567, |
|
"grad_norm": 0.6871505379676819, |
|
"learning_rate": 5.796995677202753e-05, |
|
"loss": 1.0648, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.04325265540658193, |
|
"grad_norm": 0.6167171001434326, |
|
"learning_rate": 5.76720007703895e-05, |
|
"loss": 0.7303, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.04332230541528818, |
|
"grad_norm": 0.7851260900497437, |
|
"learning_rate": 5.7374501868845544e-05, |
|
"loss": 0.7858, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.043391955423994426, |
|
"grad_norm": 0.5275984406471252, |
|
"learning_rate": 5.7077463280086415e-05, |
|
"loss": 0.7998, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.04346160543270068, |
|
"grad_norm": 0.7553796768188477, |
|
"learning_rate": 5.6780888211832116e-05, |
|
"loss": 0.6115, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.04353125544140693, |
|
"grad_norm": 0.7186095118522644, |
|
"learning_rate": 5.648477986679703e-05, |
|
"loss": 0.9616, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.04360090545011318, |
|
"grad_norm": 0.7424410581588745, |
|
"learning_rate": 5.6189141442655325e-05, |
|
"loss": 0.8707, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.043670555458819434, |
|
"grad_norm": 0.6303914189338684, |
|
"learning_rate": 5.589397613200662e-05, |
|
"loss": 0.8386, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.04374020546752568, |
|
"grad_norm": 0.7636226415634155, |
|
"learning_rate": 5.559928712234126e-05, |
|
"loss": 0.8905, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.04380985547623194, |
|
"grad_norm": 0.6990499496459961, |
|
"learning_rate": 5.530507759600614e-05, |
|
"loss": 0.964, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.043879505484938186, |
|
"grad_norm": 0.6701223254203796, |
|
"learning_rate": 5.501135073017008e-05, |
|
"loss": 0.8774, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.043949155493644435, |
|
"grad_norm": 0.5796250104904175, |
|
"learning_rate": 5.471810969678975e-05, |
|
"loss": 0.6749, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.04401880550235069, |
|
"grad_norm": 0.6239587664604187, |
|
"learning_rate": 5.442535766257525e-05, |
|
"loss": 0.9801, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.04408845551105694, |
|
"grad_norm": 0.8477646112442017, |
|
"learning_rate": 5.413309778895602e-05, |
|
"loss": 0.6404, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.04415810551976319, |
|
"grad_norm": 0.7139285802841187, |
|
"learning_rate": 5.3841333232046654e-05, |
|
"loss": 1.1062, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.04422775552846944, |
|
"grad_norm": 0.5378491878509521, |
|
"learning_rate": 5.355006714261285e-05, |
|
"loss": 1.2571, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.04429740553717569, |
|
"grad_norm": 0.647861659526825, |
|
"learning_rate": 5.325930266603724e-05, |
|
"loss": 1.2096, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.044367055545881946, |
|
"grad_norm": 0.7343048453330994, |
|
"learning_rate": 5.296904294228569e-05, |
|
"loss": 0.9278, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.044436705554588195, |
|
"grad_norm": 0.5826293230056763, |
|
"learning_rate": 5.267929110587307e-05, |
|
"loss": 1.0683, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.04450635556329444, |
|
"grad_norm": 0.6172500848770142, |
|
"learning_rate": 5.2390050285829786e-05, |
|
"loss": 0.9441, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.0445760055720007, |
|
"grad_norm": 0.7326881885528564, |
|
"learning_rate": 5.210132360566755e-05, |
|
"loss": 0.7529, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.04464565558070695, |
|
"grad_norm": 0.7021967768669128, |
|
"learning_rate": 5.181311418334608e-05, |
|
"loss": 0.606, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.044715305589413196, |
|
"grad_norm": 0.6962524652481079, |
|
"learning_rate": 5.1525425131239056e-05, |
|
"loss": 0.8838, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.04478495559811945, |
|
"grad_norm": 0.535213828086853, |
|
"learning_rate": 5.123825955610079e-05, |
|
"loss": 0.8108, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.0448546056068257, |
|
"grad_norm": 0.5601661801338196, |
|
"learning_rate": 5.0951620559032573e-05, |
|
"loss": 0.5116, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.044924255615531955, |
|
"grad_norm": 0.6015167832374573, |
|
"learning_rate": 5.066551123544907e-05, |
|
"loss": 0.7486, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.044993905624238204, |
|
"grad_norm": 0.8018868565559387, |
|
"learning_rate": 5.0379934675045145e-05, |
|
"loss": 0.9923, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.04506355563294445, |
|
"grad_norm": 0.6844683289527893, |
|
"learning_rate": 5.009489396176221e-05, |
|
"loss": 0.9141, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.04513320564165071, |
|
"grad_norm": 0.5720611810684204, |
|
"learning_rate": 4.9810392173755194e-05, |
|
"loss": 0.7879, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.045202855650356956, |
|
"grad_norm": 0.5712713599205017, |
|
"learning_rate": 4.9526432383359036e-05, |
|
"loss": 0.9627, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.045272505659063204, |
|
"grad_norm": 0.5877520442008972, |
|
"learning_rate": 4.92430176570558e-05, |
|
"loss": 0.6014, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.04534215566776946, |
|
"grad_norm": 0.639779806137085, |
|
"learning_rate": 4.896015105544124e-05, |
|
"loss": 0.6532, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.04541180567647571, |
|
"grad_norm": 0.5214322209358215, |
|
"learning_rate": 4.867783563319206e-05, |
|
"loss": 0.6277, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.045481455685181964, |
|
"grad_norm": 0.6788254380226135, |
|
"learning_rate": 4.8396074439032604e-05, |
|
"loss": 0.5997, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.04555110569388821, |
|
"grad_norm": 0.7286319732666016, |
|
"learning_rate": 4.811487051570235e-05, |
|
"loss": 0.9064, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.04562075570259446, |
|
"grad_norm": 0.6942530870437622, |
|
"learning_rate": 4.783422689992256e-05, |
|
"loss": 1.2174, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.045690405711300716, |
|
"grad_norm": 0.6202605366706848, |
|
"learning_rate": 4.7554146622363914e-05, |
|
"loss": 0.9942, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.045760055720006965, |
|
"grad_norm": 0.6402217745780945, |
|
"learning_rate": 4.727463270761346e-05, |
|
"loss": 0.9941, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.04582970572871321, |
|
"grad_norm": 0.5262777209281921, |
|
"learning_rate": 4.699568817414224e-05, |
|
"loss": 0.8669, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.04589935573741947, |
|
"grad_norm": 0.6133191585540771, |
|
"learning_rate": 4.6717316034272394e-05, |
|
"loss": 0.9069, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.04596900574612572, |
|
"grad_norm": 0.7493846416473389, |
|
"learning_rate": 4.643951929414493e-05, |
|
"loss": 0.6228, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.04603865575483197, |
|
"grad_norm": 0.642196774482727, |
|
"learning_rate": 4.616230095368697e-05, |
|
"loss": 1.012, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.04610830576353822, |
|
"grad_norm": 0.726894736289978, |
|
"learning_rate": 4.5885664006579645e-05, |
|
"loss": 1.0356, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.04617795577224447, |
|
"grad_norm": 0.7074050307273865, |
|
"learning_rate": 4.5609611440225474e-05, |
|
"loss": 1.0333, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.046247605780950725, |
|
"grad_norm": 0.7056405544281006, |
|
"learning_rate": 4.533414623571637e-05, |
|
"loss": 0.5944, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.04631725578965697, |
|
"grad_norm": 0.7887142896652222, |
|
"learning_rate": 4.505927136780128e-05, |
|
"loss": 0.8546, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.04638690579836322, |
|
"grad_norm": 0.5718196034431458, |
|
"learning_rate": 4.478498980485405e-05, |
|
"loss": 0.7971, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.04645655580706948, |
|
"grad_norm": 0.4922311007976532, |
|
"learning_rate": 4.4511304508841544e-05, |
|
"loss": 0.4773, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.046526205815775726, |
|
"grad_norm": 0.5427528619766235, |
|
"learning_rate": 4.423821843529139e-05, |
|
"loss": 0.5889, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.04659585582448198, |
|
"grad_norm": 0.5341909527778625, |
|
"learning_rate": 4.396573453326037e-05, |
|
"loss": 0.7427, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.04666550583318823, |
|
"grad_norm": 0.7404798269271851, |
|
"learning_rate": 4.369385574530227e-05, |
|
"loss": 1.1909, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.04673515584189448, |
|
"grad_norm": 0.6806610226631165, |
|
"learning_rate": 4.342258500743638e-05, |
|
"loss": 0.9576, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.046804805850600734, |
|
"grad_norm": 0.6135253310203552, |
|
"learning_rate": 4.315192524911551e-05, |
|
"loss": 0.7204, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.04687445585930698, |
|
"grad_norm": 0.8514856100082397, |
|
"learning_rate": 4.288187939319465e-05, |
|
"loss": 0.9307, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.04694410586801323, |
|
"grad_norm": 0.6521239280700684, |
|
"learning_rate": 4.261245035589917e-05, |
|
"loss": 0.6885, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.047013755876719486, |
|
"grad_norm": 0.6027514338493347, |
|
"learning_rate": 4.234364104679347e-05, |
|
"loss": 0.9786, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.047083405885425735, |
|
"grad_norm": 0.6285941004753113, |
|
"learning_rate": 4.207545436874941e-05, |
|
"loss": 0.6983, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.04715305589413199, |
|
"grad_norm": 0.6285765767097473, |
|
"learning_rate": 4.1807893217915195e-05, |
|
"loss": 0.8987, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.04722270590283824, |
|
"grad_norm": 0.7090179324150085, |
|
"learning_rate": 4.15409604836838e-05, |
|
"loss": 1.0551, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.04729235591154449, |
|
"grad_norm": 0.6713972091674805, |
|
"learning_rate": 4.127465904866209e-05, |
|
"loss": 0.7779, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.04736200592025074, |
|
"grad_norm": 0.6123691201210022, |
|
"learning_rate": 4.1008991788639386e-05, |
|
"loss": 0.6502, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.04743165592895699, |
|
"grad_norm": 0.8065311312675476, |
|
"learning_rate": 4.0743961572556686e-05, |
|
"loss": 0.6814, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.04750130593766324, |
|
"grad_norm": 0.6417213082313538, |
|
"learning_rate": 4.047957126247541e-05, |
|
"loss": 0.8127, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.047570955946369495, |
|
"grad_norm": 0.7060418725013733, |
|
"learning_rate": 4.021582371354674e-05, |
|
"loss": 0.9657, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.04764060595507574, |
|
"grad_norm": 0.6365180015563965, |
|
"learning_rate": 3.99527217739807e-05, |
|
"loss": 0.8965, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.047710255963782, |
|
"grad_norm": 0.7569335103034973, |
|
"learning_rate": 3.969026828501523e-05, |
|
"loss": 0.9742, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.04777990597248825, |
|
"grad_norm": 0.6113385558128357, |
|
"learning_rate": 3.942846608088583e-05, |
|
"loss": 0.8562, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.047849555981194496, |
|
"grad_norm": 0.5718615651130676, |
|
"learning_rate": 3.916731798879462e-05, |
|
"loss": 0.6826, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.04791920598990075, |
|
"grad_norm": 0.718606173992157, |
|
"learning_rate": 3.8906826828880085e-05, |
|
"loss": 0.5029, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.047988855998607, |
|
"grad_norm": 0.745060384273529, |
|
"learning_rate": 3.8646995414186396e-05, |
|
"loss": 0.4777, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.04805850600731325, |
|
"grad_norm": 0.6253296136856079, |
|
"learning_rate": 3.838782655063325e-05, |
|
"loss": 0.4763, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.048128156016019503, |
|
"grad_norm": 0.7446655631065369, |
|
"learning_rate": 3.812932303698533e-05, |
|
"loss": 0.7823, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.04819780602472575, |
|
"grad_norm": 0.7678576111793518, |
|
"learning_rate": 3.7871487664822326e-05, |
|
"loss": 0.7656, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.04826745603343201, |
|
"grad_norm": 0.7170537710189819, |
|
"learning_rate": 3.7614323218508506e-05, |
|
"loss": 1.0093, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.048337106042138256, |
|
"grad_norm": 0.7178253531455994, |
|
"learning_rate": 3.7357832475163045e-05, |
|
"loss": 0.9605, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.048406756050844504, |
|
"grad_norm": 0.6666684746742249, |
|
"learning_rate": 3.710201820462956e-05, |
|
"loss": 0.9654, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.04847640605955076, |
|
"grad_norm": 0.6459413766860962, |
|
"learning_rate": 3.6846883169446625e-05, |
|
"loss": 0.6705, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.04854605606825701, |
|
"grad_norm": 0.6586235165596008, |
|
"learning_rate": 3.659243012481757e-05, |
|
"loss": 1.0915, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.04861570607696326, |
|
"grad_norm": 0.6067480444908142, |
|
"learning_rate": 3.63386618185811e-05, |
|
"loss": 0.8191, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.04868535608566951, |
|
"grad_norm": 0.7405864000320435, |
|
"learning_rate": 3.6085580991181256e-05, |
|
"loss": 0.9778, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.04875500609437576, |
|
"grad_norm": 0.6318597197532654, |
|
"learning_rate": 3.583319037563816e-05, |
|
"loss": 0.6675, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.04875500609437576, |
|
"eval_loss": 0.7419635653495789, |
|
"eval_runtime": 700.4042, |
|
"eval_samples_per_second": 7.139, |
|
"eval_steps_per_second": 1.785, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.048824656103082016, |
|
"grad_norm": 0.6579747200012207, |
|
"learning_rate": 3.558149269751816e-05, |
|
"loss": 0.64, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.048894306111788265, |
|
"grad_norm": 0.6741796731948853, |
|
"learning_rate": 3.5330490674904735e-05, |
|
"loss": 0.7894, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.04896395612049451, |
|
"grad_norm": 0.691154956817627, |
|
"learning_rate": 3.5080187018368846e-05, |
|
"loss": 0.8126, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.04903360612920077, |
|
"grad_norm": 0.5884422659873962, |
|
"learning_rate": 3.483058443093989e-05, |
|
"loss": 0.4997, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.04910325613790702, |
|
"grad_norm": 0.8021077513694763, |
|
"learning_rate": 3.458168560807643e-05, |
|
"loss": 0.9094, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.049172906146613266, |
|
"grad_norm": 0.6837207674980164, |
|
"learning_rate": 3.433349323763696e-05, |
|
"loss": 0.8385, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.04924255615531952, |
|
"grad_norm": 0.815160870552063, |
|
"learning_rate": 3.408600999985112e-05, |
|
"loss": 0.7504, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.04931220616402577, |
|
"grad_norm": 0.6362173557281494, |
|
"learning_rate": 3.383923856729052e-05, |
|
"loss": 0.962, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.049381856172732025, |
|
"grad_norm": 0.7275608777999878, |
|
"learning_rate": 3.359318160484011e-05, |
|
"loss": 1.1645, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.04945150618143827, |
|
"grad_norm": 0.7200846672058105, |
|
"learning_rate": 3.334784176966912e-05, |
|
"loss": 1.1489, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.04952115619014452, |
|
"grad_norm": 0.7058080434799194, |
|
"learning_rate": 3.310322171120267e-05, |
|
"loss": 0.7897, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.04959080619885078, |
|
"grad_norm": 0.6900257468223572, |
|
"learning_rate": 3.28593240710929e-05, |
|
"loss": 0.8203, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.049660456207557026, |
|
"grad_norm": 0.6234864592552185, |
|
"learning_rate": 3.261615148319063e-05, |
|
"loss": 0.8475, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.049730106216263274, |
|
"grad_norm": 0.7157082557678223, |
|
"learning_rate": 3.2373706573516794e-05, |
|
"loss": 1.1521, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.04979975622496953, |
|
"grad_norm": 0.6452792286872864, |
|
"learning_rate": 3.21319919602342e-05, |
|
"loss": 0.7429, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.04986940623367578, |
|
"grad_norm": 0.6651695966720581, |
|
"learning_rate": 3.189101025361905e-05, |
|
"loss": 0.7481, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.049939056242382034, |
|
"grad_norm": 0.5767229199409485, |
|
"learning_rate": 3.165076405603303e-05, |
|
"loss": 1.2513, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.05000870625108828, |
|
"grad_norm": 0.6223350763320923, |
|
"learning_rate": 3.141125596189494e-05, |
|
"loss": 1.0635, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.05007835625979453, |
|
"grad_norm": 0.6872287392616272, |
|
"learning_rate": 3.117248855765294e-05, |
|
"loss": 0.6846, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.050148006268500786, |
|
"grad_norm": 0.6780046224594116, |
|
"learning_rate": 3.093446442175631e-05, |
|
"loss": 0.7238, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.050217656277207035, |
|
"grad_norm": 0.5555802583694458, |
|
"learning_rate": 3.069718612462793e-05, |
|
"loss": 0.8503, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.05028730628591328, |
|
"grad_norm": 0.7299566268920898, |
|
"learning_rate": 3.0460656228636254e-05, |
|
"loss": 0.8579, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.05035695629461954, |
|
"grad_norm": 0.6805000305175781, |
|
"learning_rate": 3.022487728806783e-05, |
|
"loss": 0.8994, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.05042660630332579, |
|
"grad_norm": 0.5568419098854065, |
|
"learning_rate": 2.9989851849099594e-05, |
|
"loss": 0.9992, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.05049625631203204, |
|
"grad_norm": 0.7006337642669678, |
|
"learning_rate": 2.9755582449771457e-05, |
|
"loss": 0.9476, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.05056590632073829, |
|
"grad_norm": 0.7835425734519958, |
|
"learning_rate": 2.952207161995879e-05, |
|
"loss": 1.0143, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.05063555632944454, |
|
"grad_norm": 0.6196465492248535, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.7623, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.050705206338150795, |
|
"grad_norm": 0.7238385677337646, |
|
"learning_rate": 2.905733574739542e-05, |
|
"loss": 0.9173, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.05077485634685704, |
|
"grad_norm": 0.45640066266059875, |
|
"learning_rate": 2.8826115723327684e-05, |
|
"loss": 0.3747, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.05084450635556329, |
|
"grad_norm": 0.7860556840896606, |
|
"learning_rate": 2.8595664306087312e-05, |
|
"loss": 0.677, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.05091415636426955, |
|
"grad_norm": 0.7076509594917297, |
|
"learning_rate": 2.8365983984319254e-05, |
|
"loss": 0.6773, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.050983806372975796, |
|
"grad_norm": 0.5683595538139343, |
|
"learning_rate": 2.8137077238341525e-05, |
|
"loss": 0.7685, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.05105345638168205, |
|
"grad_norm": 0.6466002464294434, |
|
"learning_rate": 2.7908946540118208e-05, |
|
"loss": 0.6539, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.0511231063903883, |
|
"grad_norm": 0.7310590147972107, |
|
"learning_rate": 2.7681594353232932e-05, |
|
"loss": 0.6498, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.05119275639909455, |
|
"grad_norm": 0.6998217701911926, |
|
"learning_rate": 2.7455023132862044e-05, |
|
"loss": 0.827, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.051262406407800803, |
|
"grad_norm": 0.6120029091835022, |
|
"learning_rate": 2.7229235325748393e-05, |
|
"loss": 0.7574, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.05133205641650705, |
|
"grad_norm": 0.6969332695007324, |
|
"learning_rate": 2.7004233370174603e-05, |
|
"loss": 0.9495, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.0514017064252133, |
|
"grad_norm": 0.5970465540885925, |
|
"learning_rate": 2.6780019695937008e-05, |
|
"loss": 0.826, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.051471356433919556, |
|
"grad_norm": 0.5893230438232422, |
|
"learning_rate": 2.6556596724319193e-05, |
|
"loss": 0.5827, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.051541006442625804, |
|
"grad_norm": 0.6217379570007324, |
|
"learning_rate": 2.6333966868066042e-05, |
|
"loss": 0.833, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.05161065645133206, |
|
"grad_norm": 0.7289059162139893, |
|
"learning_rate": 2.6112132531357457e-05, |
|
"loss": 0.6796, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.05168030646003831, |
|
"grad_norm": 0.6685306429862976, |
|
"learning_rate": 2.5891096109782642e-05, |
|
"loss": 0.8579, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.05174995646874456, |
|
"grad_norm": 0.6785428524017334, |
|
"learning_rate": 2.567085999031408e-05, |
|
"loss": 1.1535, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.05181960647745081, |
|
"grad_norm": 0.5720734000205994, |
|
"learning_rate": 2.5451426551281798e-05, |
|
"loss": 0.8504, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.05188925648615706, |
|
"grad_norm": 0.8368062376976013, |
|
"learning_rate": 2.5232798162347604e-05, |
|
"loss": 0.866, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.05195890649486331, |
|
"grad_norm": 0.5373237133026123, |
|
"learning_rate": 2.5014977184479694e-05, |
|
"loss": 1.1392, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.052028556503569565, |
|
"grad_norm": 0.9247710704803467, |
|
"learning_rate": 2.4797965969926907e-05, |
|
"loss": 0.8317, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.05209820651227581, |
|
"grad_norm": 0.6235398650169373, |
|
"learning_rate": 2.4581766862193556e-05, |
|
"loss": 0.889, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.05216785652098207, |
|
"grad_norm": 0.5890073776245117, |
|
"learning_rate": 2.4366382196013892e-05, |
|
"loss": 1.0977, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.05223750652968832, |
|
"grad_norm": 0.5582912564277649, |
|
"learning_rate": 2.4151814297327158e-05, |
|
"loss": 0.6759, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.052307156538394566, |
|
"grad_norm": 0.6418405771255493, |
|
"learning_rate": 2.3938065483252183e-05, |
|
"loss": 0.5678, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.05237680654710082, |
|
"grad_norm": 0.5797872543334961, |
|
"learning_rate": 2.372513806206258e-05, |
|
"loss": 0.6385, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.05244645655580707, |
|
"grad_norm": 0.6586098074913025, |
|
"learning_rate": 2.3513034333161765e-05, |
|
"loss": 0.8608, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.05251610656451332, |
|
"grad_norm": 0.5528561472892761, |
|
"learning_rate": 2.3301756587057987e-05, |
|
"loss": 0.6811, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.05258575657321957, |
|
"grad_norm": 0.5883040428161621, |
|
"learning_rate": 2.3091307105339856e-05, |
|
"loss": 0.6142, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.05265540658192582, |
|
"grad_norm": 0.9445425271987915, |
|
"learning_rate": 2.2881688160651405e-05, |
|
"loss": 0.8142, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.05272505659063208, |
|
"grad_norm": 0.6835020184516907, |
|
"learning_rate": 2.267290201666782e-05, |
|
"loss": 0.8235, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.052794706599338326, |
|
"grad_norm": 0.6816075444221497, |
|
"learning_rate": 2.246495092807077e-05, |
|
"loss": 1.0772, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.052864356608044574, |
|
"grad_norm": 0.5880750417709351, |
|
"learning_rate": 2.2257837140524274e-05, |
|
"loss": 1.0342, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.05293400661675083, |
|
"grad_norm": 0.6749791502952576, |
|
"learning_rate": 2.20515628906502e-05, |
|
"loss": 0.6126, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.05300365662545708, |
|
"grad_norm": 0.7459970712661743, |
|
"learning_rate": 2.1846130406004396e-05, |
|
"loss": 0.6544, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.05307330663416333, |
|
"grad_norm": 0.5859512686729431, |
|
"learning_rate": 2.164154190505231e-05, |
|
"loss": 0.7144, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.05314295664286958, |
|
"grad_norm": 0.6339436173439026, |
|
"learning_rate": 2.1437799597145425e-05, |
|
"loss": 0.5725, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.05321260665157583, |
|
"grad_norm": 0.7248126268386841, |
|
"learning_rate": 2.1234905682496986e-05, |
|
"loss": 0.7997, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.05328225666028208, |
|
"grad_norm": 0.6739416718482971, |
|
"learning_rate": 2.103286235215859e-05, |
|
"loss": 0.7482, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.053351906668988335, |
|
"grad_norm": 0.7312667369842529, |
|
"learning_rate": 2.083167178799623e-05, |
|
"loss": 1.0439, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.05342155667769458, |
|
"grad_norm": 0.6655896902084351, |
|
"learning_rate": 2.0631336162667035e-05, |
|
"loss": 0.8695, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.05349120668640084, |
|
"grad_norm": 0.6517478823661804, |
|
"learning_rate": 2.0431857639595486e-05, |
|
"loss": 0.6283, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.05356085669510709, |
|
"grad_norm": 0.5833168029785156, |
|
"learning_rate": 2.023323837295037e-05, |
|
"loss": 1.2862, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.053630506703813335, |
|
"grad_norm": 0.45417115092277527, |
|
"learning_rate": 2.0035480507621218e-05, |
|
"loss": 0.4238, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.05370015671251959, |
|
"grad_norm": 0.6575907468795776, |
|
"learning_rate": 1.983858617919543e-05, |
|
"loss": 1.034, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.05376980672122584, |
|
"grad_norm": 0.606704831123352, |
|
"learning_rate": 1.9642557513934933e-05, |
|
"loss": 0.8014, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.05383945672993209, |
|
"grad_norm": 0.594321608543396, |
|
"learning_rate": 1.9447396628753467e-05, |
|
"loss": 0.5752, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.05390910673863834, |
|
"grad_norm": 0.7383103966712952, |
|
"learning_rate": 1.925310563119358e-05, |
|
"loss": 0.7493, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.05397875674734459, |
|
"grad_norm": 0.636978268623352, |
|
"learning_rate": 1.905968661940385e-05, |
|
"loss": 0.4319, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.05404840675605085, |
|
"grad_norm": 0.6960916519165039, |
|
"learning_rate": 1.8867141682116374e-05, |
|
"loss": 0.9924, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.054118056764757096, |
|
"grad_norm": 0.649654746055603, |
|
"learning_rate": 1.8675472898624014e-05, |
|
"loss": 0.7308, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.054187706773463344, |
|
"grad_norm": 0.6827317476272583, |
|
"learning_rate": 1.8484682338758152e-05, |
|
"loss": 0.7227, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.0542573567821696, |
|
"grad_norm": 0.6983030438423157, |
|
"learning_rate": 1.8294772062866138e-05, |
|
"loss": 0.8553, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.05432700679087585, |
|
"grad_norm": 0.5816463232040405, |
|
"learning_rate": 1.8105744121789225e-05, |
|
"loss": 0.7053, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.0543966567995821, |
|
"grad_norm": 0.8149849772453308, |
|
"learning_rate": 1.791760055684023e-05, |
|
"loss": 0.7378, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.05446630680828835, |
|
"grad_norm": 0.626234233379364, |
|
"learning_rate": 1.7730343399781668e-05, |
|
"loss": 0.8566, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.0545359568169946, |
|
"grad_norm": 0.7223556637763977, |
|
"learning_rate": 1.754397467280372e-05, |
|
"loss": 0.7798, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.054605606825700856, |
|
"grad_norm": 0.6546375155448914, |
|
"learning_rate": 1.735849638850242e-05, |
|
"loss": 1.0634, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.054675256834407104, |
|
"grad_norm": 0.6382943987846375, |
|
"learning_rate": 1.7173910549857854e-05, |
|
"loss": 0.7336, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.05474490684311335, |
|
"grad_norm": 0.592207133769989, |
|
"learning_rate": 1.699021915021266e-05, |
|
"loss": 0.5601, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.05481455685181961, |
|
"grad_norm": 0.6741936206817627, |
|
"learning_rate": 1.6807424173250354e-05, |
|
"loss": 0.9638, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.05488420686052586, |
|
"grad_norm": 0.5983725190162659, |
|
"learning_rate": 1.6625527592974077e-05, |
|
"loss": 0.7403, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.054953856869232105, |
|
"grad_norm": 0.5087631940841675, |
|
"learning_rate": 1.6444531373685078e-05, |
|
"loss": 0.9725, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.05502350687793836, |
|
"grad_norm": 0.7693138122558594, |
|
"learning_rate": 1.6264437469961703e-05, |
|
"loss": 0.6232, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.05509315688664461, |
|
"grad_norm": 0.9830653071403503, |
|
"learning_rate": 1.6085247826638093e-05, |
|
"loss": 0.7752, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.055162806895350865, |
|
"grad_norm": 0.6889302134513855, |
|
"learning_rate": 1.5906964378783373e-05, |
|
"loss": 0.6974, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.05523245690405711, |
|
"grad_norm": 0.6805455088615417, |
|
"learning_rate": 1.5729589051680647e-05, |
|
"loss": 0.9143, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.05530210691276336, |
|
"grad_norm": 0.6505549550056458, |
|
"learning_rate": 1.5553123760806143e-05, |
|
"loss": 0.6784, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.05537175692146962, |
|
"grad_norm": 0.6062676310539246, |
|
"learning_rate": 1.5377570411808718e-05, |
|
"loss": 0.8088, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.055441406930175866, |
|
"grad_norm": 0.5329009890556335, |
|
"learning_rate": 1.5202930900489054e-05, |
|
"loss": 0.4477, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.055511056938882114, |
|
"grad_norm": 0.6530266404151917, |
|
"learning_rate": 1.502920711277943e-05, |
|
"loss": 0.7462, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.05558070694758837, |
|
"grad_norm": 0.6333693861961365, |
|
"learning_rate": 1.4856400924723079e-05, |
|
"loss": 1.1035, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.05565035695629462, |
|
"grad_norm": 0.7612791061401367, |
|
"learning_rate": 1.4684514202454225e-05, |
|
"loss": 0.9053, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.05572000696500087, |
|
"grad_norm": 0.6711084842681885, |
|
"learning_rate": 1.4513548802177634e-05, |
|
"loss": 1.0815, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.05572000696500087, |
|
"eval_loss": 0.7279470562934875, |
|
"eval_runtime": 700.3911, |
|
"eval_samples_per_second": 7.139, |
|
"eval_steps_per_second": 1.785, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.05578965697370712, |
|
"grad_norm": 0.8243626356124878, |
|
"learning_rate": 1.4343506570148846e-05, |
|
"loss": 0.9067, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.05585930698241337, |
|
"grad_norm": 0.740206241607666, |
|
"learning_rate": 1.4174389342653971e-05, |
|
"loss": 1.0956, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.055928956991119626, |
|
"grad_norm": 0.6383155584335327, |
|
"learning_rate": 1.4006198945990168e-05, |
|
"loss": 0.9274, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.055998606999825874, |
|
"grad_norm": 0.7425148487091064, |
|
"learning_rate": 1.3838937196445579e-05, |
|
"loss": 1.083, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.05606825700853212, |
|
"grad_norm": 0.6034273505210876, |
|
"learning_rate": 1.367260590028e-05, |
|
"loss": 0.7125, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.05613790701723838, |
|
"grad_norm": 0.7047588229179382, |
|
"learning_rate": 1.3507206853705178e-05, |
|
"loss": 0.7749, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.05620755702594463, |
|
"grad_norm": 0.7387014627456665, |
|
"learning_rate": 1.334274184286558e-05, |
|
"loss": 0.7397, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.05627720703465088, |
|
"grad_norm": 0.6060226559638977, |
|
"learning_rate": 1.3179212643818929e-05, |
|
"loss": 0.5144, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.05634685704335713, |
|
"grad_norm": 0.7422417402267456, |
|
"learning_rate": 1.3016621022517206e-05, |
|
"loss": 0.7739, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.05641650705206338, |
|
"grad_norm": 0.6336711645126343, |
|
"learning_rate": 1.2854968734787398e-05, |
|
"loss": 0.471, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.056486157060769634, |
|
"grad_norm": 0.667668879032135, |
|
"learning_rate": 1.2694257526312725e-05, |
|
"loss": 0.4143, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.05655580706947588, |
|
"grad_norm": 0.6936927437782288, |
|
"learning_rate": 1.2534489132613603e-05, |
|
"loss": 0.8842, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.05662545707818213, |
|
"grad_norm": 0.6019664406776428, |
|
"learning_rate": 1.2375665279029048e-05, |
|
"loss": 0.7445, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.05669510708688839, |
|
"grad_norm": 0.7595625519752502, |
|
"learning_rate": 1.221778768069799e-05, |
|
"loss": 0.8676, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.056764757095594635, |
|
"grad_norm": 0.593315839767456, |
|
"learning_rate": 1.206085804254069e-05, |
|
"loss": 0.7546, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.05683440710430089, |
|
"grad_norm": 0.7907949090003967, |
|
"learning_rate": 1.1904878059240442e-05, |
|
"loss": 1.0131, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.05690405711300714, |
|
"grad_norm": 0.6472040414810181, |
|
"learning_rate": 1.174984941522519e-05, |
|
"loss": 0.6795, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.05697370712171339, |
|
"grad_norm": 0.6748494505882263, |
|
"learning_rate": 1.1595773784649389e-05, |
|
"loss": 0.9777, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.05704335713041964, |
|
"grad_norm": 0.7594382166862488, |
|
"learning_rate": 1.1442652831375855e-05, |
|
"loss": 0.8305, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.05711300713912589, |
|
"grad_norm": 0.5605437159538269, |
|
"learning_rate": 1.1290488208957895e-05, |
|
"loss": 0.9774, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.05718265714783214, |
|
"grad_norm": 0.7108663320541382, |
|
"learning_rate": 1.1139281560621362e-05, |
|
"loss": 1.1447, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.057252307156538396, |
|
"grad_norm": 0.7549561858177185, |
|
"learning_rate": 1.0989034519246954e-05, |
|
"loss": 1.0838, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.057321957165244644, |
|
"grad_norm": 0.5975289940834045, |
|
"learning_rate": 1.0839748707352603e-05, |
|
"loss": 1.0126, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.0573916071739509, |
|
"grad_norm": 0.6680442094802856, |
|
"learning_rate": 1.06914257370759e-05, |
|
"loss": 0.5809, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.05746125718265715, |
|
"grad_norm": 0.7288407683372498, |
|
"learning_rate": 1.0544067210156671e-05, |
|
"loss": 0.9369, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.0575309071913634, |
|
"grad_norm": 0.7064124345779419, |
|
"learning_rate": 1.0397674717919802e-05, |
|
"loss": 0.8142, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.05760055720006965, |
|
"grad_norm": 0.6422365307807922, |
|
"learning_rate": 1.0252249841257877e-05, |
|
"loss": 0.5993, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.0576702072087759, |
|
"grad_norm": 0.6080381870269775, |
|
"learning_rate": 1.0107794150614281e-05, |
|
"loss": 0.6939, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.05773985721748215, |
|
"grad_norm": 0.6256659030914307, |
|
"learning_rate": 9.964309205966083e-06, |
|
"loss": 0.4506, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.057809507226188404, |
|
"grad_norm": 0.6198416352272034, |
|
"learning_rate": 9.821796556807339e-06, |
|
"loss": 0.6324, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.05787915723489465, |
|
"grad_norm": 0.6347202658653259, |
|
"learning_rate": 9.680257742132215e-06, |
|
"loss": 0.6047, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.05794880724360091, |
|
"grad_norm": 0.60918128490448, |
|
"learning_rate": 9.539694290418488e-06, |
|
"loss": 0.9085, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.05801845725230716, |
|
"grad_norm": 0.6706361174583435, |
|
"learning_rate": 9.400107719610995e-06, |
|
"loss": 0.9078, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.058088107261013405, |
|
"grad_norm": 0.7337279915809631, |
|
"learning_rate": 9.261499537105177e-06, |
|
"loss": 1.0197, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.05815775726971966, |
|
"grad_norm": 0.5747254490852356, |
|
"learning_rate": 9.12387123973093e-06, |
|
"loss": 0.8288, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.05822740727842591, |
|
"grad_norm": 0.6484262347221375, |
|
"learning_rate": 8.98722431373631e-06, |
|
"loss": 1.1276, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.05829705728713216, |
|
"grad_norm": 0.6793870329856873, |
|
"learning_rate": 8.851560234771594e-06, |
|
"loss": 0.5941, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.05836670729583841, |
|
"grad_norm": 0.6910689473152161, |
|
"learning_rate": 8.716880467873234e-06, |
|
"loss": 0.9097, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.05843635730454466, |
|
"grad_norm": 0.7062430381774902, |
|
"learning_rate": 8.583186467448167e-06, |
|
"loss": 0.9619, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.05850600731325092, |
|
"grad_norm": 0.8270265460014343, |
|
"learning_rate": 8.45047967725796e-06, |
|
"loss": 1.0196, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.058575657321957166, |
|
"grad_norm": 0.6949748992919922, |
|
"learning_rate": 8.318761530403374e-06, |
|
"loss": 0.5329, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.058645307330663414, |
|
"grad_norm": 0.7285637855529785, |
|
"learning_rate": 8.188033449308719e-06, |
|
"loss": 0.6849, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.05871495733936967, |
|
"grad_norm": 0.5861655473709106, |
|
"learning_rate": 8.058296845706715e-06, |
|
"loss": 0.8638, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.05878460734807592, |
|
"grad_norm": 0.7448881268501282, |
|
"learning_rate": 7.929553120622968e-06, |
|
"loss": 0.8458, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.058854257356782166, |
|
"grad_norm": 0.5610641241073608, |
|
"learning_rate": 7.801803664361095e-06, |
|
"loss": 0.4706, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.05892390736548842, |
|
"grad_norm": 0.5610293745994568, |
|
"learning_rate": 7.675049856487549e-06, |
|
"loss": 0.5503, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.05899355737419467, |
|
"grad_norm": 0.6175963282585144, |
|
"learning_rate": 7.5492930658168096e-06, |
|
"loss": 0.6195, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.059063207382900926, |
|
"grad_norm": 0.6749705672264099, |
|
"learning_rate": 7.42453465039652e-06, |
|
"loss": 0.7353, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.059132857391607174, |
|
"grad_norm": 0.6812541484832764, |
|
"learning_rate": 7.300775957492923e-06, |
|
"loss": 0.6882, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.05920250740031342, |
|
"grad_norm": 0.6131837368011475, |
|
"learning_rate": 7.178018323576208e-06, |
|
"loss": 0.9945, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.05927215740901968, |
|
"grad_norm": 0.6159570217132568, |
|
"learning_rate": 7.056263074306191e-06, |
|
"loss": 0.7943, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.05934180741772593, |
|
"grad_norm": 0.7175585627555847, |
|
"learning_rate": 6.935511524517835e-06, |
|
"loss": 0.8498, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.059411457426432175, |
|
"grad_norm": 0.7083918452262878, |
|
"learning_rate": 6.815764978207284e-06, |
|
"loss": 0.9473, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.05948110743513843, |
|
"grad_norm": 0.7349149584770203, |
|
"learning_rate": 6.6970247285175315e-06, |
|
"loss": 0.9025, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.05955075744384468, |
|
"grad_norm": 0.6739192008972168, |
|
"learning_rate": 6.579292057724639e-06, |
|
"loss": 0.8435, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.059620407452550934, |
|
"grad_norm": 0.6588095426559448, |
|
"learning_rate": 6.4625682372237874e-06, |
|
"loss": 0.6966, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.05969005746125718, |
|
"grad_norm": 0.5185966491699219, |
|
"learning_rate": 6.346854527515622e-06, |
|
"loss": 0.6977, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.05975970746996343, |
|
"grad_norm": 0.5705149173736572, |
|
"learning_rate": 6.23215217819253e-06, |
|
"loss": 0.6574, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.05982935747866969, |
|
"grad_norm": 0.5465989112854004, |
|
"learning_rate": 6.11846242792532e-06, |
|
"loss": 0.5492, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.059899007487375935, |
|
"grad_norm": 0.7820805311203003, |
|
"learning_rate": 6.005786504449651e-06, |
|
"loss": 0.8664, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.059968657496082184, |
|
"grad_norm": 0.7436554431915283, |
|
"learning_rate": 5.894125624552915e-06, |
|
"loss": 0.9035, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.06003830750478844, |
|
"grad_norm": 0.7402638792991638, |
|
"learning_rate": 5.7834809940610195e-06, |
|
"loss": 0.7703, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.06010795751349469, |
|
"grad_norm": 0.6208961009979248, |
|
"learning_rate": 5.673853807825424e-06, |
|
"loss": 0.7226, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.06017760752220094, |
|
"grad_norm": 0.5884114503860474, |
|
"learning_rate": 5.565245249710194e-06, |
|
"loss": 1.0493, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.06024725753090719, |
|
"grad_norm": 0.7064511179924011, |
|
"learning_rate": 5.457656492579211e-06, |
|
"loss": 1.0538, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.06031690753961344, |
|
"grad_norm": 0.714733362197876, |
|
"learning_rate": 5.351088698283558e-06, |
|
"loss": 0.7942, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.060386557548319696, |
|
"grad_norm": 0.6394374966621399, |
|
"learning_rate": 5.2455430176489014e-06, |
|
"loss": 0.7437, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.060456207557025944, |
|
"grad_norm": 0.6636267900466919, |
|
"learning_rate": 5.1410205904631415e-06, |
|
"loss": 0.8204, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.06052585756573219, |
|
"grad_norm": 0.6036087274551392, |
|
"learning_rate": 5.037522545464024e-06, |
|
"loss": 0.7066, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.06059550757443845, |
|
"grad_norm": 0.6227147579193115, |
|
"learning_rate": 4.9350500003270465e-06, |
|
"loss": 0.7101, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.060665157583144697, |
|
"grad_norm": 0.5791090130805969, |
|
"learning_rate": 4.833604061653252e-06, |
|
"loss": 0.6439, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.06073480759185095, |
|
"grad_norm": 0.5661488771438599, |
|
"learning_rate": 4.73318582495742e-06, |
|
"loss": 0.5134, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.0608044576005572, |
|
"grad_norm": 0.7721818089485168, |
|
"learning_rate": 4.633796374656174e-06, |
|
"loss": 0.8566, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.06087410760926345, |
|
"grad_norm": 0.7348571419715881, |
|
"learning_rate": 4.535436784056269e-06, |
|
"loss": 0.6653, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.060943757617969704, |
|
"grad_norm": 0.6881682872772217, |
|
"learning_rate": 4.438108115342965e-06, |
|
"loss": 0.7876, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.06101340762667595, |
|
"grad_norm": 0.6156147718429565, |
|
"learning_rate": 4.3418114195686536e-06, |
|
"loss": 0.8429, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.0610830576353822, |
|
"grad_norm": 0.6420087218284607, |
|
"learning_rate": 4.246547736641382e-06, |
|
"loss": 0.7274, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.06115270764408846, |
|
"grad_norm": 0.5134680271148682, |
|
"learning_rate": 4.152318095313778e-06, |
|
"loss": 0.5185, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.061222357652794705, |
|
"grad_norm": 0.6913058757781982, |
|
"learning_rate": 4.05912351317177e-06, |
|
"loss": 0.9036, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.06129200766150096, |
|
"grad_norm": 0.5641781091690063, |
|
"learning_rate": 3.966964996623735e-06, |
|
"loss": 0.8567, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.06136165767020721, |
|
"grad_norm": 0.5682424306869507, |
|
"learning_rate": 3.875843540889546e-06, |
|
"loss": 0.7562, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.06143130767891346, |
|
"grad_norm": 0.5852996110916138, |
|
"learning_rate": 3.785760129989868e-06, |
|
"loss": 0.4581, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.06150095768761971, |
|
"grad_norm": 0.6625421047210693, |
|
"learning_rate": 3.6967157367355567e-06, |
|
"loss": 1.0613, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.06157060769632596, |
|
"grad_norm": 0.7365720868110657, |
|
"learning_rate": 3.6087113227170287e-06, |
|
"loss": 0.8548, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.06164025770503221, |
|
"grad_norm": 0.596820592880249, |
|
"learning_rate": 3.5217478382940426e-06, |
|
"loss": 0.7301, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.061709907713738466, |
|
"grad_norm": 0.7230522632598877, |
|
"learning_rate": 3.4358262225853254e-06, |
|
"loss": 1.0264, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.061779557722444714, |
|
"grad_norm": 0.550679087638855, |
|
"learning_rate": 3.3509474034584596e-06, |
|
"loss": 0.6914, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.06184920773115097, |
|
"grad_norm": 0.6080251932144165, |
|
"learning_rate": 3.267112297519881e-06, |
|
"loss": 0.8706, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.06191885773985722, |
|
"grad_norm": 0.6070705056190491, |
|
"learning_rate": 3.184321810104962e-06, |
|
"loss": 1.0111, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.061988507748563466, |
|
"grad_norm": 0.6949368715286255, |
|
"learning_rate": 3.102576835268212e-06, |
|
"loss": 0.9892, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.06205815775726972, |
|
"grad_norm": 0.7588335275650024, |
|
"learning_rate": 3.0218782557737136e-06, |
|
"loss": 0.8309, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.06212780776597597, |
|
"grad_norm": 0.5684018135070801, |
|
"learning_rate": 2.9422269430854245e-06, |
|
"loss": 0.8553, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.06219745777468222, |
|
"grad_norm": 0.554639995098114, |
|
"learning_rate": 2.863623757357992e-06, |
|
"loss": 0.7984, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.062267107783388474, |
|
"grad_norm": 0.653669536113739, |
|
"learning_rate": 2.7860695474272392e-06, |
|
"loss": 0.8296, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.06233675779209472, |
|
"grad_norm": 0.610150158405304, |
|
"learning_rate": 2.709565150801152e-06, |
|
"loss": 0.5203, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.06240640780080098, |
|
"grad_norm": 0.6130475401878357, |
|
"learning_rate": 2.634111393650751e-06, |
|
"loss": 0.8298, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.06247605780950723, |
|
"grad_norm": 0.5449431538581848, |
|
"learning_rate": 2.559709090801221e-06, |
|
"loss": 0.7497, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.06254570781821348, |
|
"grad_norm": 0.6247503757476807, |
|
"learning_rate": 2.4863590457230743e-06, |
|
"loss": 1.1263, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.06261535782691972, |
|
"grad_norm": 0.7267642617225647, |
|
"learning_rate": 2.4140620505235135e-06, |
|
"loss": 0.7873, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.06268500783562599, |
|
"grad_norm": 0.7534024119377136, |
|
"learning_rate": 2.342818885937825e-06, |
|
"loss": 1.0745, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.06268500783562599, |
|
"eval_loss": 0.723136305809021, |
|
"eval_runtime": 700.403, |
|
"eval_samples_per_second": 7.139, |
|
"eval_steps_per_second": 1.785, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.06275465784433223, |
|
"grad_norm": 0.7109830379486084, |
|
"learning_rate": 2.272630321321023e-06, |
|
"loss": 0.704, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.06282430785303848, |
|
"grad_norm": 0.4886980950832367, |
|
"learning_rate": 2.20349711463943e-06, |
|
"loss": 0.4915, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.06289395786174473, |
|
"grad_norm": 0.6534592509269714, |
|
"learning_rate": 2.135420012462619e-06, |
|
"loss": 0.6073, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.06296360787045098, |
|
"grad_norm": 0.5471417903900146, |
|
"learning_rate": 2.0683997499552632e-06, |
|
"loss": 0.6319, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.06303325787915723, |
|
"grad_norm": 0.765691876411438, |
|
"learning_rate": 2.0024370508692104e-06, |
|
"loss": 0.9544, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.06310290788786349, |
|
"grad_norm": 0.6834742426872253, |
|
"learning_rate": 1.9375326275357208e-06, |
|
"loss": 0.8162, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.06317255789656974, |
|
"grad_norm": 0.7233893871307373, |
|
"learning_rate": 1.8736871808576861e-06, |
|
"loss": 1.0311, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.06324220790527599, |
|
"grad_norm": 0.6150738000869751, |
|
"learning_rate": 1.8109014003021452e-06, |
|
"loss": 0.9241, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.06331185791398224, |
|
"grad_norm": 0.7470687031745911, |
|
"learning_rate": 1.7491759638927686e-06, |
|
"loss": 1.1686, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.06338150792268848, |
|
"grad_norm": 0.7098023295402527, |
|
"learning_rate": 1.6885115382026085e-06, |
|
"loss": 1.1531, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.06345115793139475, |
|
"grad_norm": 0.6397354006767273, |
|
"learning_rate": 1.628908778346827e-06, |
|
"loss": 0.9153, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.063520807940101, |
|
"grad_norm": 0.6609793305397034, |
|
"learning_rate": 1.5703683279756797e-06, |
|
"loss": 0.641, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.06359045794880724, |
|
"grad_norm": 0.7062059640884399, |
|
"learning_rate": 1.5128908192675318e-06, |
|
"loss": 0.7182, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.06366010795751349, |
|
"grad_norm": 0.6093196272850037, |
|
"learning_rate": 1.4564768729220412e-06, |
|
"loss": 0.6793, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.06372975796621974, |
|
"grad_norm": 0.6978054642677307, |
|
"learning_rate": 1.401127098153443e-06, |
|
"loss": 0.7592, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.063799407974926, |
|
"grad_norm": 0.5635403394699097, |
|
"learning_rate": 1.3468420926840197e-06, |
|
"loss": 0.869, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.06386905798363225, |
|
"grad_norm": 0.6903446912765503, |
|
"learning_rate": 1.2936224427375521e-06, |
|
"loss": 0.7401, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.0639387079923385, |
|
"grad_norm": 0.6210869550704956, |
|
"learning_rate": 1.2414687230331123e-06, |
|
"loss": 0.5908, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.06400835800104475, |
|
"grad_norm": 0.6113409399986267, |
|
"learning_rate": 1.1903814967787253e-06, |
|
"loss": 0.5493, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.064078008009751, |
|
"grad_norm": 0.9400643706321716, |
|
"learning_rate": 1.1403613156654059e-06, |
|
"loss": 1.0418, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.06414765801845725, |
|
"grad_norm": 0.683574378490448, |
|
"learning_rate": 1.091408719861109e-06, |
|
"loss": 0.9345, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.06421730802716351, |
|
"grad_norm": 0.7595987915992737, |
|
"learning_rate": 1.0435242380049559e-06, |
|
"loss": 0.8716, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.06428695803586976, |
|
"grad_norm": 0.6851724982261658, |
|
"learning_rate": 9.967083872015282e-07, |
|
"loss": 0.5158, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.064356608044576, |
|
"grad_norm": 0.6724770069122314, |
|
"learning_rate": 9.509616730151827e-07, |
|
"loss": 0.5133, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.06442625805328225, |
|
"grad_norm": 0.6596947312355042, |
|
"learning_rate": 9.062845894647676e-07, |
|
"loss": 0.6722, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.0644959080619885, |
|
"grad_norm": 0.5619158148765564, |
|
"learning_rate": 8.626776190181041e-07, |
|
"loss": 0.9499, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.06456555807069476, |
|
"grad_norm": 0.7573150992393494, |
|
"learning_rate": 8.20141232586924e-07, |
|
"loss": 0.7521, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.06463520807940101, |
|
"grad_norm": 0.6126770377159119, |
|
"learning_rate": 7.786758895216629e-07, |
|
"loss": 0.6616, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.06470485808810726, |
|
"grad_norm": 0.7481774687767029, |
|
"learning_rate": 7.382820376066302e-07, |
|
"loss": 0.8779, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.06477450809681351, |
|
"grad_norm": 0.7029200792312622, |
|
"learning_rate": 6.98960113055025e-07, |
|
"loss": 0.7685, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.06484415810551976, |
|
"grad_norm": 0.6455416679382324, |
|
"learning_rate": 6.607105405043612e-07, |
|
"loss": 1.0069, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.06491380811422602, |
|
"grad_norm": 0.7011751532554626, |
|
"learning_rate": 6.23533733011783e-07, |
|
"loss": 0.6548, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.06498345812293227, |
|
"grad_norm": 0.7533524036407471, |
|
"learning_rate": 5.8743009204969e-07, |
|
"loss": 0.7463, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.06505310813163852, |
|
"grad_norm": 0.5586950182914734, |
|
"learning_rate": 5.52400007501297e-07, |
|
"loss": 0.6125, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.06512275814034477, |
|
"grad_norm": 0.6539096832275391, |
|
"learning_rate": 5.184438576565253e-07, |
|
"loss": 0.8559, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.06519240814905101, |
|
"grad_norm": 0.7584323883056641, |
|
"learning_rate": 4.855620092078627e-07, |
|
"loss": 1.1142, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.06526205815775726, |
|
"grad_norm": 0.6609397530555725, |
|
"learning_rate": 4.537548172464101e-07, |
|
"loss": 0.8978, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.06533170816646353, |
|
"grad_norm": 0.6159988641738892, |
|
"learning_rate": 4.230226252580516e-07, |
|
"loss": 0.6993, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.06540135817516977, |
|
"grad_norm": 0.6153664588928223, |
|
"learning_rate": 3.9336576511976863e-07, |
|
"loss": 0.4574, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.06547100818387602, |
|
"grad_norm": 0.6489300727844238, |
|
"learning_rate": 3.6478455709598734e-07, |
|
"loss": 0.7568, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.06554065819258227, |
|
"grad_norm": 0.6248874664306641, |
|
"learning_rate": 3.372793098352256e-07, |
|
"loss": 0.6879, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.06561030820128852, |
|
"grad_norm": 0.5801978707313538, |
|
"learning_rate": 3.108503203666402e-07, |
|
"loss": 0.7331, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.06567995820999478, |
|
"grad_norm": 0.605501115322113, |
|
"learning_rate": 2.8549787409691833e-07, |
|
"loss": 0.6179, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.06574960821870103, |
|
"grad_norm": 0.5972608327865601, |
|
"learning_rate": 2.6122224480715775e-07, |
|
"loss": 0.6514, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.06581925822740728, |
|
"grad_norm": 0.7556172609329224, |
|
"learning_rate": 2.380236946498693e-07, |
|
"loss": 0.8719, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.06588890823611353, |
|
"grad_norm": 0.6486802101135254, |
|
"learning_rate": 2.1590247414624566e-07, |
|
"loss": 0.5719, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.06595855824481978, |
|
"grad_norm": 0.638469398021698, |
|
"learning_rate": 1.948588221833303e-07, |
|
"loss": 0.6393, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.06602820825352604, |
|
"grad_norm": 0.7082604765892029, |
|
"learning_rate": 1.7489296601156392e-07, |
|
"loss": 1.0018, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.06609785826223229, |
|
"grad_norm": 0.6530460119247437, |
|
"learning_rate": 1.5600512124221978e-07, |
|
"loss": 0.7418, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.06616750827093854, |
|
"grad_norm": 0.653685986995697, |
|
"learning_rate": 1.3819549184516112e-07, |
|
"loss": 0.9309, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.06623715827964478, |
|
"grad_norm": 0.5263675451278687, |
|
"learning_rate": 1.2146427014657625e-07, |
|
"loss": 0.7189, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.06630680828835103, |
|
"grad_norm": 0.6783672571182251, |
|
"learning_rate": 1.0581163682695793e-07, |
|
"loss": 0.5871, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.06637645829705728, |
|
"grad_norm": 0.4727168083190918, |
|
"learning_rate": 9.123776091908287e-08, |
|
"loss": 0.3484, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.06644610830576354, |
|
"grad_norm": 0.5385925769805908, |
|
"learning_rate": 7.774279980626853e-08, |
|
"loss": 0.5899, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.06651575831446979, |
|
"grad_norm": 0.6668855547904968, |
|
"learning_rate": 6.532689922059687e-08, |
|
"loss": 1.0131, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.06658540832317604, |
|
"grad_norm": 0.6244344115257263, |
|
"learning_rate": 5.3990193241393313e-08, |
|
"loss": 0.7458, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.06665505833188229, |
|
"grad_norm": 0.6702743768692017, |
|
"learning_rate": 4.373280429375015e-08, |
|
"loss": 0.8924, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.06672470834058854, |
|
"grad_norm": 0.6103947758674622, |
|
"learning_rate": 3.4554843147216464e-08, |
|
"loss": 1.0036, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.0667943583492948, |
|
"grad_norm": 0.622797966003418, |
|
"learning_rate": 2.6456408914599108e-08, |
|
"loss": 0.8497, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.06686400835800105, |
|
"grad_norm": 0.7076674699783325, |
|
"learning_rate": 1.9437589050907977e-08, |
|
"loss": 0.5629, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.0669336583667073, |
|
"grad_norm": 0.7682867050170898, |
|
"learning_rate": 1.3498459352367931e-08, |
|
"loss": 0.7463, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.06700330837541355, |
|
"grad_norm": 0.7987236380577087, |
|
"learning_rate": 8.639083955663818e-09, |
|
"loss": 1.1664, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.0670729583841198, |
|
"grad_norm": 0.7837391495704651, |
|
"learning_rate": 4.859515337174436e-09, |
|
"loss": 0.6505, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.06714260839282606, |
|
"grad_norm": 0.6566223502159119, |
|
"learning_rate": 2.1597943124729292e-09, |
|
"loss": 0.8524, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.0672122584015323, |
|
"grad_norm": 0.6998875737190247, |
|
"learning_rate": 5.399500358493903e-10, |
|
"loss": 0.8817, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.06728190841023855, |
|
"grad_norm": 0.6083624362945557, |
|
"learning_rate": 0.0, |
|
"loss": 0.8767, |
|
"step": 966 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 966, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 2, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.628352553502376e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|