diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3967 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999881495526456, + "eval_steps": 500, + "global_step": 56256, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017775671031581442, + "grad_norm": 1.7014890909194946, + "learning_rate": 2.5e-06, + "loss": 2.2156, + "step": 100 + }, + { + "epoch": 0.0035551342063162884, + "grad_norm": 1.0592787265777588, + "learning_rate": 5e-06, + "loss": 2.1147, + "step": 200 + }, + { + "epoch": 0.005332701309474433, + "grad_norm": 0.7293457388877869, + "learning_rate": 7.5e-06, + "loss": 1.9982, + "step": 300 + }, + { + "epoch": 0.007110268412632577, + "grad_norm": 0.5222712755203247, + "learning_rate": 1e-05, + "loss": 1.9311, + "step": 400 + }, + { + "epoch": 0.008887835515790721, + "grad_norm": 0.37243402004241943, + "learning_rate": 1.25e-05, + "loss": 1.8657, + "step": 500 + }, + { + "epoch": 0.010665402618948865, + "grad_norm": 0.47352147102355957, + "learning_rate": 1.5e-05, + "loss": 1.8218, + "step": 600 + }, + { + "epoch": 0.01244296972210701, + "grad_norm": 0.2529783248901367, + "learning_rate": 1.75e-05, + "loss": 1.7801, + "step": 700 + }, + { + "epoch": 0.014220536825265154, + "grad_norm": 0.2332669496536255, + "learning_rate": 2e-05, + "loss": 1.7856, + "step": 800 + }, + { + "epoch": 0.0159981039284233, + "grad_norm": 0.3814944326877594, + "learning_rate": 2.25e-05, + "loss": 1.7645, + "step": 900 + }, + { + "epoch": 0.017775671031581442, + "grad_norm": 0.17010626196861267, + "learning_rate": 2.5e-05, + "loss": 1.7822, + "step": 1000 + }, + { + "epoch": 0.019553238134739588, + "grad_norm": 0.1512402594089508, + "learning_rate": 2.7500000000000004e-05, + "loss": 1.763, + "step": 1100 + }, + { + "epoch": 0.02133080523789773, + "grad_norm": 0.18556953966617584, + "learning_rate": 3e-05, + "loss": 1.7465, + "step": 1200 + }, + { + "epoch": 0.023108372341055877, + "grad_norm": 0.16216696798801422, + "learning_rate": 3.2500000000000004e-05, + "loss": 1.752, + "step": 1300 + }, + { + "epoch": 0.02488593944421402, + "grad_norm": 0.21479512751102448, + "learning_rate": 3.5e-05, + "loss": 1.7313, + "step": 1400 + }, + { + "epoch": 0.026663506547372165, + "grad_norm": 0.1630880981683731, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.7159, + "step": 1500 + }, + { + "epoch": 0.028441073650530307, + "grad_norm": 0.152820885181427, + "learning_rate": 4e-05, + "loss": 1.7233, + "step": 1600 + }, + { + "epoch": 0.030218640753688453, + "grad_norm": 0.15548893809318542, + "learning_rate": 4.25e-05, + "loss": 1.7333, + "step": 1700 + }, + { + "epoch": 0.0319962078568466, + "grad_norm": 0.437898188829422, + "learning_rate": 4.5e-05, + "loss": 1.7188, + "step": 1800 + }, + { + "epoch": 0.03377377496000474, + "grad_norm": 0.18985818326473236, + "learning_rate": 4.75e-05, + "loss": 1.7269, + "step": 1900 + }, + { + "epoch": 0.035551342063162884, + "grad_norm": 0.21630148589611053, + "learning_rate": 5e-05, + "loss": 1.7373, + "step": 2000 + }, + { + "epoch": 0.03732890916632103, + "grad_norm": 0.14671586453914642, + "learning_rate": 4.9999580904497634e-05, + "loss": 1.7236, + "step": 2100 + }, + { + "epoch": 0.039106476269479176, + "grad_norm": 0.209241583943367, + "learning_rate": 4.99983236320418e-05, + "loss": 1.7082, + "step": 2200 + }, + { + "epoch": 0.04088404337263732, + "grad_norm": 0.20861445367336273, + "learning_rate": 4.9996228224785886e-05, + "loss": 1.7319, + "step": 2300 + }, + { + "epoch": 0.04266161047579546, + "grad_norm": 0.1577518880367279, + "learning_rate": 4.999329475298396e-05, + "loss": 1.7084, + "step": 2400 + }, + { + "epoch": 0.044439177578953604, + "grad_norm": 0.14321212470531464, + "learning_rate": 4.998952331498839e-05, + "loss": 1.715, + "step": 2500 + }, + { + "epoch": 0.04621674468211175, + "grad_norm": 0.16167956590652466, + "learning_rate": 4.99849140372466e-05, + "loss": 1.7072, + "step": 2600 + }, + { + "epoch": 0.047994311785269896, + "grad_norm": 0.14405596256256104, + "learning_rate": 4.9979467074296805e-05, + "loss": 1.7354, + "step": 2700 + }, + { + "epoch": 0.04977187888842804, + "grad_norm": 0.18818779289722443, + "learning_rate": 4.9973182608762805e-05, + "loss": 1.7246, + "step": 2800 + }, + { + "epoch": 0.05154944599158618, + "grad_norm": 0.14068296551704407, + "learning_rate": 4.996606085134791e-05, + "loss": 1.7138, + "step": 2900 + }, + { + "epoch": 0.05332701309474433, + "grad_norm": 0.1438419222831726, + "learning_rate": 4.995810204082784e-05, + "loss": 1.7085, + "step": 3000 + }, + { + "epoch": 0.05510458019790247, + "grad_norm": 0.1487807333469391, + "learning_rate": 4.994930644404272e-05, + "loss": 1.703, + "step": 3100 + }, + { + "epoch": 0.056882147301060615, + "grad_norm": 0.1583404541015625, + "learning_rate": 4.993967435588816e-05, + "loss": 1.7036, + "step": 3200 + }, + { + "epoch": 0.05865971440421876, + "grad_norm": 0.25886544585227966, + "learning_rate": 4.992920609930535e-05, + "loss": 1.6993, + "step": 3300 + }, + { + "epoch": 0.06043728150737691, + "grad_norm": 0.18560856580734253, + "learning_rate": 4.991790202527022e-05, + "loss": 1.7111, + "step": 3400 + }, + { + "epoch": 0.06221484861053505, + "grad_norm": 0.14303149282932281, + "learning_rate": 4.990576251278172e-05, + "loss": 1.7104, + "step": 3500 + }, + { + "epoch": 0.0639924157136932, + "grad_norm": 0.1497068852186203, + "learning_rate": 4.9892787968849033e-05, + "loss": 1.7038, + "step": 3600 + }, + { + "epoch": 0.06576998281685134, + "grad_norm": 0.17193421721458435, + "learning_rate": 4.987897882847801e-05, + "loss": 1.6955, + "step": 3700 + }, + { + "epoch": 0.06754754992000948, + "grad_norm": 0.19913499057292938, + "learning_rate": 4.9864335554656526e-05, + "loss": 1.7009, + "step": 3800 + }, + { + "epoch": 0.06932511702316763, + "grad_norm": 0.15236733853816986, + "learning_rate": 4.984885863833901e-05, + "loss": 1.6994, + "step": 3900 + }, + { + "epoch": 0.07110268412632577, + "grad_norm": 0.18027468025684357, + "learning_rate": 4.9832548598429955e-05, + "loss": 1.6974, + "step": 4000 + }, + { + "epoch": 0.07288025122948391, + "grad_norm": 0.19036361575126648, + "learning_rate": 4.981540598176649e-05, + "loss": 1.6957, + "step": 4100 + }, + { + "epoch": 0.07465781833264205, + "grad_norm": 0.18389485776424408, + "learning_rate": 4.979743136310011e-05, + "loss": 1.7133, + "step": 4200 + }, + { + "epoch": 0.0764353854358002, + "grad_norm": 0.16271525621414185, + "learning_rate": 4.977862534507735e-05, + "loss": 1.7093, + "step": 4300 + }, + { + "epoch": 0.07821295253895835, + "grad_norm": 0.15076510608196259, + "learning_rate": 4.975898855821964e-05, + "loss": 1.7007, + "step": 4400 + }, + { + "epoch": 0.0799905196421165, + "grad_norm": 0.6439979672431946, + "learning_rate": 4.9738521660902074e-05, + "loss": 1.7049, + "step": 4500 + }, + { + "epoch": 0.08176808674527464, + "grad_norm": 0.23859179019927979, + "learning_rate": 4.971722533933144e-05, + "loss": 1.7128, + "step": 4600 + }, + { + "epoch": 0.08354565384843278, + "grad_norm": 0.18766574561595917, + "learning_rate": 4.969510030752314e-05, + "loss": 1.6976, + "step": 4700 + }, + { + "epoch": 0.08532322095159092, + "grad_norm": 0.1395421326160431, + "learning_rate": 4.9672147307277285e-05, + "loss": 1.6957, + "step": 4800 + }, + { + "epoch": 0.08710078805474906, + "grad_norm": 0.15366794168949127, + "learning_rate": 4.9648367108153795e-05, + "loss": 1.6966, + "step": 4900 + }, + { + "epoch": 0.08887835515790721, + "grad_norm": 0.14610810577869415, + "learning_rate": 4.9623760507446646e-05, + "loss": 1.6964, + "step": 5000 + }, + { + "epoch": 0.09065592226106535, + "grad_norm": 0.19104835391044617, + "learning_rate": 4.9598328330157084e-05, + "loss": 1.697, + "step": 5100 + }, + { + "epoch": 0.0924334893642235, + "grad_norm": 0.15467491745948792, + "learning_rate": 4.957207142896599e-05, + "loss": 1.7051, + "step": 5200 + }, + { + "epoch": 0.09421105646738165, + "grad_norm": 0.15119831264019012, + "learning_rate": 4.9544990684205324e-05, + "loss": 1.6961, + "step": 5300 + }, + { + "epoch": 0.09598862357053979, + "grad_norm": 0.13215667009353638, + "learning_rate": 4.951708700382853e-05, + "loss": 1.6961, + "step": 5400 + }, + { + "epoch": 0.09776619067369793, + "grad_norm": 0.1464473158121109, + "learning_rate": 4.948836132338017e-05, + "loss": 1.6968, + "step": 5500 + }, + { + "epoch": 0.09954375777685608, + "grad_norm": 0.17705056071281433, + "learning_rate": 4.945881460596453e-05, + "loss": 1.7023, + "step": 5600 + }, + { + "epoch": 0.10132132488001422, + "grad_norm": 0.1780652105808258, + "learning_rate": 4.942844784221331e-05, + "loss": 1.7127, + "step": 5700 + }, + { + "epoch": 0.10309889198317236, + "grad_norm": 0.13343891501426697, + "learning_rate": 4.9397262050252444e-05, + "loss": 1.6882, + "step": 5800 + }, + { + "epoch": 0.1048764590863305, + "grad_norm": 0.14272858202457428, + "learning_rate": 4.9365258275667935e-05, + "loss": 1.7006, + "step": 5900 + }, + { + "epoch": 0.10665402618948866, + "grad_norm": 0.1572321206331253, + "learning_rate": 4.933243759147084e-05, + "loss": 1.6909, + "step": 6000 + }, + { + "epoch": 0.1084315932926468, + "grad_norm": 0.1878873109817505, + "learning_rate": 4.9298801098061234e-05, + "loss": 1.7001, + "step": 6100 + }, + { + "epoch": 0.11020916039580494, + "grad_norm": 0.15412138402462006, + "learning_rate": 4.926434992319137e-05, + "loss": 1.7009, + "step": 6200 + }, + { + "epoch": 0.11198672749896309, + "grad_norm": 0.16579018533229828, + "learning_rate": 4.922908522192785e-05, + "loss": 1.6903, + "step": 6300 + }, + { + "epoch": 0.11376429460212123, + "grad_norm": 0.176075279712677, + "learning_rate": 4.919300817661288e-05, + "loss": 1.6814, + "step": 6400 + }, + { + "epoch": 0.11554186170527937, + "grad_norm": 0.1489766240119934, + "learning_rate": 4.9156119996824646e-05, + "loss": 1.6834, + "step": 6500 + }, + { + "epoch": 0.11731942880843751, + "grad_norm": 0.14244747161865234, + "learning_rate": 4.911842191933679e-05, + "loss": 1.698, + "step": 6600 + }, + { + "epoch": 0.11909699591159566, + "grad_norm": 0.18538010120391846, + "learning_rate": 4.9079915208076874e-05, + "loss": 1.7075, + "step": 6700 + }, + { + "epoch": 0.12087456301475381, + "grad_norm": 0.13722339272499084, + "learning_rate": 4.9040601154084064e-05, + "loss": 1.6904, + "step": 6800 + }, + { + "epoch": 0.12265213011791196, + "grad_norm": 0.14853331446647644, + "learning_rate": 4.900048107546581e-05, + "loss": 1.7006, + "step": 6900 + }, + { + "epoch": 0.1244296972210701, + "grad_norm": 0.1475294679403305, + "learning_rate": 4.895955631735369e-05, + "loss": 1.7084, + "step": 7000 + }, + { + "epoch": 0.12620726432422824, + "grad_norm": 0.16500729322433472, + "learning_rate": 4.8917828251858245e-05, + "loss": 1.6824, + "step": 7100 + }, + { + "epoch": 0.1279848314273864, + "grad_norm": 0.14395256340503693, + "learning_rate": 4.8875727542547924e-05, + "loss": 1.6846, + "step": 7200 + }, + { + "epoch": 0.12976239853054453, + "grad_norm": 0.14854487776756287, + "learning_rate": 4.8832405083980224e-05, + "loss": 1.72, + "step": 7300 + }, + { + "epoch": 0.13153996563370268, + "grad_norm": 0.13804668188095093, + "learning_rate": 4.8788283581110025e-05, + "loss": 1.6994, + "step": 7400 + }, + { + "epoch": 0.1333175327368608, + "grad_norm": 0.19897769391536713, + "learning_rate": 4.874336451322718e-05, + "loss": 1.6748, + "step": 7500 + }, + { + "epoch": 0.13509509984001897, + "grad_norm": 0.18809333443641663, + "learning_rate": 4.869764938636205e-05, + "loss": 1.7039, + "step": 7600 + }, + { + "epoch": 0.1368726669431771, + "grad_norm": 0.15036119520664215, + "learning_rate": 4.865113973323494e-05, + "loss": 1.6873, + "step": 7700 + }, + { + "epoch": 0.13865023404633525, + "grad_norm": 0.24881285429000854, + "learning_rate": 4.8603837113204786e-05, + "loss": 1.7069, + "step": 7800 + }, + { + "epoch": 0.14042780114949338, + "grad_norm": 0.13876497745513916, + "learning_rate": 4.85557431122168e-05, + "loss": 1.6825, + "step": 7900 + }, + { + "epoch": 0.14220536825265154, + "grad_norm": 0.1649981439113617, + "learning_rate": 4.850685934274935e-05, + "loss": 1.6943, + "step": 8000 + }, + { + "epoch": 0.1439829353558097, + "grad_norm": 0.14828725159168243, + "learning_rate": 4.845718744375987e-05, + "loss": 1.6928, + "step": 8100 + }, + { + "epoch": 0.14576050245896782, + "grad_norm": 0.15515898168087006, + "learning_rate": 4.84067290806299e-05, + "loss": 1.6938, + "step": 8200 + }, + { + "epoch": 0.14753806956212598, + "grad_norm": 0.21222877502441406, + "learning_rate": 4.83554859451093e-05, + "loss": 1.6775, + "step": 8300 + }, + { + "epoch": 0.1493156366652841, + "grad_norm": 0.14965397119522095, + "learning_rate": 4.830345975525948e-05, + "loss": 1.6952, + "step": 8400 + }, + { + "epoch": 0.15109320376844226, + "grad_norm": 0.1583070456981659, + "learning_rate": 4.8250652255395806e-05, + "loss": 1.6856, + "step": 8500 + }, + { + "epoch": 0.1528707708716004, + "grad_norm": 0.1827002763748169, + "learning_rate": 4.819706521602914e-05, + "loss": 1.696, + "step": 8600 + }, + { + "epoch": 0.15464833797475855, + "grad_norm": 0.21312415599822998, + "learning_rate": 4.8142700433806456e-05, + "loss": 1.6839, + "step": 8700 + }, + { + "epoch": 0.1564259050779167, + "grad_norm": 0.14075049757957458, + "learning_rate": 4.80875597314506e-05, + "loss": 1.6846, + "step": 8800 + }, + { + "epoch": 0.15820347218107483, + "grad_norm": 0.15312770009040833, + "learning_rate": 4.8031644957699214e-05, + "loss": 1.6856, + "step": 8900 + }, + { + "epoch": 0.159981039284233, + "grad_norm": 0.16638757288455963, + "learning_rate": 4.797495798724271e-05, + "loss": 1.6922, + "step": 9000 + }, + { + "epoch": 0.16175860638739112, + "grad_norm": 0.13447363674640656, + "learning_rate": 4.791750072066143e-05, + "loss": 1.6845, + "step": 9100 + }, + { + "epoch": 0.16353617349054927, + "grad_norm": 0.1486334651708603, + "learning_rate": 4.785927508436194e-05, + "loss": 1.6966, + "step": 9200 + }, + { + "epoch": 0.1653137405937074, + "grad_norm": 0.1405581384897232, + "learning_rate": 4.780028303051243e-05, + "loss": 1.6883, + "step": 9300 + }, + { + "epoch": 0.16709130769686556, + "grad_norm": 0.1692507416009903, + "learning_rate": 4.774052653697725e-05, + "loss": 1.6829, + "step": 9400 + }, + { + "epoch": 0.1688688748000237, + "grad_norm": 0.17827360332012177, + "learning_rate": 4.76800076072506e-05, + "loss": 1.698, + "step": 9500 + }, + { + "epoch": 0.17064644190318184, + "grad_norm": 0.1813431978225708, + "learning_rate": 4.7618728270389405e-05, + "loss": 1.6936, + "step": 9600 + }, + { + "epoch": 0.17242400900634, + "grad_norm": 0.15732981264591217, + "learning_rate": 4.755669058094521e-05, + "loss": 1.6756, + "step": 9700 + }, + { + "epoch": 0.17420157610949813, + "grad_norm": 0.1365622580051422, + "learning_rate": 4.749389661889535e-05, + "loss": 1.6869, + "step": 9800 + }, + { + "epoch": 0.17597914321265629, + "grad_norm": 0.14390863478183746, + "learning_rate": 4.7430348489573175e-05, + "loss": 1.6986, + "step": 9900 + }, + { + "epoch": 0.17775671031581441, + "grad_norm": 0.17032405734062195, + "learning_rate": 4.7366048323597524e-05, + "loss": 1.6997, + "step": 10000 + }, + { + "epoch": 0.17953427741897257, + "grad_norm": 0.15666988492012024, + "learning_rate": 4.73009982768012e-05, + "loss": 1.6908, + "step": 10100 + }, + { + "epoch": 0.1813118445221307, + "grad_norm": 0.12964856624603271, + "learning_rate": 4.723520053015879e-05, + "loss": 1.676, + "step": 10200 + }, + { + "epoch": 0.18308941162528886, + "grad_norm": 0.15315160155296326, + "learning_rate": 4.716865728971346e-05, + "loss": 1.6899, + "step": 10300 + }, + { + "epoch": 0.184866978728447, + "grad_norm": 0.17329467833042145, + "learning_rate": 4.710137078650302e-05, + "loss": 1.6755, + "step": 10400 + }, + { + "epoch": 0.18664454583160514, + "grad_norm": 0.16102010011672974, + "learning_rate": 4.703334327648516e-05, + "loss": 1.6779, + "step": 10500 + }, + { + "epoch": 0.1884221129347633, + "grad_norm": 0.170249804854393, + "learning_rate": 4.6964577040461745e-05, + "loss": 1.7001, + "step": 10600 + }, + { + "epoch": 0.19019968003792143, + "grad_norm": 0.14801470935344696, + "learning_rate": 4.689507438400239e-05, + "loss": 1.6881, + "step": 10700 + }, + { + "epoch": 0.19197724714107958, + "grad_norm": 0.2009027749300003, + "learning_rate": 4.682483763736718e-05, + "loss": 1.6944, + "step": 10800 + }, + { + "epoch": 0.1937548142442377, + "grad_norm": 0.15776540338993073, + "learning_rate": 4.6753869155428454e-05, + "loss": 1.6849, + "step": 10900 + }, + { + "epoch": 0.19553238134739587, + "grad_norm": 0.1666073054075241, + "learning_rate": 4.6682171317591947e-05, + "loss": 1.6986, + "step": 11000 + }, + { + "epoch": 0.197309948450554, + "grad_norm": 0.204326793551445, + "learning_rate": 4.660974652771698e-05, + "loss": 1.6927, + "step": 11100 + }, + { + "epoch": 0.19908751555371215, + "grad_norm": 0.17319276928901672, + "learning_rate": 4.653659721403583e-05, + "loss": 1.6804, + "step": 11200 + }, + { + "epoch": 0.2008650826568703, + "grad_norm": 0.19199158251285553, + "learning_rate": 4.6462725829072386e-05, + "loss": 1.6692, + "step": 11300 + }, + { + "epoch": 0.20264264976002844, + "grad_norm": 0.15492092072963715, + "learning_rate": 4.638813484955985e-05, + "loss": 1.695, + "step": 11400 + }, + { + "epoch": 0.2044202168631866, + "grad_norm": 0.2306402027606964, + "learning_rate": 4.631282677635775e-05, + "loss": 1.7068, + "step": 11500 + }, + { + "epoch": 0.20619778396634472, + "grad_norm": 0.20894396305084229, + "learning_rate": 4.62375678895541e-05, + "loss": 1.7145, + "step": 11600 + }, + { + "epoch": 0.20797535106950288, + "grad_norm": 0.31019458174705505, + "learning_rate": 4.616084033514059e-05, + "loss": 1.688, + "step": 11700 + }, + { + "epoch": 0.209752918172661, + "grad_norm": 0.22205297648906708, + "learning_rate": 4.6083403307686204e-05, + "loss": 1.6989, + "step": 11800 + }, + { + "epoch": 0.21153048527581916, + "grad_norm": 0.15302753448486328, + "learning_rate": 4.600525940347174e-05, + "loss": 1.6929, + "step": 11900 + }, + { + "epoch": 0.21330805237897732, + "grad_norm": 0.1468563824892044, + "learning_rate": 4.5926411242477904e-05, + "loss": 1.6924, + "step": 12000 + }, + { + "epoch": 0.21508561948213545, + "grad_norm": 0.1425103396177292, + "learning_rate": 4.584686146829748e-05, + "loss": 1.6904, + "step": 12100 + }, + { + "epoch": 0.2168631865852936, + "grad_norm": 0.1582684963941574, + "learning_rate": 4.5766612748046654e-05, + "loss": 1.6804, + "step": 12200 + }, + { + "epoch": 0.21864075368845173, + "grad_norm": 0.16768227517604828, + "learning_rate": 4.5685667772275654e-05, + "loss": 1.6796, + "step": 12300 + }, + { + "epoch": 0.2204183207916099, + "grad_norm": 0.1611669808626175, + "learning_rate": 4.56040292548785e-05, + "loss": 1.6749, + "step": 12400 + }, + { + "epoch": 0.22219588789476802, + "grad_norm": 0.13350994884967804, + "learning_rate": 4.5521699933002026e-05, + "loss": 1.7013, + "step": 12500 + }, + { + "epoch": 0.22397345499792617, + "grad_norm": 0.14940309524536133, + "learning_rate": 4.5438682566954124e-05, + "loss": 1.6814, + "step": 12600 + }, + { + "epoch": 0.2257510221010843, + "grad_norm": 0.13618171215057373, + "learning_rate": 4.5354979940111166e-05, + "loss": 1.6852, + "step": 12700 + }, + { + "epoch": 0.22752858920424246, + "grad_norm": 0.13858729600906372, + "learning_rate": 4.52705948588247e-05, + "loss": 1.7117, + "step": 12800 + }, + { + "epoch": 0.22930615630740062, + "grad_norm": 0.1507061868906021, + "learning_rate": 4.518553015232737e-05, + "loss": 1.6789, + "step": 12900 + }, + { + "epoch": 0.23108372341055874, + "grad_norm": 0.17016680538654327, + "learning_rate": 4.5099788672638064e-05, + "loss": 1.6925, + "step": 13000 + }, + { + "epoch": 0.2328612905137169, + "grad_norm": 0.1454281359910965, + "learning_rate": 4.501337329446625e-05, + "loss": 1.6942, + "step": 13100 + }, + { + "epoch": 0.23463885761687503, + "grad_norm": 0.13199830055236816, + "learning_rate": 4.492628691511563e-05, + "loss": 1.6844, + "step": 13200 + }, + { + "epoch": 0.23641642472003319, + "grad_norm": 0.1504441648721695, + "learning_rate": 4.483853245438702e-05, + "loss": 1.6803, + "step": 13300 + }, + { + "epoch": 0.23819399182319131, + "grad_norm": 0.14603202044963837, + "learning_rate": 4.4750112854480376e-05, + "loss": 1.6776, + "step": 13400 + }, + { + "epoch": 0.23997155892634947, + "grad_norm": 0.20005132257938385, + "learning_rate": 4.466103107989624e-05, + "loss": 1.6995, + "step": 13500 + }, + { + "epoch": 0.24174912602950763, + "grad_norm": 0.20756611227989197, + "learning_rate": 4.457129011733629e-05, + "loss": 1.691, + "step": 13600 + }, + { + "epoch": 0.24352669313266576, + "grad_norm": 0.1558232605457306, + "learning_rate": 4.448089297560325e-05, + "loss": 1.6815, + "step": 13700 + }, + { + "epoch": 0.2453042602358239, + "grad_norm": 0.18202444911003113, + "learning_rate": 4.4389842685499944e-05, + "loss": 1.6758, + "step": 13800 + }, + { + "epoch": 0.24708182733898204, + "grad_norm": 0.1685715764760971, + "learning_rate": 4.429814229972775e-05, + "loss": 1.684, + "step": 13900 + }, + { + "epoch": 0.2488593944421402, + "grad_norm": 0.1511525958776474, + "learning_rate": 4.420579489278419e-05, + "loss": 1.672, + "step": 14000 + }, + { + "epoch": 0.25063696154529835, + "grad_norm": 0.13901682198047638, + "learning_rate": 4.411280356085991e-05, + "loss": 1.6787, + "step": 14100 + }, + { + "epoch": 0.2524145286484565, + "grad_norm": 0.15039555728435516, + "learning_rate": 4.4019171421734826e-05, + "loss": 1.6854, + "step": 14200 + }, + { + "epoch": 0.2541920957516146, + "grad_norm": 0.14443428814411163, + "learning_rate": 4.392490161467361e-05, + "loss": 1.692, + "step": 14300 + }, + { + "epoch": 0.2559696628547728, + "grad_norm": 0.1846003532409668, + "learning_rate": 4.382999730032042e-05, + "loss": 1.6828, + "step": 14400 + }, + { + "epoch": 0.2577472299579309, + "grad_norm": 0.1854531168937683, + "learning_rate": 4.3734461660592985e-05, + "loss": 1.687, + "step": 14500 + }, + { + "epoch": 0.25952479706108905, + "grad_norm": 0.21927309036254883, + "learning_rate": 4.363829789857584e-05, + "loss": 1.6873, + "step": 14600 + }, + { + "epoch": 0.2613023641642472, + "grad_norm": 0.22467108070850372, + "learning_rate": 4.3541509238413e-05, + "loss": 1.6893, + "step": 14700 + }, + { + "epoch": 0.26307993126740536, + "grad_norm": 0.20354901254177094, + "learning_rate": 4.344409892519985e-05, + "loss": 1.6937, + "step": 14800 + }, + { + "epoch": 0.2648574983705635, + "grad_norm": 0.15710541605949402, + "learning_rate": 4.3346070224874304e-05, + "loss": 1.6897, + "step": 14900 + }, + { + "epoch": 0.2666350654737216, + "grad_norm": 0.16541948914527893, + "learning_rate": 4.3247426424107364e-05, + "loss": 1.6786, + "step": 15000 + }, + { + "epoch": 0.26841263257687975, + "grad_norm": 0.2642144560813904, + "learning_rate": 4.314817083019289e-05, + "loss": 1.6734, + "step": 15100 + }, + { + "epoch": 0.27019019968003793, + "grad_norm": 0.15868282318115234, + "learning_rate": 4.3048306770936716e-05, + "loss": 1.6839, + "step": 15200 + }, + { + "epoch": 0.27196776678319606, + "grad_norm": 0.20308874547481537, + "learning_rate": 4.2947837594545094e-05, + "loss": 1.6897, + "step": 15300 + }, + { + "epoch": 0.2737453338863542, + "grad_norm": 0.1677379161119461, + "learning_rate": 4.2847780346308484e-05, + "loss": 1.6795, + "step": 15400 + }, + { + "epoch": 0.2755229009895124, + "grad_norm": 0.14271363615989685, + "learning_rate": 4.27461170280642e-05, + "loss": 1.6788, + "step": 15500 + }, + { + "epoch": 0.2773004680926705, + "grad_norm": 0.16974543035030365, + "learning_rate": 4.2643858724393424e-05, + "loss": 1.6868, + "step": 15600 + }, + { + "epoch": 0.27907803519582863, + "grad_norm": 0.15350034832954407, + "learning_rate": 4.254100886377579e-05, + "loss": 1.6737, + "step": 15700 + }, + { + "epoch": 0.28085560229898676, + "grad_norm": 0.18880531191825867, + "learning_rate": 4.2437570894524404e-05, + "loss": 1.6816, + "step": 15800 + }, + { + "epoch": 0.28263316940214495, + "grad_norm": 0.14773619174957275, + "learning_rate": 4.233354828467028e-05, + "loss": 1.6799, + "step": 15900 + }, + { + "epoch": 0.2844107365053031, + "grad_norm": 0.1591775268316269, + "learning_rate": 4.2228944521846054e-05, + "loss": 1.6704, + "step": 16000 + }, + { + "epoch": 0.2861883036084612, + "grad_norm": 0.1422175019979477, + "learning_rate": 4.2123763113169053e-05, + "loss": 1.6882, + "step": 16100 + }, + { + "epoch": 0.2879658707116194, + "grad_norm": 0.1634337157011032, + "learning_rate": 4.2018007585123695e-05, + "loss": 1.6716, + "step": 16200 + }, + { + "epoch": 0.2897434378147775, + "grad_norm": 0.1616571843624115, + "learning_rate": 4.1911681483443284e-05, + "loss": 1.6814, + "step": 16300 + }, + { + "epoch": 0.29152100491793564, + "grad_norm": 0.1432926207780838, + "learning_rate": 4.180478837299109e-05, + "loss": 1.6781, + "step": 16400 + }, + { + "epoch": 0.2932985720210938, + "grad_norm": 0.14793144166469574, + "learning_rate": 4.1697331837640866e-05, + "loss": 1.675, + "step": 16500 + }, + { + "epoch": 0.29507613912425196, + "grad_norm": 0.14463911950588226, + "learning_rate": 4.158931548015665e-05, + "loss": 1.6866, + "step": 16600 + }, + { + "epoch": 0.2968537062274101, + "grad_norm": 0.14069664478302002, + "learning_rate": 4.148074292207203e-05, + "loss": 1.6848, + "step": 16700 + }, + { + "epoch": 0.2986312733305682, + "grad_norm": 0.16380813717842102, + "learning_rate": 4.137161780356866e-05, + "loss": 1.6676, + "step": 16800 + }, + { + "epoch": 0.3004088404337264, + "grad_norm": 0.16407877206802368, + "learning_rate": 4.126304322856126e-05, + "loss": 1.6757, + "step": 16900 + }, + { + "epoch": 0.3021864075368845, + "grad_norm": 0.1595907211303711, + "learning_rate": 4.1152829417731065e-05, + "loss": 1.6894, + "step": 17000 + }, + { + "epoch": 0.30396397464004266, + "grad_norm": 0.1606622189283371, + "learning_rate": 4.104207404064811e-05, + "loss": 1.675, + "step": 17100 + }, + { + "epoch": 0.3057415417432008, + "grad_norm": 0.1525093913078308, + "learning_rate": 4.093078081067882e-05, + "loss": 1.6864, + "step": 17200 + }, + { + "epoch": 0.30751910884635897, + "grad_norm": 0.18236620724201202, + "learning_rate": 4.081895345922257e-05, + "loss": 1.6756, + "step": 17300 + }, + { + "epoch": 0.3092966759495171, + "grad_norm": 0.1441909819841385, + "learning_rate": 4.070659573558656e-05, + "loss": 1.6889, + "step": 17400 + }, + { + "epoch": 0.3110742430526752, + "grad_norm": 0.182451993227005, + "learning_rate": 4.059371140686013e-05, + "loss": 1.6873, + "step": 17500 + }, + { + "epoch": 0.3128518101558334, + "grad_norm": 0.17770905792713165, + "learning_rate": 4.048030425778841e-05, + "loss": 1.6881, + "step": 17600 + }, + { + "epoch": 0.31462937725899154, + "grad_norm": 0.14115692675113678, + "learning_rate": 4.0366378090645516e-05, + "loss": 1.6789, + "step": 17700 + }, + { + "epoch": 0.31640694436214967, + "grad_norm": 0.1899385303258896, + "learning_rate": 4.0251936725106985e-05, + "loss": 1.6796, + "step": 17800 + }, + { + "epoch": 0.3181845114653078, + "grad_norm": 0.15735557675361633, + "learning_rate": 4.013698399812173e-05, + "loss": 1.6774, + "step": 17900 + }, + { + "epoch": 0.319962078568466, + "grad_norm": 0.19576773047447205, + "learning_rate": 4.002152376378343e-05, + "loss": 1.6815, + "step": 18000 + }, + { + "epoch": 0.3217396456716241, + "grad_norm": 0.17470435798168182, + "learning_rate": 3.9905559893201285e-05, + "loss": 1.6879, + "step": 18100 + }, + { + "epoch": 0.32351721277478224, + "grad_norm": 0.2007114738225937, + "learning_rate": 3.9789096274370205e-05, + "loss": 1.6728, + "step": 18200 + }, + { + "epoch": 0.32529477987794037, + "grad_norm": 0.13873660564422607, + "learning_rate": 3.967213681204051e-05, + "loss": 1.6911, + "step": 18300 + }, + { + "epoch": 0.32707234698109855, + "grad_norm": 0.15716473758220673, + "learning_rate": 3.955468542758697e-05, + "loss": 1.6881, + "step": 18400 + }, + { + "epoch": 0.3288499140842567, + "grad_norm": 0.15948426723480225, + "learning_rate": 3.9436746058877335e-05, + "loss": 1.7005, + "step": 18500 + }, + { + "epoch": 0.3306274811874148, + "grad_norm": 0.15321232378482819, + "learning_rate": 3.9318322660140324e-05, + "loss": 1.6858, + "step": 18600 + }, + { + "epoch": 0.332405048290573, + "grad_norm": 0.16375650465488434, + "learning_rate": 3.919941920183305e-05, + "loss": 1.6702, + "step": 18700 + }, + { + "epoch": 0.3341826153937311, + "grad_norm": 0.14579662680625916, + "learning_rate": 3.908003967050787e-05, + "loss": 1.6779, + "step": 18800 + }, + { + "epoch": 0.33596018249688925, + "grad_norm": 0.19252930581569672, + "learning_rate": 3.896018806867876e-05, + "loss": 1.6847, + "step": 18900 + }, + { + "epoch": 0.3377377496000474, + "grad_norm": 0.1748981475830078, + "learning_rate": 3.88398684146871e-05, + "loss": 1.6524, + "step": 19000 + }, + { + "epoch": 0.33951531670320556, + "grad_norm": 0.14768213033676147, + "learning_rate": 3.871908474256696e-05, + "loss": 1.6621, + "step": 19100 + }, + { + "epoch": 0.3412928838063637, + "grad_norm": 0.18400093913078308, + "learning_rate": 3.859784110190985e-05, + "loss": 1.6792, + "step": 19200 + }, + { + "epoch": 0.3430704509095218, + "grad_norm": 0.1892794668674469, + "learning_rate": 3.8476141557728906e-05, + "loss": 1.6883, + "step": 19300 + }, + { + "epoch": 0.34484801801268, + "grad_norm": 0.13941031694412231, + "learning_rate": 3.835399019032268e-05, + "loss": 1.6685, + "step": 19400 + }, + { + "epoch": 0.34662558511583813, + "grad_norm": 0.13327963650226593, + "learning_rate": 3.8231391095138236e-05, + "loss": 1.6791, + "step": 19500 + }, + { + "epoch": 0.34840315221899626, + "grad_norm": 0.14174780249595642, + "learning_rate": 3.810834838263396e-05, + "loss": 1.6789, + "step": 19600 + }, + { + "epoch": 0.3501807193221544, + "grad_norm": 0.2639550268650055, + "learning_rate": 3.798486617814162e-05, + "loss": 1.6694, + "step": 19700 + }, + { + "epoch": 0.35195828642531257, + "grad_norm": 0.14735499024391174, + "learning_rate": 3.786094862172816e-05, + "loss": 1.6751, + "step": 19800 + }, + { + "epoch": 0.3537358535284707, + "grad_norm": 0.1680241823196411, + "learning_rate": 3.7736599868056804e-05, + "loss": 1.6791, + "step": 19900 + }, + { + "epoch": 0.35551342063162883, + "grad_norm": 0.15196190774440765, + "learning_rate": 3.761182408624783e-05, + "loss": 1.6741, + "step": 20000 + }, + { + "epoch": 0.357290987734787, + "grad_norm": 0.14523537456989288, + "learning_rate": 3.748662545973876e-05, + "loss": 1.6732, + "step": 20100 + }, + { + "epoch": 0.35906855483794514, + "grad_norm": 0.1658225953578949, + "learning_rate": 3.7361008186144095e-05, + "loss": 1.6842, + "step": 20200 + }, + { + "epoch": 0.36084612194110327, + "grad_norm": 0.2060202807188034, + "learning_rate": 3.723497647711458e-05, + "loss": 1.6757, + "step": 20300 + }, + { + "epoch": 0.3626236890442614, + "grad_norm": 0.15790830552577972, + "learning_rate": 3.7108534558196005e-05, + "loss": 1.6613, + "step": 20400 + }, + { + "epoch": 0.3644012561474196, + "grad_norm": 0.15922047197818756, + "learning_rate": 3.6981686668687545e-05, + "loss": 1.6623, + "step": 20500 + }, + { + "epoch": 0.3661788232505777, + "grad_norm": 0.17766642570495605, + "learning_rate": 3.685443706149958e-05, + "loss": 1.6847, + "step": 20600 + }, + { + "epoch": 0.36795639035373584, + "grad_norm": 0.1501617580652237, + "learning_rate": 3.672679000301118e-05, + "loss": 1.6717, + "step": 20700 + }, + { + "epoch": 0.369733957456894, + "grad_norm": 0.1573089063167572, + "learning_rate": 3.659874977292696e-05, + "loss": 1.6723, + "step": 20800 + }, + { + "epoch": 0.37151152456005215, + "grad_norm": 0.15815529227256775, + "learning_rate": 3.647032066413372e-05, + "loss": 1.6782, + "step": 20900 + }, + { + "epoch": 0.3732890916632103, + "grad_norm": 0.16356757283210754, + "learning_rate": 3.634150698255639e-05, + "loss": 1.6694, + "step": 21000 + }, + { + "epoch": 0.3750666587663684, + "grad_norm": 0.14859165251255035, + "learning_rate": 3.6213606854414085e-05, + "loss": 1.6686, + "step": 21100 + }, + { + "epoch": 0.3768442258695266, + "grad_norm": 0.15533782541751862, + "learning_rate": 3.608404073421511e-05, + "loss": 1.6675, + "step": 21200 + }, + { + "epoch": 0.3786217929726847, + "grad_norm": 0.1758899837732315, + "learning_rate": 3.595410299228654e-05, + "loss": 1.6786, + "step": 21300 + }, + { + "epoch": 0.38039936007584285, + "grad_norm": 0.15762227773666382, + "learning_rate": 3.582379798513425e-05, + "loss": 1.6662, + "step": 21400 + }, + { + "epoch": 0.382176927179001, + "grad_norm": 0.1720816045999527, + "learning_rate": 3.569313008157762e-05, + "loss": 1.6942, + "step": 21500 + }, + { + "epoch": 0.38395449428215916, + "grad_norm": 0.17334651947021484, + "learning_rate": 3.556210366260312e-05, + "loss": 1.6791, + "step": 21600 + }, + { + "epoch": 0.3857320613853173, + "grad_norm": 0.15751953423023224, + "learning_rate": 3.5430723121217376e-05, + "loss": 1.6784, + "step": 21700 + }, + { + "epoch": 0.3875096284884754, + "grad_norm": 0.16097095608711243, + "learning_rate": 3.529899286229991e-05, + "loss": 1.6689, + "step": 21800 + }, + { + "epoch": 0.3892871955916336, + "grad_norm": 0.15864881873130798, + "learning_rate": 3.5166917302455425e-05, + "loss": 1.6738, + "step": 21900 + }, + { + "epoch": 0.39106476269479173, + "grad_norm": 0.1525215208530426, + "learning_rate": 3.5034500869865796e-05, + "loss": 1.6887, + "step": 22000 + }, + { + "epoch": 0.39284232979794986, + "grad_norm": 0.14002804458141327, + "learning_rate": 3.490174800414151e-05, + "loss": 1.6745, + "step": 22100 + }, + { + "epoch": 0.394619896901108, + "grad_norm": 0.17650793492794037, + "learning_rate": 3.47686631561729e-05, + "loss": 1.6713, + "step": 22200 + }, + { + "epoch": 0.3963974640042662, + "grad_norm": 0.16852478682994843, + "learning_rate": 3.463525078798085e-05, + "loss": 1.6872, + "step": 22300 + }, + { + "epoch": 0.3981750311074243, + "grad_norm": 0.16134943068027496, + "learning_rate": 3.450151537256725e-05, + "loss": 1.677, + "step": 22400 + }, + { + "epoch": 0.39995259821058243, + "grad_norm": 0.15445928275585175, + "learning_rate": 3.4367461393764976e-05, + "loss": 1.673, + "step": 22500 + }, + { + "epoch": 0.4017301653137406, + "grad_norm": 0.15707698464393616, + "learning_rate": 3.42330933460876e-05, + "loss": 1.6687, + "step": 22600 + }, + { + "epoch": 0.40350773241689875, + "grad_norm": 0.13525037467479706, + "learning_rate": 3.4098415734578684e-05, + "loss": 1.6729, + "step": 22700 + }, + { + "epoch": 0.4052852995200569, + "grad_norm": 0.15618863701820374, + "learning_rate": 3.3963433074660714e-05, + "loss": 1.684, + "step": 22800 + }, + { + "epoch": 0.407062866623215, + "grad_norm": 0.18125438690185547, + "learning_rate": 3.382814989198375e-05, + "loss": 1.6793, + "step": 22900 + }, + { + "epoch": 0.4088404337263732, + "grad_norm": 0.1549660563468933, + "learning_rate": 3.3692570722273676e-05, + "loss": 1.6848, + "step": 23000 + }, + { + "epoch": 0.4106180008295313, + "grad_norm": 0.18558810651302338, + "learning_rate": 3.35567001111801e-05, + "loss": 1.6687, + "step": 23100 + }, + { + "epoch": 0.41239556793268944, + "grad_norm": 0.18007346987724304, + "learning_rate": 3.3420542614123984e-05, + "loss": 1.6714, + "step": 23200 + }, + { + "epoch": 0.41417313503584763, + "grad_norm": 0.15658414363861084, + "learning_rate": 3.328683432967708e-05, + "loss": 1.6799, + "step": 23300 + }, + { + "epoch": 0.41595070213900576, + "grad_norm": 0.18134590983390808, + "learning_rate": 3.3150122275317875e-05, + "loss": 1.6743, + "step": 23400 + }, + { + "epoch": 0.4177282692421639, + "grad_norm": 0.15867780148983002, + "learning_rate": 3.3013136966591515e-05, + "loss": 1.6683, + "step": 23500 + }, + { + "epoch": 0.419505836345322, + "grad_norm": 0.17692945897579193, + "learning_rate": 3.287588299629216e-05, + "loss": 1.6685, + "step": 23600 + }, + { + "epoch": 0.4212834034484802, + "grad_norm": 0.13905645906925201, + "learning_rate": 3.273836496622152e-05, + "loss": 1.6715, + "step": 23700 + }, + { + "epoch": 0.4230609705516383, + "grad_norm": 0.1454002857208252, + "learning_rate": 3.260058748703464e-05, + "loss": 1.6773, + "step": 23800 + }, + { + "epoch": 0.42483853765479646, + "grad_norm": 0.13487789034843445, + "learning_rate": 3.2462555178085255e-05, + "loss": 1.655, + "step": 23900 + }, + { + "epoch": 0.42661610475795464, + "grad_norm": 0.1867651492357254, + "learning_rate": 3.2324272667270975e-05, + "loss": 1.6725, + "step": 24000 + }, + { + "epoch": 0.42839367186111277, + "grad_norm": 0.14305393397808075, + "learning_rate": 3.218574459087805e-05, + "loss": 1.6717, + "step": 24100 + }, + { + "epoch": 0.4301712389642709, + "grad_norm": 0.14234061539173126, + "learning_rate": 3.2046975593425975e-05, + "loss": 1.6917, + "step": 24200 + }, + { + "epoch": 0.431948806067429, + "grad_norm": 0.15563951432704926, + "learning_rate": 3.1907970327511786e-05, + "loss": 1.6725, + "step": 24300 + }, + { + "epoch": 0.4337263731705872, + "grad_norm": 0.14877410233020782, + "learning_rate": 3.176873345365402e-05, + "loss": 1.6802, + "step": 24400 + }, + { + "epoch": 0.43550394027374534, + "grad_norm": 0.16491292417049408, + "learning_rate": 3.162926964013648e-05, + "loss": 1.6671, + "step": 24500 + }, + { + "epoch": 0.43728150737690347, + "grad_norm": 0.1698901653289795, + "learning_rate": 3.1489583562851724e-05, + "loss": 1.6782, + "step": 24600 + }, + { + "epoch": 0.4390590744800616, + "grad_norm": 0.18841049075126648, + "learning_rate": 3.1349679905144285e-05, + "loss": 1.6671, + "step": 24700 + }, + { + "epoch": 0.4408366415832198, + "grad_norm": 0.14066390693187714, + "learning_rate": 3.120956335765367e-05, + "loss": 1.6597, + "step": 24800 + }, + { + "epoch": 0.4426142086863779, + "grad_norm": 0.14379048347473145, + "learning_rate": 3.1069238618157064e-05, + "loss": 1.6696, + "step": 24900 + }, + { + "epoch": 0.44439177578953604, + "grad_norm": 0.17776834964752197, + "learning_rate": 3.092871039141184e-05, + "loss": 1.6769, + "step": 25000 + }, + { + "epoch": 0.4461693428926942, + "grad_norm": 0.1451658457517624, + "learning_rate": 3.078798338899784e-05, + "loss": 1.6727, + "step": 25100 + }, + { + "epoch": 0.44794690999585235, + "grad_norm": 0.14523907005786896, + "learning_rate": 3.064706232915933e-05, + "loss": 1.6858, + "step": 25200 + }, + { + "epoch": 0.4497244770990105, + "grad_norm": 0.17594589293003082, + "learning_rate": 3.050595193664693e-05, + "loss": 1.6599, + "step": 25300 + }, + { + "epoch": 0.4515020442021686, + "grad_norm": 0.14906199276447296, + "learning_rate": 3.0364656942559087e-05, + "loss": 1.666, + "step": 25400 + }, + { + "epoch": 0.4532796113053268, + "grad_norm": 0.20227928459644318, + "learning_rate": 3.0223182084183545e-05, + "loss": 1.6799, + "step": 25500 + }, + { + "epoch": 0.4550571784084849, + "grad_norm": 0.15447662770748138, + "learning_rate": 3.0081532104838424e-05, + "loss": 1.6709, + "step": 25600 + }, + { + "epoch": 0.45683474551164305, + "grad_norm": 0.1919887661933899, + "learning_rate": 2.9939711753713285e-05, + "loss": 1.6863, + "step": 25700 + }, + { + "epoch": 0.45861231261480123, + "grad_norm": 0.23030731081962585, + "learning_rate": 2.9797725785709828e-05, + "loss": 1.68, + "step": 25800 + }, + { + "epoch": 0.46038987971795936, + "grad_norm": 0.14024241268634796, + "learning_rate": 2.9655578961282497e-05, + "loss": 1.6705, + "step": 25900 + }, + { + "epoch": 0.4621674468211175, + "grad_norm": 0.14363612234592438, + "learning_rate": 2.951327604627888e-05, + "loss": 1.6695, + "step": 26000 + }, + { + "epoch": 0.4639450139242756, + "grad_norm": 0.15318314731121063, + "learning_rate": 2.9370821811779908e-05, + "loss": 1.6665, + "step": 26100 + }, + { + "epoch": 0.4657225810274338, + "grad_norm": 0.16767314076423645, + "learning_rate": 2.9228221033939895e-05, + "loss": 1.6627, + "step": 26200 + }, + { + "epoch": 0.46750014813059193, + "grad_norm": 0.18546494841575623, + "learning_rate": 2.9085478493826413e-05, + "loss": 1.6892, + "step": 26300 + }, + { + "epoch": 0.46927771523375006, + "grad_norm": 0.14965227246284485, + "learning_rate": 2.8942598977259995e-05, + "loss": 1.6681, + "step": 26400 + }, + { + "epoch": 0.47105528233690824, + "grad_norm": 0.1423717737197876, + "learning_rate": 2.879958727465365e-05, + "loss": 1.662, + "step": 26500 + }, + { + "epoch": 0.47283284944006637, + "grad_norm": 0.154624342918396, + "learning_rate": 2.8656448180852285e-05, + "loss": 1.683, + "step": 26600 + }, + { + "epoch": 0.4746104165432245, + "grad_norm": 0.14358487725257874, + "learning_rate": 2.8513186494971944e-05, + "loss": 1.6704, + "step": 26700 + }, + { + "epoch": 0.47638798364638263, + "grad_norm": 0.14034679532051086, + "learning_rate": 2.836980702023888e-05, + "loss": 1.672, + "step": 26800 + }, + { + "epoch": 0.4781655507495408, + "grad_norm": 0.14413665235042572, + "learning_rate": 2.822631456382853e-05, + "loss": 1.6645, + "step": 26900 + }, + { + "epoch": 0.47994311785269894, + "grad_norm": 0.14380885660648346, + "learning_rate": 2.8082713936704348e-05, + "loss": 1.6671, + "step": 27000 + }, + { + "epoch": 0.48172068495585707, + "grad_norm": 0.15115170180797577, + "learning_rate": 2.7939009953456487e-05, + "loss": 1.6714, + "step": 27100 + }, + { + "epoch": 0.48349825205901525, + "grad_norm": 0.16167448461055756, + "learning_rate": 2.779520743214039e-05, + "loss": 1.6691, + "step": 27200 + }, + { + "epoch": 0.4852758191621734, + "grad_norm": 0.14348022639751434, + "learning_rate": 2.765131119411526e-05, + "loss": 1.6723, + "step": 27300 + }, + { + "epoch": 0.4870533862653315, + "grad_norm": 0.14456488192081451, + "learning_rate": 2.7507326063882376e-05, + "loss": 1.6724, + "step": 27400 + }, + { + "epoch": 0.48883095336848964, + "grad_norm": 0.15705521404743195, + "learning_rate": 2.7363256868923388e-05, + "loss": 1.6699, + "step": 27500 + }, + { + "epoch": 0.4906085204716478, + "grad_norm": 0.1625920534133911, + "learning_rate": 2.721910843953842e-05, + "loss": 1.6644, + "step": 27600 + }, + { + "epoch": 0.49238608757480595, + "grad_norm": 0.13969144225120544, + "learning_rate": 2.7074885608684154e-05, + "loss": 1.679, + "step": 27700 + }, + { + "epoch": 0.4941636546779641, + "grad_norm": 0.1736816018819809, + "learning_rate": 2.6930593211811763e-05, + "loss": 1.672, + "step": 27800 + }, + { + "epoch": 0.4959412217811222, + "grad_norm": 0.19090887904167175, + "learning_rate": 2.678767996247037e-05, + "loss": 1.6787, + "step": 27900 + }, + { + "epoch": 0.4977187888842804, + "grad_norm": 0.17863860726356506, + "learning_rate": 2.6643263524000922e-05, + "loss": 1.6841, + "step": 28000 + }, + { + "epoch": 0.4994963559874385, + "grad_norm": 0.1446855366230011, + "learning_rate": 2.64987919907833e-05, + "loss": 1.6655, + "step": 28100 + }, + { + "epoch": 0.5012739230905967, + "grad_norm": 0.1714792400598526, + "learning_rate": 2.6354270206607095e-05, + "loss": 1.6716, + "step": 28200 + }, + { + "epoch": 0.5030514901937548, + "grad_norm": 0.1719600409269333, + "learning_rate": 2.6209703016946675e-05, + "loss": 1.6551, + "step": 28300 + }, + { + "epoch": 0.504829057296913, + "grad_norm": 0.18065394461154938, + "learning_rate": 2.6065095268798772e-05, + "loss": 1.6647, + "step": 28400 + }, + { + "epoch": 0.5066066244000711, + "grad_norm": 0.14981447160243988, + "learning_rate": 2.5920451810519935e-05, + "loss": 1.6666, + "step": 28500 + }, + { + "epoch": 0.5083841915032292, + "grad_norm": 0.14168864488601685, + "learning_rate": 2.5775777491663976e-05, + "loss": 1.6619, + "step": 28600 + }, + { + "epoch": 0.5101617586063873, + "grad_norm": 0.1458740234375, + "learning_rate": 2.563107716281941e-05, + "loss": 1.6616, + "step": 28700 + }, + { + "epoch": 0.5119393257095456, + "grad_norm": 0.15625467896461487, + "learning_rate": 2.5486355675446804e-05, + "loss": 1.6606, + "step": 28800 + }, + { + "epoch": 0.5137168928127037, + "grad_norm": 0.1387881189584732, + "learning_rate": 2.5341617881716105e-05, + "loss": 1.6714, + "step": 28900 + }, + { + "epoch": 0.5154944599158618, + "grad_norm": 0.16523011028766632, + "learning_rate": 2.5196868634343986e-05, + "loss": 1.6772, + "step": 29000 + }, + { + "epoch": 0.51727202701902, + "grad_norm": 0.16577035188674927, + "learning_rate": 2.505211278643112e-05, + "loss": 1.6662, + "step": 29100 + }, + { + "epoch": 0.5190495941221781, + "grad_norm": 0.20573437213897705, + "learning_rate": 2.490735519129951e-05, + "loss": 1.6763, + "step": 29200 + }, + { + "epoch": 0.5208271612253362, + "grad_norm": 0.1646687239408493, + "learning_rate": 2.4762600702329707e-05, + "loss": 1.6713, + "step": 29300 + }, + { + "epoch": 0.5226047283284944, + "grad_norm": 0.15127506852149963, + "learning_rate": 2.461785417279814e-05, + "loss": 1.6754, + "step": 29400 + }, + { + "epoch": 0.5243822954316525, + "grad_norm": 0.13983801007270813, + "learning_rate": 2.4473120455714367e-05, + "loss": 1.6683, + "step": 29500 + }, + { + "epoch": 0.5261598625348107, + "grad_norm": 0.14342284202575684, + "learning_rate": 2.4328404403658382e-05, + "loss": 1.6817, + "step": 29600 + }, + { + "epoch": 0.5279374296379689, + "grad_norm": 0.16623562574386597, + "learning_rate": 2.41837108686179e-05, + "loss": 1.6819, + "step": 29700 + }, + { + "epoch": 0.529714996741127, + "grad_norm": 0.14822550117969513, + "learning_rate": 2.4039044701825705e-05, + "loss": 1.6691, + "step": 29800 + }, + { + "epoch": 0.5314925638442851, + "grad_norm": 0.28164225816726685, + "learning_rate": 2.3894410753596987e-05, + "loss": 1.6736, + "step": 29900 + }, + { + "epoch": 0.5332701309474432, + "grad_norm": 0.20413027703762054, + "learning_rate": 2.3751259642565925e-05, + "loss": 1.6758, + "step": 30000 + }, + { + "epoch": 0.5350476980506014, + "grad_norm": 0.15408293902873993, + "learning_rate": 2.36067042347753e-05, + "loss": 1.6683, + "step": 30100 + }, + { + "epoch": 0.5368252651537595, + "grad_norm": 0.14037184417247772, + "learning_rate": 2.346219554090377e-05, + "loss": 1.6692, + "step": 30200 + }, + { + "epoch": 0.5386028322569177, + "grad_norm": 0.15781673789024353, + "learning_rate": 2.3317738405986828e-05, + "loss": 1.6627, + "step": 30300 + }, + { + "epoch": 0.5403803993600759, + "grad_norm": 0.1486879140138626, + "learning_rate": 2.3173337673331313e-05, + "loss": 1.6728, + "step": 30400 + }, + { + "epoch": 0.542157966463234, + "grad_norm": 0.20719771087169647, + "learning_rate": 2.302899818435304e-05, + "loss": 1.665, + "step": 30500 + }, + { + "epoch": 0.5439355335663921, + "grad_norm": 0.16389068961143494, + "learning_rate": 2.288472477841445e-05, + "loss": 1.671, + "step": 30600 + }, + { + "epoch": 0.5457131006695503, + "grad_norm": 0.17496538162231445, + "learning_rate": 2.274052229266239e-05, + "loss": 1.6686, + "step": 30700 + }, + { + "epoch": 0.5474906677727084, + "grad_norm": 0.32813844084739685, + "learning_rate": 2.259639556186592e-05, + "loss": 1.6752, + "step": 30800 + }, + { + "epoch": 0.5492682348758665, + "grad_norm": 0.5157455205917358, + "learning_rate": 2.2452349418254213e-05, + "loss": 1.6838, + "step": 30900 + }, + { + "epoch": 0.5510458019790248, + "grad_norm": 0.14271293580532074, + "learning_rate": 2.2308388691354538e-05, + "loss": 1.6753, + "step": 31000 + }, + { + "epoch": 0.5528233690821829, + "grad_norm": 0.28928157687187195, + "learning_rate": 2.216451820783035e-05, + "loss": 1.6693, + "step": 31100 + }, + { + "epoch": 0.554600936185341, + "grad_norm": 0.144424170255661, + "learning_rate": 2.2020742791319452e-05, + "loss": 1.6622, + "step": 31200 + }, + { + "epoch": 0.5563785032884991, + "grad_norm": 0.16234175860881805, + "learning_rate": 2.1877067262272284e-05, + "loss": 1.6818, + "step": 31300 + }, + { + "epoch": 0.5581560703916573, + "grad_norm": 0.19830650091171265, + "learning_rate": 2.173349643779028e-05, + "loss": 1.6768, + "step": 31400 + }, + { + "epoch": 0.5599336374948154, + "grad_norm": Infinity, + "learning_rate": 2.159146918661628e-05, + "loss": 1.6824, + "step": 31500 + }, + { + "epoch": 0.5617112045979735, + "grad_norm": 0.16732582449913025, + "learning_rate": 2.144812104128816e-05, + "loss": 1.6745, + "step": 31600 + }, + { + "epoch": 0.5634887717011318, + "grad_norm": 0.1458720713853836, + "learning_rate": 2.130489198207977e-05, + "loss": 1.6559, + "step": 31700 + }, + { + "epoch": 0.5652663388042899, + "grad_norm": 0.1879144012928009, + "learning_rate": 2.1161786811123463e-05, + "loss": 1.6582, + "step": 31800 + }, + { + "epoch": 0.567043905907448, + "grad_norm": 0.15326225757598877, + "learning_rate": 2.1018810326397926e-05, + "loss": 1.6632, + "step": 31900 + }, + { + "epoch": 0.5688214730106061, + "grad_norm": 0.14871954917907715, + "learning_rate": 2.087596732156729e-05, + "loss": 1.6683, + "step": 32000 + }, + { + "epoch": 0.5705990401137643, + "grad_norm": 0.138087660074234, + "learning_rate": 2.073326258582043e-05, + "loss": 1.6687, + "step": 32100 + }, + { + "epoch": 0.5723766072169224, + "grad_norm": 0.1453862488269806, + "learning_rate": 2.0592125796717588e-05, + "loss": 1.6658, + "step": 32200 + }, + { + "epoch": 0.5741541743200805, + "grad_norm": 0.14493419229984283, + "learning_rate": 2.044971044602353e-05, + "loss": 1.6667, + "step": 32300 + }, + { + "epoch": 0.5759317414232388, + "grad_norm": 0.22620221972465515, + "learning_rate": 2.0307447655800402e-05, + "loss": 1.6818, + "step": 32400 + }, + { + "epoch": 0.5777093085263969, + "grad_norm": 0.14831425249576569, + "learning_rate": 2.016534219578384e-05, + "loss": 1.6518, + "step": 32500 + }, + { + "epoch": 0.579486875629555, + "grad_norm": 0.15642555058002472, + "learning_rate": 2.0023398830434578e-05, + "loss": 1.6578, + "step": 32600 + }, + { + "epoch": 0.5812644427327132, + "grad_norm": 0.18855425715446472, + "learning_rate": 1.9881622318778698e-05, + "loss": 1.6719, + "step": 32700 + }, + { + "epoch": 0.5830420098358713, + "grad_norm": 0.14109855890274048, + "learning_rate": 1.974001741424807e-05, + "loss": 1.6672, + "step": 32800 + }, + { + "epoch": 0.5848195769390294, + "grad_norm": 0.16638268530368805, + "learning_rate": 1.959858886452098e-05, + "loss": 1.6732, + "step": 32900 + }, + { + "epoch": 0.5865971440421875, + "grad_norm": 0.15555280447006226, + "learning_rate": 1.9457341411362953e-05, + "loss": 1.6738, + "step": 33000 + }, + { + "epoch": 0.5883747111453458, + "grad_norm": 0.19915728271007538, + "learning_rate": 1.9316279790467785e-05, + "loss": 1.6828, + "step": 33100 + }, + { + "epoch": 0.5901522782485039, + "grad_norm": 0.14633417129516602, + "learning_rate": 1.9175408731298737e-05, + "loss": 1.6582, + "step": 33200 + }, + { + "epoch": 0.591929845351662, + "grad_norm": 0.15195755660533905, + "learning_rate": 1.9034732956930004e-05, + "loss": 1.6722, + "step": 33300 + }, + { + "epoch": 0.5937074124548202, + "grad_norm": 0.1364789605140686, + "learning_rate": 1.8894257183888324e-05, + "loss": 1.6797, + "step": 33400 + }, + { + "epoch": 0.5954849795579783, + "grad_norm": 0.1638212352991104, + "learning_rate": 1.8753986121994874e-05, + "loss": 1.6607, + "step": 33500 + }, + { + "epoch": 0.5972625466611364, + "grad_norm": 0.16077277064323425, + "learning_rate": 1.8613924474207344e-05, + "loss": 1.6731, + "step": 33600 + }, + { + "epoch": 0.5990401137642946, + "grad_norm": 0.16311664879322052, + "learning_rate": 1.8474076936462277e-05, + "loss": 1.6604, + "step": 33700 + }, + { + "epoch": 0.6008176808674528, + "grad_norm": 0.16206273436546326, + "learning_rate": 1.833444819751758e-05, + "loss": 1.6769, + "step": 33800 + }, + { + "epoch": 0.6025952479706109, + "grad_norm": 0.1508202999830246, + "learning_rate": 1.8195042938795387e-05, + "loss": 1.6739, + "step": 33900 + }, + { + "epoch": 0.604372815073769, + "grad_norm": 0.16781938076019287, + "learning_rate": 1.8055865834225045e-05, + "loss": 1.6665, + "step": 34000 + }, + { + "epoch": 0.6061503821769272, + "grad_norm": 0.1427120417356491, + "learning_rate": 1.7916921550086444e-05, + "loss": 1.68, + "step": 34100 + }, + { + "epoch": 0.6079279492800853, + "grad_norm": 0.15128500759601593, + "learning_rate": 1.7778214744853537e-05, + "loss": 1.6666, + "step": 34200 + }, + { + "epoch": 0.6097055163832434, + "grad_norm": 0.19677314162254333, + "learning_rate": 1.76397500690382e-05, + "loss": 1.673, + "step": 34300 + }, + { + "epoch": 0.6114830834864016, + "grad_norm": 0.1646704375743866, + "learning_rate": 1.7501532165034255e-05, + "loss": 1.6748, + "step": 34400 + }, + { + "epoch": 0.6132606505895598, + "grad_norm": 0.15692903101444244, + "learning_rate": 1.736356566696186e-05, + "loss": 1.6713, + "step": 34500 + }, + { + "epoch": 0.6150382176927179, + "grad_norm": 0.15300609171390533, + "learning_rate": 1.7225855200512113e-05, + "loss": 1.6673, + "step": 34600 + }, + { + "epoch": 0.6168157847958761, + "grad_norm": 0.18005123734474182, + "learning_rate": 1.7088405382791988e-05, + "loss": 1.6638, + "step": 34700 + }, + { + "epoch": 0.6185933518990342, + "grad_norm": 0.13681212067604065, + "learning_rate": 1.6951220822169514e-05, + "loss": 1.6638, + "step": 34800 + }, + { + "epoch": 0.6203709190021923, + "grad_norm": 0.15197409689426422, + "learning_rate": 1.681430611811928e-05, + "loss": 1.6831, + "step": 34900 + }, + { + "epoch": 0.6221484861053505, + "grad_norm": 0.14276647567749023, + "learning_rate": 1.667766586106822e-05, + "loss": 1.6715, + "step": 35000 + }, + { + "epoch": 0.6239260532085086, + "grad_norm": 0.1536717563867569, + "learning_rate": 1.654130463224171e-05, + "loss": 1.6608, + "step": 35100 + }, + { + "epoch": 0.6257036203116668, + "grad_norm": 0.14087150990962982, + "learning_rate": 1.6405227003509966e-05, + "loss": 1.671, + "step": 35200 + }, + { + "epoch": 0.627481187414825, + "grad_norm": 0.1598573476076126, + "learning_rate": 1.6269437537234758e-05, + "loss": 1.6824, + "step": 35300 + }, + { + "epoch": 0.6292587545179831, + "grad_norm": 0.14146994054317474, + "learning_rate": 1.613394078611646e-05, + "loss": 1.6736, + "step": 35400 + }, + { + "epoch": 0.6310363216211412, + "grad_norm": 0.1572994738817215, + "learning_rate": 1.599874129304138e-05, + "loss": 1.6656, + "step": 35500 + }, + { + "epoch": 0.6328138887242993, + "grad_norm": 0.1504960060119629, + "learning_rate": 1.5863843590929483e-05, + "loss": 1.6639, + "step": 35600 + }, + { + "epoch": 0.6345914558274575, + "grad_norm": 0.17305798828601837, + "learning_rate": 1.572925220258239e-05, + "loss": 1.6563, + "step": 35700 + }, + { + "epoch": 0.6363690229306156, + "grad_norm": 0.19001583755016327, + "learning_rate": 1.5594971640531735e-05, + "loss": 1.6694, + "step": 35800 + }, + { + "epoch": 0.6381465900337737, + "grad_norm": 0.16306428611278534, + "learning_rate": 1.5461006406887892e-05, + "loss": 1.6597, + "step": 35900 + }, + { + "epoch": 0.639924157136932, + "grad_norm": 0.1538590043783188, + "learning_rate": 1.532736099318901e-05, + "loss": 1.6573, + "step": 36000 + }, + { + "epoch": 0.6417017242400901, + "grad_norm": 0.14714497327804565, + "learning_rate": 1.5194039880250432e-05, + "loss": 1.6647, + "step": 36100 + }, + { + "epoch": 0.6434792913432482, + "grad_norm": 0.1425635814666748, + "learning_rate": 1.5061047538014466e-05, + "loss": 1.67, + "step": 36200 + }, + { + "epoch": 0.6452568584464063, + "grad_norm": 0.14618100225925446, + "learning_rate": 1.4928388425400514e-05, + "loss": 1.6456, + "step": 36300 + }, + { + "epoch": 0.6470344255495645, + "grad_norm": 0.13993218541145325, + "learning_rate": 1.479606699015556e-05, + "loss": 1.6603, + "step": 36400 + }, + { + "epoch": 0.6488119926527226, + "grad_norm": 0.1687382161617279, + "learning_rate": 1.4664087668705082e-05, + "loss": 1.6684, + "step": 36500 + }, + { + "epoch": 0.6505895597558807, + "grad_norm": 0.1678340882062912, + "learning_rate": 1.453245488600427e-05, + "loss": 1.6762, + "step": 36600 + }, + { + "epoch": 0.652367126859039, + "grad_norm": 0.15924805402755737, + "learning_rate": 1.4401173055389722e-05, + "loss": 1.6775, + "step": 36700 + }, + { + "epoch": 0.6541446939621971, + "grad_norm": 0.149272158741951, + "learning_rate": 1.42702465784314e-05, + "loss": 1.6607, + "step": 36800 + }, + { + "epoch": 0.6559222610653552, + "grad_norm": 0.16075880825519562, + "learning_rate": 1.4139679844785125e-05, + "loss": 1.6526, + "step": 36900 + }, + { + "epoch": 0.6576998281685134, + "grad_norm": 0.15068253874778748, + "learning_rate": 1.4009477232045356e-05, + "loss": 1.6502, + "step": 37000 + }, + { + "epoch": 0.6594773952716715, + "grad_norm": 0.15616253018379211, + "learning_rate": 1.387964310559845e-05, + "loss": 1.6681, + "step": 37100 + }, + { + "epoch": 0.6612549623748296, + "grad_norm": 0.16503843665122986, + "learning_rate": 1.3750181818476282e-05, + "loss": 1.6583, + "step": 37200 + }, + { + "epoch": 0.6630325294779877, + "grad_norm": 0.1866072565317154, + "learning_rate": 1.36210977112103e-05, + "loss": 1.662, + "step": 37300 + }, + { + "epoch": 0.664810096581146, + "grad_norm": 0.14251679182052612, + "learning_rate": 1.3492395111686013e-05, + "loss": 1.6654, + "step": 37400 + }, + { + "epoch": 0.6665876636843041, + "grad_norm": 0.3741336464881897, + "learning_rate": 1.3364078334997871e-05, + "loss": 1.6506, + "step": 37500 + }, + { + "epoch": 0.6683652307874622, + "grad_norm": 0.15886838734149933, + "learning_rate": 1.3236151683304582e-05, + "loss": 1.6593, + "step": 37600 + }, + { + "epoch": 0.6701427978906204, + "grad_norm": 0.1486154943704605, + "learning_rate": 1.3108619445684905e-05, + "loss": 1.6752, + "step": 37700 + }, + { + "epoch": 0.6719203649937785, + "grad_norm": 0.14979256689548492, + "learning_rate": 1.2981485897993812e-05, + "loss": 1.6614, + "step": 37800 + }, + { + "epoch": 0.6736979320969366, + "grad_norm": 0.14133110642433167, + "learning_rate": 1.2854755302719146e-05, + "loss": 1.6715, + "step": 37900 + }, + { + "epoch": 0.6754754992000948, + "grad_norm": 0.16530480980873108, + "learning_rate": 1.2728431908838707e-05, + "loss": 1.6732, + "step": 38000 + }, + { + "epoch": 0.677253066303253, + "grad_norm": 0.13394702970981598, + "learning_rate": 1.2602519951677793e-05, + "loss": 1.6682, + "step": 38100 + }, + { + "epoch": 0.6790306334064111, + "grad_norm": 0.1656082421541214, + "learning_rate": 1.2477023652767197e-05, + "loss": 1.6655, + "step": 38200 + }, + { + "epoch": 0.6808082005095693, + "grad_norm": 0.18027155101299286, + "learning_rate": 1.2351947219701676e-05, + "loss": 1.6624, + "step": 38300 + }, + { + "epoch": 0.6825857676127274, + "grad_norm": 0.1868833750486374, + "learning_rate": 1.2227294845998873e-05, + "loss": 1.6671, + "step": 38400 + }, + { + "epoch": 0.6843633347158855, + "grad_norm": 0.14410994946956635, + "learning_rate": 1.2103070710958724e-05, + "loss": 1.6698, + "step": 38500 + }, + { + "epoch": 0.6861409018190436, + "grad_norm": 0.16073070466518402, + "learning_rate": 1.1980514742799768e-05, + "loss": 1.6697, + "step": 38600 + }, + { + "epoch": 0.6879184689222018, + "grad_norm": 0.13548509776592255, + "learning_rate": 1.1857155179376509e-05, + "loss": 1.6619, + "step": 38700 + }, + { + "epoch": 0.68969603602536, + "grad_norm": 0.20386448502540588, + "learning_rate": 1.1734236264525464e-05, + "loss": 1.6802, + "step": 38800 + }, + { + "epoch": 0.6914736031285181, + "grad_norm": 0.13660947978496552, + "learning_rate": 1.1611762119427785e-05, + "loss": 1.6718, + "step": 38900 + }, + { + "epoch": 0.6932511702316763, + "grad_norm": 0.16920311748981476, + "learning_rate": 1.1489736850352542e-05, + "loss": 1.6529, + "step": 39000 + }, + { + "epoch": 0.6950287373348344, + "grad_norm": 0.1494123786687851, + "learning_rate": 1.1368164548519047e-05, + "loss": 1.6876, + "step": 39100 + }, + { + "epoch": 0.6968063044379925, + "grad_norm": 0.16994184255599976, + "learning_rate": 1.1247049289959693e-05, + "loss": 1.6635, + "step": 39200 + }, + { + "epoch": 0.6985838715411506, + "grad_norm": 0.16828663647174835, + "learning_rate": 1.1126395135383297e-05, + "loss": 1.6505, + "step": 39300 + }, + { + "epoch": 0.7003614386443088, + "grad_norm": 0.1685681790113449, + "learning_rate": 1.1006206130038932e-05, + "loss": 1.6674, + "step": 39400 + }, + { + "epoch": 0.702139005747467, + "grad_norm": 0.14324048161506653, + "learning_rate": 1.0886486303580332e-05, + "loss": 1.6712, + "step": 39500 + }, + { + "epoch": 0.7039165728506251, + "grad_norm": 0.14247146248817444, + "learning_rate": 1.0767239669930756e-05, + "loss": 1.6638, + "step": 39600 + }, + { + "epoch": 0.7056941399537833, + "grad_norm": 0.18177978694438934, + "learning_rate": 1.0648470227148434e-05, + "loss": 1.6489, + "step": 39700 + }, + { + "epoch": 0.7074717070569414, + "grad_norm": 0.13986016809940338, + "learning_rate": 1.0530181957292506e-05, + "loss": 1.6603, + "step": 39800 + }, + { + "epoch": 0.7092492741600995, + "grad_norm": 0.22386716306209564, + "learning_rate": 1.0412378826289529e-05, + "loss": 1.6701, + "step": 39900 + }, + { + "epoch": 0.7110268412632577, + "grad_norm": 0.1482144594192505, + "learning_rate": 1.0295064783800485e-05, + "loss": 1.6711, + "step": 40000 + }, + { + "epoch": 0.7128044083664158, + "grad_norm": 0.1389176994562149, + "learning_rate": 1.0178243763088382e-05, + "loss": 1.6643, + "step": 40100 + }, + { + "epoch": 0.714581975469574, + "grad_norm": 0.16256819665431976, + "learning_rate": 1.0061919680886375e-05, + "loss": 1.6737, + "step": 40200 + }, + { + "epoch": 0.7163595425727322, + "grad_norm": 0.1738821119070053, + "learning_rate": 9.946096437266427e-06, + "loss": 1.6854, + "step": 40300 + }, + { + "epoch": 0.7181371096758903, + "grad_norm": 0.1526288390159607, + "learning_rate": 9.830777915508584e-06, + "loss": 1.6535, + "step": 40400 + }, + { + "epoch": 0.7199146767790484, + "grad_norm": 0.13686831295490265, + "learning_rate": 9.71596798197075e-06, + "loss": 1.6807, + "step": 40500 + }, + { + "epoch": 0.7216922438822065, + "grad_norm": 0.1469413936138153, + "learning_rate": 9.602810911756332e-06, + "loss": 1.675, + "step": 40600 + }, + { + "epoch": 0.7234698109853647, + "grad_norm": 0.14766348898410797, + "learning_rate": 9.489024503787308e-06, + "loss": 1.6554, + "step": 40700 + }, + { + "epoch": 0.7252473780885228, + "grad_norm": 0.15909671783447266, + "learning_rate": 9.375758142223712e-06, + "loss": 1.6665, + "step": 40800 + }, + { + "epoch": 0.727024945191681, + "grad_norm": 0.17711246013641357, + "learning_rate": 9.263015624619362e-06, + "loss": 1.66, + "step": 40900 + }, + { + "epoch": 0.7288025122948392, + "grad_norm": 0.15886163711547852, + "learning_rate": 9.150800730964821e-06, + "loss": 1.6639, + "step": 41000 + }, + { + "epoch": 0.7305800793979973, + "grad_norm": 0.14557142555713654, + "learning_rate": 9.039117223560666e-06, + "loss": 1.677, + "step": 41100 + }, + { + "epoch": 0.7323576465011554, + "grad_norm": 0.15475749969482422, + "learning_rate": 8.927968846891351e-06, + "loss": 1.6677, + "step": 41200 + }, + { + "epoch": 0.7341352136043136, + "grad_norm": 0.1995362639427185, + "learning_rate": 8.817359327499659e-06, + "loss": 1.6635, + "step": 41300 + }, + { + "epoch": 0.7359127807074717, + "grad_norm": 0.16061349213123322, + "learning_rate": 8.70729237386175e-06, + "loss": 1.6696, + "step": 41400 + }, + { + "epoch": 0.7376903478106298, + "grad_norm": 0.1676475554704666, + "learning_rate": 8.597771676262848e-06, + "loss": 1.6609, + "step": 41500 + }, + { + "epoch": 0.739467914913788, + "grad_norm": 0.28265243768692017, + "learning_rate": 8.488800906673493e-06, + "loss": 1.657, + "step": 41600 + }, + { + "epoch": 0.7412454820169462, + "grad_norm": 0.15924739837646484, + "learning_rate": 8.380383718626441e-06, + "loss": 1.661, + "step": 41700 + }, + { + "epoch": 0.7430230491201043, + "grad_norm": 0.15197895467281342, + "learning_rate": 8.27252374709416e-06, + "loss": 1.6663, + "step": 41800 + }, + { + "epoch": 0.7448006162232624, + "grad_norm": 0.15987786650657654, + "learning_rate": 8.165224608366981e-06, + "loss": 1.6657, + "step": 41900 + }, + { + "epoch": 0.7465781833264206, + "grad_norm": 0.14073099195957184, + "learning_rate": 8.058489899931795e-06, + "loss": 1.6596, + "step": 42000 + }, + { + "epoch": 0.7483557504295787, + "grad_norm": 0.148057222366333, + "learning_rate": 7.95232320035152e-06, + "loss": 1.6642, + "step": 42100 + }, + { + "epoch": 0.7501333175327368, + "grad_norm": 0.17104440927505493, + "learning_rate": 7.846728069145052e-06, + "loss": 1.6587, + "step": 42200 + }, + { + "epoch": 0.7519108846358951, + "grad_norm": 0.1572682410478592, + "learning_rate": 7.741708046667947e-06, + "loss": 1.6748, + "step": 42300 + }, + { + "epoch": 0.7536884517390532, + "grad_norm": 0.17384777963161469, + "learning_rate": 7.637266653993755e-06, + "loss": 1.6731, + "step": 42400 + }, + { + "epoch": 0.7554660188422113, + "grad_norm": 0.15057361125946045, + "learning_rate": 7.533407392795896e-06, + "loss": 1.6753, + "step": 42500 + }, + { + "epoch": 0.7572435859453694, + "grad_norm": 0.17582474648952484, + "learning_rate": 7.431163571532962e-06, + "loss": 1.6614, + "step": 42600 + }, + { + "epoch": 0.7590211530485276, + "grad_norm": 0.16274411976337433, + "learning_rate": 7.328473092285082e-06, + "loss": 1.6622, + "step": 42700 + }, + { + "epoch": 0.7607987201516857, + "grad_norm": 0.14647479355335236, + "learning_rate": 7.226375097632967e-06, + "loss": 1.6672, + "step": 42800 + }, + { + "epoch": 0.7625762872548438, + "grad_norm": 0.14855672419071198, + "learning_rate": 7.124873010681446e-06, + "loss": 1.6734, + "step": 42900 + }, + { + "epoch": 0.764353854358002, + "grad_norm": 0.14453125, + "learning_rate": 7.0239702345559766e-06, + "loss": 1.6629, + "step": 43000 + }, + { + "epoch": 0.7661314214611602, + "grad_norm": 0.15817640721797943, + "learning_rate": 6.923670152288514e-06, + "loss": 1.6726, + "step": 43100 + }, + { + "epoch": 0.7679089885643183, + "grad_norm": 0.14252114295959473, + "learning_rate": 6.823976126704137e-06, + "loss": 1.6561, + "step": 43200 + }, + { + "epoch": 0.7696865556674765, + "grad_norm": 0.16966678202152252, + "learning_rate": 6.724891500308264e-06, + "loss": 1.6703, + "step": 43300 + }, + { + "epoch": 0.7714641227706346, + "grad_norm": 0.1773873120546341, + "learning_rate": 6.626419595174596e-06, + "loss": 1.6519, + "step": 43400 + }, + { + "epoch": 0.7732416898737927, + "grad_norm": 0.15773586928844452, + "learning_rate": 6.528563712833738e-06, + "loss": 1.6652, + "step": 43500 + }, + { + "epoch": 0.7750192569769508, + "grad_norm": 0.13970991969108582, + "learning_rate": 6.431327134162498e-06, + "loss": 1.6402, + "step": 43600 + }, + { + "epoch": 0.776796824080109, + "grad_norm": 0.20692099630832672, + "learning_rate": 6.3347131192739105e-06, + "loss": 1.6667, + "step": 43700 + }, + { + "epoch": 0.7785743911832672, + "grad_norm": 0.14494654536247253, + "learning_rate": 6.238724907407897e-06, + "loss": 1.6598, + "step": 43800 + }, + { + "epoch": 0.7803519582864253, + "grad_norm": 0.15285950899124146, + "learning_rate": 6.143365716822691e-06, + "loss": 1.6675, + "step": 43900 + }, + { + "epoch": 0.7821295253895835, + "grad_norm": 0.1541031152009964, + "learning_rate": 6.048638744686922e-06, + "loss": 1.6609, + "step": 44000 + }, + { + "epoch": 0.7839070924927416, + "grad_norm": 0.16380751132965088, + "learning_rate": 5.954547166972424e-06, + "loss": 1.6751, + "step": 44100 + }, + { + "epoch": 0.7856846595958997, + "grad_norm": 0.1482234001159668, + "learning_rate": 5.8610941383477615e-06, + "loss": 1.6584, + "step": 44200 + }, + { + "epoch": 0.7874622266990579, + "grad_norm": 0.15207888185977936, + "learning_rate": 5.768282792072455e-06, + "loss": 1.6495, + "step": 44300 + }, + { + "epoch": 0.789239793802216, + "grad_norm": 0.14984245598316193, + "learning_rate": 5.6761162398919264e-06, + "loss": 1.6837, + "step": 44400 + }, + { + "epoch": 0.7910173609053742, + "grad_norm": 0.18983450531959534, + "learning_rate": 5.584597571933176e-06, + "loss": 1.6604, + "step": 44500 + }, + { + "epoch": 0.7927949280085324, + "grad_norm": 0.14986811578273773, + "learning_rate": 5.493729856601171e-06, + "loss": 1.6734, + "step": 44600 + }, + { + "epoch": 0.7945724951116905, + "grad_norm": 0.16871729493141174, + "learning_rate": 5.4035161404759755e-06, + "loss": 1.6535, + "step": 44700 + }, + { + "epoch": 0.7963500622148486, + "grad_norm": 0.1682252287864685, + "learning_rate": 5.313959448210609e-06, + "loss": 1.654, + "step": 44800 + }, + { + "epoch": 0.7981276293180067, + "grad_norm": 0.16398753225803375, + "learning_rate": 5.225062782429624e-06, + "loss": 1.6665, + "step": 44900 + }, + { + "epoch": 0.7999051964211649, + "grad_norm": 0.16041302680969238, + "learning_rate": 5.136829123628442e-06, + "loss": 1.6668, + "step": 45000 + }, + { + "epoch": 0.801682763524323, + "grad_norm": 0.14892232418060303, + "learning_rate": 5.049261430073432e-06, + "loss": 1.6698, + "step": 45100 + }, + { + "epoch": 0.8034603306274812, + "grad_norm": 0.15795736014842987, + "learning_rate": 4.962362637702711e-06, + "loss": 1.6449, + "step": 45200 + }, + { + "epoch": 0.8052378977306394, + "grad_norm": 0.14031356573104858, + "learning_rate": 4.8761356600277284e-06, + "loss": 1.6736, + "step": 45300 + }, + { + "epoch": 0.8070154648337975, + "grad_norm": 0.14954744279384613, + "learning_rate": 4.790583388035561e-06, + "loss": 1.6595, + "step": 45400 + }, + { + "epoch": 0.8087930319369556, + "grad_norm": 0.1448160856962204, + "learning_rate": 4.705708690092006e-06, + "loss": 1.6697, + "step": 45500 + }, + { + "epoch": 0.8105705990401137, + "grad_norm": 0.1565515398979187, + "learning_rate": 4.621514411845399e-06, + "loss": 1.6642, + "step": 45600 + }, + { + "epoch": 0.8123481661432719, + "grad_norm": 0.17093238234519958, + "learning_rate": 4.5380033761312e-06, + "loss": 1.6697, + "step": 45700 + }, + { + "epoch": 0.81412573324643, + "grad_norm": 0.14756350219249725, + "learning_rate": 4.4560032277625644e-06, + "loss": 1.6726, + "step": 45800 + }, + { + "epoch": 0.8159033003495882, + "grad_norm": 0.15235283970832825, + "learning_rate": 4.373860152031772e-06, + "loss": 1.6617, + "step": 45900 + }, + { + "epoch": 0.8176808674527464, + "grad_norm": 0.16607356071472168, + "learning_rate": 4.292408622096306e-06, + "loss": 1.6648, + "step": 46000 + }, + { + "epoch": 0.8194584345559045, + "grad_norm": 0.1486140936613083, + "learning_rate": 4.211651368833752e-06, + "loss": 1.6664, + "step": 46100 + }, + { + "epoch": 0.8212360016590626, + "grad_norm": 0.1572180539369583, + "learning_rate": 4.131591099844242e-06, + "loss": 1.667, + "step": 46200 + }, + { + "epoch": 0.8230135687622208, + "grad_norm": 0.1429453045129776, + "learning_rate": 4.052230499359672e-06, + "loss": 1.6601, + "step": 46300 + }, + { + "epoch": 0.8247911358653789, + "grad_norm": 0.2002544403076172, + "learning_rate": 3.973572228153693e-06, + "loss": 1.6767, + "step": 46400 + }, + { + "epoch": 0.826568702968537, + "grad_norm": 0.2108883261680603, + "learning_rate": 3.895618923452526e-06, + "loss": 1.6682, + "step": 46500 + }, + { + "epoch": 0.8283462700716953, + "grad_norm": 0.16697534918785095, + "learning_rate": 3.818373198846526e-06, + "loss": 1.6739, + "step": 46600 + }, + { + "epoch": 0.8301238371748534, + "grad_norm": 0.21570728719234467, + "learning_rate": 3.741837644202542e-06, + "loss": 1.6602, + "step": 46700 + }, + { + "epoch": 0.8319014042780115, + "grad_norm": 0.14422467350959778, + "learning_rate": 3.6660148255771187e-06, + "loss": 1.6702, + "step": 46800 + }, + { + "epoch": 0.8336789713811696, + "grad_norm": 0.19726932048797607, + "learning_rate": 3.590907285130435e-06, + "loss": 1.6776, + "step": 46900 + }, + { + "epoch": 0.8354565384843278, + "grad_norm": 0.15185341238975525, + "learning_rate": 3.5165175410410838e-06, + "loss": 1.6568, + "step": 47000 + }, + { + "epoch": 0.8372341055874859, + "grad_norm": 0.13279466331005096, + "learning_rate": 3.4428480874216407e-06, + "loss": 1.6847, + "step": 47100 + }, + { + "epoch": 0.839011672690644, + "grad_norm": 0.15089605748653412, + "learning_rate": 3.3699013942350367e-06, + "loss": 1.6742, + "step": 47200 + }, + { + "epoch": 0.8407892397938023, + "grad_norm": 0.17505663633346558, + "learning_rate": 3.2976799072117564e-06, + "loss": 1.6749, + "step": 47300 + }, + { + "epoch": 0.8425668068969604, + "grad_norm": 0.1557130366563797, + "learning_rate": 3.226186047767829e-06, + "loss": 1.6768, + "step": 47400 + }, + { + "epoch": 0.8443443740001185, + "grad_norm": 0.17593321204185486, + "learning_rate": 3.1554222129236505e-06, + "loss": 1.6651, + "step": 47500 + }, + { + "epoch": 0.8461219411032767, + "grad_norm": 0.14146077632904053, + "learning_rate": 3.0853907752236123e-06, + "loss": 1.6463, + "step": 47600 + }, + { + "epoch": 0.8478995082064348, + "grad_norm": 0.16316835582256317, + "learning_rate": 3.0160940826565566e-06, + "loss": 1.665, + "step": 47700 + }, + { + "epoch": 0.8496770753095929, + "grad_norm": 0.15772958099842072, + "learning_rate": 2.947534458577067e-06, + "loss": 1.6691, + "step": 47800 + }, + { + "epoch": 0.851454642412751, + "grad_norm": 0.15084761381149292, + "learning_rate": 2.879714201627548e-06, + "loss": 1.6562, + "step": 47900 + }, + { + "epoch": 0.8532322095159093, + "grad_norm": 0.1439027339220047, + "learning_rate": 2.812635585661169e-06, + "loss": 1.6736, + "step": 48000 + }, + { + "epoch": 0.8550097766190674, + "grad_norm": 0.14739972352981567, + "learning_rate": 2.746300859665632e-06, + "loss": 1.656, + "step": 48100 + }, + { + "epoch": 0.8567873437222255, + "grad_norm": 0.14110083878040314, + "learning_rate": 2.6807122476877637e-06, + "loss": 1.6568, + "step": 48200 + }, + { + "epoch": 0.8585649108253837, + "grad_norm": 0.17874117195606232, + "learning_rate": 2.6158719487589467e-06, + "loss": 1.6855, + "step": 48300 + }, + { + "epoch": 0.8603424779285418, + "grad_norm": 0.20830568671226501, + "learning_rate": 2.5517821368213927e-06, + "loss": 1.6613, + "step": 48400 + }, + { + "epoch": 0.8621200450316999, + "grad_norm": 0.24184918403625488, + "learning_rate": 2.4884449606552564e-06, + "loss": 1.6665, + "step": 48500 + }, + { + "epoch": 0.863897612134858, + "grad_norm": 0.18940667808055878, + "learning_rate": 2.4258625438065898e-06, + "loss": 1.6668, + "step": 48600 + }, + { + "epoch": 0.8656751792380163, + "grad_norm": 0.1848708689212799, + "learning_rate": 2.3640369845161464e-06, + "loss": 1.6668, + "step": 48700 + }, + { + "epoch": 0.8674527463411744, + "grad_norm": 0.14313864707946777, + "learning_rate": 2.302970355649034e-06, + "loss": 1.6648, + "step": 48800 + }, + { + "epoch": 0.8692303134443325, + "grad_norm": 0.144060418009758, + "learning_rate": 2.242664704625216e-06, + "loss": 1.6684, + "step": 48900 + }, + { + "epoch": 0.8710078805474907, + "grad_norm": 0.1597507745027542, + "learning_rate": 2.1831220533508556e-06, + "loss": 1.6655, + "step": 49000 + }, + { + "epoch": 0.8727854476506488, + "grad_norm": 0.1441722959280014, + "learning_rate": 2.124344398150546e-06, + "loss": 1.6778, + "step": 49100 + }, + { + "epoch": 0.8745630147538069, + "grad_norm": 0.16209940612316132, + "learning_rate": 2.0663337097003576e-06, + "loss": 1.6608, + "step": 49200 + }, + { + "epoch": 0.8763405818569651, + "grad_norm": 0.33218371868133545, + "learning_rate": 2.0090919329617876e-06, + "loss": 1.6411, + "step": 49300 + }, + { + "epoch": 0.8781181489601232, + "grad_norm": 0.1453067809343338, + "learning_rate": 1.9526209871165184e-06, + "loss": 1.6652, + "step": 49400 + }, + { + "epoch": 0.8798957160632814, + "grad_norm": 0.18067798018455505, + "learning_rate": 1.8969227655021098e-06, + "loss": 1.6777, + "step": 49500 + }, + { + "epoch": 0.8816732831664396, + "grad_norm": 0.14852333068847656, + "learning_rate": 1.8419991355484945e-06, + "loss": 1.6616, + "step": 49600 + }, + { + "epoch": 0.8834508502695977, + "grad_norm": 0.17207376658916473, + "learning_rate": 1.7878519387153763e-06, + "loss": 1.6693, + "step": 49700 + }, + { + "epoch": 0.8852284173727558, + "grad_norm": Infinity, + "learning_rate": 1.7350128216860744e-06, + "loss": 1.6699, + "step": 49800 + }, + { + "epoch": 0.8870059844759139, + "grad_norm": 0.1677992343902588, + "learning_rate": 1.6824161021340963e-06, + "loss": 1.662, + "step": 49900 + }, + { + "epoch": 0.8887835515790721, + "grad_norm": 0.1469300240278244, + "learning_rate": 1.6306011661451375e-06, + "loss": 1.6804, + "step": 50000 + }, + { + "epoch": 0.8905611186822302, + "grad_norm": 0.17192143201828003, + "learning_rate": 1.5795697509517316e-06, + "loss": 1.6682, + "step": 50100 + }, + { + "epoch": 0.8923386857853884, + "grad_norm": 0.16549953818321228, + "learning_rate": 1.529323567516805e-06, + "loss": 1.6442, + "step": 50200 + }, + { + "epoch": 0.8941162528885466, + "grad_norm": 0.21391679346561432, + "learning_rate": 1.4803549924437943e-06, + "loss": 1.6649, + "step": 50300 + }, + { + "epoch": 0.8958938199917047, + "grad_norm": 0.14870049059391022, + "learning_rate": 1.4316764061822001e-06, + "loss": 1.6526, + "step": 50400 + }, + { + "epoch": 0.8976713870948628, + "grad_norm": 0.14293646812438965, + "learning_rate": 1.3837880101939342e-06, + "loss": 1.6585, + "step": 50500 + }, + { + "epoch": 0.899448954198021, + "grad_norm": 0.19570215046405792, + "learning_rate": 1.3366914100639061e-06, + "loss": 1.6568, + "step": 50600 + }, + { + "epoch": 0.9012265213011791, + "grad_norm": 0.13882361352443695, + "learning_rate": 1.2903881848299797e-06, + "loss": 1.6541, + "step": 50700 + }, + { + "epoch": 0.9030040884043372, + "grad_norm": 0.1370488852262497, + "learning_rate": 1.244879886930031e-06, + "loss": 1.6625, + "step": 50800 + }, + { + "epoch": 0.9047816555074955, + "grad_norm": 0.18005253374576569, + "learning_rate": 1.200168042149899e-06, + "loss": 1.6708, + "step": 50900 + }, + { + "epoch": 0.9065592226106536, + "grad_norm": 0.18435104191303253, + "learning_rate": 1.156254149572225e-06, + "loss": 1.6642, + "step": 51000 + }, + { + "epoch": 0.9083367897138117, + "grad_norm": 0.1873762458562851, + "learning_rate": 1.1131396815261985e-06, + "loss": 1.6561, + "step": 51100 + }, + { + "epoch": 0.9101143568169698, + "grad_norm": 0.1366182565689087, + "learning_rate": 1.0708260835381927e-06, + "loss": 1.6456, + "step": 51200 + }, + { + "epoch": 0.911891923920128, + "grad_norm": 0.16981613636016846, + "learning_rate": 1.0293147742832966e-06, + "loss": 1.6732, + "step": 51300 + }, + { + "epoch": 0.9136694910232861, + "grad_norm": 0.18389706313610077, + "learning_rate": 9.88607145537751e-07, + "loss": 1.6679, + "step": 51400 + }, + { + "epoch": 0.9154470581264442, + "grad_norm": 0.15548266470432281, + "learning_rate": 9.487045621322799e-07, + "loss": 1.6619, + "step": 51500 + }, + { + "epoch": 0.9172246252296025, + "grad_norm": 0.18912291526794434, + "learning_rate": 9.096083619063473e-07, + "loss": 1.6736, + "step": 51600 + }, + { + "epoch": 0.9190021923327606, + "grad_norm": 0.16964443027973175, + "learning_rate": 8.713198556632885e-07, + "loss": 1.6748, + "step": 51700 + }, + { + "epoch": 0.9207797594359187, + "grad_norm": 0.14487318694591522, + "learning_rate": 8.338403271263589e-07, + "loss": 1.6692, + "step": 51800 + }, + { + "epoch": 0.9225573265390768, + "grad_norm": 0.15898752212524414, + "learning_rate": 7.971710328957132e-07, + "loss": 1.6646, + "step": 51900 + }, + { + "epoch": 0.924334893642235, + "grad_norm": 0.15839669108390808, + "learning_rate": 7.613132024062469e-07, + "loss": 1.662, + "step": 52000 + }, + { + "epoch": 0.9261124607453931, + "grad_norm": 0.15475721657276154, + "learning_rate": 7.262680378864017e-07, + "loss": 1.6671, + "step": 52100 + }, + { + "epoch": 0.9278900278485512, + "grad_norm": 0.1529570072889328, + "learning_rate": 6.920367143178452e-07, + "loss": 1.6726, + "step": 52200 + }, + { + "epoch": 0.9296675949517095, + "grad_norm": 0.1656341701745987, + "learning_rate": 6.586203793960771e-07, + "loss": 1.6623, + "step": 52300 + }, + { + "epoch": 0.9314451620548676, + "grad_norm": 0.1463242918252945, + "learning_rate": 6.260201534919491e-07, + "loss": 1.6662, + "step": 52400 + }, + { + "epoch": 0.9332227291580257, + "grad_norm": 0.13823845982551575, + "learning_rate": 5.942371296141058e-07, + "loss": 1.6453, + "step": 52500 + }, + { + "epoch": 0.9350002962611839, + "grad_norm": 0.14740724861621857, + "learning_rate": 5.632723733723366e-07, + "loss": 1.6574, + "step": 52600 + }, + { + "epoch": 0.936777863364342, + "grad_norm": 0.13956210017204285, + "learning_rate": 5.331269229418484e-07, + "loss": 1.6577, + "step": 52700 + }, + { + "epoch": 0.9385554304675001, + "grad_norm": 0.14831486344337463, + "learning_rate": 5.038017890284547e-07, + "loss": 1.6491, + "step": 52800 + }, + { + "epoch": 0.9403329975706582, + "grad_norm": 0.17318527400493622, + "learning_rate": 4.75297954834697e-07, + "loss": 1.6704, + "step": 52900 + }, + { + "epoch": 0.9421105646738165, + "grad_norm": 0.15295392274856567, + "learning_rate": 4.476163760268659e-07, + "loss": 1.6386, + "step": 53000 + }, + { + "epoch": 0.9438881317769746, + "grad_norm": 0.1569519191980362, + "learning_rate": 4.207579807029821e-07, + "loss": 1.6618, + "step": 53100 + }, + { + "epoch": 0.9456656988801327, + "grad_norm": 0.14801190793514252, + "learning_rate": 3.947236693616574e-07, + "loss": 1.6625, + "step": 53200 + }, + { + "epoch": 0.9474432659832909, + "grad_norm": 0.16047972440719604, + "learning_rate": 3.697623220822066e-07, + "loss": 1.6702, + "step": 53300 + }, + { + "epoch": 0.949220833086449, + "grad_norm": 0.17775191366672516, + "learning_rate": 3.453705075406932e-07, + "loss": 1.6671, + "step": 53400 + }, + { + "epoch": 0.9509984001896071, + "grad_norm": 0.15378819406032562, + "learning_rate": 3.218053045458136e-07, + "loss": 1.6644, + "step": 53500 + }, + { + "epoch": 0.9527759672927653, + "grad_norm": 0.14244551956653595, + "learning_rate": 2.990675031832174e-07, + "loss": 1.6791, + "step": 53600 + }, + { + "epoch": 0.9545535343959235, + "grad_norm": 0.1715448796749115, + "learning_rate": 2.7715786579772527e-07, + "loss": 1.6614, + "step": 53700 + }, + { + "epoch": 0.9563311014990816, + "grad_norm": 0.15638667345046997, + "learning_rate": 2.560771269677742e-07, + "loss": 1.6782, + "step": 53800 + }, + { + "epoch": 0.9581086686022398, + "grad_norm": 0.18975552916526794, + "learning_rate": 2.358259934807927e-07, + "loss": 1.6656, + "step": 53900 + }, + { + "epoch": 0.9598862357053979, + "grad_norm": 0.14271163940429688, + "learning_rate": 2.1640514430950055e-07, + "loss": 1.6574, + "step": 54000 + }, + { + "epoch": 0.961663802808556, + "grad_norm": 0.20691907405853271, + "learning_rate": 1.978152305891351e-07, + "loss": 1.6523, + "step": 54100 + }, + { + "epoch": 0.9634413699117141, + "grad_norm": 0.15907599031925201, + "learning_rate": 1.8005687559563834e-07, + "loss": 1.6763, + "step": 54200 + }, + { + "epoch": 0.9652189370148723, + "grad_norm": 0.13600093126296997, + "learning_rate": 1.6313067472474576e-07, + "loss": 1.6771, + "step": 54300 + }, + { + "epoch": 0.9669965041180305, + "grad_norm": 0.157552108168602, + "learning_rate": 1.470371954720301e-07, + "loss": 1.6601, + "step": 54400 + }, + { + "epoch": 0.9687740712211886, + "grad_norm": 0.14566664397716522, + "learning_rate": 1.3177697741387218e-07, + "loss": 1.6758, + "step": 54500 + }, + { + "epoch": 0.9705516383243468, + "grad_norm": 0.16759639978408813, + "learning_rate": 1.1735053218937808e-07, + "loss": 1.6591, + "step": 54600 + }, + { + "epoch": 0.9723292054275049, + "grad_norm": 0.16849561035633087, + "learning_rate": 1.0375834348320401e-07, + "loss": 1.6756, + "step": 54700 + }, + { + "epoch": 0.974106772530663, + "grad_norm": 0.17653703689575195, + "learning_rate": 9.100086700936649e-08, + "loss": 1.6621, + "step": 54800 + }, + { + "epoch": 0.9758843396338212, + "grad_norm": 0.17266112565994263, + "learning_rate": 7.907853049594905e-08, + "loss": 1.673, + "step": 54900 + }, + { + "epoch": 0.9776619067369793, + "grad_norm": 0.1719112992286682, + "learning_rate": 6.799173367075528e-08, + "loss": 1.6574, + "step": 55000 + }, + { + "epoch": 0.9794394738401375, + "grad_norm": 0.1552852988243103, + "learning_rate": 5.774084824792247e-08, + "loss": 1.6673, + "step": 55100 + }, + { + "epoch": 0.9812170409432956, + "grad_norm": 0.1883048564195633, + "learning_rate": 4.8326217915448114e-08, + "loss": 1.6688, + "step": 55200 + }, + { + "epoch": 0.9829946080464538, + "grad_norm": 0.16423700749874115, + "learning_rate": 3.97481583236714e-08, + "loss": 1.6706, + "step": 55300 + }, + { + "epoch": 0.9847721751496119, + "grad_norm": 0.14842714369297028, + "learning_rate": 3.2006957074690035e-08, + "loss": 1.6586, + "step": 55400 + }, + { + "epoch": 0.98654974225277, + "grad_norm": 0.13827410340309143, + "learning_rate": 2.510287371270681e-08, + "loss": 1.6537, + "step": 55500 + }, + { + "epoch": 0.9883273093559282, + "grad_norm": 0.14455120265483856, + "learning_rate": 1.903613971535323e-08, + "loss": 1.6689, + "step": 55600 + }, + { + "epoch": 0.9901048764590863, + "grad_norm": 0.14416253566741943, + "learning_rate": 1.385510381303745e-08, + "loss": 1.6618, + "step": 55700 + }, + { + "epoch": 0.9918824435622444, + "grad_norm": 0.14247511327266693, + "learning_rate": 9.455272617062139e-09, + "loss": 1.6753, + "step": 55800 + }, + { + "epoch": 0.9936600106654027, + "grad_norm": 0.15325996279716492, + "learning_rate": 5.893315412855427e-09, + "loss": 1.6689, + "step": 55900 + }, + { + "epoch": 0.9954375777685608, + "grad_norm": 0.17999306321144104, + "learning_rate": 3.169351624432437e-09, + "loss": 1.6623, + "step": 56000 + }, + { + "epoch": 0.9972151448717189, + "grad_norm": 0.27596476674079895, + "learning_rate": 1.283472579871603e-09, + "loss": 1.67, + "step": 56100 + }, + { + "epoch": 0.998992711974877, + "grad_norm": 0.14970412850379944, + "learning_rate": 2.3574150824490215e-10, + "loss": 1.6558, + "step": 56200 + } + ], + "logging_steps": 100, + "max_steps": 56256, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6446037739742167e+18, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}