{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8261617900172117, "eval_steps": 375, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005507745266781412, "grad_norm": 73.5024642944336, "learning_rate": 2e-05, "loss": 0.7928, "step": 1 }, { "epoch": 0.0011015490533562823, "grad_norm": 133.29066467285156, "learning_rate": 4e-05, "loss": 1.3315, "step": 2 }, { "epoch": 0.0016523235800344235, "grad_norm": 145.70004272460938, "learning_rate": 6e-05, "loss": 1.3951, "step": 3 }, { "epoch": 0.0022030981067125647, "grad_norm": 114.30488586425781, "learning_rate": 8e-05, "loss": 1.3063, "step": 4 }, { "epoch": 0.002753872633390706, "grad_norm": 95.38715362548828, "learning_rate": 0.0001, "loss": 0.9302, "step": 5 }, { "epoch": 0.003304647160068847, "grad_norm": 77.89411926269531, "learning_rate": 0.00012, "loss": 0.5725, "step": 6 }, { "epoch": 0.0038554216867469878, "grad_norm": 74.4919662475586, "learning_rate": 0.00014, "loss": 0.2181, "step": 7 }, { "epoch": 0.004406196213425129, "grad_norm": 22.369192123413086, "learning_rate": 0.00016, "loss": 0.0416, "step": 8 }, { "epoch": 0.00495697074010327, "grad_norm": 16.90326499938965, "learning_rate": 0.00018, "loss": 0.0181, "step": 9 }, { "epoch": 0.005507745266781412, "grad_norm": 26.334524154663086, "learning_rate": 0.0002, "loss": 0.0573, "step": 10 }, { "epoch": 0.0060585197934595525, "grad_norm": 14.402909278869629, "learning_rate": 0.00019999977772170748, "loss": 0.0428, "step": 11 }, { "epoch": 0.006609294320137694, "grad_norm": 32.174686431884766, "learning_rate": 0.00019999911088781805, "loss": 0.0081, "step": 12 }, { "epoch": 0.007160068846815835, "grad_norm": 3.5905063152313232, "learning_rate": 0.0001999979995012962, "loss": 0.0027, "step": 13 }, { "epoch": 0.0077108433734939755, "grad_norm": 32.16322708129883, "learning_rate": 0.00019999644356708261, "loss": 0.0368, "step": 14 }, { "epoch": 0.008261617900172116, "grad_norm": 1.719008445739746, "learning_rate": 0.00019999444309209432, "loss": 0.0012, "step": 15 }, { "epoch": 0.008812392426850259, "grad_norm": 37.352081298828125, "learning_rate": 0.0001999919980852246, "loss": 0.098, "step": 16 }, { "epoch": 0.0093631669535284, "grad_norm": 100.51689910888672, "learning_rate": 0.00019998910855734288, "loss": 0.0998, "step": 17 }, { "epoch": 0.00991394148020654, "grad_norm": 26.180370330810547, "learning_rate": 0.0001999857745212947, "loss": 0.1002, "step": 18 }, { "epoch": 0.010464716006884681, "grad_norm": 7.088527679443359, "learning_rate": 0.00019998199599190178, "loss": 0.0041, "step": 19 }, { "epoch": 0.011015490533562823, "grad_norm": 16.436443328857422, "learning_rate": 0.0001999777729859618, "loss": 0.0974, "step": 20 }, { "epoch": 0.011566265060240964, "grad_norm": 27.93756103515625, "learning_rate": 0.00019997310552224846, "loss": 0.0986, "step": 21 }, { "epoch": 0.012117039586919105, "grad_norm": 41.224544525146484, "learning_rate": 0.00019996799362151122, "loss": 0.0394, "step": 22 }, { "epoch": 0.012667814113597246, "grad_norm": 9.353221893310547, "learning_rate": 0.00019996243730647538, "loss": 0.0263, "step": 23 }, { "epoch": 0.013218588640275388, "grad_norm": 10.83764362335205, "learning_rate": 0.00019995643660184191, "loss": 0.0392, "step": 24 }, { "epoch": 0.013769363166953529, "grad_norm": 14.599753379821777, "learning_rate": 0.00019994999153428737, "loss": 0.0732, "step": 25 }, { "epoch": 0.01432013769363167, "grad_norm": 13.742327690124512, "learning_rate": 0.00019994310213246368, "loss": 0.0984, "step": 26 }, { "epoch": 0.01487091222030981, "grad_norm": 1.8473610877990723, "learning_rate": 0.00019993576842699816, "loss": 0.0026, "step": 27 }, { "epoch": 0.015421686746987951, "grad_norm": 9.659041404724121, "learning_rate": 0.0001999279904504933, "loss": 0.0355, "step": 28 }, { "epoch": 0.015972461273666094, "grad_norm": 7.979834079742432, "learning_rate": 0.00019991976823752653, "loss": 0.013, "step": 29 }, { "epoch": 0.016523235800344233, "grad_norm": 6.332828044891357, "learning_rate": 0.00019991110182465032, "loss": 0.0078, "step": 30 }, { "epoch": 0.017074010327022375, "grad_norm": 15.081528663635254, "learning_rate": 0.00019990199125039174, "loss": 0.0472, "step": 31 }, { "epoch": 0.017624784853700518, "grad_norm": 1.6818047761917114, "learning_rate": 0.00019989243655525247, "loss": 0.0011, "step": 32 }, { "epoch": 0.018175559380378657, "grad_norm": 16.635372161865234, "learning_rate": 0.00019988243778170853, "loss": 0.101, "step": 33 }, { "epoch": 0.0187263339070568, "grad_norm": 15.169713973999023, "learning_rate": 0.0001998719949742101, "loss": 0.0786, "step": 34 }, { "epoch": 0.01927710843373494, "grad_norm": 19.847829818725586, "learning_rate": 0.0001998611081791814, "loss": 0.0138, "step": 35 }, { "epoch": 0.01982788296041308, "grad_norm": 9.9878568649292, "learning_rate": 0.00019984977744502038, "loss": 0.0151, "step": 36 }, { "epoch": 0.020378657487091223, "grad_norm": 8.012694358825684, "learning_rate": 0.00019983800282209857, "loss": 0.0237, "step": 37 }, { "epoch": 0.020929432013769362, "grad_norm": 18.2310848236084, "learning_rate": 0.00019982578436276082, "loss": 0.0546, "step": 38 }, { "epoch": 0.021480206540447504, "grad_norm": 12.813060760498047, "learning_rate": 0.00019981312212132512, "loss": 0.0115, "step": 39 }, { "epoch": 0.022030981067125647, "grad_norm": 13.290501594543457, "learning_rate": 0.00019980001615408228, "loss": 0.0362, "step": 40 }, { "epoch": 0.022581755593803786, "grad_norm": 10.766707420349121, "learning_rate": 0.00019978646651929572, "loss": 0.0074, "step": 41 }, { "epoch": 0.02313253012048193, "grad_norm": 4.9491753578186035, "learning_rate": 0.00019977247327720128, "loss": 0.0239, "step": 42 }, { "epoch": 0.023683304647160067, "grad_norm": 16.821706771850586, "learning_rate": 0.0001997580364900068, "loss": 0.0492, "step": 43 }, { "epoch": 0.02423407917383821, "grad_norm": 6.35998010635376, "learning_rate": 0.000199743156221892, "loss": 0.0128, "step": 44 }, { "epoch": 0.024784853700516352, "grad_norm": 0.9552410244941711, "learning_rate": 0.00019972783253900808, "loss": 0.002, "step": 45 }, { "epoch": 0.02533562822719449, "grad_norm": 9.008210182189941, "learning_rate": 0.00019971206550947748, "loss": 0.044, "step": 46 }, { "epoch": 0.025886402753872634, "grad_norm": 0.973410427570343, "learning_rate": 0.00019969585520339354, "loss": 0.001, "step": 47 }, { "epoch": 0.026437177280550776, "grad_norm": 6.15267276763916, "learning_rate": 0.0001996792016928203, "loss": 0.0056, "step": 48 }, { "epoch": 0.026987951807228915, "grad_norm": 28.659374237060547, "learning_rate": 0.00019966210505179197, "loss": 0.0666, "step": 49 }, { "epoch": 0.027538726333907058, "grad_norm": 3.20442533493042, "learning_rate": 0.00019964456535631286, "loss": 0.0024, "step": 50 }, { "epoch": 0.028089500860585197, "grad_norm": 20.461469650268555, "learning_rate": 0.0001996265826843568, "loss": 0.2034, "step": 51 }, { "epoch": 0.02864027538726334, "grad_norm": 4.691360950469971, "learning_rate": 0.00019960815711586696, "loss": 0.0152, "step": 52 }, { "epoch": 0.02919104991394148, "grad_norm": 2.52117919921875, "learning_rate": 0.00019958928873275539, "loss": 0.0084, "step": 53 }, { "epoch": 0.02974182444061962, "grad_norm": 1.440590262413025, "learning_rate": 0.00019956997761890277, "loss": 0.0015, "step": 54 }, { "epoch": 0.030292598967297763, "grad_norm": 0.34622547030448914, "learning_rate": 0.00019955022386015792, "loss": 0.0002, "step": 55 }, { "epoch": 0.030843373493975902, "grad_norm": 7.247194290161133, "learning_rate": 0.00019953002754433743, "loss": 0.0351, "step": 56 }, { "epoch": 0.031394148020654045, "grad_norm": 9.206347465515137, "learning_rate": 0.00019950938876122542, "loss": 0.0613, "step": 57 }, { "epoch": 0.03194492254733219, "grad_norm": 4.78291130065918, "learning_rate": 0.00019948830760257291, "loss": 0.0099, "step": 58 }, { "epoch": 0.03249569707401033, "grad_norm": 9.625162124633789, "learning_rate": 0.0001994667841620976, "loss": 0.0162, "step": 59 }, { "epoch": 0.033046471600688465, "grad_norm": 15.956674575805664, "learning_rate": 0.00019944481853548335, "loss": 0.0573, "step": 60 }, { "epoch": 0.03359724612736661, "grad_norm": 3.3552818298339844, "learning_rate": 0.00019942241082037982, "loss": 0.0143, "step": 61 }, { "epoch": 0.03414802065404475, "grad_norm": 0.20101390779018402, "learning_rate": 0.00019939956111640197, "loss": 0.0001, "step": 62 }, { "epoch": 0.03469879518072289, "grad_norm": 15.361065864562988, "learning_rate": 0.00019937626952512964, "loss": 0.0097, "step": 63 }, { "epoch": 0.035249569707401035, "grad_norm": 0.7548312544822693, "learning_rate": 0.0001993525361501072, "loss": 0.0005, "step": 64 }, { "epoch": 0.03580034423407917, "grad_norm": 11.565154075622559, "learning_rate": 0.00019932836109684286, "loss": 0.0308, "step": 65 }, { "epoch": 0.03635111876075731, "grad_norm": 44.96464920043945, "learning_rate": 0.00019930374447280845, "loss": 0.0886, "step": 66 }, { "epoch": 0.036901893287435455, "grad_norm": 15.662043571472168, "learning_rate": 0.00019927868638743875, "loss": 0.0833, "step": 67 }, { "epoch": 0.0374526678141136, "grad_norm": 9.843422889709473, "learning_rate": 0.0001992531869521312, "loss": 0.0314, "step": 68 }, { "epoch": 0.03800344234079174, "grad_norm": 2.948150157928467, "learning_rate": 0.00019922724628024515, "loss": 0.0029, "step": 69 }, { "epoch": 0.03855421686746988, "grad_norm": 12.844285011291504, "learning_rate": 0.0001992008644871016, "loss": 0.0111, "step": 70 }, { "epoch": 0.03910499139414802, "grad_norm": 4.494997024536133, "learning_rate": 0.00019917404168998256, "loss": 0.0142, "step": 71 }, { "epoch": 0.03965576592082616, "grad_norm": 5.065650939941406, "learning_rate": 0.0001991467780081305, "loss": 0.0203, "step": 72 }, { "epoch": 0.0402065404475043, "grad_norm": 3.4543423652648926, "learning_rate": 0.00019911907356274795, "loss": 0.0145, "step": 73 }, { "epoch": 0.040757314974182446, "grad_norm": 4.724408149719238, "learning_rate": 0.00019909092847699683, "loss": 0.0178, "step": 74 }, { "epoch": 0.04130808950086059, "grad_norm": 16.997365951538086, "learning_rate": 0.00019906234287599798, "loss": 0.0757, "step": 75 }, { "epoch": 0.041858864027538724, "grad_norm": 5.861988544464111, "learning_rate": 0.00019903331688683057, "loss": 0.0123, "step": 76 }, { "epoch": 0.042409638554216866, "grad_norm": 6.995002746582031, "learning_rate": 0.00019900385063853154, "loss": 0.0073, "step": 77 }, { "epoch": 0.04296041308089501, "grad_norm": 20.438507080078125, "learning_rate": 0.00019897394426209505, "loss": 0.0754, "step": 78 }, { "epoch": 0.04351118760757315, "grad_norm": 0.08117744326591492, "learning_rate": 0.00019894359789047187, "loss": 0.0001, "step": 79 }, { "epoch": 0.044061962134251294, "grad_norm": 27.27115821838379, "learning_rate": 0.00019891281165856873, "loss": 0.0794, "step": 80 }, { "epoch": 0.04461273666092943, "grad_norm": 2.5560953617095947, "learning_rate": 0.00019888158570324795, "loss": 0.0016, "step": 81 }, { "epoch": 0.04516351118760757, "grad_norm": 6.3870697021484375, "learning_rate": 0.0001988499201633265, "loss": 0.0192, "step": 82 }, { "epoch": 0.045714285714285714, "grad_norm": 0.5458478927612305, "learning_rate": 0.00019881781517957562, "loss": 0.0003, "step": 83 }, { "epoch": 0.04626506024096386, "grad_norm": 2.0009050369262695, "learning_rate": 0.0001987852708947202, "loss": 0.0014, "step": 84 }, { "epoch": 0.046815834767642, "grad_norm": 7.568767547607422, "learning_rate": 0.00019875228745343794, "loss": 0.0483, "step": 85 }, { "epoch": 0.047366609294320135, "grad_norm": 9.756795883178711, "learning_rate": 0.0001987188650023589, "loss": 0.0379, "step": 86 }, { "epoch": 0.04791738382099828, "grad_norm": 6.3170905113220215, "learning_rate": 0.0001986850036900648, "loss": 0.0014, "step": 87 }, { "epoch": 0.04846815834767642, "grad_norm": 1.1404838562011719, "learning_rate": 0.00019865070366708836, "loss": 0.0004, "step": 88 }, { "epoch": 0.04901893287435456, "grad_norm": 4.913083076477051, "learning_rate": 0.00019861596508591255, "loss": 0.009, "step": 89 }, { "epoch": 0.049569707401032705, "grad_norm": 1.6249858140945435, "learning_rate": 0.00019858078810097002, "loss": 0.0034, "step": 90 }, { "epoch": 0.05012048192771084, "grad_norm": 14.43410587310791, "learning_rate": 0.00019854517286864245, "loss": 0.0524, "step": 91 }, { "epoch": 0.05067125645438898, "grad_norm": 12.45235824584961, "learning_rate": 0.0001985091195472596, "loss": 0.0044, "step": 92 }, { "epoch": 0.051222030981067125, "grad_norm": 6.194559574127197, "learning_rate": 0.0001984726282970989, "loss": 0.0207, "step": 93 }, { "epoch": 0.05177280550774527, "grad_norm": 29.69363021850586, "learning_rate": 0.0001984356992803847, "loss": 0.0117, "step": 94 }, { "epoch": 0.05232358003442341, "grad_norm": 13.536998748779297, "learning_rate": 0.00019839833266128724, "loss": 0.0559, "step": 95 }, { "epoch": 0.05287435456110155, "grad_norm": 3.107300281524658, "learning_rate": 0.00019836052860592237, "loss": 0.0142, "step": 96 }, { "epoch": 0.05342512908777969, "grad_norm": 0.012410641647875309, "learning_rate": 0.0001983222872823505, "loss": 0.0, "step": 97 }, { "epoch": 0.05397590361445783, "grad_norm": 0.7991882562637329, "learning_rate": 0.00019828360886057594, "loss": 0.0006, "step": 98 }, { "epoch": 0.05452667814113597, "grad_norm": 21.360546112060547, "learning_rate": 0.00019824449351254616, "loss": 0.0475, "step": 99 }, { "epoch": 0.055077452667814115, "grad_norm": 4.854896545410156, "learning_rate": 0.00019820494141215104, "loss": 0.0026, "step": 100 }, { "epoch": 0.05562822719449226, "grad_norm": 20.94474220275879, "learning_rate": 0.000198164952735222, "loss": 0.0792, "step": 101 }, { "epoch": 0.056179001721170393, "grad_norm": 22.7581787109375, "learning_rate": 0.00019812452765953135, "loss": 0.0946, "step": 102 }, { "epoch": 0.056729776247848536, "grad_norm": 0.20172899961471558, "learning_rate": 0.00019808366636479147, "loss": 0.0001, "step": 103 }, { "epoch": 0.05728055077452668, "grad_norm": 0.8026621341705322, "learning_rate": 0.00019804236903265388, "loss": 0.0006, "step": 104 }, { "epoch": 0.05783132530120482, "grad_norm": 38.03317642211914, "learning_rate": 0.00019800063584670863, "loss": 0.0276, "step": 105 }, { "epoch": 0.05838209982788296, "grad_norm": 7.372140407562256, "learning_rate": 0.00019795846699248332, "loss": 0.0125, "step": 106 }, { "epoch": 0.0589328743545611, "grad_norm": 3.519967794418335, "learning_rate": 0.00019791586265744237, "loss": 0.0014, "step": 107 }, { "epoch": 0.05948364888123924, "grad_norm": 0.10014593601226807, "learning_rate": 0.00019787282303098617, "loss": 0.0001, "step": 108 }, { "epoch": 0.060034423407917384, "grad_norm": 2.4528684616088867, "learning_rate": 0.0001978293483044502, "loss": 0.0018, "step": 109 }, { "epoch": 0.060585197934595526, "grad_norm": 6.558525085449219, "learning_rate": 0.00019778543867110426, "loss": 0.0175, "step": 110 }, { "epoch": 0.06113597246127367, "grad_norm": 15.120335578918457, "learning_rate": 0.00019774109432615147, "loss": 0.0641, "step": 111 }, { "epoch": 0.061686746987951804, "grad_norm": 10.05320930480957, "learning_rate": 0.00019769631546672756, "loss": 0.0237, "step": 112 }, { "epoch": 0.06223752151462995, "grad_norm": 0.3913431763648987, "learning_rate": 0.00019765110229189988, "loss": 0.0003, "step": 113 }, { "epoch": 0.06278829604130809, "grad_norm": 0.2732701301574707, "learning_rate": 0.00019760545500266657, "loss": 0.0002, "step": 114 }, { "epoch": 0.06333907056798622, "grad_norm": 11.495320320129395, "learning_rate": 0.00019755937380195568, "loss": 0.0227, "step": 115 }, { "epoch": 0.06388984509466437, "grad_norm": 0.1443290114402771, "learning_rate": 0.00019751285889462423, "loss": 0.0001, "step": 116 }, { "epoch": 0.06444061962134251, "grad_norm": 43.460514068603516, "learning_rate": 0.0001974659104874573, "loss": 0.0037, "step": 117 }, { "epoch": 0.06499139414802066, "grad_norm": 25.649385452270508, "learning_rate": 0.0001974185287891671, "loss": 0.0317, "step": 118 }, { "epoch": 0.0655421686746988, "grad_norm": 27.432331085205078, "learning_rate": 0.0001973707140103921, "loss": 0.0866, "step": 119 }, { "epoch": 0.06609294320137693, "grad_norm": 12.296560287475586, "learning_rate": 0.00019732246636369605, "loss": 0.0143, "step": 120 }, { "epoch": 0.06664371772805508, "grad_norm": 9.799930572509766, "learning_rate": 0.00019727378606356703, "loss": 0.0716, "step": 121 }, { "epoch": 0.06719449225473322, "grad_norm": 2.1945130825042725, "learning_rate": 0.00019722467332641656, "loss": 0.0072, "step": 122 }, { "epoch": 0.06774526678141136, "grad_norm": 8.398616790771484, "learning_rate": 0.00019717512837057855, "loss": 0.0416, "step": 123 }, { "epoch": 0.0682960413080895, "grad_norm": 1.7101330757141113, "learning_rate": 0.0001971251514163083, "loss": 0.004, "step": 124 }, { "epoch": 0.06884681583476764, "grad_norm": 21.397602081298828, "learning_rate": 0.0001970747426857817, "loss": 0.0126, "step": 125 }, { "epoch": 0.06939759036144579, "grad_norm": 2.502991199493408, "learning_rate": 0.00019702390240309404, "loss": 0.0077, "step": 126 }, { "epoch": 0.06994836488812392, "grad_norm": 0.6189124584197998, "learning_rate": 0.0001969726307942592, "loss": 0.0005, "step": 127 }, { "epoch": 0.07049913941480207, "grad_norm": 19.822641372680664, "learning_rate": 0.00019692092808720846, "loss": 0.1037, "step": 128 }, { "epoch": 0.0710499139414802, "grad_norm": 28.530107498168945, "learning_rate": 0.0001968687945117896, "loss": 0.0812, "step": 129 }, { "epoch": 0.07160068846815834, "grad_norm": 14.628978729248047, "learning_rate": 0.00019681623029976588, "loss": 0.0176, "step": 130 }, { "epoch": 0.07215146299483649, "grad_norm": 5.735737323760986, "learning_rate": 0.00019676323568481498, "loss": 0.0017, "step": 131 }, { "epoch": 0.07270223752151463, "grad_norm": 6.343621730804443, "learning_rate": 0.00019670981090252792, "loss": 0.0034, "step": 132 }, { "epoch": 0.07325301204819278, "grad_norm": 3.1931121349334717, "learning_rate": 0.00019665595619040808, "loss": 0.0063, "step": 133 }, { "epoch": 0.07380378657487091, "grad_norm": 16.363962173461914, "learning_rate": 0.0001966016717878702, "loss": 0.0849, "step": 134 }, { "epoch": 0.07435456110154905, "grad_norm": 4.27025032043457, "learning_rate": 0.00019654695793623907, "loss": 0.0047, "step": 135 }, { "epoch": 0.0749053356282272, "grad_norm": 1.3693689107894897, "learning_rate": 0.0001964918148787488, "loss": 0.0012, "step": 136 }, { "epoch": 0.07545611015490533, "grad_norm": 9.333032608032227, "learning_rate": 0.00019643624286054144, "loss": 0.0284, "step": 137 }, { "epoch": 0.07600688468158348, "grad_norm": 4.581555366516113, "learning_rate": 0.00019638024212866606, "loss": 0.0108, "step": 138 }, { "epoch": 0.07655765920826162, "grad_norm": 5.115914344787598, "learning_rate": 0.0001963238129320776, "loss": 0.0105, "step": 139 }, { "epoch": 0.07710843373493977, "grad_norm": 13.050433158874512, "learning_rate": 0.00019626695552163578, "loss": 0.0038, "step": 140 }, { "epoch": 0.0776592082616179, "grad_norm": 0.23759493231773376, "learning_rate": 0.00019620967015010395, "loss": 0.0002, "step": 141 }, { "epoch": 0.07820998278829604, "grad_norm": 6.179422855377197, "learning_rate": 0.00019615195707214803, "loss": 0.0072, "step": 142 }, { "epoch": 0.07876075731497419, "grad_norm": 77.84343719482422, "learning_rate": 0.0001960938165443353, "loss": 0.0319, "step": 143 }, { "epoch": 0.07931153184165232, "grad_norm": 7.489847660064697, "learning_rate": 0.00019603524882513327, "loss": 0.0123, "step": 144 }, { "epoch": 0.07986230636833047, "grad_norm": 5.89699649810791, "learning_rate": 0.0001959762541749086, "loss": 0.0063, "step": 145 }, { "epoch": 0.0804130808950086, "grad_norm": 0.14980897307395935, "learning_rate": 0.00019591683285592593, "loss": 0.0002, "step": 146 }, { "epoch": 0.08096385542168674, "grad_norm": 3.6770894527435303, "learning_rate": 0.00019585698513234663, "loss": 0.0032, "step": 147 }, { "epoch": 0.08151462994836489, "grad_norm": 5.460299491882324, "learning_rate": 0.0001957967112702277, "loss": 0.0037, "step": 148 }, { "epoch": 0.08206540447504303, "grad_norm": 11.682272911071777, "learning_rate": 0.00019573601153752052, "loss": 0.0125, "step": 149 }, { "epoch": 0.08261617900172118, "grad_norm": 0.0344335101544857, "learning_rate": 0.00019567488620406983, "loss": 0.0001, "step": 150 }, { "epoch": 0.08316695352839931, "grad_norm": 19.28710174560547, "learning_rate": 0.00019561333554161224, "loss": 0.07, "step": 151 }, { "epoch": 0.08371772805507745, "grad_norm": 5.555969715118408, "learning_rate": 0.0001955513598237753, "loss": 0.0211, "step": 152 }, { "epoch": 0.0842685025817556, "grad_norm": 2.036860466003418, "learning_rate": 0.00019548895932607621, "loss": 0.0081, "step": 153 }, { "epoch": 0.08481927710843373, "grad_norm": 5.751703262329102, "learning_rate": 0.00019542613432592038, "loss": 0.0379, "step": 154 }, { "epoch": 0.08537005163511188, "grad_norm": 12.363378524780273, "learning_rate": 0.00019536288510260056, "loss": 0.0714, "step": 155 }, { "epoch": 0.08592082616179002, "grad_norm": 5.051399230957031, "learning_rate": 0.00019529921193729534, "loss": 0.0225, "step": 156 }, { "epoch": 0.08647160068846815, "grad_norm": 28.904964447021484, "learning_rate": 0.00019523511511306793, "loss": 0.046, "step": 157 }, { "epoch": 0.0870223752151463, "grad_norm": 1.6140872240066528, "learning_rate": 0.000195170594914865, "loss": 0.0026, "step": 158 }, { "epoch": 0.08757314974182444, "grad_norm": 3.2958765029907227, "learning_rate": 0.00019510565162951537, "loss": 0.0191, "step": 159 }, { "epoch": 0.08812392426850259, "grad_norm": 6.248198509216309, "learning_rate": 0.00019504028554572864, "loss": 0.0365, "step": 160 }, { "epoch": 0.08867469879518072, "grad_norm": 5.809995651245117, "learning_rate": 0.00019497449695409408, "loss": 0.0242, "step": 161 }, { "epoch": 0.08922547332185886, "grad_norm": 0.7355067133903503, "learning_rate": 0.00019490828614707916, "loss": 0.0013, "step": 162 }, { "epoch": 0.08977624784853701, "grad_norm": 0.6112813353538513, "learning_rate": 0.00019484165341902845, "loss": 0.0007, "step": 163 }, { "epoch": 0.09032702237521514, "grad_norm": 0.046819303184747696, "learning_rate": 0.00019477459906616206, "loss": 0.0001, "step": 164 }, { "epoch": 0.09087779690189329, "grad_norm": 0.04794855788350105, "learning_rate": 0.00019470712338657458, "loss": 0.0001, "step": 165 }, { "epoch": 0.09142857142857143, "grad_norm": 0.05958491191267967, "learning_rate": 0.0001946392266802336, "loss": 0.0, "step": 166 }, { "epoch": 0.09197934595524956, "grad_norm": 0.010006679221987724, "learning_rate": 0.0001945709092489783, "loss": 0.0, "step": 167 }, { "epoch": 0.09253012048192771, "grad_norm": 9.560103416442871, "learning_rate": 0.00019450217139651844, "loss": 0.0014, "step": 168 }, { "epoch": 0.09308089500860585, "grad_norm": 30.910526275634766, "learning_rate": 0.0001944330134284326, "loss": 0.0006, "step": 169 }, { "epoch": 0.093631669535284, "grad_norm": 20.69938087463379, "learning_rate": 0.00019436343565216711, "loss": 0.0324, "step": 170 }, { "epoch": 0.09418244406196213, "grad_norm": 28.434446334838867, "learning_rate": 0.00019429343837703455, "loss": 0.0312, "step": 171 }, { "epoch": 0.09473321858864027, "grad_norm": 43.733787536621094, "learning_rate": 0.0001942230219142124, "loss": 0.0128, "step": 172 }, { "epoch": 0.09528399311531842, "grad_norm": 24.354318618774414, "learning_rate": 0.0001941521865767417, "loss": 0.0519, "step": 173 }, { "epoch": 0.09583476764199655, "grad_norm": 20.031837463378906, "learning_rate": 0.0001940809326795256, "loss": 0.0488, "step": 174 }, { "epoch": 0.0963855421686747, "grad_norm": 6.543106555938721, "learning_rate": 0.000194009260539328, "loss": 0.0145, "step": 175 }, { "epoch": 0.09693631669535284, "grad_norm": 0.05378224328160286, "learning_rate": 0.0001939371704747721, "loss": 0.0, "step": 176 }, { "epoch": 0.09748709122203097, "grad_norm": 7.194676876068115, "learning_rate": 0.00019386466280633906, "loss": 0.0068, "step": 177 }, { "epoch": 0.09803786574870912, "grad_norm": 5.09645414352417, "learning_rate": 0.00019379173785636646, "loss": 0.0356, "step": 178 }, { "epoch": 0.09858864027538726, "grad_norm": 7.9466776847839355, "learning_rate": 0.000193718395949047, "loss": 0.016, "step": 179 }, { "epoch": 0.09913941480206541, "grad_norm": 14.834365844726562, "learning_rate": 0.00019364463741042694, "loss": 0.005, "step": 180 }, { "epoch": 0.09969018932874354, "grad_norm": 8.876616477966309, "learning_rate": 0.00019357046256840473, "loss": 0.0323, "step": 181 }, { "epoch": 0.10024096385542168, "grad_norm": 93.1019058227539, "learning_rate": 0.00019349587175272948, "loss": 0.0269, "step": 182 }, { "epoch": 0.10079173838209983, "grad_norm": 7.017251491546631, "learning_rate": 0.0001934208652949996, "loss": 0.0025, "step": 183 }, { "epoch": 0.10134251290877797, "grad_norm": 1.8236428499221802, "learning_rate": 0.00019334544352866127, "loss": 0.0044, "step": 184 }, { "epoch": 0.10189328743545611, "grad_norm": 2.512834310531616, "learning_rate": 0.00019326960678900688, "loss": 0.0118, "step": 185 }, { "epoch": 0.10244406196213425, "grad_norm": 2.117576837539673, "learning_rate": 0.00019319335541317361, "loss": 0.0035, "step": 186 }, { "epoch": 0.10299483648881239, "grad_norm": 3.1964597702026367, "learning_rate": 0.00019311668974014208, "loss": 0.0038, "step": 187 }, { "epoch": 0.10354561101549054, "grad_norm": 10.971129417419434, "learning_rate": 0.00019303961011073447, "loss": 0.0237, "step": 188 }, { "epoch": 0.10409638554216867, "grad_norm": 0.10692563652992249, "learning_rate": 0.00019296211686761346, "loss": 0.0001, "step": 189 }, { "epoch": 0.10464716006884682, "grad_norm": 1.3508963584899902, "learning_rate": 0.00019288421035528028, "loss": 0.001, "step": 190 }, { "epoch": 0.10519793459552496, "grad_norm": 5.2927680015563965, "learning_rate": 0.00019280589092007352, "loss": 0.0164, "step": 191 }, { "epoch": 0.1057487091222031, "grad_norm": 11.33450698852539, "learning_rate": 0.00019272715891016735, "loss": 0.0367, "step": 192 }, { "epoch": 0.10629948364888124, "grad_norm": 0.010012628510594368, "learning_rate": 0.00019264801467557007, "loss": 0.0, "step": 193 }, { "epoch": 0.10685025817555938, "grad_norm": 0.09973425418138504, "learning_rate": 0.00019256845856812266, "loss": 0.0, "step": 194 }, { "epoch": 0.10740103270223753, "grad_norm": 14.699050903320312, "learning_rate": 0.000192488490941497, "loss": 0.0205, "step": 195 }, { "epoch": 0.10795180722891566, "grad_norm": 12.911165237426758, "learning_rate": 0.00019240811215119448, "loss": 0.0204, "step": 196 }, { "epoch": 0.10850258175559381, "grad_norm": 10.890249252319336, "learning_rate": 0.00019232732255454422, "loss": 0.0439, "step": 197 }, { "epoch": 0.10905335628227195, "grad_norm": 35.77629089355469, "learning_rate": 0.00019224612251070175, "loss": 0.1668, "step": 198 }, { "epoch": 0.10960413080895008, "grad_norm": 0.2508499324321747, "learning_rate": 0.0001921645123806472, "loss": 0.0002, "step": 199 }, { "epoch": 0.11015490533562823, "grad_norm": 3.9498987197875977, "learning_rate": 0.0001920824925271838, "loss": 0.0058, "step": 200 }, { "epoch": 0.11070567986230637, "grad_norm": 28.809978485107422, "learning_rate": 0.0001920000633149362, "loss": 0.149, "step": 201 }, { "epoch": 0.11125645438898452, "grad_norm": 18.837404251098633, "learning_rate": 0.00019191722511034884, "loss": 0.084, "step": 202 }, { "epoch": 0.11180722891566265, "grad_norm": 2.4062259197235107, "learning_rate": 0.00019183397828168448, "loss": 0.0033, "step": 203 }, { "epoch": 0.11235800344234079, "grad_norm": 4.724879741668701, "learning_rate": 0.00019175032319902234, "loss": 0.0102, "step": 204 }, { "epoch": 0.11290877796901894, "grad_norm": 3.095519781112671, "learning_rate": 0.00019166626023425662, "loss": 0.0238, "step": 205 }, { "epoch": 0.11345955249569707, "grad_norm": 14.588111877441406, "learning_rate": 0.00019158178976109476, "loss": 0.0741, "step": 206 }, { "epoch": 0.11401032702237522, "grad_norm": 3.231032609939575, "learning_rate": 0.0001914969121550558, "loss": 0.0155, "step": 207 }, { "epoch": 0.11456110154905336, "grad_norm": 3.5190517902374268, "learning_rate": 0.00019141162779346874, "loss": 0.0056, "step": 208 }, { "epoch": 0.11511187607573149, "grad_norm": 9.027364730834961, "learning_rate": 0.00019132593705547082, "loss": 0.0183, "step": 209 }, { "epoch": 0.11566265060240964, "grad_norm": 4.2814836502075195, "learning_rate": 0.00019123984032200586, "loss": 0.0102, "step": 210 }, { "epoch": 0.11621342512908778, "grad_norm": 1.7414846420288086, "learning_rate": 0.00019115333797582254, "loss": 0.002, "step": 211 }, { "epoch": 0.11676419965576593, "grad_norm": 0.32831230759620667, "learning_rate": 0.00019106643040147278, "loss": 0.0006, "step": 212 }, { "epoch": 0.11731497418244406, "grad_norm": 14.78460693359375, "learning_rate": 0.00019097911798530987, "loss": 0.0321, "step": 213 }, { "epoch": 0.1178657487091222, "grad_norm": 0.08234310150146484, "learning_rate": 0.00019089140111548696, "loss": 0.0001, "step": 214 }, { "epoch": 0.11841652323580035, "grad_norm": 0.2181520015001297, "learning_rate": 0.00019080328018195513, "loss": 0.0003, "step": 215 }, { "epoch": 0.11896729776247848, "grad_norm": 0.07363662868738174, "learning_rate": 0.0001907147555764618, "loss": 0.0001, "step": 216 }, { "epoch": 0.11951807228915663, "grad_norm": 17.70883560180664, "learning_rate": 0.00019062582769254895, "loss": 0.0497, "step": 217 }, { "epoch": 0.12006884681583477, "grad_norm": 15.310802459716797, "learning_rate": 0.00019053649692555135, "loss": 0.0804, "step": 218 }, { "epoch": 0.1206196213425129, "grad_norm": 14.525551795959473, "learning_rate": 0.00019044676367259476, "loss": 0.0418, "step": 219 }, { "epoch": 0.12117039586919105, "grad_norm": 6.240030288696289, "learning_rate": 0.00019035662833259432, "loss": 0.0085, "step": 220 }, { "epoch": 0.12172117039586919, "grad_norm": 1.3535479307174683, "learning_rate": 0.00019026609130625257, "loss": 0.0011, "step": 221 }, { "epoch": 0.12227194492254734, "grad_norm": 2.8791394233703613, "learning_rate": 0.00019017515299605788, "loss": 0.0075, "step": 222 }, { "epoch": 0.12282271944922547, "grad_norm": 19.29328727722168, "learning_rate": 0.00019008381380628247, "loss": 0.0106, "step": 223 }, { "epoch": 0.12337349397590361, "grad_norm": 11.366170883178711, "learning_rate": 0.00018999207414298067, "loss": 0.1071, "step": 224 }, { "epoch": 0.12392426850258176, "grad_norm": 0.0661618560552597, "learning_rate": 0.00018989993441398726, "loss": 0.0001, "step": 225 }, { "epoch": 0.1244750430292599, "grad_norm": 0.5174265503883362, "learning_rate": 0.00018980739502891546, "loss": 0.0001, "step": 226 }, { "epoch": 0.12502581755593803, "grad_norm": 16.169435501098633, "learning_rate": 0.0001897144563991552, "loss": 0.0177, "step": 227 }, { "epoch": 0.12557659208261618, "grad_norm": 4.683776378631592, "learning_rate": 0.00018962111893787128, "loss": 0.0103, "step": 228 }, { "epoch": 0.12612736660929433, "grad_norm": 0.2907809317111969, "learning_rate": 0.00018952738306000151, "loss": 0.0005, "step": 229 }, { "epoch": 0.12667814113597245, "grad_norm": 11.669445991516113, "learning_rate": 0.00018943324918225494, "loss": 0.0233, "step": 230 }, { "epoch": 0.1272289156626506, "grad_norm": 12.24880313873291, "learning_rate": 0.0001893387177231099, "loss": 0.023, "step": 231 }, { "epoch": 0.12777969018932875, "grad_norm": 2.313523769378662, "learning_rate": 0.0001892437891028122, "loss": 0.0039, "step": 232 }, { "epoch": 0.1283304647160069, "grad_norm": 12.641063690185547, "learning_rate": 0.0001891484637433733, "loss": 0.056, "step": 233 }, { "epoch": 0.12888123924268502, "grad_norm": 7.852517127990723, "learning_rate": 0.00018905274206856837, "loss": 0.0205, "step": 234 }, { "epoch": 0.12943201376936317, "grad_norm": 8.71448802947998, "learning_rate": 0.00018895662450393438, "loss": 0.0056, "step": 235 }, { "epoch": 0.12998278829604132, "grad_norm": 7.275470733642578, "learning_rate": 0.00018886011147676833, "loss": 0.036, "step": 236 }, { "epoch": 0.13053356282271944, "grad_norm": 8.140000343322754, "learning_rate": 0.00018876320341612522, "loss": 0.0122, "step": 237 }, { "epoch": 0.1310843373493976, "grad_norm": 8.014253616333008, "learning_rate": 0.00018866590075281624, "loss": 0.0209, "step": 238 }, { "epoch": 0.13163511187607574, "grad_norm": 2.029446840286255, "learning_rate": 0.00018856820391940674, "loss": 0.0028, "step": 239 }, { "epoch": 0.13218588640275386, "grad_norm": 2.125037431716919, "learning_rate": 0.00018847011335021449, "loss": 0.0018, "step": 240 }, { "epoch": 0.132736660929432, "grad_norm": 29.78271484375, "learning_rate": 0.00018837162948130752, "loss": 0.0455, "step": 241 }, { "epoch": 0.13328743545611016, "grad_norm": 0.5051661133766174, "learning_rate": 0.00018827275275050233, "loss": 0.0003, "step": 242 }, { "epoch": 0.1338382099827883, "grad_norm": 15.133818626403809, "learning_rate": 0.00018817348359736203, "loss": 0.036, "step": 243 }, { "epoch": 0.13438898450946643, "grad_norm": 1.845505952835083, "learning_rate": 0.00018807382246319412, "loss": 0.0018, "step": 244 }, { "epoch": 0.13493975903614458, "grad_norm": 7.715407848358154, "learning_rate": 0.00018797376979104872, "loss": 0.0094, "step": 245 }, { "epoch": 0.13549053356282273, "grad_norm": 1.1177228689193726, "learning_rate": 0.00018787332602571662, "loss": 0.0008, "step": 246 }, { "epoch": 0.13604130808950085, "grad_norm": 5.975366115570068, "learning_rate": 0.00018777249161372713, "loss": 0.0371, "step": 247 }, { "epoch": 0.136592082616179, "grad_norm": 13.159732818603516, "learning_rate": 0.00018767126700334634, "loss": 0.0393, "step": 248 }, { "epoch": 0.13714285714285715, "grad_norm": 1.592596173286438, "learning_rate": 0.0001875696526445749, "loss": 0.0018, "step": 249 }, { "epoch": 0.13769363166953527, "grad_norm": 26.09734344482422, "learning_rate": 0.0001874676489891461, "loss": 0.0552, "step": 250 }, { "epoch": 0.13824440619621342, "grad_norm": 19.239826202392578, "learning_rate": 0.00018736525649052394, "loss": 0.1379, "step": 251 }, { "epoch": 0.13879518072289157, "grad_norm": 8.794814109802246, "learning_rate": 0.00018726247560390099, "loss": 0.0559, "step": 252 }, { "epoch": 0.13934595524956972, "grad_norm": 5.115240097045898, "learning_rate": 0.00018715930678619644, "loss": 0.0232, "step": 253 }, { "epoch": 0.13989672977624784, "grad_norm": 5.476420879364014, "learning_rate": 0.00018705575049605413, "loss": 0.0277, "step": 254 }, { "epoch": 0.140447504302926, "grad_norm": 12.339875221252441, "learning_rate": 0.00018695180719384029, "loss": 0.0291, "step": 255 }, { "epoch": 0.14099827882960414, "grad_norm": 19.775014877319336, "learning_rate": 0.00018684747734164177, "loss": 0.0517, "step": 256 }, { "epoch": 0.14154905335628226, "grad_norm": 4.344018936157227, "learning_rate": 0.00018674276140326376, "loss": 0.013, "step": 257 }, { "epoch": 0.1420998278829604, "grad_norm": 4.897819519042969, "learning_rate": 0.00018663765984422786, "loss": 0.0156, "step": 258 }, { "epoch": 0.14265060240963856, "grad_norm": 6.578986167907715, "learning_rate": 0.00018653217313177004, "loss": 0.0082, "step": 259 }, { "epoch": 0.14320137693631668, "grad_norm": 10.146879196166992, "learning_rate": 0.00018642630173483832, "loss": 0.0275, "step": 260 }, { "epoch": 0.14375215146299483, "grad_norm": 3.6826372146606445, "learning_rate": 0.00018632004612409103, "loss": 0.0021, "step": 261 }, { "epoch": 0.14430292598967298, "grad_norm": 0.7568719983100891, "learning_rate": 0.00018621340677189453, "loss": 0.0011, "step": 262 }, { "epoch": 0.14485370051635113, "grad_norm": 0.9699116945266724, "learning_rate": 0.00018610638415232097, "loss": 0.0011, "step": 263 }, { "epoch": 0.14540447504302925, "grad_norm": 3.2024614810943604, "learning_rate": 0.00018599897874114652, "loss": 0.0013, "step": 264 }, { "epoch": 0.1459552495697074, "grad_norm": 1.1160845756530762, "learning_rate": 0.00018589119101584898, "loss": 0.0004, "step": 265 }, { "epoch": 0.14650602409638555, "grad_norm": 22.40152931213379, "learning_rate": 0.00018578302145560584, "loss": 0.0043, "step": 266 }, { "epoch": 0.14705679862306367, "grad_norm": 12.429545402526855, "learning_rate": 0.00018567447054129195, "loss": 0.006, "step": 267 }, { "epoch": 0.14760757314974182, "grad_norm": 0.18238234519958496, "learning_rate": 0.00018556553875547754, "loss": 0.0002, "step": 268 }, { "epoch": 0.14815834767641997, "grad_norm": 12.409387588500977, "learning_rate": 0.00018545622658242607, "loss": 0.0465, "step": 269 }, { "epoch": 0.1487091222030981, "grad_norm": 26.479427337646484, "learning_rate": 0.00018534653450809197, "loss": 0.1398, "step": 270 }, { "epoch": 0.14925989672977624, "grad_norm": 0.027319632470607758, "learning_rate": 0.00018523646302011867, "loss": 0.0, "step": 271 }, { "epoch": 0.1498106712564544, "grad_norm": 0.36630889773368835, "learning_rate": 0.00018512601260783606, "loss": 0.0003, "step": 272 }, { "epoch": 0.15036144578313254, "grad_norm": 3.364025115966797, "learning_rate": 0.00018501518376225887, "loss": 0.0165, "step": 273 }, { "epoch": 0.15091222030981066, "grad_norm": 3.449206829071045, "learning_rate": 0.00018490397697608395, "loss": 0.0075, "step": 274 }, { "epoch": 0.1514629948364888, "grad_norm": 8.42161750793457, "learning_rate": 0.0001847923927436884, "loss": 0.0375, "step": 275 }, { "epoch": 0.15201376936316696, "grad_norm": 18.621793746948242, "learning_rate": 0.00018468043156112728, "loss": 0.0261, "step": 276 }, { "epoch": 0.15256454388984508, "grad_norm": 8.895025253295898, "learning_rate": 0.0001845680939261314, "loss": 0.0337, "step": 277 }, { "epoch": 0.15311531841652323, "grad_norm": 3.8389241695404053, "learning_rate": 0.00018445538033810515, "loss": 0.0056, "step": 278 }, { "epoch": 0.15366609294320138, "grad_norm": 12.102470397949219, "learning_rate": 0.00018434229129812418, "loss": 0.0537, "step": 279 }, { "epoch": 0.15421686746987953, "grad_norm": 15.383883476257324, "learning_rate": 0.0001842288273089332, "loss": 0.0074, "step": 280 }, { "epoch": 0.15476764199655765, "grad_norm": 7.8810296058654785, "learning_rate": 0.00018411498887494396, "loss": 0.0236, "step": 281 }, { "epoch": 0.1553184165232358, "grad_norm": 8.538419723510742, "learning_rate": 0.00018400077650223263, "loss": 0.0099, "step": 282 }, { "epoch": 0.15586919104991395, "grad_norm": 0.7942506670951843, "learning_rate": 0.0001838861906985379, "loss": 0.0011, "step": 283 }, { "epoch": 0.15641996557659207, "grad_norm": 5.657203674316406, "learning_rate": 0.00018377123197325842, "loss": 0.006, "step": 284 }, { "epoch": 0.15697074010327022, "grad_norm": 22.057641983032227, "learning_rate": 0.00018365590083745085, "loss": 0.0073, "step": 285 }, { "epoch": 0.15752151462994837, "grad_norm": 28.677671432495117, "learning_rate": 0.00018354019780382735, "loss": 0.0476, "step": 286 }, { "epoch": 0.1580722891566265, "grad_norm": 5.358160495758057, "learning_rate": 0.0001834241233867533, "loss": 0.0079, "step": 287 }, { "epoch": 0.15862306368330464, "grad_norm": 9.355661392211914, "learning_rate": 0.00018330767810224524, "loss": 0.0452, "step": 288 }, { "epoch": 0.1591738382099828, "grad_norm": 4.246338844299316, "learning_rate": 0.0001831908624679683, "loss": 0.0147, "step": 289 }, { "epoch": 0.15972461273666094, "grad_norm": 13.707676887512207, "learning_rate": 0.0001830736770032341, "loss": 0.011, "step": 290 }, { "epoch": 0.16027538726333906, "grad_norm": 22.739002227783203, "learning_rate": 0.0001829561222289984, "loss": 0.0507, "step": 291 }, { "epoch": 0.1608261617900172, "grad_norm": 3.4950594902038574, "learning_rate": 0.00018283819866785853, "loss": 0.0009, "step": 292 }, { "epoch": 0.16137693631669536, "grad_norm": 107.92597198486328, "learning_rate": 0.0001827199068440516, "loss": 0.0122, "step": 293 }, { "epoch": 0.16192771084337348, "grad_norm": 6.7393999099731445, "learning_rate": 0.00018260124728345162, "loss": 0.0053, "step": 294 }, { "epoch": 0.16247848537005163, "grad_norm": 5.110424518585205, "learning_rate": 0.00018248222051356754, "loss": 0.0056, "step": 295 }, { "epoch": 0.16302925989672978, "grad_norm": 9.672557830810547, "learning_rate": 0.00018236282706354063, "loss": 0.0125, "step": 296 }, { "epoch": 0.1635800344234079, "grad_norm": 13.510973930358887, "learning_rate": 0.00018224306746414238, "loss": 0.0473, "step": 297 }, { "epoch": 0.16413080895008605, "grad_norm": 6.835041046142578, "learning_rate": 0.00018212294224777197, "loss": 0.023, "step": 298 }, { "epoch": 0.1646815834767642, "grad_norm": 16.499286651611328, "learning_rate": 0.00018200245194845399, "loss": 0.0113, "step": 299 }, { "epoch": 0.16523235800344235, "grad_norm": 2.4769785404205322, "learning_rate": 0.00018188159710183594, "loss": 0.0022, "step": 300 }, { "epoch": 0.16578313253012048, "grad_norm": 13.840828895568848, "learning_rate": 0.000181760378245186, "loss": 0.0566, "step": 301 }, { "epoch": 0.16633390705679862, "grad_norm": 12.780588150024414, "learning_rate": 0.00018163879591739067, "loss": 0.0358, "step": 302 }, { "epoch": 0.16688468158347677, "grad_norm": 4.507590293884277, "learning_rate": 0.0001815168506589521, "loss": 0.0034, "step": 303 }, { "epoch": 0.1674354561101549, "grad_norm": 0.1847132444381714, "learning_rate": 0.000181394543011986, "loss": 0.0004, "step": 304 }, { "epoch": 0.16798623063683304, "grad_norm": 52.670936584472656, "learning_rate": 0.00018127187352021907, "loss": 0.0515, "step": 305 }, { "epoch": 0.1685370051635112, "grad_norm": 13.024288177490234, "learning_rate": 0.0001811488427289866, "loss": 0.0332, "step": 306 }, { "epoch": 0.16908777969018932, "grad_norm": 25.725215911865234, "learning_rate": 0.00018102545118523007, "loss": 0.0119, "step": 307 }, { "epoch": 0.16963855421686747, "grad_norm": 8.522557258605957, "learning_rate": 0.00018090169943749476, "loss": 0.0136, "step": 308 }, { "epoch": 0.17018932874354561, "grad_norm": 9.241713523864746, "learning_rate": 0.00018077758803592718, "loss": 0.0128, "step": 309 }, { "epoch": 0.17074010327022376, "grad_norm": 4.13601541519165, "learning_rate": 0.00018065311753227273, "loss": 0.0046, "step": 310 }, { "epoch": 0.17129087779690189, "grad_norm": 12.283329963684082, "learning_rate": 0.0001805282884798732, "loss": 0.0018, "step": 311 }, { "epoch": 0.17184165232358004, "grad_norm": 12.496675491333008, "learning_rate": 0.00018040310143366446, "loss": 0.012, "step": 312 }, { "epoch": 0.17239242685025818, "grad_norm": 0.1850021779537201, "learning_rate": 0.00018027755695017368, "loss": 0.0002, "step": 313 }, { "epoch": 0.1729432013769363, "grad_norm": 0.06477334350347519, "learning_rate": 0.00018015165558751717, "loss": 0.0001, "step": 314 }, { "epoch": 0.17349397590361446, "grad_norm": 0.039351146668195724, "learning_rate": 0.00018002539790539773, "loss": 0.0001, "step": 315 }, { "epoch": 0.1740447504302926, "grad_norm": 34.130149841308594, "learning_rate": 0.00017989878446510215, "loss": 0.0395, "step": 316 }, { "epoch": 0.17459552495697073, "grad_norm": 0.10393693298101425, "learning_rate": 0.00017977181582949888, "loss": 0.0001, "step": 317 }, { "epoch": 0.17514629948364888, "grad_norm": 13.766066551208496, "learning_rate": 0.0001796444925630353, "loss": 0.0411, "step": 318 }, { "epoch": 0.17569707401032703, "grad_norm": 0.19885025918483734, "learning_rate": 0.00017951681523173542, "loss": 0.0002, "step": 319 }, { "epoch": 0.17624784853700518, "grad_norm": 29.224424362182617, "learning_rate": 0.0001793887844031972, "loss": 0.0924, "step": 320 }, { "epoch": 0.1767986230636833, "grad_norm": 47.07408905029297, "learning_rate": 0.00017926040064659014, "loss": 0.0257, "step": 321 }, { "epoch": 0.17734939759036145, "grad_norm": 23.150787353515625, "learning_rate": 0.0001791316645326526, "loss": 0.0059, "step": 322 }, { "epoch": 0.1779001721170396, "grad_norm": 0.34272682666778564, "learning_rate": 0.00017900257663368963, "loss": 0.0002, "step": 323 }, { "epoch": 0.17845094664371772, "grad_norm": 13.793720245361328, "learning_rate": 0.0001788731375235698, "loss": 0.0285, "step": 324 }, { "epoch": 0.17900172117039587, "grad_norm": 3.910888433456421, "learning_rate": 0.00017874334777772327, "loss": 0.0063, "step": 325 }, { "epoch": 0.17955249569707402, "grad_norm": 17.40552520751953, "learning_rate": 0.00017861320797313892, "loss": 0.0341, "step": 326 }, { "epoch": 0.18010327022375214, "grad_norm": 19.947919845581055, "learning_rate": 0.0001784827186883618, "loss": 0.0326, "step": 327 }, { "epoch": 0.1806540447504303, "grad_norm": 16.762727737426758, "learning_rate": 0.00017835188050349064, "loss": 0.0406, "step": 328 }, { "epoch": 0.18120481927710844, "grad_norm": 0.9610361456871033, "learning_rate": 0.00017822069400017516, "loss": 0.0004, "step": 329 }, { "epoch": 0.18175559380378659, "grad_norm": 9.873037338256836, "learning_rate": 0.00017808915976161362, "loss": 0.0374, "step": 330 }, { "epoch": 0.1823063683304647, "grad_norm": 4.067539215087891, "learning_rate": 0.00017795727837255015, "loss": 0.0079, "step": 331 }, { "epoch": 0.18285714285714286, "grad_norm": 24.27317237854004, "learning_rate": 0.00017782505041927216, "loss": 0.0025, "step": 332 }, { "epoch": 0.183407917383821, "grad_norm": 4.535031795501709, "learning_rate": 0.00017769247648960774, "loss": 0.0096, "step": 333 }, { "epoch": 0.18395869191049913, "grad_norm": 4.228193283081055, "learning_rate": 0.00017755955717292296, "loss": 0.0004, "step": 334 }, { "epoch": 0.18450946643717728, "grad_norm": 23.3373966217041, "learning_rate": 0.00017742629306011944, "loss": 0.0382, "step": 335 }, { "epoch": 0.18506024096385543, "grad_norm": 0.5856701731681824, "learning_rate": 0.00017729268474363154, "loss": 0.0005, "step": 336 }, { "epoch": 0.18561101549053358, "grad_norm": 0.09005908668041229, "learning_rate": 0.0001771587328174239, "loss": 0.0001, "step": 337 }, { "epoch": 0.1861617900172117, "grad_norm": 0.14111103117465973, "learning_rate": 0.0001770244378769885, "loss": 0.0002, "step": 338 }, { "epoch": 0.18671256454388985, "grad_norm": 0.32232967019081116, "learning_rate": 0.0001768898005193425, "loss": 0.0002, "step": 339 }, { "epoch": 0.187263339070568, "grad_norm": 5.739742755889893, "learning_rate": 0.000176754821343025, "loss": 0.0071, "step": 340 }, { "epoch": 0.18781411359724612, "grad_norm": 20.651893615722656, "learning_rate": 0.0001766195009480949, "loss": 0.0147, "step": 341 }, { "epoch": 0.18836488812392427, "grad_norm": 10.77313232421875, "learning_rate": 0.0001764838399361279, "loss": 0.0058, "step": 342 }, { "epoch": 0.18891566265060242, "grad_norm": 8.245923042297363, "learning_rate": 0.00017634783891021393, "loss": 0.0346, "step": 343 }, { "epoch": 0.18946643717728054, "grad_norm": 4.982606410980225, "learning_rate": 0.00017621149847495458, "loss": 0.0064, "step": 344 }, { "epoch": 0.1900172117039587, "grad_norm": 2.163201332092285, "learning_rate": 0.00017607481923646016, "loss": 0.0007, "step": 345 }, { "epoch": 0.19056798623063684, "grad_norm": 1.1414597034454346, "learning_rate": 0.0001759378018023473, "loss": 0.0009, "step": 346 }, { "epoch": 0.191118760757315, "grad_norm": 12.274886131286621, "learning_rate": 0.00017580044678173592, "loss": 0.0231, "step": 347 }, { "epoch": 0.1916695352839931, "grad_norm": 0.8529264330863953, "learning_rate": 0.00017566275478524693, "loss": 0.0006, "step": 348 }, { "epoch": 0.19222030981067126, "grad_norm": 4.924040794372559, "learning_rate": 0.0001755247264249991, "loss": 0.0046, "step": 349 }, { "epoch": 0.1927710843373494, "grad_norm": 3.737917423248291, "learning_rate": 0.0001753863623146066, "loss": 0.0031, "step": 350 }, { "epoch": 0.19332185886402753, "grad_norm": 16.35202407836914, "learning_rate": 0.00017524766306917618, "loss": 0.0532, "step": 351 }, { "epoch": 0.19387263339070568, "grad_norm": 48.342002868652344, "learning_rate": 0.0001751086293053045, "loss": 0.0626, "step": 352 }, { "epoch": 0.19442340791738383, "grad_norm": 9.88490104675293, "learning_rate": 0.0001749692616410753, "loss": 0.0126, "step": 353 }, { "epoch": 0.19497418244406195, "grad_norm": 2.726943016052246, "learning_rate": 0.00017482956069605668, "loss": 0.0033, "step": 354 }, { "epoch": 0.1955249569707401, "grad_norm": 29.867555618286133, "learning_rate": 0.00017468952709129846, "loss": 0.0487, "step": 355 }, { "epoch": 0.19607573149741825, "grad_norm": 0.0770748034119606, "learning_rate": 0.00017454916144932922, "loss": 0.0001, "step": 356 }, { "epoch": 0.1966265060240964, "grad_norm": 29.8394775390625, "learning_rate": 0.0001744084643941536, "loss": 0.0155, "step": 357 }, { "epoch": 0.19717728055077452, "grad_norm": 2.80316162109375, "learning_rate": 0.00017426743655124974, "loss": 0.0215, "step": 358 }, { "epoch": 0.19772805507745267, "grad_norm": 7.068621635437012, "learning_rate": 0.0001741260785475661, "loss": 0.0042, "step": 359 }, { "epoch": 0.19827882960413082, "grad_norm": 1.4078574180603027, "learning_rate": 0.00017398439101151905, "loss": 0.0012, "step": 360 }, { "epoch": 0.19882960413080894, "grad_norm": 2.0016562938690186, "learning_rate": 0.00017384237457298987, "loss": 0.0133, "step": 361 }, { "epoch": 0.1993803786574871, "grad_norm": 14.658370018005371, "learning_rate": 0.00017370002986332193, "loss": 0.0217, "step": 362 }, { "epoch": 0.19993115318416524, "grad_norm": 8.145126342773438, "learning_rate": 0.00017355735751531807, "loss": 0.0095, "step": 363 }, { "epoch": 0.20048192771084336, "grad_norm": 10.614785194396973, "learning_rate": 0.00017341435816323756, "loss": 0.0416, "step": 364 }, { "epoch": 0.2010327022375215, "grad_norm": 1.0934712886810303, "learning_rate": 0.00017327103244279348, "loss": 0.0018, "step": 365 }, { "epoch": 0.20158347676419966, "grad_norm": 0.18148697912693024, "learning_rate": 0.00017312738099114973, "loss": 0.0003, "step": 366 }, { "epoch": 0.2021342512908778, "grad_norm": 5.038586616516113, "learning_rate": 0.00017298340444691835, "loss": 0.0148, "step": 367 }, { "epoch": 0.20268502581755593, "grad_norm": 2.103548288345337, "learning_rate": 0.00017283910345015647, "loss": 0.0007, "step": 368 }, { "epoch": 0.20323580034423408, "grad_norm": 8.434141159057617, "learning_rate": 0.0001726944786423637, "loss": 0.0074, "step": 369 }, { "epoch": 0.20378657487091223, "grad_norm": 12.97700023651123, "learning_rate": 0.00017254953066647913, "loss": 0.0407, "step": 370 }, { "epoch": 0.20433734939759035, "grad_norm": 8.532750129699707, "learning_rate": 0.00017240426016687863, "loss": 0.0181, "step": 371 }, { "epoch": 0.2048881239242685, "grad_norm": 0.8986814022064209, "learning_rate": 0.00017225866778937165, "loss": 0.0008, "step": 372 }, { "epoch": 0.20543889845094665, "grad_norm": 9.588179588317871, "learning_rate": 0.00017211275418119876, "loss": 0.0105, "step": 373 }, { "epoch": 0.20598967297762477, "grad_norm": 7.790585994720459, "learning_rate": 0.0001719665199910285, "loss": 0.0175, "step": 374 }, { "epoch": 0.20654044750430292, "grad_norm": 11.345576286315918, "learning_rate": 0.00017181996586895454, "loss": 0.0036, "step": 375 }, { "epoch": 0.20654044750430292, "eval_loss": 0.01565716229379177, "eval_runtime": 233.3468, "eval_samples_per_second": 13.105, "eval_steps_per_second": 6.552, "step": 375 }, { "epoch": 0.20709122203098107, "grad_norm": 9.130244255065918, "learning_rate": 0.00017167309246649297, "loss": 0.0522, "step": 376 }, { "epoch": 0.20764199655765922, "grad_norm": 8.900921821594238, "learning_rate": 0.0001715259004365791, "loss": 0.0118, "step": 377 }, { "epoch": 0.20819277108433734, "grad_norm": 0.20976708829402924, "learning_rate": 0.00017137839043356484, "loss": 0.0002, "step": 378 }, { "epoch": 0.2087435456110155, "grad_norm": 33.07256317138672, "learning_rate": 0.00017123056311321562, "loss": 0.0298, "step": 379 }, { "epoch": 0.20929432013769364, "grad_norm": 6.082475185394287, "learning_rate": 0.0001710824191327075, "loss": 0.01, "step": 380 }, { "epoch": 0.20984509466437176, "grad_norm": 59.199100494384766, "learning_rate": 0.00017093395915062428, "loss": 0.0452, "step": 381 }, { "epoch": 0.2103958691910499, "grad_norm": 7.921163558959961, "learning_rate": 0.00017078518382695465, "loss": 0.0068, "step": 382 }, { "epoch": 0.21094664371772806, "grad_norm": 0.03215315565466881, "learning_rate": 0.00017063609382308908, "loss": 0.0001, "step": 383 }, { "epoch": 0.2114974182444062, "grad_norm": 11.410179138183594, "learning_rate": 0.00017048668980181698, "loss": 0.0038, "step": 384 }, { "epoch": 0.21204819277108433, "grad_norm": 6.6274261474609375, "learning_rate": 0.00017033697242732377, "loss": 0.0015, "step": 385 }, { "epoch": 0.21259896729776248, "grad_norm": 9.298541069030762, "learning_rate": 0.0001701869423651879, "loss": 0.0083, "step": 386 }, { "epoch": 0.21314974182444063, "grad_norm": 5.331132888793945, "learning_rate": 0.00017003660028237793, "loss": 0.0055, "step": 387 }, { "epoch": 0.21370051635111875, "grad_norm": 14.713404655456543, "learning_rate": 0.00016988594684724947, "loss": 0.0153, "step": 388 }, { "epoch": 0.2142512908777969, "grad_norm": 8.523561477661133, "learning_rate": 0.00016973498272954222, "loss": 0.0175, "step": 389 }, { "epoch": 0.21480206540447505, "grad_norm": 1.6441515684127808, "learning_rate": 0.00016958370860037717, "loss": 0.0008, "step": 390 }, { "epoch": 0.21535283993115317, "grad_norm": 0.8777960538864136, "learning_rate": 0.00016943212513225345, "loss": 0.0007, "step": 391 }, { "epoch": 0.21590361445783132, "grad_norm": 3.6094653606414795, "learning_rate": 0.00016928023299904533, "loss": 0.0255, "step": 392 }, { "epoch": 0.21645438898450947, "grad_norm": 9.559914588928223, "learning_rate": 0.0001691280328759992, "loss": 0.0584, "step": 393 }, { "epoch": 0.21700516351118762, "grad_norm": 11.264703750610352, "learning_rate": 0.00016897552543973084, "loss": 0.0088, "step": 394 }, { "epoch": 0.21755593803786574, "grad_norm": 12.385393142700195, "learning_rate": 0.00016882271136822206, "loss": 0.024, "step": 395 }, { "epoch": 0.2181067125645439, "grad_norm": 10.668384552001953, "learning_rate": 0.0001686695913408179, "loss": 0.0372, "step": 396 }, { "epoch": 0.21865748709122204, "grad_norm": 1.1692023277282715, "learning_rate": 0.0001685161660382235, "loss": 0.0014, "step": 397 }, { "epoch": 0.21920826161790016, "grad_norm": 14.253864288330078, "learning_rate": 0.00016836243614250113, "loss": 0.0098, "step": 398 }, { "epoch": 0.2197590361445783, "grad_norm": 0.03801527991890907, "learning_rate": 0.00016820840233706719, "loss": 0.0001, "step": 399 }, { "epoch": 0.22030981067125646, "grad_norm": 10.528767585754395, "learning_rate": 0.0001680540653066891, "loss": 0.0473, "step": 400 }, { "epoch": 0.22086058519793458, "grad_norm": 24.44769859313965, "learning_rate": 0.00016789942573748232, "loss": 0.0699, "step": 401 }, { "epoch": 0.22141135972461273, "grad_norm": 10.922078132629395, "learning_rate": 0.0001677444843169072, "loss": 0.0328, "step": 402 }, { "epoch": 0.22196213425129088, "grad_norm": 7.12520694732666, "learning_rate": 0.00016758924173376603, "loss": 0.018, "step": 403 }, { "epoch": 0.22251290877796903, "grad_norm": 60.143558502197266, "learning_rate": 0.0001674336986781999, "loss": 0.0187, "step": 404 }, { "epoch": 0.22306368330464715, "grad_norm": 1.9462488889694214, "learning_rate": 0.00016727785584168581, "loss": 0.0212, "step": 405 }, { "epoch": 0.2236144578313253, "grad_norm": 5.15855073928833, "learning_rate": 0.0001671217139170333, "loss": 0.0126, "step": 406 }, { "epoch": 0.22416523235800345, "grad_norm": 0.5160714983940125, "learning_rate": 0.00016696527359838154, "loss": 0.0006, "step": 407 }, { "epoch": 0.22471600688468157, "grad_norm": 0.2832033336162567, "learning_rate": 0.00016680853558119632, "loss": 0.0004, "step": 408 }, { "epoch": 0.22526678141135972, "grad_norm": 8.852500915527344, "learning_rate": 0.0001666515005622668, "loss": 0.0276, "step": 409 }, { "epoch": 0.22581755593803787, "grad_norm": 13.498091697692871, "learning_rate": 0.0001664941692397025, "loss": 0.0423, "step": 410 }, { "epoch": 0.226368330464716, "grad_norm": 6.0677337646484375, "learning_rate": 0.00016633654231293013, "loss": 0.044, "step": 411 }, { "epoch": 0.22691910499139414, "grad_norm": 0.19278694689273834, "learning_rate": 0.00016617862048269065, "loss": 0.0004, "step": 412 }, { "epoch": 0.2274698795180723, "grad_norm": 3.3796088695526123, "learning_rate": 0.00016602040445103588, "loss": 0.0059, "step": 413 }, { "epoch": 0.22802065404475044, "grad_norm": 0.4880172312259674, "learning_rate": 0.00016586189492132566, "loss": 0.0008, "step": 414 }, { "epoch": 0.22857142857142856, "grad_norm": 0.16374406218528748, "learning_rate": 0.00016570309259822453, "loss": 0.0003, "step": 415 }, { "epoch": 0.2291222030981067, "grad_norm": 0.14378395676612854, "learning_rate": 0.0001655439981876987, "loss": 0.0004, "step": 416 }, { "epoch": 0.22967297762478486, "grad_norm": 5.660022258758545, "learning_rate": 0.00016538461239701277, "loss": 0.0095, "step": 417 }, { "epoch": 0.23022375215146298, "grad_norm": 9.309393882751465, "learning_rate": 0.00016522493593472683, "loss": 0.0149, "step": 418 }, { "epoch": 0.23077452667814113, "grad_norm": 4.814165115356445, "learning_rate": 0.0001650649695106931, "loss": 0.0174, "step": 419 }, { "epoch": 0.23132530120481928, "grad_norm": 0.06857752054929733, "learning_rate": 0.00016490471383605288, "loss": 0.0001, "step": 420 }, { "epoch": 0.2318760757314974, "grad_norm": 7.606141567230225, "learning_rate": 0.00016474416962323325, "loss": 0.0168, "step": 421 }, { "epoch": 0.23242685025817555, "grad_norm": 1.7405983209609985, "learning_rate": 0.00016458333758594414, "loss": 0.0189, "step": 422 }, { "epoch": 0.2329776247848537, "grad_norm": 11.488520622253418, "learning_rate": 0.00016442221843917496, "loss": 0.021, "step": 423 }, { "epoch": 0.23352839931153185, "grad_norm": 6.9766082763671875, "learning_rate": 0.00016426081289919143, "loss": 0.0169, "step": 424 }, { "epoch": 0.23407917383820998, "grad_norm": 3.3361473083496094, "learning_rate": 0.0001640991216835326, "loss": 0.0145, "step": 425 }, { "epoch": 0.23462994836488812, "grad_norm": 15.612713813781738, "learning_rate": 0.00016393714551100734, "loss": 0.0043, "step": 426 }, { "epoch": 0.23518072289156627, "grad_norm": 79.22930908203125, "learning_rate": 0.0001637748851016914, "loss": 0.0522, "step": 427 }, { "epoch": 0.2357314974182444, "grad_norm": 13.727797508239746, "learning_rate": 0.00016361234117692413, "loss": 0.0092, "step": 428 }, { "epoch": 0.23628227194492255, "grad_norm": 1.2949936389923096, "learning_rate": 0.00016344951445930526, "loss": 0.0024, "step": 429 }, { "epoch": 0.2368330464716007, "grad_norm": 4.6701273918151855, "learning_rate": 0.0001632864056726917, "loss": 0.0065, "step": 430 }, { "epoch": 0.23738382099827882, "grad_norm": 18.505714416503906, "learning_rate": 0.00016312301554219426, "loss": 0.0286, "step": 431 }, { "epoch": 0.23793459552495697, "grad_norm": 0.48056456446647644, "learning_rate": 0.00016295934479417453, "loss": 0.001, "step": 432 }, { "epoch": 0.23848537005163511, "grad_norm": 7.874746799468994, "learning_rate": 0.00016279539415624164, "loss": 0.0329, "step": 433 }, { "epoch": 0.23903614457831326, "grad_norm": 5.7578511238098145, "learning_rate": 0.0001626311643572489, "loss": 0.023, "step": 434 }, { "epoch": 0.23958691910499139, "grad_norm": 1.9401549100875854, "learning_rate": 0.00016246665612729074, "loss": 0.0021, "step": 435 }, { "epoch": 0.24013769363166954, "grad_norm": 0.5277252793312073, "learning_rate": 0.00016230187019769928, "loss": 0.001, "step": 436 }, { "epoch": 0.24068846815834768, "grad_norm": 7.538656234741211, "learning_rate": 0.00016213680730104124, "loss": 0.01, "step": 437 }, { "epoch": 0.2412392426850258, "grad_norm": 7.437373161315918, "learning_rate": 0.0001619714681711146, "loss": 0.0117, "step": 438 }, { "epoch": 0.24179001721170396, "grad_norm": 11.190563201904297, "learning_rate": 0.00016180585354294536, "loss": 0.0088, "step": 439 }, { "epoch": 0.2423407917383821, "grad_norm": 10.174369812011719, "learning_rate": 0.00016163996415278424, "loss": 0.0092, "step": 440 }, { "epoch": 0.24289156626506025, "grad_norm": 14.066507339477539, "learning_rate": 0.00016147380073810346, "loss": 0.016, "step": 441 }, { "epoch": 0.24344234079173838, "grad_norm": 0.6655990481376648, "learning_rate": 0.0001613073640375934, "loss": 0.0006, "step": 442 }, { "epoch": 0.24399311531841653, "grad_norm": 6.541067123413086, "learning_rate": 0.00016114065479115946, "loss": 0.0329, "step": 443 }, { "epoch": 0.24454388984509468, "grad_norm": 1.1429013013839722, "learning_rate": 0.00016097367373991842, "loss": 0.0012, "step": 444 }, { "epoch": 0.2450946643717728, "grad_norm": 12.869746208190918, "learning_rate": 0.00016080642162619565, "loss": 0.0228, "step": 445 }, { "epoch": 0.24564543889845095, "grad_norm": 4.195258140563965, "learning_rate": 0.0001606388991935214, "loss": 0.0057, "step": 446 }, { "epoch": 0.2461962134251291, "grad_norm": 2.13649845123291, "learning_rate": 0.0001604711071866277, "loss": 0.0007, "step": 447 }, { "epoch": 0.24674698795180722, "grad_norm": 1.2508876323699951, "learning_rate": 0.00016030304635144494, "loss": 0.0014, "step": 448 }, { "epoch": 0.24729776247848537, "grad_norm": 8.783740997314453, "learning_rate": 0.00016013471743509862, "loss": 0.0342, "step": 449 }, { "epoch": 0.24784853700516352, "grad_norm": 2.1866512298583984, "learning_rate": 0.00015996612118590603, "loss": 0.0015, "step": 450 }, { "epoch": 0.24839931153184167, "grad_norm": 35.168540954589844, "learning_rate": 0.00015979725835337294, "loss": 0.0848, "step": 451 }, { "epoch": 0.2489500860585198, "grad_norm": 69.21905517578125, "learning_rate": 0.00015962812968819016, "loss": 0.0529, "step": 452 }, { "epoch": 0.24950086058519794, "grad_norm": 5.425115585327148, "learning_rate": 0.0001594587359422303, "loss": 0.0155, "step": 453 }, { "epoch": 0.25005163511187606, "grad_norm": 8.447904586791992, "learning_rate": 0.0001592890778685444, "loss": 0.0102, "step": 454 }, { "epoch": 0.25060240963855424, "grad_norm": 14.610235214233398, "learning_rate": 0.00015911915622135862, "loss": 0.0319, "step": 455 }, { "epoch": 0.25115318416523236, "grad_norm": 23.367507934570312, "learning_rate": 0.00015894897175607086, "loss": 0.0133, "step": 456 }, { "epoch": 0.2517039586919105, "grad_norm": 7.306400299072266, "learning_rate": 0.00015877852522924732, "loss": 0.0278, "step": 457 }, { "epoch": 0.25225473321858866, "grad_norm": 17.951114654541016, "learning_rate": 0.00015860781739861928, "loss": 0.0486, "step": 458 }, { "epoch": 0.2528055077452668, "grad_norm": 8.91996955871582, "learning_rate": 0.00015843684902307962, "loss": 0.0057, "step": 459 }, { "epoch": 0.2533562822719449, "grad_norm": 5.62592077255249, "learning_rate": 0.00015826562086267956, "loss": 0.0047, "step": 460 }, { "epoch": 0.2539070567986231, "grad_norm": 0.9356241822242737, "learning_rate": 0.00015809413367862512, "loss": 0.001, "step": 461 }, { "epoch": 0.2544578313253012, "grad_norm": 0.14365902543067932, "learning_rate": 0.00015792238823327388, "loss": 0.0003, "step": 462 }, { "epoch": 0.2550086058519793, "grad_norm": 1.1473212242126465, "learning_rate": 0.00015775038529013152, "loss": 0.0012, "step": 463 }, { "epoch": 0.2555593803786575, "grad_norm": 0.07937091588973999, "learning_rate": 0.0001575781256138485, "loss": 0.0001, "step": 464 }, { "epoch": 0.2561101549053356, "grad_norm": 9.727457046508789, "learning_rate": 0.00015740560997021648, "loss": 0.0087, "step": 465 }, { "epoch": 0.2566609294320138, "grad_norm": 3.2500722408294678, "learning_rate": 0.00015723283912616513, "loss": 0.0024, "step": 466 }, { "epoch": 0.2572117039586919, "grad_norm": 0.03407001867890358, "learning_rate": 0.00015705981384975866, "loss": 0.0, "step": 467 }, { "epoch": 0.25776247848537004, "grad_norm": 281.6208190917969, "learning_rate": 0.0001568865349101923, "loss": 0.1144, "step": 468 }, { "epoch": 0.2583132530120482, "grad_norm": 24.475414276123047, "learning_rate": 0.00015671300307778898, "loss": 0.044, "step": 469 }, { "epoch": 0.25886402753872634, "grad_norm": 38.64067077636719, "learning_rate": 0.00015653921912399589, "loss": 0.0169, "step": 470 }, { "epoch": 0.25941480206540446, "grad_norm": 13.259678840637207, "learning_rate": 0.00015636518382138107, "loss": 0.0497, "step": 471 }, { "epoch": 0.25996557659208264, "grad_norm": 0.1407472789287567, "learning_rate": 0.0001561908979436299, "loss": 0.0001, "step": 472 }, { "epoch": 0.26051635111876076, "grad_norm": 0.06780751049518585, "learning_rate": 0.00015601636226554168, "loss": 0.0, "step": 473 }, { "epoch": 0.2610671256454389, "grad_norm": 0.016142461448907852, "learning_rate": 0.00015584157756302634, "loss": 0.0, "step": 474 }, { "epoch": 0.26161790017211706, "grad_norm": 3.1208982467651367, "learning_rate": 0.0001556665446131007, "loss": 0.0005, "step": 475 }, { "epoch": 0.2621686746987952, "grad_norm": 1.6359968185424805, "learning_rate": 0.00015549126419388536, "loss": 0.0008, "step": 476 }, { "epoch": 0.2627194492254733, "grad_norm": 12.597987174987793, "learning_rate": 0.0001553157370846009, "loss": 0.0084, "step": 477 }, { "epoch": 0.2632702237521515, "grad_norm": 2.9279329776763916, "learning_rate": 0.00015513996406556465, "loss": 0.0026, "step": 478 }, { "epoch": 0.2638209982788296, "grad_norm": 9.96490478515625, "learning_rate": 0.00015496394591818716, "loss": 0.0151, "step": 479 }, { "epoch": 0.2643717728055077, "grad_norm": 3.191298484802246, "learning_rate": 0.0001547876834249687, "loss": 0.0049, "step": 480 }, { "epoch": 0.2649225473321859, "grad_norm": 97.6749496459961, "learning_rate": 0.00015461117736949577, "loss": 0.0144, "step": 481 }, { "epoch": 0.265473321858864, "grad_norm": 19.6650333404541, "learning_rate": 0.00015443442853643762, "loss": 0.0493, "step": 482 }, { "epoch": 0.26602409638554214, "grad_norm": 11.947951316833496, "learning_rate": 0.00015425743771154294, "loss": 0.0208, "step": 483 }, { "epoch": 0.2665748709122203, "grad_norm": 2.060878038406372, "learning_rate": 0.00015408020568163602, "loss": 0.0021, "step": 484 }, { "epoch": 0.26712564543889844, "grad_norm": 18.24596405029297, "learning_rate": 0.00015390273323461352, "loss": 0.0458, "step": 485 }, { "epoch": 0.2676764199655766, "grad_norm": 3.655306100845337, "learning_rate": 0.0001537250211594409, "loss": 0.0149, "step": 486 }, { "epoch": 0.26822719449225474, "grad_norm": 5.758177757263184, "learning_rate": 0.0001535470702461489, "loss": 0.0027, "step": 487 }, { "epoch": 0.26877796901893286, "grad_norm": 0.024112781509757042, "learning_rate": 0.00015336888128583, "loss": 0.0, "step": 488 }, { "epoch": 0.26932874354561104, "grad_norm": 10.2376708984375, "learning_rate": 0.000153190455070635, "loss": 0.0163, "step": 489 }, { "epoch": 0.26987951807228916, "grad_norm": 0.0794905498623848, "learning_rate": 0.00015301179239376938, "loss": 0.0001, "step": 490 }, { "epoch": 0.2704302925989673, "grad_norm": 23.45412254333496, "learning_rate": 0.00015283289404948976, "loss": 0.0243, "step": 491 }, { "epoch": 0.27098106712564546, "grad_norm": 5.440088272094727, "learning_rate": 0.0001526537608331006, "loss": 0.0141, "step": 492 }, { "epoch": 0.2715318416523236, "grad_norm": 3.6664321422576904, "learning_rate": 0.00015247439354095041, "loss": 0.007, "step": 493 }, { "epoch": 0.2720826161790017, "grad_norm": 1.5260258913040161, "learning_rate": 0.00015229479297042823, "loss": 0.0008, "step": 494 }, { "epoch": 0.2726333907056799, "grad_norm": 0.25935593247413635, "learning_rate": 0.00015211495991996027, "loss": 0.0004, "step": 495 }, { "epoch": 0.273184165232358, "grad_norm": 2.942270278930664, "learning_rate": 0.0001519348951890062, "loss": 0.0008, "step": 496 }, { "epoch": 0.2737349397590361, "grad_norm": 20.422147750854492, "learning_rate": 0.0001517545995780556, "loss": 0.0026, "step": 497 }, { "epoch": 0.2742857142857143, "grad_norm": 0.3070022761821747, "learning_rate": 0.00015157407388862452, "loss": 0.0003, "step": 498 }, { "epoch": 0.2748364888123924, "grad_norm": 0.10984133183956146, "learning_rate": 0.00015139331892325179, "loss": 0.0001, "step": 499 }, { "epoch": 0.27538726333907054, "grad_norm": 0.3376227021217346, "learning_rate": 0.0001512123354854955, "loss": 0.0002, "step": 500 }, { "epoch": 0.2759380378657487, "grad_norm": 19.875431060791016, "learning_rate": 0.0001510311243799295, "loss": 0.0341, "step": 501 }, { "epoch": 0.27648881239242684, "grad_norm": 8.290825843811035, "learning_rate": 0.00015084968641213958, "loss": 0.0386, "step": 502 }, { "epoch": 0.277039586919105, "grad_norm": 8.833995819091797, "learning_rate": 0.00015066802238872023, "loss": 0.0204, "step": 503 }, { "epoch": 0.27759036144578314, "grad_norm": 4.0433878898620605, "learning_rate": 0.0001504861331172709, "loss": 0.0018, "step": 504 }, { "epoch": 0.27814113597246126, "grad_norm": 2.896510601043701, "learning_rate": 0.0001503040194063922, "loss": 0.0091, "step": 505 }, { "epoch": 0.27869191049913944, "grad_norm": 0.11457529664039612, "learning_rate": 0.00015012168206568268, "loss": 0.0002, "step": 506 }, { "epoch": 0.27924268502581756, "grad_norm": 4.713685035705566, "learning_rate": 0.00014993912190573505, "loss": 0.0099, "step": 507 }, { "epoch": 0.2797934595524957, "grad_norm": 0.14999738335609436, "learning_rate": 0.00014975633973813242, "loss": 0.0001, "step": 508 }, { "epoch": 0.28034423407917386, "grad_norm": 3.3715879917144775, "learning_rate": 0.00014957333637544503, "loss": 0.0045, "step": 509 }, { "epoch": 0.280895008605852, "grad_norm": 7.687290668487549, "learning_rate": 0.00014939011263122634, "loss": 0.0176, "step": 510 }, { "epoch": 0.2814457831325301, "grad_norm": 2.6134235858917236, "learning_rate": 0.0001492066693200096, "loss": 0.0016, "step": 511 }, { "epoch": 0.2819965576592083, "grad_norm": 0.03317194804549217, "learning_rate": 0.00014902300725730413, "loss": 0.0, "step": 512 }, { "epoch": 0.2825473321858864, "grad_norm": 6.015658378601074, "learning_rate": 0.00014883912725959167, "loss": 0.0035, "step": 513 }, { "epoch": 0.2830981067125645, "grad_norm": 0.8223051428794861, "learning_rate": 0.00014865503014432292, "loss": 0.0007, "step": 514 }, { "epoch": 0.2836488812392427, "grad_norm": 12.35944938659668, "learning_rate": 0.00014847071672991367, "loss": 0.0769, "step": 515 }, { "epoch": 0.2841996557659208, "grad_norm": 0.006626329850405455, "learning_rate": 0.0001482861878357414, "loss": 0.0, "step": 516 }, { "epoch": 0.28475043029259894, "grad_norm": 0.2557137906551361, "learning_rate": 0.00014810144428214144, "loss": 0.0001, "step": 517 }, { "epoch": 0.2853012048192771, "grad_norm": 12.677950859069824, "learning_rate": 0.0001479164868904034, "loss": 0.0122, "step": 518 }, { "epoch": 0.28585197934595524, "grad_norm": 3.604584217071533, "learning_rate": 0.00014773131648276758, "loss": 0.003, "step": 519 }, { "epoch": 0.28640275387263336, "grad_norm": 10.6746826171875, "learning_rate": 0.00014754593388242117, "loss": 0.0165, "step": 520 }, { "epoch": 0.28695352839931154, "grad_norm": 2.4383175373077393, "learning_rate": 0.0001473603399134948, "loss": 0.0016, "step": 521 }, { "epoch": 0.28750430292598966, "grad_norm": 3.8901569843292236, "learning_rate": 0.0001471745354010586, "loss": 0.0025, "step": 522 }, { "epoch": 0.28805507745266784, "grad_norm": 757.5812377929688, "learning_rate": 0.00014698852117111884, "loss": 0.1207, "step": 523 }, { "epoch": 0.28860585197934596, "grad_norm": 2.0592381954193115, "learning_rate": 0.000146802298050614, "loss": 0.0009, "step": 524 }, { "epoch": 0.2891566265060241, "grad_norm": 0.02510848268866539, "learning_rate": 0.0001466158668674112, "loss": 0.0, "step": 525 }, { "epoch": 0.28970740103270226, "grad_norm": 4.278772830963135, "learning_rate": 0.00014642922845030257, "loss": 0.0075, "step": 526 }, { "epoch": 0.2902581755593804, "grad_norm": 4.709321022033691, "learning_rate": 0.0001462423836290015, "loss": 0.0104, "step": 527 }, { "epoch": 0.2908089500860585, "grad_norm": 8.028677940368652, "learning_rate": 0.00014605533323413887, "loss": 0.0153, "step": 528 }, { "epoch": 0.2913597246127367, "grad_norm": 0.46918410062789917, "learning_rate": 0.00014586807809725962, "loss": 0.0002, "step": 529 }, { "epoch": 0.2919104991394148, "grad_norm": 16.962570190429688, "learning_rate": 0.00014568061905081875, "loss": 0.0112, "step": 530 }, { "epoch": 0.2924612736660929, "grad_norm": 0.0195084810256958, "learning_rate": 0.00014549295692817778, "loss": 0.0, "step": 531 }, { "epoch": 0.2930120481927711, "grad_norm": 2.102169990539551, "learning_rate": 0.00014530509256360102, "loss": 0.0046, "step": 532 }, { "epoch": 0.2935628227194492, "grad_norm": 2.9148740768432617, "learning_rate": 0.00014511702679225193, "loss": 0.0015, "step": 533 }, { "epoch": 0.29411359724612735, "grad_norm": 1.496279239654541, "learning_rate": 0.0001449287604501893, "loss": 0.0006, "step": 534 }, { "epoch": 0.2946643717728055, "grad_norm": 7.43492317199707, "learning_rate": 0.00014474029437436348, "loss": 0.02, "step": 535 }, { "epoch": 0.29521514629948364, "grad_norm": 19.925079345703125, "learning_rate": 0.00014455162940261285, "loss": 0.0361, "step": 536 }, { "epoch": 0.29576592082616177, "grad_norm": 0.7218440771102905, "learning_rate": 0.0001443627663736599, "loss": 0.0002, "step": 537 }, { "epoch": 0.29631669535283994, "grad_norm": 19.624330520629883, "learning_rate": 0.00014417370612710778, "loss": 0.0417, "step": 538 }, { "epoch": 0.29686746987951806, "grad_norm": 0.15112939476966858, "learning_rate": 0.00014398444950343623, "loss": 0.0001, "step": 539 }, { "epoch": 0.2974182444061962, "grad_norm": 7.219048976898193, "learning_rate": 0.00014379499734399798, "loss": 0.0191, "step": 540 }, { "epoch": 0.29796901893287436, "grad_norm": 0.18765327334403992, "learning_rate": 0.0001436053504910151, "loss": 0.0001, "step": 541 }, { "epoch": 0.2985197934595525, "grad_norm": 0.3138192892074585, "learning_rate": 0.0001434155097875752, "loss": 0.0, "step": 542 }, { "epoch": 0.29907056798623066, "grad_norm": 0.8192682862281799, "learning_rate": 0.00014322547607762762, "loss": 0.0003, "step": 543 }, { "epoch": 0.2996213425129088, "grad_norm": 0.06458425521850586, "learning_rate": 0.0001430352502059797, "loss": 0.0, "step": 544 }, { "epoch": 0.3001721170395869, "grad_norm": 17.617156982421875, "learning_rate": 0.0001428448330182931, "loss": 0.0032, "step": 545 }, { "epoch": 0.3007228915662651, "grad_norm": 10.481761932373047, "learning_rate": 0.00014265422536107993, "loss": 0.0178, "step": 546 }, { "epoch": 0.3012736660929432, "grad_norm": 0.011122568510472775, "learning_rate": 0.00014246342808169914, "loss": 0.0, "step": 547 }, { "epoch": 0.3018244406196213, "grad_norm": 0.05860462784767151, "learning_rate": 0.00014227244202835257, "loss": 0.0, "step": 548 }, { "epoch": 0.3023752151462995, "grad_norm": 15.055013656616211, "learning_rate": 0.0001420812680500813, "loss": 0.0069, "step": 549 }, { "epoch": 0.3029259896729776, "grad_norm": 136.0084228515625, "learning_rate": 0.00014188990699676184, "loss": 0.0187, "step": 550 }, { "epoch": 0.30347676419965575, "grad_norm": 44.47858810424805, "learning_rate": 0.00014169835971910238, "loss": 0.1126, "step": 551 }, { "epoch": 0.3040275387263339, "grad_norm": 16.49810791015625, "learning_rate": 0.0001415066270686389, "loss": 0.0142, "step": 552 }, { "epoch": 0.30457831325301205, "grad_norm": 1.3499763011932373, "learning_rate": 0.00014131470989773158, "loss": 0.0007, "step": 553 }, { "epoch": 0.30512908777969017, "grad_norm": 1.1182827949523926, "learning_rate": 0.0001411226090595608, "loss": 0.0008, "step": 554 }, { "epoch": 0.30567986230636834, "grad_norm": 2.274522542953491, "learning_rate": 0.00014093032540812348, "loss": 0.0012, "step": 555 }, { "epoch": 0.30623063683304647, "grad_norm": 0.14136746525764465, "learning_rate": 0.0001407378597982293, "loss": 0.0002, "step": 556 }, { "epoch": 0.3067814113597246, "grad_norm": 28.30078887939453, "learning_rate": 0.00014054521308549673, "loss": 0.011, "step": 557 }, { "epoch": 0.30733218588640276, "grad_norm": 0.12664511799812317, "learning_rate": 0.0001403523861263495, "loss": 0.0001, "step": 558 }, { "epoch": 0.3078829604130809, "grad_norm": 5.084848880767822, "learning_rate": 0.00014015937977801256, "loss": 0.0129, "step": 559 }, { "epoch": 0.30843373493975906, "grad_norm": 0.7848260998725891, "learning_rate": 0.00013996619489850822, "loss": 0.0005, "step": 560 }, { "epoch": 0.3089845094664372, "grad_norm": 0.38129305839538574, "learning_rate": 0.00013977283234665273, "loss": 0.0003, "step": 561 }, { "epoch": 0.3095352839931153, "grad_norm": 0.009055509231984615, "learning_rate": 0.00013957929298205195, "loss": 0.0, "step": 562 }, { "epoch": 0.3100860585197935, "grad_norm": 0.02733941748738289, "learning_rate": 0.00013938557766509792, "loss": 0.0, "step": 563 }, { "epoch": 0.3106368330464716, "grad_norm": 0.014881683513522148, "learning_rate": 0.0001391916872569648, "loss": 0.0, "step": 564 }, { "epoch": 0.3111876075731497, "grad_norm": 0.01753387600183487, "learning_rate": 0.00013899762261960518, "loss": 0.0, "step": 565 }, { "epoch": 0.3117383820998279, "grad_norm": 0.013407194055616856, "learning_rate": 0.0001388033846157462, "loss": 0.0, "step": 566 }, { "epoch": 0.312289156626506, "grad_norm": 0.007663563825190067, "learning_rate": 0.0001386089741088857, "loss": 0.0, "step": 567 }, { "epoch": 0.31283993115318415, "grad_norm": 4.376173973083496, "learning_rate": 0.00013841439196328836, "loss": 0.0092, "step": 568 }, { "epoch": 0.3133907056798623, "grad_norm": 39.19396209716797, "learning_rate": 0.00013821963904398193, "loss": 0.059, "step": 569 }, { "epoch": 0.31394148020654045, "grad_norm": 12.558430671691895, "learning_rate": 0.00013802471621675338, "loss": 0.0471, "step": 570 }, { "epoch": 0.31449225473321857, "grad_norm": 14.614710807800293, "learning_rate": 0.00013782962434814492, "loss": 0.0247, "step": 571 }, { "epoch": 0.31504302925989675, "grad_norm": 19.138317108154297, "learning_rate": 0.00013763436430545034, "loss": 0.0376, "step": 572 }, { "epoch": 0.31559380378657487, "grad_norm": 23.286149978637695, "learning_rate": 0.00013743893695671096, "loss": 0.0247, "step": 573 }, { "epoch": 0.316144578313253, "grad_norm": 8.172500610351562, "learning_rate": 0.00013724334317071198, "loss": 0.0158, "step": 574 }, { "epoch": 0.31669535283993117, "grad_norm": 5.787549018859863, "learning_rate": 0.00013704758381697844, "loss": 0.0124, "step": 575 }, { "epoch": 0.3172461273666093, "grad_norm": 37.85810852050781, "learning_rate": 0.00013685165976577146, "loss": 0.0562, "step": 576 }, { "epoch": 0.3177969018932874, "grad_norm": 13.53454875946045, "learning_rate": 0.0001366555718880843, "loss": 0.0095, "step": 577 }, { "epoch": 0.3183476764199656, "grad_norm": 4.110681056976318, "learning_rate": 0.00013645932105563844, "loss": 0.0065, "step": 578 }, { "epoch": 0.3188984509466437, "grad_norm": 19.304405212402344, "learning_rate": 0.00013626290814088005, "loss": 0.0211, "step": 579 }, { "epoch": 0.3194492254733219, "grad_norm": 9.172099113464355, "learning_rate": 0.00013606633401697557, "loss": 0.0133, "step": 580 }, { "epoch": 0.32, "grad_norm": 3.5083987712860107, "learning_rate": 0.00013586959955780824, "loss": 0.0068, "step": 581 }, { "epoch": 0.32055077452667813, "grad_norm": 6.262914657592773, "learning_rate": 0.00013567270563797398, "loss": 0.0041, "step": 582 }, { "epoch": 0.3211015490533563, "grad_norm": 13.267187118530273, "learning_rate": 0.00013547565313277776, "loss": 0.0048, "step": 583 }, { "epoch": 0.3216523235800344, "grad_norm": 15.806163787841797, "learning_rate": 0.00013527844291822948, "loss": 0.0154, "step": 584 }, { "epoch": 0.32220309810671255, "grad_norm": 1.3934779167175293, "learning_rate": 0.0001350810758710401, "loss": 0.0007, "step": 585 }, { "epoch": 0.3227538726333907, "grad_norm": 0.1373090147972107, "learning_rate": 0.00013488355286861783, "loss": 0.0002, "step": 586 }, { "epoch": 0.32330464716006885, "grad_norm": 0.957940399646759, "learning_rate": 0.0001346858747890642, "loss": 0.0006, "step": 587 }, { "epoch": 0.32385542168674697, "grad_norm": 0.6639079451560974, "learning_rate": 0.00013448804251117003, "loss": 0.0005, "step": 588 }, { "epoch": 0.32440619621342515, "grad_norm": 3.942610025405884, "learning_rate": 0.0001342900569144119, "loss": 0.0182, "step": 589 }, { "epoch": 0.32495697074010327, "grad_norm": 1.2919187545776367, "learning_rate": 0.0001340919188789477, "loss": 0.0011, "step": 590 }, { "epoch": 0.3255077452667814, "grad_norm": 0.3621211647987366, "learning_rate": 0.00013389362928561317, "loss": 0.0002, "step": 591 }, { "epoch": 0.32605851979345957, "grad_norm": 2.909912347793579, "learning_rate": 0.00013369518901591772, "loss": 0.003, "step": 592 }, { "epoch": 0.3266092943201377, "grad_norm": 0.028715912252664566, "learning_rate": 0.00013349659895204067, "loss": 0.0, "step": 593 }, { "epoch": 0.3271600688468158, "grad_norm": 0.5734847187995911, "learning_rate": 0.0001332978599768272, "loss": 0.0001, "step": 594 }, { "epoch": 0.327710843373494, "grad_norm": 20.375886917114258, "learning_rate": 0.00013309897297378455, "loss": 0.0366, "step": 595 }, { "epoch": 0.3282616179001721, "grad_norm": 2.9586613178253174, "learning_rate": 0.00013289993882707797, "loss": 0.0009, "step": 596 }, { "epoch": 0.32881239242685023, "grad_norm": 2.027022123336792, "learning_rate": 0.00013270075842152678, "loss": 0.0007, "step": 597 }, { "epoch": 0.3293631669535284, "grad_norm": 0.678037703037262, "learning_rate": 0.00013250143264260074, "loss": 0.0002, "step": 598 }, { "epoch": 0.32991394148020653, "grad_norm": 6.7018561363220215, "learning_rate": 0.0001323019623764156, "loss": 0.0036, "step": 599 }, { "epoch": 0.3304647160068847, "grad_norm": 0.2971855401992798, "learning_rate": 0.00013210234850972964, "loss": 0.0002, "step": 600 }, { "epoch": 0.33101549053356283, "grad_norm": 16.413806915283203, "learning_rate": 0.0001319025919299394, "loss": 0.0261, "step": 601 }, { "epoch": 0.33156626506024095, "grad_norm": 14.29276180267334, "learning_rate": 0.00013170269352507597, "loss": 0.0314, "step": 602 }, { "epoch": 0.3321170395869191, "grad_norm": 0.8844265341758728, "learning_rate": 0.0001315026541838008, "loss": 0.0007, "step": 603 }, { "epoch": 0.33266781411359725, "grad_norm": 0.10956772416830063, "learning_rate": 0.00013130247479540202, "loss": 0.0, "step": 604 }, { "epoch": 0.33321858864027537, "grad_norm": 0.04536151513457298, "learning_rate": 0.00013110215624979025, "loss": 0.0, "step": 605 }, { "epoch": 0.33376936316695355, "grad_norm": 8.255972862243652, "learning_rate": 0.00013090169943749476, "loss": 0.022, "step": 606 }, { "epoch": 0.33432013769363167, "grad_norm": 2.008922576904297, "learning_rate": 0.00013070110524965954, "loss": 0.0017, "step": 607 }, { "epoch": 0.3348709122203098, "grad_norm": 1.0998557806015015, "learning_rate": 0.00013050037457803924, "loss": 0.0006, "step": 608 }, { "epoch": 0.33542168674698797, "grad_norm": 3.046152114868164, "learning_rate": 0.0001302995083149953, "loss": 0.001, "step": 609 }, { "epoch": 0.3359724612736661, "grad_norm": 0.0050095426850020885, "learning_rate": 0.0001300985073534919, "loss": 0.0, "step": 610 }, { "epoch": 0.3365232358003442, "grad_norm": 0.022635338827967644, "learning_rate": 0.00012989737258709203, "loss": 0.0, "step": 611 }, { "epoch": 0.3370740103270224, "grad_norm": 0.28050509095191956, "learning_rate": 0.00012969610490995358, "loss": 0.0, "step": 612 }, { "epoch": 0.3376247848537005, "grad_norm": 0.011976325884461403, "learning_rate": 0.00012949470521682528, "loss": 0.0, "step": 613 }, { "epoch": 0.33817555938037863, "grad_norm": 0.05875038355588913, "learning_rate": 0.0001292931744030427, "loss": 0.0, "step": 614 }, { "epoch": 0.3387263339070568, "grad_norm": 4.9109039306640625, "learning_rate": 0.0001290915133645243, "loss": 0.0004, "step": 615 }, { "epoch": 0.33927710843373493, "grad_norm": 29.8415470123291, "learning_rate": 0.00012888972299776754, "loss": 0.0403, "step": 616 }, { "epoch": 0.3398278829604131, "grad_norm": 0.3943260610103607, "learning_rate": 0.00012868780419984482, "loss": 0.0003, "step": 617 }, { "epoch": 0.34037865748709123, "grad_norm": 17.573259353637695, "learning_rate": 0.00012848575786839943, "loss": 0.0186, "step": 618 }, { "epoch": 0.34092943201376935, "grad_norm": 21.970373153686523, "learning_rate": 0.0001282835849016416, "loss": 0.0436, "step": 619 }, { "epoch": 0.34148020654044753, "grad_norm": 3.752251625061035, "learning_rate": 0.00012808128619834461, "loss": 0.0181, "step": 620 }, { "epoch": 0.34203098106712565, "grad_norm": 16.67305564880371, "learning_rate": 0.0001278788626578407, "loss": 0.0067, "step": 621 }, { "epoch": 0.34258175559380377, "grad_norm": 12.117171287536621, "learning_rate": 0.00012767631518001698, "loss": 0.0111, "step": 622 }, { "epoch": 0.34313253012048195, "grad_norm": 9.554963111877441, "learning_rate": 0.00012747364466531163, "loss": 0.0089, "step": 623 }, { "epoch": 0.34368330464716007, "grad_norm": 0.09575698524713516, "learning_rate": 0.00012727085201470973, "loss": 0.0001, "step": 624 }, { "epoch": 0.3442340791738382, "grad_norm": 2.1685287952423096, "learning_rate": 0.00012706793812973941, "loss": 0.0097, "step": 625 }, { "epoch": 0.34478485370051637, "grad_norm": 3.880866289138794, "learning_rate": 0.0001268649039124677, "loss": 0.0025, "step": 626 }, { "epoch": 0.3453356282271945, "grad_norm": 0.03563214838504791, "learning_rate": 0.00012666175026549662, "loss": 0.0, "step": 627 }, { "epoch": 0.3458864027538726, "grad_norm": 0.47583332657814026, "learning_rate": 0.000126458478091959, "loss": 0.0002, "step": 628 }, { "epoch": 0.3464371772805508, "grad_norm": 6.501410484313965, "learning_rate": 0.00012625508829551473, "loss": 0.0149, "step": 629 }, { "epoch": 0.3469879518072289, "grad_norm": 3.5434982776641846, "learning_rate": 0.00012605158178034654, "loss": 0.0088, "step": 630 }, { "epoch": 0.34753872633390703, "grad_norm": 11.453561782836914, "learning_rate": 0.00012584795945115603, "loss": 0.0159, "step": 631 }, { "epoch": 0.3480895008605852, "grad_norm": 4.885062217712402, "learning_rate": 0.0001256442222131597, "loss": 0.0082, "step": 632 }, { "epoch": 0.34864027538726333, "grad_norm": 9.836543083190918, "learning_rate": 0.0001254403709720848, "loss": 0.0108, "step": 633 }, { "epoch": 0.34919104991394145, "grad_norm": 0.013475382700562477, "learning_rate": 0.0001252364066341655, "loss": 0.0, "step": 634 }, { "epoch": 0.34974182444061963, "grad_norm": 21.27243995666504, "learning_rate": 0.00012503233010613865, "loss": 0.0276, "step": 635 }, { "epoch": 0.35029259896729775, "grad_norm": 7.823695659637451, "learning_rate": 0.00012482814229523997, "loss": 0.0189, "step": 636 }, { "epoch": 0.35084337349397593, "grad_norm": 0.056137245148420334, "learning_rate": 0.00012462384410919975, "loss": 0.0001, "step": 637 }, { "epoch": 0.35139414802065405, "grad_norm": 10.407916069030762, "learning_rate": 0.00012441943645623903, "loss": 0.0118, "step": 638 }, { "epoch": 0.3519449225473322, "grad_norm": 4.3115739822387695, "learning_rate": 0.00012421492024506555, "loss": 0.0082, "step": 639 }, { "epoch": 0.35249569707401035, "grad_norm": 7.684454917907715, "learning_rate": 0.00012401029638486953, "loss": 0.0039, "step": 640 }, { "epoch": 0.35304647160068847, "grad_norm": 1.7815595865249634, "learning_rate": 0.0001238055657853198, "loss": 0.0013, "step": 641 }, { "epoch": 0.3535972461273666, "grad_norm": 0.33550453186035156, "learning_rate": 0.00012360072935655982, "loss": 0.0004, "step": 642 }, { "epoch": 0.35414802065404477, "grad_norm": 4.237879753112793, "learning_rate": 0.00012339578800920332, "loss": 0.009, "step": 643 }, { "epoch": 0.3546987951807229, "grad_norm": 6.094169616699219, "learning_rate": 0.00012319074265433063, "loss": 0.0076, "step": 644 }, { "epoch": 0.355249569707401, "grad_norm": 10.927846908569336, "learning_rate": 0.00012298559420348437, "loss": 0.0065, "step": 645 }, { "epoch": 0.3558003442340792, "grad_norm": 10.031198501586914, "learning_rate": 0.00012278034356866545, "loss": 0.0048, "step": 646 }, { "epoch": 0.3563511187607573, "grad_norm": 0.4392476975917816, "learning_rate": 0.00012257499166232907, "loss": 0.0003, "step": 647 }, { "epoch": 0.35690189328743543, "grad_norm": 0.02974950335919857, "learning_rate": 0.0001223695393973807, "loss": 0.0001, "step": 648 }, { "epoch": 0.3574526678141136, "grad_norm": 2.786081075668335, "learning_rate": 0.0001221639876871719, "loss": 0.0013, "step": 649 }, { "epoch": 0.35800344234079173, "grad_norm": 1.3992780447006226, "learning_rate": 0.0001219583374454963, "loss": 0.0004, "step": 650 }, { "epoch": 0.35855421686746985, "grad_norm": 24.265626907348633, "learning_rate": 0.00012175258958658564, "loss": 0.1209, "step": 651 }, { "epoch": 0.35910499139414803, "grad_norm": 7.308268070220947, "learning_rate": 0.00012154674502510555, "loss": 0.0306, "step": 652 }, { "epoch": 0.35965576592082615, "grad_norm": 1.407171368598938, "learning_rate": 0.00012134080467615159, "loss": 0.0023, "step": 653 }, { "epoch": 0.3602065404475043, "grad_norm": 3.039745807647705, "learning_rate": 0.00012113476945524513, "loss": 0.0015, "step": 654 }, { "epoch": 0.36075731497418245, "grad_norm": 0.01705438829958439, "learning_rate": 0.00012092864027832933, "loss": 0.0, "step": 655 }, { "epoch": 0.3613080895008606, "grad_norm": 6.047449588775635, "learning_rate": 0.000120722418061765, "loss": 0.0117, "step": 656 }, { "epoch": 0.36185886402753875, "grad_norm": 48.94442367553711, "learning_rate": 0.0001205161037223266, "loss": 0.0098, "step": 657 }, { "epoch": 0.3624096385542169, "grad_norm": 3.99149227142334, "learning_rate": 0.00012030969817719808, "loss": 0.0124, "step": 658 }, { "epoch": 0.362960413080895, "grad_norm": 0.8626866340637207, "learning_rate": 0.00012010320234396894, "loss": 0.0007, "step": 659 }, { "epoch": 0.36351118760757317, "grad_norm": 1.8950494527816772, "learning_rate": 0.00011989661714062999, "loss": 0.0012, "step": 660 }, { "epoch": 0.3640619621342513, "grad_norm": 0.027771569788455963, "learning_rate": 0.0001196899434855693, "loss": 0.0, "step": 661 }, { "epoch": 0.3646127366609294, "grad_norm": 0.07818129658699036, "learning_rate": 0.00011948318229756827, "loss": 0.0001, "step": 662 }, { "epoch": 0.3651635111876076, "grad_norm": 0.005358471069484949, "learning_rate": 0.00011927633449579735, "loss": 0.0, "step": 663 }, { "epoch": 0.3657142857142857, "grad_norm": 0.01550602912902832, "learning_rate": 0.0001190694009998121, "loss": 0.0, "step": 664 }, { "epoch": 0.36626506024096384, "grad_norm": 12.277764320373535, "learning_rate": 0.00011886238272954897, "loss": 0.003, "step": 665 }, { "epoch": 0.366815834767642, "grad_norm": 36.647640228271484, "learning_rate": 0.00011865528060532127, "loss": 0.006, "step": 666 }, { "epoch": 0.36736660929432013, "grad_norm": 2.3219637870788574, "learning_rate": 0.0001184480955478152, "loss": 0.0081, "step": 667 }, { "epoch": 0.36791738382099826, "grad_norm": 1.2723623514175415, "learning_rate": 0.00011824082847808558, "loss": 0.0022, "step": 668 }, { "epoch": 0.36846815834767643, "grad_norm": 0.7096296548843384, "learning_rate": 0.00011803348031755179, "loss": 0.0002, "step": 669 }, { "epoch": 0.36901893287435455, "grad_norm": 1.3105905055999756, "learning_rate": 0.0001178260519879937, "loss": 0.0007, "step": 670 }, { "epoch": 0.3695697074010327, "grad_norm": 0.07750259339809418, "learning_rate": 0.00011761854441154767, "loss": 0.0001, "step": 671 }, { "epoch": 0.37012048192771085, "grad_norm": 0.006007653195410967, "learning_rate": 0.00011741095851070228, "loss": 0.0, "step": 672 }, { "epoch": 0.370671256454389, "grad_norm": 5.658960342407227, "learning_rate": 0.00011720329520829429, "loss": 0.0075, "step": 673 }, { "epoch": 0.37122203098106715, "grad_norm": 8.77633285522461, "learning_rate": 0.0001169955554275046, "loss": 0.006, "step": 674 }, { "epoch": 0.3717728055077453, "grad_norm": 0.028751753270626068, "learning_rate": 0.0001167877400918541, "loss": 0.0, "step": 675 }, { "epoch": 0.3723235800344234, "grad_norm": 2.9458200931549072, "learning_rate": 0.00011657985012519952, "loss": 0.0213, "step": 676 }, { "epoch": 0.3728743545611016, "grad_norm": 0.07287055999040604, "learning_rate": 0.00011637188645172944, "loss": 0.0, "step": 677 }, { "epoch": 0.3734251290877797, "grad_norm": 0.0063529410399496555, "learning_rate": 0.00011616384999596006, "loss": 0.0, "step": 678 }, { "epoch": 0.3739759036144578, "grad_norm": 8.3136568069458, "learning_rate": 0.00011595574168273111, "loss": 0.0223, "step": 679 }, { "epoch": 0.374526678141136, "grad_norm": 0.2632562816143036, "learning_rate": 0.0001157475624372018, "loss": 0.0001, "step": 680 }, { "epoch": 0.3750774526678141, "grad_norm": 0.021285418421030045, "learning_rate": 0.0001155393131848467, "loss": 0.0, "step": 681 }, { "epoch": 0.37562822719449224, "grad_norm": 14.85163688659668, "learning_rate": 0.00011533099485145155, "loss": 0.0047, "step": 682 }, { "epoch": 0.3761790017211704, "grad_norm": 28.53243064880371, "learning_rate": 0.00011512260836310924, "loss": 0.0201, "step": 683 }, { "epoch": 0.37672977624784854, "grad_norm": 0.0720270574092865, "learning_rate": 0.00011491415464621562, "loss": 0.0001, "step": 684 }, { "epoch": 0.37728055077452666, "grad_norm": 11.400739669799805, "learning_rate": 0.00011470563462746541, "loss": 0.002, "step": 685 }, { "epoch": 0.37783132530120483, "grad_norm": 13.433859825134277, "learning_rate": 0.00011449704923384812, "loss": 0.0475, "step": 686 }, { "epoch": 0.37838209982788296, "grad_norm": 0.032436806708574295, "learning_rate": 0.00011428839939264382, "loss": 0.0, "step": 687 }, { "epoch": 0.3789328743545611, "grad_norm": 0.35929447412490845, "learning_rate": 0.0001140796860314191, "loss": 0.0001, "step": 688 }, { "epoch": 0.37948364888123925, "grad_norm": 0.1945551633834839, "learning_rate": 0.00011387091007802297, "loss": 0.0002, "step": 689 }, { "epoch": 0.3800344234079174, "grad_norm": 2.29357647895813, "learning_rate": 0.0001136620724605827, "loss": 0.0014, "step": 690 }, { "epoch": 0.3805851979345955, "grad_norm": 10.327256202697754, "learning_rate": 0.00011345317410749964, "loss": 0.0054, "step": 691 }, { "epoch": 0.3811359724612737, "grad_norm": 0.07525800168514252, "learning_rate": 0.00011324421594744516, "loss": 0.0001, "step": 692 }, { "epoch": 0.3816867469879518, "grad_norm": 0.24219144880771637, "learning_rate": 0.00011303519890935656, "loss": 0.0002, "step": 693 }, { "epoch": 0.38223752151463, "grad_norm": 0.5817039608955383, "learning_rate": 0.00011282612392243286, "loss": 0.0002, "step": 694 }, { "epoch": 0.3827882960413081, "grad_norm": 0.02477632835507393, "learning_rate": 0.00011261699191613066, "loss": 0.0, "step": 695 }, { "epoch": 0.3833390705679862, "grad_norm": 8.5928316116333, "learning_rate": 0.00011240780382016005, "loss": 0.0056, "step": 696 }, { "epoch": 0.3838898450946644, "grad_norm": 3.734281301498413, "learning_rate": 0.00011219856056448051, "loss": 0.0036, "step": 697 }, { "epoch": 0.3844406196213425, "grad_norm": 0.012047311291098595, "learning_rate": 0.00011198926307929664, "loss": 0.0, "step": 698 }, { "epoch": 0.38499139414802064, "grad_norm": 5.915680885314941, "learning_rate": 0.00011177991229505431, "loss": 0.0195, "step": 699 }, { "epoch": 0.3855421686746988, "grad_norm": 5.875691890716553, "learning_rate": 0.00011157050914243614, "loss": 0.0023, "step": 700 }, { "epoch": 0.38609294320137694, "grad_norm": 7.3662495613098145, "learning_rate": 0.00011136105455235766, "loss": 0.0387, "step": 701 }, { "epoch": 0.38664371772805506, "grad_norm": 25.111196517944336, "learning_rate": 0.00011115154945596305, "loss": 0.0186, "step": 702 }, { "epoch": 0.38719449225473324, "grad_norm": 0.17061816155910492, "learning_rate": 0.00011094199478462095, "loss": 0.0001, "step": 703 }, { "epoch": 0.38774526678141136, "grad_norm": 0.12905219197273254, "learning_rate": 0.00011073239146992054, "loss": 0.0001, "step": 704 }, { "epoch": 0.3882960413080895, "grad_norm": 0.16951178014278412, "learning_rate": 0.00011052274044366711, "loss": 0.0002, "step": 705 }, { "epoch": 0.38884681583476766, "grad_norm": 0.01369217224419117, "learning_rate": 0.00011031304263787812, "loss": 0.0, "step": 706 }, { "epoch": 0.3893975903614458, "grad_norm": 25.56741714477539, "learning_rate": 0.00011010329898477891, "loss": 0.0062, "step": 707 }, { "epoch": 0.3899483648881239, "grad_norm": 0.25825047492980957, "learning_rate": 0.0001098935104167988, "loss": 0.0001, "step": 708 }, { "epoch": 0.3904991394148021, "grad_norm": 2.790830135345459, "learning_rate": 0.00010968367786656663, "loss": 0.0008, "step": 709 }, { "epoch": 0.3910499139414802, "grad_norm": 0.02756306901574135, "learning_rate": 0.00010947380226690684, "loss": 0.0, "step": 710 }, { "epoch": 0.3916006884681583, "grad_norm": 0.11872462928295135, "learning_rate": 0.00010926388455083522, "loss": 0.0001, "step": 711 }, { "epoch": 0.3921514629948365, "grad_norm": 0.12274815887212753, "learning_rate": 0.00010905392565155477, "loss": 0.0, "step": 712 }, { "epoch": 0.3927022375215146, "grad_norm": 29.329174041748047, "learning_rate": 0.00010884392650245165, "loss": 0.0127, "step": 713 }, { "epoch": 0.3932530120481928, "grad_norm": 7.097370624542236, "learning_rate": 0.00010863388803709089, "loss": 0.0007, "step": 714 }, { "epoch": 0.3938037865748709, "grad_norm": 0.07541192322969437, "learning_rate": 0.00010842381118921232, "loss": 0.0001, "step": 715 }, { "epoch": 0.39435456110154904, "grad_norm": 0.006260357331484556, "learning_rate": 0.00010821369689272638, "loss": 0.0, "step": 716 }, { "epoch": 0.3949053356282272, "grad_norm": 0.006641521118581295, "learning_rate": 0.00010800354608171003, "loss": 0.0, "step": 717 }, { "epoch": 0.39545611015490534, "grad_norm": 0.01294923759996891, "learning_rate": 0.00010779335969040252, "loss": 0.0, "step": 718 }, { "epoch": 0.39600688468158346, "grad_norm": 0.004326496738940477, "learning_rate": 0.00010758313865320134, "loss": 0.0, "step": 719 }, { "epoch": 0.39655765920826164, "grad_norm": 20.02821922302246, "learning_rate": 0.00010737288390465792, "loss": 0.0325, "step": 720 }, { "epoch": 0.39710843373493976, "grad_norm": 0.019133757799863815, "learning_rate": 0.00010716259637947357, "loss": 0.0, "step": 721 }, { "epoch": 0.3976592082616179, "grad_norm": 0.00470879627391696, "learning_rate": 0.00010695227701249537, "loss": 0.0, "step": 722 }, { "epoch": 0.39820998278829606, "grad_norm": 2.680922508239746, "learning_rate": 0.00010674192673871191, "loss": 0.0008, "step": 723 }, { "epoch": 0.3987607573149742, "grad_norm": 0.11107369512319565, "learning_rate": 0.00010653154649324917, "loss": 0.0001, "step": 724 }, { "epoch": 0.3993115318416523, "grad_norm": 1.2510350942611694, "learning_rate": 0.00010632113721136636, "loss": 0.0007, "step": 725 }, { "epoch": 0.3998623063683305, "grad_norm": 5.508364200592041, "learning_rate": 0.00010611069982845183, "loss": 0.0143, "step": 726 }, { "epoch": 0.4004130808950086, "grad_norm": 0.026287969201803207, "learning_rate": 0.00010590023528001884, "loss": 0.0, "step": 727 }, { "epoch": 0.4009638554216867, "grad_norm": 0.003707467345520854, "learning_rate": 0.00010568974450170139, "loss": 0.0, "step": 728 }, { "epoch": 0.4015146299483649, "grad_norm": 0.014163332991302013, "learning_rate": 0.00010547922842925008, "loss": 0.0, "step": 729 }, { "epoch": 0.402065404475043, "grad_norm": 0.20458070933818817, "learning_rate": 0.00010526868799852796, "loss": 0.0001, "step": 730 }, { "epoch": 0.4026161790017212, "grad_norm": 9.005853652954102, "learning_rate": 0.0001050581241455064, "loss": 0.0193, "step": 731 }, { "epoch": 0.4031669535283993, "grad_norm": 4.9345502853393555, "learning_rate": 0.00010484753780626089, "loss": 0.002, "step": 732 }, { "epoch": 0.40371772805507744, "grad_norm": 3.4388484954833984, "learning_rate": 0.00010463692991696685, "loss": 0.001, "step": 733 }, { "epoch": 0.4042685025817556, "grad_norm": 3.456713914871216, "learning_rate": 0.00010442630141389549, "loss": 0.0092, "step": 734 }, { "epoch": 0.40481927710843374, "grad_norm": 3.68304705619812, "learning_rate": 0.00010421565323340971, "loss": 0.0007, "step": 735 }, { "epoch": 0.40537005163511186, "grad_norm": 0.07865356653928757, "learning_rate": 0.00010400498631195992, "loss": 0.0, "step": 736 }, { "epoch": 0.40592082616179004, "grad_norm": 16.42683219909668, "learning_rate": 0.00010379430158607975, "loss": 0.0124, "step": 737 }, { "epoch": 0.40647160068846816, "grad_norm": 0.24729914963245392, "learning_rate": 0.000103583599992382, "loss": 0.0001, "step": 738 }, { "epoch": 0.4070223752151463, "grad_norm": 0.7038357257843018, "learning_rate": 0.0001033728824675545, "loss": 0.0003, "step": 739 }, { "epoch": 0.40757314974182446, "grad_norm": 7.950600624084473, "learning_rate": 0.0001031621499483559, "loss": 0.0028, "step": 740 }, { "epoch": 0.4081239242685026, "grad_norm": 5.538748264312744, "learning_rate": 0.00010295140337161146, "loss": 0.0038, "step": 741 }, { "epoch": 0.4086746987951807, "grad_norm": 0.1777314990758896, "learning_rate": 0.00010274064367420897, "loss": 0.0001, "step": 742 }, { "epoch": 0.4092254733218589, "grad_norm": 3.247906446456909, "learning_rate": 0.00010252987179309459, "loss": 0.0007, "step": 743 }, { "epoch": 0.409776247848537, "grad_norm": 2.847930908203125, "learning_rate": 0.00010231908866526851, "loss": 0.0062, "step": 744 }, { "epoch": 0.4103270223752151, "grad_norm": 2.769158363342285, "learning_rate": 0.00010210829522778111, "loss": 0.0016, "step": 745 }, { "epoch": 0.4108777969018933, "grad_norm": 0.0015942445024847984, "learning_rate": 0.00010189749241772844, "loss": 0.0, "step": 746 }, { "epoch": 0.4114285714285714, "grad_norm": 0.001644248841330409, "learning_rate": 0.00010168668117224825, "loss": 0.0, "step": 747 }, { "epoch": 0.41197934595524954, "grad_norm": 0.004610294476151466, "learning_rate": 0.00010147586242851585, "loss": 0.0, "step": 748 }, { "epoch": 0.4125301204819277, "grad_norm": 0.17768514156341553, "learning_rate": 0.00010126503712373982, "loss": 0.0001, "step": 749 }, { "epoch": 0.41308089500860584, "grad_norm": 1.0455394983291626, "learning_rate": 0.00010105420619515798, "loss": 0.0003, "step": 750 }, { "epoch": 0.41308089500860584, "eval_loss": 0.010985496453940868, "eval_runtime": 232.9212, "eval_samples_per_second": 13.129, "eval_steps_per_second": 6.564, "step": 750 }, { "epoch": 0.413631669535284, "grad_norm": 21.729881286621094, "learning_rate": 0.00010084337058003303, "loss": 0.0526, "step": 751 }, { "epoch": 0.41418244406196214, "grad_norm": 4.894376277923584, "learning_rate": 0.00010063253121564868, "loss": 0.0084, "step": 752 }, { "epoch": 0.41473321858864026, "grad_norm": 0.08202194422483444, "learning_rate": 0.00010042168903930514, "loss": 0.0, "step": 753 }, { "epoch": 0.41528399311531844, "grad_norm": 0.005352225620299578, "learning_rate": 0.00010021084498831522, "loss": 0.0, "step": 754 }, { "epoch": 0.41583476764199656, "grad_norm": 1.0854874849319458, "learning_rate": 0.0001, "loss": 0.0003, "step": 755 }, { "epoch": 0.4163855421686747, "grad_norm": 7.41090726852417, "learning_rate": 9.97891550116848e-05, "loss": 0.0084, "step": 756 }, { "epoch": 0.41693631669535286, "grad_norm": 0.566426694393158, "learning_rate": 9.957831096069488e-05, "loss": 0.0001, "step": 757 }, { "epoch": 0.417487091222031, "grad_norm": 0.024814387783408165, "learning_rate": 9.936746878435136e-05, "loss": 0.0, "step": 758 }, { "epoch": 0.4180378657487091, "grad_norm": 1.1818994283676147, "learning_rate": 9.915662941996699e-05, "loss": 0.0011, "step": 759 }, { "epoch": 0.4185886402753873, "grad_norm": 0.02017657645046711, "learning_rate": 9.894579380484204e-05, "loss": 0.0, "step": 760 }, { "epoch": 0.4191394148020654, "grad_norm": 0.0026490064337849617, "learning_rate": 9.873496287626019e-05, "loss": 0.0, "step": 761 }, { "epoch": 0.4196901893287435, "grad_norm": 7.569090843200684, "learning_rate": 9.852413757148417e-05, "loss": 0.0192, "step": 762 }, { "epoch": 0.4202409638554217, "grad_norm": 0.0010120115475729108, "learning_rate": 9.831331882775178e-05, "loss": 0.0, "step": 763 }, { "epoch": 0.4207917383820998, "grad_norm": 0.0010434604482725263, "learning_rate": 9.81025075822716e-05, "loss": 0.0, "step": 764 }, { "epoch": 0.42134251290877794, "grad_norm": 0.0014836654299870133, "learning_rate": 9.789170477221891e-05, "loss": 0.0, "step": 765 }, { "epoch": 0.4218932874354561, "grad_norm": 0.0024362194817513227, "learning_rate": 9.76809113347315e-05, "loss": 0.0, "step": 766 }, { "epoch": 0.42244406196213424, "grad_norm": 0.003682615701109171, "learning_rate": 9.747012820690543e-05, "loss": 0.0, "step": 767 }, { "epoch": 0.4229948364888124, "grad_norm": 0.5902315378189087, "learning_rate": 9.725935632579104e-05, "loss": 0.0002, "step": 768 }, { "epoch": 0.42354561101549054, "grad_norm": 0.014872108586132526, "learning_rate": 9.704859662838855e-05, "loss": 0.0, "step": 769 }, { "epoch": 0.42409638554216866, "grad_norm": 0.20517182350158691, "learning_rate": 9.683785005164411e-05, "loss": 0.0001, "step": 770 }, { "epoch": 0.42464716006884684, "grad_norm": 17.052011489868164, "learning_rate": 9.662711753244551e-05, "loss": 0.0482, "step": 771 }, { "epoch": 0.42519793459552496, "grad_norm": 0.002969280816614628, "learning_rate": 9.641640000761802e-05, "loss": 0.0, "step": 772 }, { "epoch": 0.4257487091222031, "grad_norm": 16.601808547973633, "learning_rate": 9.620569841392029e-05, "loss": 0.0149, "step": 773 }, { "epoch": 0.42629948364888126, "grad_norm": 3.007627248764038, "learning_rate": 9.59950136880401e-05, "loss": 0.0008, "step": 774 }, { "epoch": 0.4268502581755594, "grad_norm": 58.51079177856445, "learning_rate": 9.57843467665903e-05, "loss": 0.0306, "step": 775 }, { "epoch": 0.4274010327022375, "grad_norm": 0.3650367856025696, "learning_rate": 9.557369858610453e-05, "loss": 0.0002, "step": 776 }, { "epoch": 0.4279518072289157, "grad_norm": 0.04099801555275917, "learning_rate": 9.53630700830332e-05, "loss": 0.0, "step": 777 }, { "epoch": 0.4285025817555938, "grad_norm": 7.434444904327393, "learning_rate": 9.51524621937391e-05, "loss": 0.0026, "step": 778 }, { "epoch": 0.4290533562822719, "grad_norm": 0.0014230706728994846, "learning_rate": 9.494187585449358e-05, "loss": 0.0, "step": 779 }, { "epoch": 0.4296041308089501, "grad_norm": 0.010296106338500977, "learning_rate": 9.473131200147205e-05, "loss": 0.0, "step": 780 }, { "epoch": 0.4301549053356282, "grad_norm": 0.10973358154296875, "learning_rate": 9.452077157074994e-05, "loss": 0.0001, "step": 781 }, { "epoch": 0.43070567986230635, "grad_norm": 1.2571378946304321, "learning_rate": 9.431025549829862e-05, "loss": 0.0005, "step": 782 }, { "epoch": 0.4312564543889845, "grad_norm": 0.00297401356510818, "learning_rate": 9.409976471998118e-05, "loss": 0.0, "step": 783 }, { "epoch": 0.43180722891566264, "grad_norm": 14.645354270935059, "learning_rate": 9.388930017154819e-05, "loss": 0.0146, "step": 784 }, { "epoch": 0.43235800344234077, "grad_norm": 12.415586471557617, "learning_rate": 9.367886278863366e-05, "loss": 0.0056, "step": 785 }, { "epoch": 0.43290877796901894, "grad_norm": 0.0028187285643070936, "learning_rate": 9.346845350675088e-05, "loss": 0.0, "step": 786 }, { "epoch": 0.43345955249569706, "grad_norm": 12.749964714050293, "learning_rate": 9.325807326128814e-05, "loss": 0.0053, "step": 787 }, { "epoch": 0.43401032702237524, "grad_norm": 0.022418567910790443, "learning_rate": 9.304772298750463e-05, "loss": 0.0, "step": 788 }, { "epoch": 0.43456110154905336, "grad_norm": 4.886881351470947, "learning_rate": 9.283740362052642e-05, "loss": 0.039, "step": 789 }, { "epoch": 0.4351118760757315, "grad_norm": 0.5270808935165405, "learning_rate": 9.26271160953421e-05, "loss": 0.0003, "step": 790 }, { "epoch": 0.43566265060240966, "grad_norm": 0.29282739758491516, "learning_rate": 9.241686134679867e-05, "loss": 0.0002, "step": 791 }, { "epoch": 0.4362134251290878, "grad_norm": 5.848938465118408, "learning_rate": 9.220664030959749e-05, "loss": 0.0164, "step": 792 }, { "epoch": 0.4367641996557659, "grad_norm": 149.54795837402344, "learning_rate": 9.199645391828999e-05, "loss": 0.0029, "step": 793 }, { "epoch": 0.4373149741824441, "grad_norm": 1.0582444667816162, "learning_rate": 9.178630310727365e-05, "loss": 0.0004, "step": 794 }, { "epoch": 0.4378657487091222, "grad_norm": 20.871862411499023, "learning_rate": 9.157618881078772e-05, "loss": 0.025, "step": 795 }, { "epoch": 0.4384165232358003, "grad_norm": 0.01747993379831314, "learning_rate": 9.136611196290915e-05, "loss": 0.0, "step": 796 }, { "epoch": 0.4389672977624785, "grad_norm": 0.03277825936675072, "learning_rate": 9.115607349754834e-05, "loss": 0.0, "step": 797 }, { "epoch": 0.4395180722891566, "grad_norm": 0.04785889759659767, "learning_rate": 9.094607434844523e-05, "loss": 0.0, "step": 798 }, { "epoch": 0.44006884681583475, "grad_norm": 0.08755006641149521, "learning_rate": 9.07361154491648e-05, "loss": 0.0001, "step": 799 }, { "epoch": 0.4406196213425129, "grad_norm": 16.84794044494629, "learning_rate": 9.052619773309317e-05, "loss": 0.0202, "step": 800 }, { "epoch": 0.44117039586919105, "grad_norm": 13.666803359985352, "learning_rate": 9.031632213343339e-05, "loss": 0.0514, "step": 801 }, { "epoch": 0.44172117039586917, "grad_norm": 7.759776592254639, "learning_rate": 9.01064895832012e-05, "loss": 0.0114, "step": 802 }, { "epoch": 0.44227194492254734, "grad_norm": 1.7166695594787598, "learning_rate": 8.98967010152211e-05, "loss": 0.0012, "step": 803 }, { "epoch": 0.44282271944922547, "grad_norm": 13.207154273986816, "learning_rate": 8.968695736212193e-05, "loss": 0.0277, "step": 804 }, { "epoch": 0.4433734939759036, "grad_norm": 1.7127803564071655, "learning_rate": 8.947725955633294e-05, "loss": 0.0013, "step": 805 }, { "epoch": 0.44392426850258176, "grad_norm": 8.172250747680664, "learning_rate": 8.926760853007946e-05, "loss": 0.0043, "step": 806 }, { "epoch": 0.4444750430292599, "grad_norm": 0.008811322040855885, "learning_rate": 8.905800521537905e-05, "loss": 0.0, "step": 807 }, { "epoch": 0.44502581755593806, "grad_norm": 0.03438156843185425, "learning_rate": 8.884845054403699e-05, "loss": 0.0, "step": 808 }, { "epoch": 0.4455765920826162, "grad_norm": 0.003290075110271573, "learning_rate": 8.863894544764236e-05, "loss": 0.0, "step": 809 }, { "epoch": 0.4461273666092943, "grad_norm": 3.513845920562744, "learning_rate": 8.84294908575639e-05, "loss": 0.0017, "step": 810 }, { "epoch": 0.4466781411359725, "grad_norm": 0.15585871040821075, "learning_rate": 8.822008770494572e-05, "loss": 0.0001, "step": 811 }, { "epoch": 0.4472289156626506, "grad_norm": 0.006181429140269756, "learning_rate": 8.801073692070337e-05, "loss": 0.0, "step": 812 }, { "epoch": 0.4477796901893287, "grad_norm": 0.0036190818063914776, "learning_rate": 8.780143943551954e-05, "loss": 0.0, "step": 813 }, { "epoch": 0.4483304647160069, "grad_norm": 0.6933053731918335, "learning_rate": 8.759219617983999e-05, "loss": 0.0007, "step": 814 }, { "epoch": 0.448881239242685, "grad_norm": 37.291080474853516, "learning_rate": 8.738300808386935e-05, "loss": 0.0248, "step": 815 }, { "epoch": 0.44943201376936315, "grad_norm": 18.005563735961914, "learning_rate": 8.717387607756713e-05, "loss": 0.0123, "step": 816 }, { "epoch": 0.4499827882960413, "grad_norm": 0.004367250949144363, "learning_rate": 8.696480109064342e-05, "loss": 0.0, "step": 817 }, { "epoch": 0.45053356282271945, "grad_norm": 0.12219999730587006, "learning_rate": 8.675578405255485e-05, "loss": 0.0001, "step": 818 }, { "epoch": 0.45108433734939757, "grad_norm": 1.7600035667419434, "learning_rate": 8.654682589250038e-05, "loss": 0.0042, "step": 819 }, { "epoch": 0.45163511187607575, "grad_norm": 0.3002900183200836, "learning_rate": 8.633792753941733e-05, "loss": 0.0002, "step": 820 }, { "epoch": 0.45218588640275387, "grad_norm": 0.3823472261428833, "learning_rate": 8.612908992197705e-05, "loss": 0.0002, "step": 821 }, { "epoch": 0.452736660929432, "grad_norm": 15.623300552368164, "learning_rate": 8.592031396858093e-05, "loss": 0.0117, "step": 822 }, { "epoch": 0.45328743545611017, "grad_norm": 23.941953659057617, "learning_rate": 8.571160060735624e-05, "loss": 0.0679, "step": 823 }, { "epoch": 0.4538382099827883, "grad_norm": 0.04101578891277313, "learning_rate": 8.550295076615188e-05, "loss": 0.0, "step": 824 }, { "epoch": 0.45438898450946646, "grad_norm": 0.010554364882409573, "learning_rate": 8.529436537253458e-05, "loss": 0.0, "step": 825 }, { "epoch": 0.4549397590361446, "grad_norm": 1.0968074798583984, "learning_rate": 8.508584535378439e-05, "loss": 0.0009, "step": 826 }, { "epoch": 0.4554905335628227, "grad_norm": 101.60618591308594, "learning_rate": 8.487739163689079e-05, "loss": 0.0217, "step": 827 }, { "epoch": 0.4560413080895009, "grad_norm": 0.19474026560783386, "learning_rate": 8.466900514854847e-05, "loss": 0.0001, "step": 828 }, { "epoch": 0.456592082616179, "grad_norm": 1.0491626262664795, "learning_rate": 8.446068681515334e-05, "loss": 0.0004, "step": 829 }, { "epoch": 0.45714285714285713, "grad_norm": 0.41485464572906494, "learning_rate": 8.425243756279824e-05, "loss": 0.0007, "step": 830 }, { "epoch": 0.4576936316695353, "grad_norm": 0.05498143285512924, "learning_rate": 8.404425831726894e-05, "loss": 0.0, "step": 831 }, { "epoch": 0.4582444061962134, "grad_norm": 2.534912586212158, "learning_rate": 8.383615000404e-05, "loss": 0.0082, "step": 832 }, { "epoch": 0.45879518072289155, "grad_norm": 0.035978011786937714, "learning_rate": 8.362811354827059e-05, "loss": 0.0, "step": 833 }, { "epoch": 0.4593459552495697, "grad_norm": 0.008016057312488556, "learning_rate": 8.342014987480047e-05, "loss": 0.0, "step": 834 }, { "epoch": 0.45989672977624785, "grad_norm": 17.5090274810791, "learning_rate": 8.321225990814591e-05, "loss": 0.0182, "step": 835 }, { "epoch": 0.46044750430292597, "grad_norm": 0.01992870680987835, "learning_rate": 8.300444457249543e-05, "loss": 0.0, "step": 836 }, { "epoch": 0.46099827882960415, "grad_norm": 1.8790115118026733, "learning_rate": 8.279670479170573e-05, "loss": 0.0009, "step": 837 }, { "epoch": 0.46154905335628227, "grad_norm": 0.005685754586011171, "learning_rate": 8.258904148929775e-05, "loss": 0.0, "step": 838 }, { "epoch": 0.4620998278829604, "grad_norm": 5.560945987701416, "learning_rate": 8.238145558845235e-05, "loss": 0.0022, "step": 839 }, { "epoch": 0.46265060240963857, "grad_norm": 0.06590569019317627, "learning_rate": 8.217394801200631e-05, "loss": 0.0, "step": 840 }, { "epoch": 0.4632013769363167, "grad_norm": 7.009140491485596, "learning_rate": 8.196651968244826e-05, "loss": 0.0017, "step": 841 }, { "epoch": 0.4637521514629948, "grad_norm": 0.25225764513015747, "learning_rate": 8.175917152191447e-05, "loss": 0.0003, "step": 842 }, { "epoch": 0.464302925989673, "grad_norm": 15.349773406982422, "learning_rate": 8.15519044521848e-05, "loss": 0.0262, "step": 843 }, { "epoch": 0.4648537005163511, "grad_norm": 0.019111350178718567, "learning_rate": 8.134471939467874e-05, "loss": 0.0, "step": 844 }, { "epoch": 0.4654044750430293, "grad_norm": 14.229382514953613, "learning_rate": 8.113761727045105e-05, "loss": 0.0388, "step": 845 }, { "epoch": 0.4659552495697074, "grad_norm": 0.017653990536928177, "learning_rate": 8.093059900018792e-05, "loss": 0.0, "step": 846 }, { "epoch": 0.46650602409638553, "grad_norm": 0.0023088185116648674, "learning_rate": 8.072366550420266e-05, "loss": 0.0, "step": 847 }, { "epoch": 0.4670567986230637, "grad_norm": 11.615833282470703, "learning_rate": 8.051681770243175e-05, "loss": 0.0323, "step": 848 }, { "epoch": 0.46760757314974183, "grad_norm": 0.062436215579509735, "learning_rate": 8.031005651443073e-05, "loss": 0.0, "step": 849 }, { "epoch": 0.46815834767641995, "grad_norm": 20.127883911132812, "learning_rate": 8.010338285937006e-05, "loss": 0.0285, "step": 850 }, { "epoch": 0.4687091222030981, "grad_norm": 10.490528106689453, "learning_rate": 7.989679765603108e-05, "loss": 0.0302, "step": 851 }, { "epoch": 0.46925989672977625, "grad_norm": 4.987813949584961, "learning_rate": 7.969030182280192e-05, "loss": 0.012, "step": 852 }, { "epoch": 0.46981067125645437, "grad_norm": 1.9365439414978027, "learning_rate": 7.948389627767343e-05, "loss": 0.0023, "step": 853 }, { "epoch": 0.47036144578313255, "grad_norm": 0.011486521922051907, "learning_rate": 7.927758193823501e-05, "loss": 0.0, "step": 854 }, { "epoch": 0.47091222030981067, "grad_norm": 6.997819423675537, "learning_rate": 7.907135972167069e-05, "loss": 0.0082, "step": 855 }, { "epoch": 0.4714629948364888, "grad_norm": 11.875788688659668, "learning_rate": 7.88652305447549e-05, "loss": 0.0214, "step": 856 }, { "epoch": 0.47201376936316697, "grad_norm": 0.9945202469825745, "learning_rate": 7.865919532384844e-05, "loss": 0.0003, "step": 857 }, { "epoch": 0.4725645438898451, "grad_norm": 0.0036436058580875397, "learning_rate": 7.845325497489449e-05, "loss": 0.0, "step": 858 }, { "epoch": 0.4731153184165232, "grad_norm": 0.04759114235639572, "learning_rate": 7.82474104134144e-05, "loss": 0.0, "step": 859 }, { "epoch": 0.4736660929432014, "grad_norm": 0.6421709656715393, "learning_rate": 7.804166255450373e-05, "loss": 0.0002, "step": 860 }, { "epoch": 0.4742168674698795, "grad_norm": 0.006554386578500271, "learning_rate": 7.783601231282812e-05, "loss": 0.0, "step": 861 }, { "epoch": 0.47476764199655763, "grad_norm": 0.009047183208167553, "learning_rate": 7.763046060261932e-05, "loss": 0.0, "step": 862 }, { "epoch": 0.4753184165232358, "grad_norm": 0.11660318076610565, "learning_rate": 7.742500833767094e-05, "loss": 0.0001, "step": 863 }, { "epoch": 0.47586919104991393, "grad_norm": 0.0022875091526657343, "learning_rate": 7.721965643133458e-05, "loss": 0.0, "step": 864 }, { "epoch": 0.4764199655765921, "grad_norm": 0.018028393387794495, "learning_rate": 7.701440579651566e-05, "loss": 0.0, "step": 865 }, { "epoch": 0.47697074010327023, "grad_norm": 0.0035408425610512495, "learning_rate": 7.680925734566937e-05, "loss": 0.0, "step": 866 }, { "epoch": 0.47752151462994835, "grad_norm": 0.04633285105228424, "learning_rate": 7.660421199079669e-05, "loss": 0.0, "step": 867 }, { "epoch": 0.47807228915662653, "grad_norm": 6.6373419761657715, "learning_rate": 7.639927064344022e-05, "loss": 0.0135, "step": 868 }, { "epoch": 0.47862306368330465, "grad_norm": 0.04508458450436592, "learning_rate": 7.619443421468021e-05, "loss": 0.0001, "step": 869 }, { "epoch": 0.47917383820998277, "grad_norm": 3.6620137691497803, "learning_rate": 7.598970361513051e-05, "loss": 0.0009, "step": 870 }, { "epoch": 0.47972461273666095, "grad_norm": 0.018993297591805458, "learning_rate": 7.578507975493448e-05, "loss": 0.0, "step": 871 }, { "epoch": 0.48027538726333907, "grad_norm": 0.058901507407426834, "learning_rate": 7.558056354376098e-05, "loss": 0.0, "step": 872 }, { "epoch": 0.4808261617900172, "grad_norm": 7.850873947143555, "learning_rate": 7.537615589080027e-05, "loss": 0.006, "step": 873 }, { "epoch": 0.48137693631669537, "grad_norm": 0.1622982621192932, "learning_rate": 7.517185770476006e-05, "loss": 0.0001, "step": 874 }, { "epoch": 0.4819277108433735, "grad_norm": 8.512648582458496, "learning_rate": 7.496766989386136e-05, "loss": 0.0031, "step": 875 }, { "epoch": 0.4824784853700516, "grad_norm": 0.6003686785697937, "learning_rate": 7.476359336583454e-05, "loss": 0.0004, "step": 876 }, { "epoch": 0.4830292598967298, "grad_norm": 1.6147948503494263, "learning_rate": 7.455962902791522e-05, "loss": 0.0062, "step": 877 }, { "epoch": 0.4835800344234079, "grad_norm": 10.153653144836426, "learning_rate": 7.435577778684033e-05, "loss": 0.0125, "step": 878 }, { "epoch": 0.48413080895008603, "grad_norm": 1.5219645500183105, "learning_rate": 7.415204054884399e-05, "loss": 0.0005, "step": 879 }, { "epoch": 0.4846815834767642, "grad_norm": 0.15091396868228912, "learning_rate": 7.394841821965345e-05, "loss": 0.0001, "step": 880 }, { "epoch": 0.48523235800344233, "grad_norm": 0.005259205121546984, "learning_rate": 7.374491170448525e-05, "loss": 0.0, "step": 881 }, { "epoch": 0.4857831325301205, "grad_norm": 11.563203811645508, "learning_rate": 7.3541521908041e-05, "loss": 0.0411, "step": 882 }, { "epoch": 0.48633390705679863, "grad_norm": 0.25063633918762207, "learning_rate": 7.33382497345034e-05, "loss": 0.0002, "step": 883 }, { "epoch": 0.48688468158347675, "grad_norm": 6.4719929695129395, "learning_rate": 7.313509608753231e-05, "loss": 0.0142, "step": 884 }, { "epoch": 0.48743545611015493, "grad_norm": 2.5145390033721924, "learning_rate": 7.293206187026061e-05, "loss": 0.0015, "step": 885 }, { "epoch": 0.48798623063683305, "grad_norm": 6.3872294425964355, "learning_rate": 7.27291479852903e-05, "loss": 0.0088, "step": 886 }, { "epoch": 0.4885370051635112, "grad_norm": 0.060577619820833206, "learning_rate": 7.252635533468843e-05, "loss": 0.0, "step": 887 }, { "epoch": 0.48908777969018935, "grad_norm": 3.8343629837036133, "learning_rate": 7.232368481998309e-05, "loss": 0.0008, "step": 888 }, { "epoch": 0.48963855421686747, "grad_norm": 0.5144193768501282, "learning_rate": 7.212113734215932e-05, "loss": 0.0005, "step": 889 }, { "epoch": 0.4901893287435456, "grad_norm": 0.9012476205825806, "learning_rate": 7.191871380165538e-05, "loss": 0.0003, "step": 890 }, { "epoch": 0.49074010327022377, "grad_norm": 6.458563804626465, "learning_rate": 7.17164150983584e-05, "loss": 0.0074, "step": 891 }, { "epoch": 0.4912908777969019, "grad_norm": 0.10740283131599426, "learning_rate": 7.151424213160061e-05, "loss": 0.0001, "step": 892 }, { "epoch": 0.49184165232358, "grad_norm": 7.138601303100586, "learning_rate": 7.131219580015521e-05, "loss": 0.0037, "step": 893 }, { "epoch": 0.4923924268502582, "grad_norm": 0.006446048151701689, "learning_rate": 7.11102770022325e-05, "loss": 0.0, "step": 894 }, { "epoch": 0.4929432013769363, "grad_norm": 0.0800614282488823, "learning_rate": 7.090848663547574e-05, "loss": 0.0, "step": 895 }, { "epoch": 0.49349397590361443, "grad_norm": 0.4230630695819855, "learning_rate": 7.070682559695736e-05, "loss": 0.0003, "step": 896 }, { "epoch": 0.4940447504302926, "grad_norm": 0.02704252116382122, "learning_rate": 7.050529478317476e-05, "loss": 0.0, "step": 897 }, { "epoch": 0.49459552495697073, "grad_norm": 2.851123094558716, "learning_rate": 7.03038950900464e-05, "loss": 0.0023, "step": 898 }, { "epoch": 0.49514629948364886, "grad_norm": 1.6615166664123535, "learning_rate": 7.010262741290798e-05, "loss": 0.0019, "step": 899 }, { "epoch": 0.49569707401032703, "grad_norm": 3.7547733783721924, "learning_rate": 6.990149264650814e-05, "loss": 0.002, "step": 900 }, { "epoch": 0.49624784853700515, "grad_norm": 15.791815757751465, "learning_rate": 6.970049168500474e-05, "loss": 0.0299, "step": 901 }, { "epoch": 0.49679862306368333, "grad_norm": 6.185478210449219, "learning_rate": 6.94996254219608e-05, "loss": 0.0086, "step": 902 }, { "epoch": 0.49734939759036145, "grad_norm": 0.9614514112472534, "learning_rate": 6.929889475034048e-05, "loss": 0.0004, "step": 903 }, { "epoch": 0.4979001721170396, "grad_norm": 3.863112449645996, "learning_rate": 6.909830056250527e-05, "loss": 0.0099, "step": 904 }, { "epoch": 0.49845094664371775, "grad_norm": 5.418091297149658, "learning_rate": 6.889784375020978e-05, "loss": 0.0044, "step": 905 }, { "epoch": 0.4990017211703959, "grad_norm": 0.003445209702476859, "learning_rate": 6.869752520459803e-05, "loss": 0.0, "step": 906 }, { "epoch": 0.499552495697074, "grad_norm": 0.16882912814617157, "learning_rate": 6.849734581619918e-05, "loss": 0.0002, "step": 907 }, { "epoch": 0.5001032702237521, "grad_norm": 0.027079444378614426, "learning_rate": 6.829730647492404e-05, "loss": 0.0, "step": 908 }, { "epoch": 0.5006540447504303, "grad_norm": 4.950252056121826, "learning_rate": 6.80974080700606e-05, "loss": 0.0008, "step": 909 }, { "epoch": 0.5012048192771085, "grad_norm": 0.02952880598604679, "learning_rate": 6.789765149027039e-05, "loss": 0.0, "step": 910 }, { "epoch": 0.5017555938037865, "grad_norm": 0.002348650014027953, "learning_rate": 6.769803762358443e-05, "loss": 0.0, "step": 911 }, { "epoch": 0.5023063683304647, "grad_norm": 0.007956968620419502, "learning_rate": 6.749856735739928e-05, "loss": 0.0, "step": 912 }, { "epoch": 0.5028571428571429, "grad_norm": 0.07710185647010803, "learning_rate": 6.729924157847323e-05, "loss": 0.0001, "step": 913 }, { "epoch": 0.503407917383821, "grad_norm": 0.0024748938158154488, "learning_rate": 6.710006117292209e-05, "loss": 0.0, "step": 914 }, { "epoch": 0.5039586919104991, "grad_norm": 0.013895099982619286, "learning_rate": 6.690102702621548e-05, "loss": 0.0, "step": 915 }, { "epoch": 0.5045094664371773, "grad_norm": 8.888647079467773, "learning_rate": 6.670214002317278e-05, "loss": 0.0132, "step": 916 }, { "epoch": 0.5050602409638554, "grad_norm": 0.1327354609966278, "learning_rate": 6.650340104795932e-05, "loss": 0.0001, "step": 917 }, { "epoch": 0.5056110154905336, "grad_norm": 0.4979362189769745, "learning_rate": 6.630481098408228e-05, "loss": 0.0003, "step": 918 }, { "epoch": 0.5061617900172117, "grad_norm": 0.13994409143924713, "learning_rate": 6.610637071438686e-05, "loss": 0.0001, "step": 919 }, { "epoch": 0.5067125645438898, "grad_norm": 4.102688312530518, "learning_rate": 6.590808112105232e-05, "loss": 0.0053, "step": 920 }, { "epoch": 0.507263339070568, "grad_norm": 0.13280542194843292, "learning_rate": 6.570994308558812e-05, "loss": 0.0001, "step": 921 }, { "epoch": 0.5078141135972462, "grad_norm": 4.462156295776367, "learning_rate": 6.551195748882997e-05, "loss": 0.0101, "step": 922 }, { "epoch": 0.5083648881239242, "grad_norm": 4.197547912597656, "learning_rate": 6.531412521093586e-05, "loss": 0.0014, "step": 923 }, { "epoch": 0.5089156626506024, "grad_norm": 12.819262504577637, "learning_rate": 6.51164471313822e-05, "loss": 0.0273, "step": 924 }, { "epoch": 0.5094664371772806, "grad_norm": 8.425509452819824, "learning_rate": 6.491892412895995e-05, "loss": 0.0211, "step": 925 }, { "epoch": 0.5100172117039586, "grad_norm": 0.007090330123901367, "learning_rate": 6.472155708177052e-05, "loss": 0.0, "step": 926 }, { "epoch": 0.5105679862306368, "grad_norm": 0.013635394163429737, "learning_rate": 6.452434686722224e-05, "loss": 0.0, "step": 927 }, { "epoch": 0.511118760757315, "grad_norm": 0.0706426352262497, "learning_rate": 6.432729436202604e-05, "loss": 0.0, "step": 928 }, { "epoch": 0.5116695352839931, "grad_norm": 1.1313623189926147, "learning_rate": 6.41304004421918e-05, "loss": 0.0025, "step": 929 }, { "epoch": 0.5122203098106712, "grad_norm": 10.614190101623535, "learning_rate": 6.393366598302446e-05, "loss": 0.0054, "step": 930 }, { "epoch": 0.5127710843373494, "grad_norm": 0.8380641937255859, "learning_rate": 6.373709185911998e-05, "loss": 0.0016, "step": 931 }, { "epoch": 0.5133218588640276, "grad_norm": 10.021825790405273, "learning_rate": 6.354067894436155e-05, "loss": 0.002, "step": 932 }, { "epoch": 0.5138726333907057, "grad_norm": 0.0030106378253549337, "learning_rate": 6.334442811191576e-05, "loss": 0.0, "step": 933 }, { "epoch": 0.5144234079173838, "grad_norm": 3.062901020050049, "learning_rate": 6.314834023422858e-05, "loss": 0.0107, "step": 934 }, { "epoch": 0.514974182444062, "grad_norm": 0.058365944772958755, "learning_rate": 6.295241618302156e-05, "loss": 0.0, "step": 935 }, { "epoch": 0.5155249569707401, "grad_norm": 0.7592312097549438, "learning_rate": 6.275665682928803e-05, "loss": 0.0002, "step": 936 }, { "epoch": 0.5160757314974183, "grad_norm": 0.5309937596321106, "learning_rate": 6.256106304328905e-05, "loss": 0.0001, "step": 937 }, { "epoch": 0.5166265060240964, "grad_norm": 0.00404881127178669, "learning_rate": 6.23656356945497e-05, "loss": 0.0, "step": 938 }, { "epoch": 0.5171772805507745, "grad_norm": 2.8146097660064697, "learning_rate": 6.21703756518551e-05, "loss": 0.0028, "step": 939 }, { "epoch": 0.5177280550774527, "grad_norm": 0.6000826954841614, "learning_rate": 6.197528378324665e-05, "loss": 0.0005, "step": 940 }, { "epoch": 0.5182788296041309, "grad_norm": 0.0061622909270226955, "learning_rate": 6.17803609560181e-05, "loss": 0.0, "step": 941 }, { "epoch": 0.5188296041308089, "grad_norm": 0.14080527424812317, "learning_rate": 6.158560803671168e-05, "loss": 0.0001, "step": 942 }, { "epoch": 0.5193803786574871, "grad_norm": 0.20560882985591888, "learning_rate": 6.139102589111435e-05, "loss": 0.0002, "step": 943 }, { "epoch": 0.5199311531841653, "grad_norm": 0.9022922515869141, "learning_rate": 6.119661538425381e-05, "loss": 0.0002, "step": 944 }, { "epoch": 0.5204819277108433, "grad_norm": 1.646817922592163, "learning_rate": 6.100237738039484e-05, "loss": 0.0009, "step": 945 }, { "epoch": 0.5210327022375215, "grad_norm": 8.438623428344727, "learning_rate": 6.0808312743035236e-05, "loss": 0.0062, "step": 946 }, { "epoch": 0.5215834767641997, "grad_norm": 0.03131546825170517, "learning_rate": 6.061442233490211e-05, "loss": 0.0, "step": 947 }, { "epoch": 0.5221342512908778, "grad_norm": 3.2325186729431152, "learning_rate": 6.042070701794806e-05, "loss": 0.0135, "step": 948 }, { "epoch": 0.5226850258175559, "grad_norm": 1.5477123260498047, "learning_rate": 6.0227167653347305e-05, "loss": 0.0005, "step": 949 }, { "epoch": 0.5232358003442341, "grad_norm": 7.821549892425537, "learning_rate": 6.0033805101491794e-05, "loss": 0.014, "step": 950 }, { "epoch": 0.5237865748709122, "grad_norm": 9.65178394317627, "learning_rate": 5.98406202219875e-05, "loss": 0.0423, "step": 951 }, { "epoch": 0.5243373493975904, "grad_norm": 2.107517719268799, "learning_rate": 5.964761387365052e-05, "loss": 0.0031, "step": 952 }, { "epoch": 0.5248881239242685, "grad_norm": 3.902543544769287, "learning_rate": 5.9454786914503255e-05, "loss": 0.001, "step": 953 }, { "epoch": 0.5254388984509466, "grad_norm": 12.97108268737793, "learning_rate": 5.926214020177074e-05, "loss": 0.0011, "step": 954 }, { "epoch": 0.5259896729776248, "grad_norm": 0.015660088509321213, "learning_rate": 5.9069674591876534e-05, "loss": 0.0, "step": 955 }, { "epoch": 0.526540447504303, "grad_norm": 0.042610302567481995, "learning_rate": 5.887739094043923e-05, "loss": 0.0, "step": 956 }, { "epoch": 0.527091222030981, "grad_norm": 0.0024744123220443726, "learning_rate": 5.868529010226845e-05, "loss": 0.0, "step": 957 }, { "epoch": 0.5276419965576592, "grad_norm": 0.13216692209243774, "learning_rate": 5.849337293136112e-05, "loss": 0.0, "step": 958 }, { "epoch": 0.5281927710843374, "grad_norm": 6.042928695678711, "learning_rate": 5.830164028089766e-05, "loss": 0.0202, "step": 959 }, { "epoch": 0.5287435456110154, "grad_norm": 5.890335559844971, "learning_rate": 5.811009300323818e-05, "loss": 0.0007, "step": 960 }, { "epoch": 0.5292943201376936, "grad_norm": 0.009286030195653439, "learning_rate": 5.791873194991872e-05, "loss": 0.0, "step": 961 }, { "epoch": 0.5298450946643718, "grad_norm": 0.005374649539589882, "learning_rate": 5.7727557971647427e-05, "loss": 0.0, "step": 962 }, { "epoch": 0.5303958691910499, "grad_norm": 8.952714920043945, "learning_rate": 5.7536571918300864e-05, "loss": 0.0118, "step": 963 }, { "epoch": 0.530946643717728, "grad_norm": 0.005943234544247389, "learning_rate": 5.734577463892008e-05, "loss": 0.0, "step": 964 }, { "epoch": 0.5314974182444062, "grad_norm": 0.0027692588046193123, "learning_rate": 5.7155166981706956e-05, "loss": 0.0, "step": 965 }, { "epoch": 0.5320481927710843, "grad_norm": 0.001263860729523003, "learning_rate": 5.6964749794020354e-05, "loss": 0.0, "step": 966 }, { "epoch": 0.5325989672977625, "grad_norm": 0.012995108962059021, "learning_rate": 5.6774523922372394e-05, "loss": 0.0, "step": 967 }, { "epoch": 0.5331497418244406, "grad_norm": 1.759899616241455, "learning_rate": 5.6584490212424804e-05, "loss": 0.0005, "step": 968 }, { "epoch": 0.5337005163511188, "grad_norm": 2.710604190826416, "learning_rate": 5.639464950898491e-05, "loss": 0.0089, "step": 969 }, { "epoch": 0.5342512908777969, "grad_norm": 3.7784769535064697, "learning_rate": 5.620500265600206e-05, "loss": 0.0027, "step": 970 }, { "epoch": 0.5348020654044751, "grad_norm": 3.894794225692749, "learning_rate": 5.601555049656382e-05, "loss": 0.0124, "step": 971 }, { "epoch": 0.5353528399311532, "grad_norm": 6.060196399688721, "learning_rate": 5.58262938728922e-05, "loss": 0.0024, "step": 972 }, { "epoch": 0.5359036144578313, "grad_norm": 5.4104461669921875, "learning_rate": 5.563723362634008e-05, "loss": 0.0134, "step": 973 }, { "epoch": 0.5364543889845095, "grad_norm": 0.010672791860997677, "learning_rate": 5.544837059738719e-05, "loss": 0.0, "step": 974 }, { "epoch": 0.5370051635111877, "grad_norm": 0.03070840612053871, "learning_rate": 5.525970562563656e-05, "loss": 0.0, "step": 975 }, { "epoch": 0.5375559380378657, "grad_norm": 0.38995373249053955, "learning_rate": 5.507123954981073e-05, "loss": 0.0007, "step": 976 }, { "epoch": 0.5381067125645439, "grad_norm": 0.19271329045295715, "learning_rate": 5.488297320774807e-05, "loss": 0.0001, "step": 977 }, { "epoch": 0.5386574870912221, "grad_norm": 0.06045440956950188, "learning_rate": 5.4694907436399e-05, "loss": 0.0, "step": 978 }, { "epoch": 0.5392082616179001, "grad_norm": 0.012985218316316605, "learning_rate": 5.4507043071822284e-05, "loss": 0.0, "step": 979 }, { "epoch": 0.5397590361445783, "grad_norm": 4.008856773376465, "learning_rate": 5.431938094918132e-05, "loss": 0.012, "step": 980 }, { "epoch": 0.5403098106712565, "grad_norm": 0.17044109106063843, "learning_rate": 5.41319219027404e-05, "loss": 0.0002, "step": 981 }, { "epoch": 0.5408605851979346, "grad_norm": 0.012877269648015499, "learning_rate": 5.394466676586114e-05, "loss": 0.0, "step": 982 }, { "epoch": 0.5414113597246127, "grad_norm": 15.690069198608398, "learning_rate": 5.375761637099854e-05, "loss": 0.0313, "step": 983 }, { "epoch": 0.5419621342512909, "grad_norm": 0.021764356642961502, "learning_rate": 5.357077154969742e-05, "loss": 0.0, "step": 984 }, { "epoch": 0.542512908777969, "grad_norm": 0.8998143672943115, "learning_rate": 5.3384133132588784e-05, "loss": 0.0006, "step": 985 }, { "epoch": 0.5430636833046472, "grad_norm": 1.1006726026535034, "learning_rate": 5.3197701949386e-05, "loss": 0.0008, "step": 986 }, { "epoch": 0.5436144578313253, "grad_norm": 0.04761703312397003, "learning_rate": 5.301147882888116e-05, "loss": 0.0, "step": 987 }, { "epoch": 0.5441652323580034, "grad_norm": 0.0064292424358427525, "learning_rate": 5.28254645989414e-05, "loss": 0.0, "step": 988 }, { "epoch": 0.5447160068846816, "grad_norm": 1.401316523551941, "learning_rate": 5.2639660086505226e-05, "loss": 0.006, "step": 989 }, { "epoch": 0.5452667814113598, "grad_norm": 3.4560980796813965, "learning_rate": 5.2454066117578815e-05, "loss": 0.0033, "step": 990 }, { "epoch": 0.5458175559380378, "grad_norm": 0.09853185713291168, "learning_rate": 5.226868351723244e-05, "loss": 0.0001, "step": 991 }, { "epoch": 0.546368330464716, "grad_norm": 0.6609016060829163, "learning_rate": 5.2083513109596616e-05, "loss": 0.0006, "step": 992 }, { "epoch": 0.5469191049913942, "grad_norm": 4.3532257080078125, "learning_rate": 5.189855571785859e-05, "loss": 0.0018, "step": 993 }, { "epoch": 0.5474698795180722, "grad_norm": 0.00882643461227417, "learning_rate": 5.171381216425863e-05, "loss": 0.0, "step": 994 }, { "epoch": 0.5480206540447504, "grad_norm": 0.013490208424627781, "learning_rate": 5.152928327008635e-05, "loss": 0.0, "step": 995 }, { "epoch": 0.5485714285714286, "grad_norm": 0.002792400773614645, "learning_rate": 5.134496985567714e-05, "loss": 0.0, "step": 996 }, { "epoch": 0.5491222030981067, "grad_norm": 6.08452844619751, "learning_rate": 5.116087274040837e-05, "loss": 0.0042, "step": 997 }, { "epoch": 0.5496729776247848, "grad_norm": 0.005321510136127472, "learning_rate": 5.0976992742695925e-05, "loss": 0.0, "step": 998 }, { "epoch": 0.550223752151463, "grad_norm": 6.796543121337891, "learning_rate": 5.07933306799904e-05, "loss": 0.0083, "step": 999 }, { "epoch": 0.5507745266781411, "grad_norm": 0.19201387465000153, "learning_rate": 5.060988736877366e-05, "loss": 0.0001, "step": 1000 }, { "epoch": 0.5513253012048193, "grad_norm": 4.590329170227051, "learning_rate": 5.042666362455498e-05, "loss": 0.0077, "step": 1001 }, { "epoch": 0.5518760757314974, "grad_norm": 3.0635335445404053, "learning_rate": 5.024366026186755e-05, "loss": 0.0067, "step": 1002 }, { "epoch": 0.5524268502581755, "grad_norm": 0.3797217309474945, "learning_rate": 5.006087809426496e-05, "loss": 0.0004, "step": 1003 }, { "epoch": 0.5529776247848537, "grad_norm": 7.616761207580566, "learning_rate": 4.987831793431731e-05, "loss": 0.0376, "step": 1004 }, { "epoch": 0.5535283993115319, "grad_norm": 2.884629964828491, "learning_rate": 4.9695980593607817e-05, "loss": 0.0009, "step": 1005 }, { "epoch": 0.55407917383821, "grad_norm": 2.6406657695770264, "learning_rate": 4.9513866882729146e-05, "loss": 0.0043, "step": 1006 }, { "epoch": 0.5546299483648881, "grad_norm": 0.1903446763753891, "learning_rate": 4.9331977611279777e-05, "loss": 0.0002, "step": 1007 }, { "epoch": 0.5551807228915663, "grad_norm": 0.19499507546424866, "learning_rate": 4.9150313587860433e-05, "loss": 0.0002, "step": 1008 }, { "epoch": 0.5557314974182445, "grad_norm": 0.08927839249372482, "learning_rate": 4.896887562007054e-05, "loss": 0.0001, "step": 1009 }, { "epoch": 0.5562822719449225, "grad_norm": 0.0022910397965461016, "learning_rate": 4.8787664514504504e-05, "loss": 0.0, "step": 1010 }, { "epoch": 0.5568330464716007, "grad_norm": 3.508301258087158, "learning_rate": 4.860668107674823e-05, "loss": 0.0025, "step": 1011 }, { "epoch": 0.5573838209982789, "grad_norm": 0.012703589163720608, "learning_rate": 4.8425926111375506e-05, "loss": 0.0, "step": 1012 }, { "epoch": 0.5579345955249569, "grad_norm": 0.0075357710011303425, "learning_rate": 4.824540042194443e-05, "loss": 0.0, "step": 1013 }, { "epoch": 0.5584853700516351, "grad_norm": 0.002030513249337673, "learning_rate": 4.8065104810993856e-05, "loss": 0.0, "step": 1014 }, { "epoch": 0.5590361445783133, "grad_norm": 0.008482123725116253, "learning_rate": 4.788504008003978e-05, "loss": 0.0, "step": 1015 }, { "epoch": 0.5595869191049914, "grad_norm": 0.020104877650737762, "learning_rate": 4.770520702957182e-05, "loss": 0.0, "step": 1016 }, { "epoch": 0.5601376936316695, "grad_norm": 1.7687091827392578, "learning_rate": 4.752560645904962e-05, "loss": 0.0016, "step": 1017 }, { "epoch": 0.5606884681583477, "grad_norm": 0.10008379817008972, "learning_rate": 4.734623916689941e-05, "loss": 0.0001, "step": 1018 }, { "epoch": 0.5612392426850258, "grad_norm": 0.7421602606773376, "learning_rate": 4.716710595051022e-05, "loss": 0.0003, "step": 1019 }, { "epoch": 0.561790017211704, "grad_norm": 5.485063552856445, "learning_rate": 4.698820760623064e-05, "loss": 0.0086, "step": 1020 }, { "epoch": 0.5623407917383821, "grad_norm": 2.5806753635406494, "learning_rate": 4.6809544929365004e-05, "loss": 0.0021, "step": 1021 }, { "epoch": 0.5628915662650602, "grad_norm": 1.5442454814910889, "learning_rate": 4.663111871417e-05, "loss": 0.0012, "step": 1022 }, { "epoch": 0.5634423407917384, "grad_norm": 0.011233295314013958, "learning_rate": 4.645292975385111e-05, "loss": 0.0, "step": 1023 }, { "epoch": 0.5639931153184166, "grad_norm": 3.3653011322021484, "learning_rate": 4.627497884055912e-05, "loss": 0.0013, "step": 1024 }, { "epoch": 0.5645438898450946, "grad_norm": 0.1083044707775116, "learning_rate": 4.609726676538652e-05, "loss": 0.0001, "step": 1025 }, { "epoch": 0.5650946643717728, "grad_norm": 0.018117021769285202, "learning_rate": 4.591979431836402e-05, "loss": 0.0, "step": 1026 }, { "epoch": 0.565645438898451, "grad_norm": 16.228464126586914, "learning_rate": 4.574256228845706e-05, "loss": 0.0214, "step": 1027 }, { "epoch": 0.566196213425129, "grad_norm": 0.355914831161499, "learning_rate": 4.5565571463562365e-05, "loss": 0.0002, "step": 1028 }, { "epoch": 0.5667469879518072, "grad_norm": 0.004606591537594795, "learning_rate": 4.5388822630504256e-05, "loss": 0.0, "step": 1029 }, { "epoch": 0.5672977624784854, "grad_norm": 18.283254623413086, "learning_rate": 4.521231657503132e-05, "loss": 0.0185, "step": 1030 }, { "epoch": 0.5678485370051635, "grad_norm": 0.013465874828398228, "learning_rate": 4.503605408181286e-05, "loss": 0.0, "step": 1031 }, { "epoch": 0.5683993115318416, "grad_norm": 10.572543144226074, "learning_rate": 4.486003593443537e-05, "loss": 0.0179, "step": 1032 }, { "epoch": 0.5689500860585198, "grad_norm": 1.8724266290664673, "learning_rate": 4.468426291539914e-05, "loss": 0.0005, "step": 1033 }, { "epoch": 0.5695008605851979, "grad_norm": 0.7917636036872864, "learning_rate": 4.4508735806114654e-05, "loss": 0.0004, "step": 1034 }, { "epoch": 0.5700516351118761, "grad_norm": 0.16629016399383545, "learning_rate": 4.433345538689929e-05, "loss": 0.0001, "step": 1035 }, { "epoch": 0.5706024096385542, "grad_norm": 2.3158931732177734, "learning_rate": 4.415842243697369e-05, "loss": 0.0009, "step": 1036 }, { "epoch": 0.5711531841652323, "grad_norm": 0.27926522493362427, "learning_rate": 4.39836377344583e-05, "loss": 0.0001, "step": 1037 }, { "epoch": 0.5717039586919105, "grad_norm": 4.536098003387451, "learning_rate": 4.380910205637012e-05, "loss": 0.0183, "step": 1038 }, { "epoch": 0.5722547332185887, "grad_norm": 0.01662346161901951, "learning_rate": 4.363481617861893e-05, "loss": 0.0, "step": 1039 }, { "epoch": 0.5728055077452667, "grad_norm": 0.054134905338287354, "learning_rate": 4.346078087600412e-05, "loss": 0.0, "step": 1040 }, { "epoch": 0.5733562822719449, "grad_norm": 0.010733433067798615, "learning_rate": 4.3286996922211034e-05, "loss": 0.0, "step": 1041 }, { "epoch": 0.5739070567986231, "grad_norm": 0.00935351848602295, "learning_rate": 4.311346508980772e-05, "loss": 0.0, "step": 1042 }, { "epoch": 0.5744578313253013, "grad_norm": 0.0055184029042720795, "learning_rate": 4.2940186150241365e-05, "loss": 0.0, "step": 1043 }, { "epoch": 0.5750086058519793, "grad_norm": 0.036371879279613495, "learning_rate": 4.27671608738349e-05, "loss": 0.0, "step": 1044 }, { "epoch": 0.5755593803786575, "grad_norm": 7.273275852203369, "learning_rate": 4.2594390029783534e-05, "loss": 0.0141, "step": 1045 }, { "epoch": 0.5761101549053357, "grad_norm": 3.321528673171997, "learning_rate": 4.242187438615153e-05, "loss": 0.016, "step": 1046 }, { "epoch": 0.5766609294320137, "grad_norm": 0.006214479450136423, "learning_rate": 4.224961470986849e-05, "loss": 0.0, "step": 1047 }, { "epoch": 0.5772117039586919, "grad_norm": 0.002388172782957554, "learning_rate": 4.207761176672614e-05, "loss": 0.0, "step": 1048 }, { "epoch": 0.5777624784853701, "grad_norm": 0.005175253376364708, "learning_rate": 4.190586632137491e-05, "loss": 0.0, "step": 1049 }, { "epoch": 0.5783132530120482, "grad_norm": 0.0655880868434906, "learning_rate": 4.173437913732048e-05, "loss": 0.0001, "step": 1050 }, { "epoch": 0.5788640275387263, "grad_norm": 5.999275207519531, "learning_rate": 4.156315097692037e-05, "loss": 0.0111, "step": 1051 }, { "epoch": 0.5794148020654045, "grad_norm": 16.064970016479492, "learning_rate": 4.139218260138074e-05, "loss": 0.0585, "step": 1052 }, { "epoch": 0.5799655765920826, "grad_norm": 0.05536043271422386, "learning_rate": 4.12214747707527e-05, "loss": 0.0001, "step": 1053 }, { "epoch": 0.5805163511187608, "grad_norm": 0.003760656574741006, "learning_rate": 4.1051028243929125e-05, "loss": 0.0, "step": 1054 }, { "epoch": 0.5810671256454389, "grad_norm": 3.240367889404297, "learning_rate": 4.088084377864135e-05, "loss": 0.0038, "step": 1055 }, { "epoch": 0.581617900172117, "grad_norm": 7.815759658813477, "learning_rate": 4.07109221314556e-05, "loss": 0.0131, "step": 1056 }, { "epoch": 0.5821686746987952, "grad_norm": 0.007338278461247683, "learning_rate": 4.054126405776971e-05, "loss": 0.0, "step": 1057 }, { "epoch": 0.5827194492254734, "grad_norm": 3.292480707168579, "learning_rate": 4.037187031180985e-05, "loss": 0.0038, "step": 1058 }, { "epoch": 0.5832702237521514, "grad_norm": 0.008572899736464024, "learning_rate": 4.020274164662707e-05, "loss": 0.0, "step": 1059 }, { "epoch": 0.5838209982788296, "grad_norm": 0.011731223203241825, "learning_rate": 4.003387881409397e-05, "loss": 0.0, "step": 1060 }, { "epoch": 0.5843717728055078, "grad_norm": 0.0048867035657167435, "learning_rate": 3.986528256490141e-05, "loss": 0.0, "step": 1061 }, { "epoch": 0.5849225473321858, "grad_norm": 0.001119846710935235, "learning_rate": 3.969695364855511e-05, "loss": 0.0, "step": 1062 }, { "epoch": 0.585473321858864, "grad_norm": 0.18532831966876984, "learning_rate": 3.952889281337235e-05, "loss": 0.0001, "step": 1063 }, { "epoch": 0.5860240963855422, "grad_norm": 0.6665564179420471, "learning_rate": 3.93611008064786e-05, "loss": 0.0003, "step": 1064 }, { "epoch": 0.5865748709122203, "grad_norm": 0.6930995583534241, "learning_rate": 3.9193578373804364e-05, "loss": 0.0004, "step": 1065 }, { "epoch": 0.5871256454388984, "grad_norm": 0.7324560284614563, "learning_rate": 3.90263262600816e-05, "loss": 0.0001, "step": 1066 }, { "epoch": 0.5876764199655766, "grad_norm": 0.006854007486253977, "learning_rate": 3.88593452088406e-05, "loss": 0.0, "step": 1067 }, { "epoch": 0.5882271944922547, "grad_norm": 2.600399971008301, "learning_rate": 3.869263596240661e-05, "loss": 0.0008, "step": 1068 }, { "epoch": 0.5887779690189329, "grad_norm": 35.40540313720703, "learning_rate": 3.8526199261896544e-05, "loss": 0.0253, "step": 1069 }, { "epoch": 0.589328743545611, "grad_norm": 0.011782824993133545, "learning_rate": 3.836003584721577e-05, "loss": 0.0, "step": 1070 }, { "epoch": 0.5898795180722891, "grad_norm": 0.05736062675714493, "learning_rate": 3.8194146457054655e-05, "loss": 0.0001, "step": 1071 }, { "epoch": 0.5904302925989673, "grad_norm": 0.005979849025607109, "learning_rate": 3.802853182888543e-05, "loss": 0.0, "step": 1072 }, { "epoch": 0.5909810671256455, "grad_norm": 0.046467263251543045, "learning_rate": 3.786319269895877e-05, "loss": 0.0001, "step": 1073 }, { "epoch": 0.5915318416523235, "grad_norm": 3.021083116531372, "learning_rate": 3.769812980230074e-05, "loss": 0.007, "step": 1074 }, { "epoch": 0.5920826161790017, "grad_norm": 3.062448501586914, "learning_rate": 3.7533343872709294e-05, "loss": 0.0012, "step": 1075 }, { "epoch": 0.5926333907056799, "grad_norm": 0.021482253447175026, "learning_rate": 3.736883564275112e-05, "loss": 0.0, "step": 1076 }, { "epoch": 0.593184165232358, "grad_norm": 0.0058983745984733105, "learning_rate": 3.7204605843758386e-05, "loss": 0.0, "step": 1077 }, { "epoch": 0.5937349397590361, "grad_norm": 0.06852951645851135, "learning_rate": 3.704065520582549e-05, "loss": 0.0001, "step": 1078 }, { "epoch": 0.5942857142857143, "grad_norm": 2.337948799133301, "learning_rate": 3.6876984457805786e-05, "loss": 0.0011, "step": 1079 }, { "epoch": 0.5948364888123924, "grad_norm": 0.016796719282865524, "learning_rate": 3.671359432730834e-05, "loss": 0.0, "step": 1080 }, { "epoch": 0.5953872633390705, "grad_norm": 1.0604562759399414, "learning_rate": 3.655048554069478e-05, "loss": 0.0003, "step": 1081 }, { "epoch": 0.5959380378657487, "grad_norm": 2.7649543285369873, "learning_rate": 3.638765882307589e-05, "loss": 0.0045, "step": 1082 }, { "epoch": 0.5964888123924269, "grad_norm": 4.96885347366333, "learning_rate": 3.6225114898308634e-05, "loss": 0.0129, "step": 1083 }, { "epoch": 0.597039586919105, "grad_norm": 0.034742552787065506, "learning_rate": 3.6062854488992714e-05, "loss": 0.0, "step": 1084 }, { "epoch": 0.5975903614457831, "grad_norm": 0.1056838408112526, "learning_rate": 3.5900878316467454e-05, "loss": 0.0001, "step": 1085 }, { "epoch": 0.5981411359724613, "grad_norm": 4.890119552612305, "learning_rate": 3.573918710080857e-05, "loss": 0.0059, "step": 1086 }, { "epoch": 0.5986919104991394, "grad_norm": 1.738047480583191, "learning_rate": 3.5577781560825066e-05, "loss": 0.0017, "step": 1087 }, { "epoch": 0.5992426850258176, "grad_norm": 0.003094714367762208, "learning_rate": 3.541666241405588e-05, "loss": 0.0, "step": 1088 }, { "epoch": 0.5997934595524957, "grad_norm": 0.13502635061740875, "learning_rate": 3.5255830376766764e-05, "loss": 0.0001, "step": 1089 }, { "epoch": 0.6003442340791738, "grad_norm": 7.647158145904541, "learning_rate": 3.509528616394716e-05, "loss": 0.0208, "step": 1090 }, { "epoch": 0.600895008605852, "grad_norm": 0.02719770185649395, "learning_rate": 3.4935030489306883e-05, "loss": 0.0, "step": 1091 }, { "epoch": 0.6014457831325302, "grad_norm": 2.0539026260375977, "learning_rate": 3.4775064065273165e-05, "loss": 0.0019, "step": 1092 }, { "epoch": 0.6019965576592082, "grad_norm": 0.02227661944925785, "learning_rate": 3.4615387602987236e-05, "loss": 0.0, "step": 1093 }, { "epoch": 0.6025473321858864, "grad_norm": 0.234940305352211, "learning_rate": 3.445600181230134e-05, "loss": 0.0002, "step": 1094 }, { "epoch": 0.6030981067125646, "grad_norm": 0.4507715702056885, "learning_rate": 3.429690740177549e-05, "loss": 0.0004, "step": 1095 }, { "epoch": 0.6036488812392427, "grad_norm": 0.009895412251353264, "learning_rate": 3.413810507867436e-05, "loss": 0.0, "step": 1096 }, { "epoch": 0.6041996557659208, "grad_norm": 0.0052743651904165745, "learning_rate": 3.397959554896415e-05, "loss": 0.0, "step": 1097 }, { "epoch": 0.604750430292599, "grad_norm": 0.14304792881011963, "learning_rate": 3.3821379517309405e-05, "loss": 0.0001, "step": 1098 }, { "epoch": 0.6053012048192771, "grad_norm": 2.7031238079071045, "learning_rate": 3.3663457687069924e-05, "loss": 0.0127, "step": 1099 }, { "epoch": 0.6058519793459552, "grad_norm": 0.003596097696572542, "learning_rate": 3.350583076029754e-05, "loss": 0.0, "step": 1100 }, { "epoch": 0.6064027538726334, "grad_norm": 7.82961368560791, "learning_rate": 3.334849943773323e-05, "loss": 0.0211, "step": 1101 }, { "epoch": 0.6069535283993115, "grad_norm": 0.9673002362251282, "learning_rate": 3.319146441880371e-05, "loss": 0.0023, "step": 1102 }, { "epoch": 0.6075043029259897, "grad_norm": 0.00502818962559104, "learning_rate": 3.3034726401618444e-05, "loss": 0.0, "step": 1103 }, { "epoch": 0.6080550774526678, "grad_norm": 5.485194683074951, "learning_rate": 3.28782860829667e-05, "loss": 0.0026, "step": 1104 }, { "epoch": 0.6086058519793459, "grad_norm": 7.251112937927246, "learning_rate": 3.272214415831418e-05, "loss": 0.0038, "step": 1105 }, { "epoch": 0.6091566265060241, "grad_norm": 0.15072965621948242, "learning_rate": 3.2566301321800085e-05, "loss": 0.0001, "step": 1106 }, { "epoch": 0.6097074010327023, "grad_norm": 6.833738327026367, "learning_rate": 3.241075826623401e-05, "loss": 0.0007, "step": 1107 }, { "epoch": 0.6102581755593803, "grad_norm": 4.482696056365967, "learning_rate": 3.225551568309284e-05, "loss": 0.0011, "step": 1108 }, { "epoch": 0.6108089500860585, "grad_norm": 0.003364129923284054, "learning_rate": 3.210057426251773e-05, "loss": 0.0, "step": 1109 }, { "epoch": 0.6113597246127367, "grad_norm": 0.008832336403429508, "learning_rate": 3.1945934693310896e-05, "loss": 0.0, "step": 1110 }, { "epoch": 0.6119104991394148, "grad_norm": 0.0035265255719423294, "learning_rate": 3.179159766293282e-05, "loss": 0.0, "step": 1111 }, { "epoch": 0.6124612736660929, "grad_norm": 0.022767003625631332, "learning_rate": 3.163756385749889e-05, "loss": 0.0, "step": 1112 }, { "epoch": 0.6130120481927711, "grad_norm": 0.011694950051605701, "learning_rate": 3.148383396177653e-05, "loss": 0.0, "step": 1113 }, { "epoch": 0.6135628227194492, "grad_norm": 0.00783373974263668, "learning_rate": 3.133040865918213e-05, "loss": 0.0, "step": 1114 }, { "epoch": 0.6141135972461274, "grad_norm": 0.002150429179891944, "learning_rate": 3.117728863177796e-05, "loss": 0.0, "step": 1115 }, { "epoch": 0.6146643717728055, "grad_norm": 0.038459453731775284, "learning_rate": 3.102447456026919e-05, "loss": 0.0, "step": 1116 }, { "epoch": 0.6152151462994836, "grad_norm": 0.5452999472618103, "learning_rate": 3.0871967124000834e-05, "loss": 0.0003, "step": 1117 }, { "epoch": 0.6157659208261618, "grad_norm": 0.2770096957683563, "learning_rate": 3.0719767000954714e-05, "loss": 0.0003, "step": 1118 }, { "epoch": 0.61631669535284, "grad_norm": 0.024996010586619377, "learning_rate": 3.056787486774656e-05, "loss": 0.0, "step": 1119 }, { "epoch": 0.6168674698795181, "grad_norm": 10.510160446166992, "learning_rate": 3.041629139962283e-05, "loss": 0.0043, "step": 1120 }, { "epoch": 0.6174182444061962, "grad_norm": 0.002503847237676382, "learning_rate": 3.0265017270457775e-05, "loss": 0.0, "step": 1121 }, { "epoch": 0.6179690189328744, "grad_norm": 0.2628730237483978, "learning_rate": 3.0114053152750556e-05, "loss": 0.0002, "step": 1122 }, { "epoch": 0.6185197934595525, "grad_norm": 0.1967441588640213, "learning_rate": 2.9963399717622077e-05, "loss": 0.0002, "step": 1123 }, { "epoch": 0.6190705679862306, "grad_norm": 0.15400297939777374, "learning_rate": 2.98130576348121e-05, "loss": 0.0001, "step": 1124 }, { "epoch": 0.6196213425129088, "grad_norm": 1.809255838394165, "learning_rate": 2.966302757267625e-05, "loss": 0.0021, "step": 1125 }, { "epoch": 0.6196213425129088, "eval_loss": 0.004392336588352919, "eval_runtime": 232.638, "eval_samples_per_second": 13.145, "eval_steps_per_second": 6.572, "step": 1125 }, { "epoch": 0.620172117039587, "grad_norm": 0.4748697578907013, "learning_rate": 2.9513310198183065e-05, "loss": 0.0001, "step": 1126 }, { "epoch": 0.620722891566265, "grad_norm": 0.01868477463722229, "learning_rate": 2.936390617691097e-05, "loss": 0.0, "step": 1127 }, { "epoch": 0.6212736660929432, "grad_norm": 1.4908583164215088, "learning_rate": 2.9214816173045356e-05, "loss": 0.0004, "step": 1128 }, { "epoch": 0.6218244406196214, "grad_norm": 0.2228550761938095, "learning_rate": 2.906604084937572e-05, "loss": 0.0001, "step": 1129 }, { "epoch": 0.6223752151462995, "grad_norm": 8.267536163330078, "learning_rate": 2.8917580867292526e-05, "loss": 0.1006, "step": 1130 }, { "epoch": 0.6229259896729776, "grad_norm": 10.493623733520508, "learning_rate": 2.8769436886784408e-05, "loss": 0.0136, "step": 1131 }, { "epoch": 0.6234767641996558, "grad_norm": 1.9761072397232056, "learning_rate": 2.862160956643517e-05, "loss": 0.0044, "step": 1132 }, { "epoch": 0.6240275387263339, "grad_norm": 1.1715402603149414, "learning_rate": 2.847409956342092e-05, "loss": 0.0003, "step": 1133 }, { "epoch": 0.624578313253012, "grad_norm": 10.6393461227417, "learning_rate": 2.8326907533507074e-05, "loss": 0.0805, "step": 1134 }, { "epoch": 0.6251290877796902, "grad_norm": 0.04610615596175194, "learning_rate": 2.8180034131045464e-05, "loss": 0.0, "step": 1135 }, { "epoch": 0.6256798623063683, "grad_norm": 6.594550132751465, "learning_rate": 2.8033480008971546e-05, "loss": 0.0194, "step": 1136 }, { "epoch": 0.6262306368330465, "grad_norm": 1.6887128353118896, "learning_rate": 2.7887245818801277e-05, "loss": 0.0025, "step": 1137 }, { "epoch": 0.6267814113597246, "grad_norm": 0.597827672958374, "learning_rate": 2.7741332210628345e-05, "loss": 0.0002, "step": 1138 }, { "epoch": 0.6273321858864027, "grad_norm": 0.035196468234062195, "learning_rate": 2.759573983312138e-05, "loss": 0.0, "step": 1139 }, { "epoch": 0.6278829604130809, "grad_norm": 0.010384023189544678, "learning_rate": 2.7450469333520855e-05, "loss": 0.0, "step": 1140 }, { "epoch": 0.6284337349397591, "grad_norm": 0.013360747136175632, "learning_rate": 2.730552135763632e-05, "loss": 0.0, "step": 1141 }, { "epoch": 0.6289845094664371, "grad_norm": 0.004259203560650349, "learning_rate": 2.7160896549843562e-05, "loss": 0.0, "step": 1142 }, { "epoch": 0.6295352839931153, "grad_norm": 0.05798590928316116, "learning_rate": 2.701659555308169e-05, "loss": 0.0001, "step": 1143 }, { "epoch": 0.6300860585197935, "grad_norm": 0.006525599863380194, "learning_rate": 2.6872619008850274e-05, "loss": 0.0, "step": 1144 }, { "epoch": 0.6306368330464716, "grad_norm": 0.00698586693033576, "learning_rate": 2.672896755720654e-05, "loss": 0.0, "step": 1145 }, { "epoch": 0.6311876075731497, "grad_norm": 1.2709800004959106, "learning_rate": 2.6585641836762433e-05, "loss": 0.0037, "step": 1146 }, { "epoch": 0.6317383820998279, "grad_norm": 0.020473510026931763, "learning_rate": 2.6442642484681944e-05, "loss": 0.0, "step": 1147 }, { "epoch": 0.632289156626506, "grad_norm": 11.559391975402832, "learning_rate": 2.6299970136678077e-05, "loss": 0.0057, "step": 1148 }, { "epoch": 0.6328399311531842, "grad_norm": 0.002915577497333288, "learning_rate": 2.6157625427010156e-05, "loss": 0.0, "step": 1149 }, { "epoch": 0.6333907056798623, "grad_norm": 0.011440039612352848, "learning_rate": 2.6015608988480955e-05, "loss": 0.0, "step": 1150 }, { "epoch": 0.6339414802065404, "grad_norm": 8.000061988830566, "learning_rate": 2.5873921452433915e-05, "loss": 0.0119, "step": 1151 }, { "epoch": 0.6344922547332186, "grad_norm": 4.93593168258667, "learning_rate": 2.57325634487503e-05, "loss": 0.019, "step": 1152 }, { "epoch": 0.6350430292598968, "grad_norm": 5.69904899597168, "learning_rate": 2.5591535605846383e-05, "loss": 0.0061, "step": 1153 }, { "epoch": 0.6355938037865748, "grad_norm": 0.19449971616268158, "learning_rate": 2.5450838550670808e-05, "loss": 0.0001, "step": 1154 }, { "epoch": 0.636144578313253, "grad_norm": 0.024679578840732574, "learning_rate": 2.5310472908701555e-05, "loss": 0.0, "step": 1155 }, { "epoch": 0.6366953528399312, "grad_norm": 2.1368253231048584, "learning_rate": 2.5170439303943294e-05, "loss": 0.0014, "step": 1156 }, { "epoch": 0.6372461273666093, "grad_norm": 0.06892140954732895, "learning_rate": 2.503073835892471e-05, "loss": 0.0001, "step": 1157 }, { "epoch": 0.6377969018932874, "grad_norm": 0.017484420910477638, "learning_rate": 2.4891370694695517e-05, "loss": 0.0, "step": 1158 }, { "epoch": 0.6383476764199656, "grad_norm": 0.01218375377357006, "learning_rate": 2.4752336930823837e-05, "loss": 0.0, "step": 1159 }, { "epoch": 0.6388984509466438, "grad_norm": 0.03714378550648689, "learning_rate": 2.4613637685393432e-05, "loss": 0.0, "step": 1160 }, { "epoch": 0.6394492254733218, "grad_norm": 2.92740535736084, "learning_rate": 2.4475273575000936e-05, "loss": 0.0021, "step": 1161 }, { "epoch": 0.64, "grad_norm": 0.014792831614613533, "learning_rate": 2.4337245214753103e-05, "loss": 0.0, "step": 1162 }, { "epoch": 0.6405507745266782, "grad_norm": 0.0039852154441177845, "learning_rate": 2.4199553218264093e-05, "loss": 0.0, "step": 1163 }, { "epoch": 0.6411015490533563, "grad_norm": 0.0019080197671428323, "learning_rate": 2.4062198197652752e-05, "loss": 0.0, "step": 1164 }, { "epoch": 0.6416523235800344, "grad_norm": 0.0016980243381112814, "learning_rate": 2.3925180763539844e-05, "loss": 0.0, "step": 1165 }, { "epoch": 0.6422030981067126, "grad_norm": 0.06215986609458923, "learning_rate": 2.3788501525045438e-05, "loss": 0.0001, "step": 1166 }, { "epoch": 0.6427538726333907, "grad_norm": 0.52988201379776, "learning_rate": 2.3652161089786086e-05, "loss": 0.0003, "step": 1167 }, { "epoch": 0.6433046471600689, "grad_norm": 0.004748248960822821, "learning_rate": 2.351616006387214e-05, "loss": 0.0, "step": 1168 }, { "epoch": 0.643855421686747, "grad_norm": 0.09933594614267349, "learning_rate": 2.3380499051905137e-05, "loss": 0.0001, "step": 1169 }, { "epoch": 0.6444061962134251, "grad_norm": 0.3680607080459595, "learning_rate": 2.324517865697501e-05, "loss": 0.0004, "step": 1170 }, { "epoch": 0.6449569707401033, "grad_norm": 0.1050693616271019, "learning_rate": 2.3110199480657525e-05, "loss": 0.0001, "step": 1171 }, { "epoch": 0.6455077452667815, "grad_norm": 5.546016693115234, "learning_rate": 2.2975562123011495e-05, "loss": 0.0256, "step": 1172 }, { "epoch": 0.6460585197934595, "grad_norm": 0.056336987763643265, "learning_rate": 2.2841267182576143e-05, "loss": 0.0, "step": 1173 }, { "epoch": 0.6466092943201377, "grad_norm": 0.28463220596313477, "learning_rate": 2.2707315256368433e-05, "loss": 0.0002, "step": 1174 }, { "epoch": 0.6471600688468159, "grad_norm": 9.667635917663574, "learning_rate": 2.2573706939880555e-05, "loss": 0.0052, "step": 1175 }, { "epoch": 0.6477108433734939, "grad_norm": 6.7510199546813965, "learning_rate": 2.2440442827077045e-05, "loss": 0.0404, "step": 1176 }, { "epoch": 0.6482616179001721, "grad_norm": 12.501592636108398, "learning_rate": 2.230752351039228e-05, "loss": 0.0146, "step": 1177 }, { "epoch": 0.6488123924268503, "grad_norm": 0.45063453912734985, "learning_rate": 2.2174949580727832e-05, "loss": 0.0003, "step": 1178 }, { "epoch": 0.6493631669535284, "grad_norm": 1.936355471611023, "learning_rate": 2.2042721627449846e-05, "loss": 0.0051, "step": 1179 }, { "epoch": 0.6499139414802065, "grad_norm": 0.007956241257488728, "learning_rate": 2.1910840238386398e-05, "loss": 0.0, "step": 1180 }, { "epoch": 0.6504647160068847, "grad_norm": 0.06485851854085922, "learning_rate": 2.1779305999824884e-05, "loss": 0.0001, "step": 1181 }, { "epoch": 0.6510154905335628, "grad_norm": 0.12809894979000092, "learning_rate": 2.164811949650942e-05, "loss": 0.0002, "step": 1182 }, { "epoch": 0.651566265060241, "grad_norm": 5.802713394165039, "learning_rate": 2.1517281311638217e-05, "loss": 0.0118, "step": 1183 }, { "epoch": 0.6521170395869191, "grad_norm": 9.11218547821045, "learning_rate": 2.1386792026861103e-05, "loss": 0.0081, "step": 1184 }, { "epoch": 0.6526678141135972, "grad_norm": 0.009693088009953499, "learning_rate": 2.125665222227675e-05, "loss": 0.0, "step": 1185 }, { "epoch": 0.6532185886402754, "grad_norm": 0.013427584432065487, "learning_rate": 2.112686247643024e-05, "loss": 0.0, "step": 1186 }, { "epoch": 0.6537693631669536, "grad_norm": 0.10089477896690369, "learning_rate": 2.09974233663104e-05, "loss": 0.0001, "step": 1187 }, { "epoch": 0.6543201376936316, "grad_norm": 0.009399957023561, "learning_rate": 2.0868335467347366e-05, "loss": 0.0, "step": 1188 }, { "epoch": 0.6548709122203098, "grad_norm": 22.31793212890625, "learning_rate": 2.073959935340988e-05, "loss": 0.0098, "step": 1189 }, { "epoch": 0.655421686746988, "grad_norm": 0.08975204080343246, "learning_rate": 2.06112155968028e-05, "loss": 0.0001, "step": 1190 }, { "epoch": 0.655972461273666, "grad_norm": 0.023852644488215446, "learning_rate": 2.0483184768264596e-05, "loss": 0.0, "step": 1191 }, { "epoch": 0.6565232358003442, "grad_norm": 0.014178517274558544, "learning_rate": 2.035550743696468e-05, "loss": 0.0, "step": 1192 }, { "epoch": 0.6570740103270224, "grad_norm": 0.21900485455989838, "learning_rate": 2.022818417050113e-05, "loss": 0.0001, "step": 1193 }, { "epoch": 0.6576247848537005, "grad_norm": 2.1252355575561523, "learning_rate": 2.0101215534897855e-05, "loss": 0.0014, "step": 1194 }, { "epoch": 0.6581755593803786, "grad_norm": 2.31019926071167, "learning_rate": 1.99746020946023e-05, "loss": 0.0026, "step": 1195 }, { "epoch": 0.6587263339070568, "grad_norm": 0.013259191997349262, "learning_rate": 1.9848344412482854e-05, "loss": 0.0, "step": 1196 }, { "epoch": 0.659277108433735, "grad_norm": 4.49750280380249, "learning_rate": 1.9722443049826344e-05, "loss": 0.0142, "step": 1197 }, { "epoch": 0.6598278829604131, "grad_norm": 0.0020921523682773113, "learning_rate": 1.9596898566335576e-05, "loss": 0.0, "step": 1198 }, { "epoch": 0.6603786574870912, "grad_norm": 0.006069159600883722, "learning_rate": 1.9471711520126824e-05, "loss": 0.0, "step": 1199 }, { "epoch": 0.6609294320137694, "grad_norm": 0.34093979001045227, "learning_rate": 1.9346882467727325e-05, "loss": 0.0002, "step": 1200 }, { "epoch": 0.6614802065404475, "grad_norm": 60.169490814208984, "learning_rate": 1.9222411964072884e-05, "loss": 0.0387, "step": 1201 }, { "epoch": 0.6620309810671257, "grad_norm": 20.716371536254883, "learning_rate": 1.9098300562505266e-05, "loss": 0.0061, "step": 1202 }, { "epoch": 0.6625817555938038, "grad_norm": 0.04583056643605232, "learning_rate": 1.8974548814769944e-05, "loss": 0.0001, "step": 1203 }, { "epoch": 0.6631325301204819, "grad_norm": 6.579235553741455, "learning_rate": 1.8851157271013442e-05, "loss": 0.0152, "step": 1204 }, { "epoch": 0.6636833046471601, "grad_norm": 0.12049424648284912, "learning_rate": 1.872812647978095e-05, "loss": 0.0001, "step": 1205 }, { "epoch": 0.6642340791738383, "grad_norm": 0.6648346185684204, "learning_rate": 1.8605456988014015e-05, "loss": 0.0002, "step": 1206 }, { "epoch": 0.6647848537005163, "grad_norm": 0.033920131623744965, "learning_rate": 1.8483149341047923e-05, "loss": 0.0, "step": 1207 }, { "epoch": 0.6653356282271945, "grad_norm": 0.0012317304499447346, "learning_rate": 1.8361204082609352e-05, "loss": 0.0, "step": 1208 }, { "epoch": 0.6658864027538727, "grad_norm": 0.0391898937523365, "learning_rate": 1.8239621754813995e-05, "loss": 0.0, "step": 1209 }, { "epoch": 0.6664371772805507, "grad_norm": 0.040248189121484756, "learning_rate": 1.811840289816409e-05, "loss": 0.0, "step": 1210 }, { "epoch": 0.6669879518072289, "grad_norm": 0.005994821432977915, "learning_rate": 1.799754805154603e-05, "loss": 0.0, "step": 1211 }, { "epoch": 0.6675387263339071, "grad_norm": 0.644835352897644, "learning_rate": 1.787705775222802e-05, "loss": 0.0005, "step": 1212 }, { "epoch": 0.6680895008605852, "grad_norm": 0.005988036748021841, "learning_rate": 1.775693253585763e-05, "loss": 0.0, "step": 1213 }, { "epoch": 0.6686402753872633, "grad_norm": 0.1871015429496765, "learning_rate": 1.763717293645939e-05, "loss": 0.0001, "step": 1214 }, { "epoch": 0.6691910499139415, "grad_norm": 0.11890304088592529, "learning_rate": 1.7517779486432495e-05, "loss": 0.0001, "step": 1215 }, { "epoch": 0.6697418244406196, "grad_norm": 0.0010532401502132416, "learning_rate": 1.7398752716548395e-05, "loss": 0.0, "step": 1216 }, { "epoch": 0.6702925989672978, "grad_norm": 11.517462730407715, "learning_rate": 1.728009315594843e-05, "loss": 0.0166, "step": 1217 }, { "epoch": 0.6708433734939759, "grad_norm": 0.7917474508285522, "learning_rate": 1.716180133214149e-05, "loss": 0.0023, "step": 1218 }, { "epoch": 0.671394148020654, "grad_norm": 0.14306794106960297, "learning_rate": 1.704387777100165e-05, "loss": 0.0002, "step": 1219 }, { "epoch": 0.6719449225473322, "grad_norm": 9.24384593963623, "learning_rate": 1.6926322996765897e-05, "loss": 0.0274, "step": 1220 }, { "epoch": 0.6724956970740104, "grad_norm": 0.07585463672876358, "learning_rate": 1.6809137532031704e-05, "loss": 0.0001, "step": 1221 }, { "epoch": 0.6730464716006884, "grad_norm": 0.11331991851329803, "learning_rate": 1.6692321897754758e-05, "loss": 0.0, "step": 1222 }, { "epoch": 0.6735972461273666, "grad_norm": 3.4845287799835205, "learning_rate": 1.65758766132467e-05, "loss": 0.0044, "step": 1223 }, { "epoch": 0.6741480206540448, "grad_norm": 30.960084915161133, "learning_rate": 1.6459802196172668e-05, "loss": 0.0079, "step": 1224 }, { "epoch": 0.6746987951807228, "grad_norm": 0.0033825919963419437, "learning_rate": 1.634409916254914e-05, "loss": 0.0, "step": 1225 }, { "epoch": 0.675249569707401, "grad_norm": 0.010637586936354637, "learning_rate": 1.622876802674158e-05, "loss": 0.0, "step": 1226 }, { "epoch": 0.6758003442340792, "grad_norm": 0.0037515556905418634, "learning_rate": 1.6113809301462125e-05, "loss": 0.0, "step": 1227 }, { "epoch": 0.6763511187607573, "grad_norm": 0.004635954741388559, "learning_rate": 1.599922349776738e-05, "loss": 0.0, "step": 1228 }, { "epoch": 0.6769018932874354, "grad_norm": 4.071500778198242, "learning_rate": 1.5885011125056047e-05, "loss": 0.005, "step": 1229 }, { "epoch": 0.6774526678141136, "grad_norm": 0.006481868214905262, "learning_rate": 1.5771172691066794e-05, "loss": 0.0, "step": 1230 }, { "epoch": 0.6780034423407917, "grad_norm": 6.379063606262207, "learning_rate": 1.565770870187585e-05, "loss": 0.0054, "step": 1231 }, { "epoch": 0.6785542168674699, "grad_norm": 0.007611383218318224, "learning_rate": 1.5544619661894864e-05, "loss": 0.0, "step": 1232 }, { "epoch": 0.679104991394148, "grad_norm": 0.030380571261048317, "learning_rate": 1.543190607386861e-05, "loss": 0.0, "step": 1233 }, { "epoch": 0.6796557659208262, "grad_norm": 0.0029648577328771353, "learning_rate": 1.5319568438872745e-05, "loss": 0.0, "step": 1234 }, { "epoch": 0.6802065404475043, "grad_norm": 0.009032072499394417, "learning_rate": 1.520760725631164e-05, "loss": 0.0, "step": 1235 }, { "epoch": 0.6807573149741825, "grad_norm": 3.791918992996216, "learning_rate": 1.5096023023916094e-05, "loss": 0.0078, "step": 1236 }, { "epoch": 0.6813080895008606, "grad_norm": 0.030180688947439194, "learning_rate": 1.498481623774115e-05, "loss": 0.0, "step": 1237 }, { "epoch": 0.6818588640275387, "grad_norm": 5.113369941711426, "learning_rate": 1.4873987392163947e-05, "loss": 0.0016, "step": 1238 }, { "epoch": 0.6824096385542169, "grad_norm": 0.005653590429574251, "learning_rate": 1.4763536979881354e-05, "loss": 0.0, "step": 1239 }, { "epoch": 0.6829604130808951, "grad_norm": 10.294276237487793, "learning_rate": 1.4653465491908003e-05, "loss": 0.0058, "step": 1240 }, { "epoch": 0.6835111876075731, "grad_norm": 0.017583025619387627, "learning_rate": 1.4543773417573925e-05, "loss": 0.0, "step": 1241 }, { "epoch": 0.6840619621342513, "grad_norm": 0.019949622452259064, "learning_rate": 1.4434461244522458e-05, "loss": 0.0, "step": 1242 }, { "epoch": 0.6846127366609295, "grad_norm": 0.0036968078929930925, "learning_rate": 1.4325529458708065e-05, "loss": 0.0, "step": 1243 }, { "epoch": 0.6851635111876075, "grad_norm": 0.007728655356913805, "learning_rate": 1.4216978544394177e-05, "loss": 0.0, "step": 1244 }, { "epoch": 0.6857142857142857, "grad_norm": 0.021823743358254433, "learning_rate": 1.4108808984151023e-05, "loss": 0.0, "step": 1245 }, { "epoch": 0.6862650602409639, "grad_norm": 3.9599990844726562, "learning_rate": 1.4001021258853509e-05, "loss": 0.0154, "step": 1246 }, { "epoch": 0.686815834767642, "grad_norm": 0.021933233365416527, "learning_rate": 1.3893615847679065e-05, "loss": 0.0, "step": 1247 }, { "epoch": 0.6873666092943201, "grad_norm": 0.10486134141683578, "learning_rate": 1.3786593228105494e-05, "loss": 0.0001, "step": 1248 }, { "epoch": 0.6879173838209983, "grad_norm": 0.02882656268775463, "learning_rate": 1.3679953875908957e-05, "loss": 0.0, "step": 1249 }, { "epoch": 0.6884681583476764, "grad_norm": 0.28168728947639465, "learning_rate": 1.3573698265161683e-05, "loss": 0.0003, "step": 1250 }, { "epoch": 0.6890189328743546, "grad_norm": 12.428107261657715, "learning_rate": 1.3467826868229994e-05, "loss": 0.0126, "step": 1251 }, { "epoch": 0.6895697074010327, "grad_norm": 41.68017578125, "learning_rate": 1.3362340155772146e-05, "loss": 0.0016, "step": 1252 }, { "epoch": 0.6901204819277108, "grad_norm": 0.004966510459780693, "learning_rate": 1.3257238596736266e-05, "loss": 0.0, "step": 1253 }, { "epoch": 0.690671256454389, "grad_norm": 20.071935653686523, "learning_rate": 1.3152522658358245e-05, "loss": 0.0185, "step": 1254 }, { "epoch": 0.6912220309810672, "grad_norm": 0.004030510783195496, "learning_rate": 1.3048192806159721e-05, "loss": 0.0, "step": 1255 }, { "epoch": 0.6917728055077452, "grad_norm": 4.145101070404053, "learning_rate": 1.2944249503945894e-05, "loss": 0.0029, "step": 1256 }, { "epoch": 0.6923235800344234, "grad_norm": 0.09861322492361069, "learning_rate": 1.2840693213803545e-05, "loss": 0.0001, "step": 1257 }, { "epoch": 0.6928743545611016, "grad_norm": 0.22644849121570587, "learning_rate": 1.2737524396099032e-05, "loss": 0.0, "step": 1258 }, { "epoch": 0.6934251290877796, "grad_norm": 0.0009723069379106164, "learning_rate": 1.2634743509476088e-05, "loss": 0.0, "step": 1259 }, { "epoch": 0.6939759036144578, "grad_norm": 0.006863076239824295, "learning_rate": 1.2532351010853916e-05, "loss": 0.0, "step": 1260 }, { "epoch": 0.694526678141136, "grad_norm": 0.006150909699499607, "learning_rate": 1.243034735542512e-05, "loss": 0.0, "step": 1261 }, { "epoch": 0.6950774526678141, "grad_norm": 0.09807290136814117, "learning_rate": 1.2328732996653669e-05, "loss": 0.0001, "step": 1262 }, { "epoch": 0.6956282271944922, "grad_norm": 0.0024463830050081015, "learning_rate": 1.2227508386272878e-05, "loss": 0.0, "step": 1263 }, { "epoch": 0.6961790017211704, "grad_norm": 0.0017220525769516826, "learning_rate": 1.212667397428342e-05, "loss": 0.0, "step": 1264 }, { "epoch": 0.6967297762478485, "grad_norm": 0.001602950389496982, "learning_rate": 1.2026230208951306e-05, "loss": 0.0, "step": 1265 }, { "epoch": 0.6972805507745267, "grad_norm": 0.002950621536001563, "learning_rate": 1.1926177536805905e-05, "loss": 0.0, "step": 1266 }, { "epoch": 0.6978313253012048, "grad_norm": 0.1427912712097168, "learning_rate": 1.1826516402637989e-05, "loss": 0.0001, "step": 1267 }, { "epoch": 0.6983820998278829, "grad_norm": 5.550673007965088, "learning_rate": 1.1727247249497685e-05, "loss": 0.011, "step": 1268 }, { "epoch": 0.6989328743545611, "grad_norm": 0.22811011970043182, "learning_rate": 1.1628370518692533e-05, "loss": 0.0002, "step": 1269 }, { "epoch": 0.6994836488812393, "grad_norm": 0.002816747408360243, "learning_rate": 1.152988664978556e-05, "loss": 0.0, "step": 1270 }, { "epoch": 0.7000344234079174, "grad_norm": 0.05251504108309746, "learning_rate": 1.1431796080593283e-05, "loss": 0.0001, "step": 1271 }, { "epoch": 0.7005851979345955, "grad_norm": 0.6719489097595215, "learning_rate": 1.1334099247183783e-05, "loss": 0.0006, "step": 1272 }, { "epoch": 0.7011359724612737, "grad_norm": 0.0034861667081713676, "learning_rate": 1.1236796583874787e-05, "loss": 0.0, "step": 1273 }, { "epoch": 0.7016867469879519, "grad_norm": 1.1858373880386353, "learning_rate": 1.1139888523231678e-05, "loss": 0.0032, "step": 1274 }, { "epoch": 0.7022375215146299, "grad_norm": 3.075684070587158, "learning_rate": 1.1043375496065611e-05, "loss": 0.0065, "step": 1275 }, { "epoch": 0.7027882960413081, "grad_norm": 4.249173164367676, "learning_rate": 1.0947257931431642e-05, "loss": 0.0015, "step": 1276 }, { "epoch": 0.7033390705679863, "grad_norm": 0.45814889669418335, "learning_rate": 1.0851536256626705e-05, "loss": 0.0003, "step": 1277 }, { "epoch": 0.7038898450946643, "grad_norm": 0.006351792253553867, "learning_rate": 1.0756210897187812e-05, "loss": 0.0, "step": 1278 }, { "epoch": 0.7044406196213425, "grad_norm": 0.013387829065322876, "learning_rate": 1.0661282276890127e-05, "loss": 0.0, "step": 1279 }, { "epoch": 0.7049913941480207, "grad_norm": 0.10735993087291718, "learning_rate": 1.0566750817745074e-05, "loss": 0.0001, "step": 1280 }, { "epoch": 0.7055421686746988, "grad_norm": 1.825049877166748, "learning_rate": 1.0472616939998492e-05, "loss": 0.0007, "step": 1281 }, { "epoch": 0.7060929432013769, "grad_norm": 0.06405424326658249, "learning_rate": 1.0378881062128731e-05, "loss": 0.0, "step": 1282 }, { "epoch": 0.7066437177280551, "grad_norm": 0.06038561463356018, "learning_rate": 1.0285543600844804e-05, "loss": 0.0, "step": 1283 }, { "epoch": 0.7071944922547332, "grad_norm": 0.008663682267069817, "learning_rate": 1.019260497108453e-05, "loss": 0.0, "step": 1284 }, { "epoch": 0.7077452667814114, "grad_norm": 2.3917040824890137, "learning_rate": 1.010006558601274e-05, "loss": 0.0005, "step": 1285 }, { "epoch": 0.7082960413080895, "grad_norm": 0.009731964208185673, "learning_rate": 1.000792585701934e-05, "loss": 0.0, "step": 1286 }, { "epoch": 0.7088468158347676, "grad_norm": 0.00465196929872036, "learning_rate": 9.91618619371757e-06, "loss": 0.0, "step": 1287 }, { "epoch": 0.7093975903614458, "grad_norm": 0.06420578062534332, "learning_rate": 9.82484700394215e-06, "loss": 0.0001, "step": 1288 }, { "epoch": 0.709948364888124, "grad_norm": 0.002349059097468853, "learning_rate": 9.73390869374743e-06, "loss": 0.0, "step": 1289 }, { "epoch": 0.710499139414802, "grad_norm": 0.24351459741592407, "learning_rate": 9.643371667405698e-06, "loss": 0.0001, "step": 1290 }, { "epoch": 0.7110499139414802, "grad_norm": 0.0989309772849083, "learning_rate": 9.553236327405246e-06, "loss": 0.0001, "step": 1291 }, { "epoch": 0.7116006884681584, "grad_norm": 0.004126241430640221, "learning_rate": 9.463503074448677e-06, "loss": 0.0, "step": 1292 }, { "epoch": 0.7121514629948364, "grad_norm": 21.740692138671875, "learning_rate": 9.374172307451068e-06, "loss": 0.0341, "step": 1293 }, { "epoch": 0.7127022375215146, "grad_norm": 1.8497823476791382, "learning_rate": 9.285244423538197e-06, "loss": 0.0075, "step": 1294 }, { "epoch": 0.7132530120481928, "grad_norm": 0.2676040530204773, "learning_rate": 9.196719818044886e-06, "loss": 0.0002, "step": 1295 }, { "epoch": 0.7138037865748709, "grad_norm": 0.010861944407224655, "learning_rate": 9.108598884513053e-06, "loss": 0.0, "step": 1296 }, { "epoch": 0.714354561101549, "grad_norm": 0.0009497642167843878, "learning_rate": 9.020882014690136e-06, "loss": 0.0, "step": 1297 }, { "epoch": 0.7149053356282272, "grad_norm": 0.7260568737983704, "learning_rate": 8.933569598527247e-06, "loss": 0.0005, "step": 1298 }, { "epoch": 0.7154561101549053, "grad_norm": 7.981448173522949, "learning_rate": 8.846662024177477e-06, "loss": 0.0096, "step": 1299 }, { "epoch": 0.7160068846815835, "grad_norm": 0.006492141634225845, "learning_rate": 8.760159677994172e-06, "loss": 0.0, "step": 1300 }, { "epoch": 0.7165576592082616, "grad_norm": 5.62687349319458, "learning_rate": 8.674062944529216e-06, "loss": 0.0154, "step": 1301 }, { "epoch": 0.7171084337349397, "grad_norm": 26.390893936157227, "learning_rate": 8.588372206531292e-06, "loss": 0.0145, "step": 1302 }, { "epoch": 0.7176592082616179, "grad_norm": 0.8518588542938232, "learning_rate": 8.503087844944213e-06, "loss": 0.0014, "step": 1303 }, { "epoch": 0.7182099827882961, "grad_norm": 3.6683707237243652, "learning_rate": 8.418210238905256e-06, "loss": 0.0226, "step": 1304 }, { "epoch": 0.7187607573149741, "grad_norm": 0.189906045794487, "learning_rate": 8.333739765743398e-06, "loss": 0.0003, "step": 1305 }, { "epoch": 0.7193115318416523, "grad_norm": 6.024000644683838, "learning_rate": 8.249676800977658e-06, "loss": 0.0022, "step": 1306 }, { "epoch": 0.7198623063683305, "grad_norm": 0.0026775836013257504, "learning_rate": 8.16602171831553e-06, "loss": 0.0, "step": 1307 }, { "epoch": 0.7204130808950086, "grad_norm": 0.3145427107810974, "learning_rate": 8.082774889651168e-06, "loss": 0.0001, "step": 1308 }, { "epoch": 0.7209638554216867, "grad_norm": 8.958983421325684, "learning_rate": 7.999936685063835e-06, "loss": 0.0092, "step": 1309 }, { "epoch": 0.7215146299483649, "grad_norm": 0.003721661167219281, "learning_rate": 7.91750747281621e-06, "loss": 0.0, "step": 1310 }, { "epoch": 0.7220654044750431, "grad_norm": 0.07062265276908875, "learning_rate": 7.835487619352811e-06, "loss": 0.0001, "step": 1311 }, { "epoch": 0.7226161790017211, "grad_norm": 4.393032550811768, "learning_rate": 7.753877489298244e-06, "loss": 0.0045, "step": 1312 }, { "epoch": 0.7231669535283993, "grad_norm": 0.0029092449694871902, "learning_rate": 7.67267744545579e-06, "loss": 0.0, "step": 1313 }, { "epoch": 0.7237177280550775, "grad_norm": 0.004748050589114428, "learning_rate": 7.591887848805545e-06, "loss": 0.0, "step": 1314 }, { "epoch": 0.7242685025817556, "grad_norm": 0.0064840069971978664, "learning_rate": 7.5115090585029966e-06, "loss": 0.0, "step": 1315 }, { "epoch": 0.7248192771084337, "grad_norm": 0.016844289377331734, "learning_rate": 7.431541431877342e-06, "loss": 0.0, "step": 1316 }, { "epoch": 0.7253700516351119, "grad_norm": 0.007927708327770233, "learning_rate": 7.351985324429933e-06, "loss": 0.0, "step": 1317 }, { "epoch": 0.72592082616179, "grad_norm": 2.70413875579834, "learning_rate": 7.272841089832694e-06, "loss": 0.0028, "step": 1318 }, { "epoch": 0.7264716006884682, "grad_norm": 0.11179018020629883, "learning_rate": 7.194109079926514e-06, "loss": 0.0, "step": 1319 }, { "epoch": 0.7270223752151463, "grad_norm": 3.2051963806152344, "learning_rate": 7.115789644719728e-06, "loss": 0.0057, "step": 1320 }, { "epoch": 0.7275731497418244, "grad_norm": 0.025669926777482033, "learning_rate": 7.037883132386547e-06, "loss": 0.0, "step": 1321 }, { "epoch": 0.7281239242685026, "grad_norm": 0.13069964945316315, "learning_rate": 6.960389889265517e-06, "loss": 0.0002, "step": 1322 }, { "epoch": 0.7286746987951808, "grad_norm": 0.06460897624492645, "learning_rate": 6.883310259857944e-06, "loss": 0.0001, "step": 1323 }, { "epoch": 0.7292254733218588, "grad_norm": 0.0024216710589826107, "learning_rate": 6.806644586826383e-06, "loss": 0.0, "step": 1324 }, { "epoch": 0.729776247848537, "grad_norm": 0.34324705600738525, "learning_rate": 6.730393210993147e-06, "loss": 0.0003, "step": 1325 }, { "epoch": 0.7303270223752152, "grad_norm": 21.349599838256836, "learning_rate": 6.654556471338746e-06, "loss": 0.0167, "step": 1326 }, { "epoch": 0.7308777969018933, "grad_norm": 0.04000944644212723, "learning_rate": 6.579134705000412e-06, "loss": 0.0, "step": 1327 }, { "epoch": 0.7314285714285714, "grad_norm": 0.001637598848901689, "learning_rate": 6.504128247270546e-06, "loss": 0.0, "step": 1328 }, { "epoch": 0.7319793459552496, "grad_norm": 0.0130823515355587, "learning_rate": 6.429537431595312e-06, "loss": 0.0, "step": 1329 }, { "epoch": 0.7325301204819277, "grad_norm": 0.027401890605688095, "learning_rate": 6.355362589573077e-06, "loss": 0.0, "step": 1330 }, { "epoch": 0.7330808950086058, "grad_norm": 4.384512901306152, "learning_rate": 6.2816040509530165e-06, "loss": 0.0012, "step": 1331 }, { "epoch": 0.733631669535284, "grad_norm": 0.007645327597856522, "learning_rate": 6.2082621436335475e-06, "loss": 0.0, "step": 1332 }, { "epoch": 0.7341824440619621, "grad_norm": 5.937259674072266, "learning_rate": 6.135337193660962e-06, "loss": 0.0049, "step": 1333 }, { "epoch": 0.7347332185886403, "grad_norm": 3.533447027206421, "learning_rate": 6.062829525227909e-06, "loss": 0.0037, "step": 1334 }, { "epoch": 0.7352839931153184, "grad_norm": 0.021448787301778793, "learning_rate": 5.990739460672024e-06, "loss": 0.0, "step": 1335 }, { "epoch": 0.7358347676419965, "grad_norm": 0.030548710376024246, "learning_rate": 5.9190673204744255e-06, "loss": 0.0, "step": 1336 }, { "epoch": 0.7363855421686747, "grad_norm": 0.2512188255786896, "learning_rate": 5.84781342325833e-06, "loss": 0.0001, "step": 1337 }, { "epoch": 0.7369363166953529, "grad_norm": 0.4013170301914215, "learning_rate": 5.77697808578761e-06, "loss": 0.0007, "step": 1338 }, { "epoch": 0.7374870912220309, "grad_norm": 0.49961724877357483, "learning_rate": 5.706561622965467e-06, "loss": 0.0003, "step": 1339 }, { "epoch": 0.7380378657487091, "grad_norm": 0.2584999203681946, "learning_rate": 5.636564347832907e-06, "loss": 0.0001, "step": 1340 }, { "epoch": 0.7385886402753873, "grad_norm": 0.002199718030169606, "learning_rate": 5.566986571567401e-06, "loss": 0.0, "step": 1341 }, { "epoch": 0.7391394148020654, "grad_norm": 1.3660738468170166, "learning_rate": 5.497828603481569e-06, "loss": 0.0029, "step": 1342 }, { "epoch": 0.7396901893287435, "grad_norm": 0.011288951151072979, "learning_rate": 5.429090751021704e-06, "loss": 0.0, "step": 1343 }, { "epoch": 0.7402409638554217, "grad_norm": 0.7769904136657715, "learning_rate": 5.3607733197664436e-06, "loss": 0.0016, "step": 1344 }, { "epoch": 0.7407917383820998, "grad_norm": 0.0041184755973517895, "learning_rate": 5.2928766134254345e-06, "loss": 0.0, "step": 1345 }, { "epoch": 0.741342512908778, "grad_norm": 0.1276787966489792, "learning_rate": 5.225400933837954e-06, "loss": 0.0001, "step": 1346 }, { "epoch": 0.7418932874354561, "grad_norm": 2.6570403575897217, "learning_rate": 5.158346580971573e-06, "loss": 0.0011, "step": 1347 }, { "epoch": 0.7424440619621343, "grad_norm": 0.3625442087650299, "learning_rate": 5.091713852920854e-06, "loss": 0.0003, "step": 1348 }, { "epoch": 0.7429948364888124, "grad_norm": 33.33551788330078, "learning_rate": 5.025503045905933e-06, "loss": 0.0008, "step": 1349 }, { "epoch": 0.7435456110154905, "grad_norm": 0.0995141789317131, "learning_rate": 4.959714454271369e-06, "loss": 0.0, "step": 1350 }, { "epoch": 0.7440963855421687, "grad_norm": 5.254178524017334, "learning_rate": 4.8943483704846475e-06, "loss": 0.0087, "step": 1351 }, { "epoch": 0.7446471600688468, "grad_norm": 0.15599283576011658, "learning_rate": 4.829405085134997e-06, "loss": 0.0002, "step": 1352 }, { "epoch": 0.745197934595525, "grad_norm": 0.012318381108343601, "learning_rate": 4.764884886932086e-06, "loss": 0.0, "step": 1353 }, { "epoch": 0.7457487091222031, "grad_norm": 2.744201421737671, "learning_rate": 4.700788062704687e-06, "loss": 0.0048, "step": 1354 }, { "epoch": 0.7462994836488812, "grad_norm": 0.11001694202423096, "learning_rate": 4.6371148973994525e-06, "loss": 0.0001, "step": 1355 }, { "epoch": 0.7468502581755594, "grad_norm": 4.165911674499512, "learning_rate": 4.573865674079625e-06, "loss": 0.0115, "step": 1356 }, { "epoch": 0.7474010327022376, "grad_norm": 1.2358767986297607, "learning_rate": 4.511040673923828e-06, "loss": 0.0025, "step": 1357 }, { "epoch": 0.7479518072289156, "grad_norm": 0.003075912594795227, "learning_rate": 4.448640176224694e-06, "loss": 0.0, "step": 1358 }, { "epoch": 0.7485025817555938, "grad_norm": 7.379143714904785, "learning_rate": 4.386664458387779e-06, "loss": 0.0125, "step": 1359 }, { "epoch": 0.749053356282272, "grad_norm": 0.006688182707875967, "learning_rate": 4.325113795930203e-06, "loss": 0.0, "step": 1360 }, { "epoch": 0.74960413080895, "grad_norm": 0.006307372823357582, "learning_rate": 4.263988462479484e-06, "loss": 0.0, "step": 1361 }, { "epoch": 0.7501549053356282, "grad_norm": 0.04635660722851753, "learning_rate": 4.203288729772326e-06, "loss": 0.0, "step": 1362 }, { "epoch": 0.7507056798623064, "grad_norm": 0.0073508513160049915, "learning_rate": 4.143014867653383e-06, "loss": 0.0, "step": 1363 }, { "epoch": 0.7512564543889845, "grad_norm": 0.004286817274987698, "learning_rate": 4.083167144074073e-06, "loss": 0.0, "step": 1364 }, { "epoch": 0.7518072289156627, "grad_norm": 0.0023021779488772154, "learning_rate": 4.023745825091407e-06, "loss": 0.0, "step": 1365 }, { "epoch": 0.7523580034423408, "grad_norm": 0.034736454486846924, "learning_rate": 3.964751174866765e-06, "loss": 0.0, "step": 1366 }, { "epoch": 0.7529087779690189, "grad_norm": 0.43322134017944336, "learning_rate": 3.906183455664725e-06, "loss": 0.0002, "step": 1367 }, { "epoch": 0.7534595524956971, "grad_norm": 4.027128219604492, "learning_rate": 3.84804292785198e-06, "loss": 0.0041, "step": 1368 }, { "epoch": 0.7540103270223752, "grad_norm": 5.799943923950195, "learning_rate": 3.7903298498960572e-06, "loss": 0.002, "step": 1369 }, { "epoch": 0.7545611015490533, "grad_norm": 1.1865922212600708, "learning_rate": 3.7330444783642338e-06, "loss": 0.0015, "step": 1370 }, { "epoch": 0.7551118760757315, "grad_norm": 0.23426827788352966, "learning_rate": 3.676187067922421e-06, "loss": 0.0002, "step": 1371 }, { "epoch": 0.7556626506024097, "grad_norm": 3.185793876647949, "learning_rate": 3.619757871333973e-06, "loss": 0.0069, "step": 1372 }, { "epoch": 0.7562134251290877, "grad_norm": 0.26282617449760437, "learning_rate": 3.563757139458579e-06, "loss": 0.0003, "step": 1373 }, { "epoch": 0.7567641996557659, "grad_norm": 0.10954123735427856, "learning_rate": 3.5081851212512175e-06, "loss": 0.0, "step": 1374 }, { "epoch": 0.7573149741824441, "grad_norm": 0.03658856078982353, "learning_rate": 3.4530420637609363e-06, "loss": 0.0, "step": 1375 }, { "epoch": 0.7578657487091222, "grad_norm": 0.031913258135318756, "learning_rate": 3.3983282121298086e-06, "loss": 0.0001, "step": 1376 }, { "epoch": 0.7584165232358003, "grad_norm": 0.08342316001653671, "learning_rate": 3.3440438095919126e-06, "loss": 0.0, "step": 1377 }, { "epoch": 0.7589672977624785, "grad_norm": 3.030644416809082, "learning_rate": 3.290189097472096e-06, "loss": 0.0019, "step": 1378 }, { "epoch": 0.7595180722891566, "grad_norm": 0.08356063067913055, "learning_rate": 3.236764315185037e-06, "loss": 0.0001, "step": 1379 }, { "epoch": 0.7600688468158348, "grad_norm": 0.003935623448342085, "learning_rate": 3.1837697002341293e-06, "loss": 0.0, "step": 1380 }, { "epoch": 0.7606196213425129, "grad_norm": 1.5438226461410522, "learning_rate": 3.131205488210409e-06, "loss": 0.0017, "step": 1381 }, { "epoch": 0.761170395869191, "grad_norm": 0.006664223503321409, "learning_rate": 3.0790719127915646e-06, "loss": 0.0, "step": 1382 }, { "epoch": 0.7617211703958692, "grad_norm": 0.41365811228752136, "learning_rate": 3.0273692057408265e-06, "loss": 0.0001, "step": 1383 }, { "epoch": 0.7622719449225474, "grad_norm": 0.7512823343276978, "learning_rate": 2.976097596905969e-06, "loss": 0.0004, "step": 1384 }, { "epoch": 0.7628227194492255, "grad_norm": 0.05560838058590889, "learning_rate": 2.9252573142183326e-06, "loss": 0.0001, "step": 1385 }, { "epoch": 0.7633734939759036, "grad_norm": 0.12114834785461426, "learning_rate": 2.874848583691714e-06, "loss": 0.0001, "step": 1386 }, { "epoch": 0.7639242685025818, "grad_norm": 0.003411223879083991, "learning_rate": 2.8248716294214774e-06, "loss": 0.0, "step": 1387 }, { "epoch": 0.76447504302926, "grad_norm": 3.396744728088379, "learning_rate": 2.7753266735834338e-06, "loss": 0.0025, "step": 1388 }, { "epoch": 0.765025817555938, "grad_norm": 0.11682251840829849, "learning_rate": 2.7262139364329643e-06, "loss": 0.0001, "step": 1389 }, { "epoch": 0.7655765920826162, "grad_norm": 0.0082024484872818, "learning_rate": 2.677533636303964e-06, "loss": 0.0, "step": 1390 }, { "epoch": 0.7661273666092944, "grad_norm": 2.9342269897460938, "learning_rate": 2.6292859896079213e-06, "loss": 0.0019, "step": 1391 }, { "epoch": 0.7666781411359724, "grad_norm": 0.014128429815173149, "learning_rate": 2.581471210832931e-06, "loss": 0.0, "step": 1392 }, { "epoch": 0.7672289156626506, "grad_norm": 2.022939682006836, "learning_rate": 2.5340895125427364e-06, "loss": 0.0036, "step": 1393 }, { "epoch": 0.7677796901893288, "grad_norm": 0.007080804090946913, "learning_rate": 2.4871411053757898e-06, "loss": 0.0, "step": 1394 }, { "epoch": 0.7683304647160069, "grad_norm": 0.0009527353104203939, "learning_rate": 2.440626198044327e-06, "loss": 0.0, "step": 1395 }, { "epoch": 0.768881239242685, "grad_norm": 0.010615051724016666, "learning_rate": 2.394544997333437e-06, "loss": 0.0, "step": 1396 }, { "epoch": 0.7694320137693632, "grad_norm": 0.002693140646442771, "learning_rate": 2.3488977081001394e-06, "loss": 0.0, "step": 1397 }, { "epoch": 0.7699827882960413, "grad_norm": 0.006079079583287239, "learning_rate": 2.3036845332724543e-06, "loss": 0.0, "step": 1398 }, { "epoch": 0.7705335628227195, "grad_norm": 0.011672936379909515, "learning_rate": 2.2589056738485324e-06, "loss": 0.0, "step": 1399 }, { "epoch": 0.7710843373493976, "grad_norm": 0.004134076647460461, "learning_rate": 2.2145613288957478e-06, "loss": 0.0, "step": 1400 }, { "epoch": 0.7716351118760757, "grad_norm": 11.089314460754395, "learning_rate": 2.170651695549786e-06, "loss": 0.0113, "step": 1401 }, { "epoch": 0.7721858864027539, "grad_norm": 1.6987656354904175, "learning_rate": 2.1271769690138332e-06, "loss": 0.0024, "step": 1402 }, { "epoch": 0.772736660929432, "grad_norm": 3.2635018825531006, "learning_rate": 2.084137342557646e-06, "loss": 0.0018, "step": 1403 }, { "epoch": 0.7732874354561101, "grad_norm": 1.581158995628357, "learning_rate": 2.0415330075166937e-06, "loss": 0.001, "step": 1404 }, { "epoch": 0.7738382099827883, "grad_norm": 2.175628423690796, "learning_rate": 1.9993641532913833e-06, "loss": 0.003, "step": 1405 }, { "epoch": 0.7743889845094665, "grad_norm": 0.16958962380886078, "learning_rate": 1.9576309673461357e-06, "loss": 0.0001, "step": 1406 }, { "epoch": 0.7749397590361445, "grad_norm": 0.10555793344974518, "learning_rate": 1.916333635208556e-06, "loss": 0.0001, "step": 1407 }, { "epoch": 0.7754905335628227, "grad_norm": 0.006902970839291811, "learning_rate": 1.8754723404686425e-06, "loss": 0.0, "step": 1408 }, { "epoch": 0.7760413080895009, "grad_norm": 0.020889397710561752, "learning_rate": 1.8350472647780116e-06, "loss": 0.0, "step": 1409 }, { "epoch": 0.776592082616179, "grad_norm": 0.04529467225074768, "learning_rate": 1.7950585878489856e-06, "loss": 0.0, "step": 1410 }, { "epoch": 0.7771428571428571, "grad_norm": 0.005028935614973307, "learning_rate": 1.7555064874538397e-06, "loss": 0.0, "step": 1411 }, { "epoch": 0.7776936316695353, "grad_norm": 0.0008711517439223826, "learning_rate": 1.7163911394240672e-06, "loss": 0.0, "step": 1412 }, { "epoch": 0.7782444061962134, "grad_norm": 1.4885623455047607, "learning_rate": 1.6777127176495043e-06, "loss": 0.0002, "step": 1413 }, { "epoch": 0.7787951807228916, "grad_norm": 0.23050819337368011, "learning_rate": 1.6394713940776296e-06, "loss": 0.0002, "step": 1414 }, { "epoch": 0.7793459552495697, "grad_norm": 3.158535957336426, "learning_rate": 1.6016673387127646e-06, "loss": 0.0011, "step": 1415 }, { "epoch": 0.7798967297762478, "grad_norm": 0.09719827026128769, "learning_rate": 1.5643007196153302e-06, "loss": 0.0001, "step": 1416 }, { "epoch": 0.780447504302926, "grad_norm": 3.3785789012908936, "learning_rate": 1.5273717029010925e-06, "loss": 0.0236, "step": 1417 }, { "epoch": 0.7809982788296042, "grad_norm": 9.680088996887207, "learning_rate": 1.4908804527404286e-06, "loss": 0.013, "step": 1418 }, { "epoch": 0.7815490533562822, "grad_norm": 0.003274108050391078, "learning_rate": 1.4548271313575835e-06, "loss": 0.0, "step": 1419 }, { "epoch": 0.7820998278829604, "grad_norm": 0.0325576514005661, "learning_rate": 1.4192118990299707e-06, "loss": 0.0, "step": 1420 }, { "epoch": 0.7826506024096386, "grad_norm": 1.2686595916748047, "learning_rate": 1.3840349140874619e-06, "loss": 0.001, "step": 1421 }, { "epoch": 0.7832013769363166, "grad_norm": 0.08304214477539062, "learning_rate": 1.3492963329116537e-06, "loss": 0.0001, "step": 1422 }, { "epoch": 0.7837521514629948, "grad_norm": 0.0262454804033041, "learning_rate": 1.3149963099352014e-06, "loss": 0.0, "step": 1423 }, { "epoch": 0.784302925989673, "grad_norm": 0.03751380741596222, "learning_rate": 1.2811349976411202e-06, "loss": 0.0, "step": 1424 }, { "epoch": 0.7848537005163512, "grad_norm": 3.2223174571990967, "learning_rate": 1.2477125465620853e-06, "loss": 0.0078, "step": 1425 }, { "epoch": 0.7854044750430292, "grad_norm": 0.007997360080480576, "learning_rate": 1.2147291052798216e-06, "loss": 0.0, "step": 1426 }, { "epoch": 0.7859552495697074, "grad_norm": 0.02939125895500183, "learning_rate": 1.1821848204243814e-06, "loss": 0.0, "step": 1427 }, { "epoch": 0.7865060240963856, "grad_norm": 0.042306166142225266, "learning_rate": 1.1500798366735233e-06, "loss": 0.0, "step": 1428 }, { "epoch": 0.7870567986230637, "grad_norm": 5.503622055053711, "learning_rate": 1.1184142967520794e-06, "loss": 0.0039, "step": 1429 }, { "epoch": 0.7876075731497418, "grad_norm": 0.0017208909848704934, "learning_rate": 1.0871883414312777e-06, "loss": 0.0, "step": 1430 }, { "epoch": 0.78815834767642, "grad_norm": 1.9671458005905151, "learning_rate": 1.0564021095281652e-06, "loss": 0.0028, "step": 1431 }, { "epoch": 0.7887091222030981, "grad_norm": 3.082493543624878, "learning_rate": 1.0260557379049519e-06, "loss": 0.0038, "step": 1432 }, { "epoch": 0.7892598967297763, "grad_norm": 14.470429420471191, "learning_rate": 9.96149361468457e-07, "loss": 0.0199, "step": 1433 }, { "epoch": 0.7898106712564544, "grad_norm": 0.03234907612204552, "learning_rate": 9.66683113169431e-07, "loss": 0.0, "step": 1434 }, { "epoch": 0.7903614457831325, "grad_norm": 11.065224647521973, "learning_rate": 9.376571240020227e-07, "loss": 0.0209, "step": 1435 }, { "epoch": 0.7909122203098107, "grad_norm": 0.041658949106931686, "learning_rate": 9.090715230031688e-07, "loss": 0.0001, "step": 1436 }, { "epoch": 0.7914629948364889, "grad_norm": 0.03237476944923401, "learning_rate": 8.809264372520609e-07, "loss": 0.0, "step": 1437 }, { "epoch": 0.7920137693631669, "grad_norm": 3.8435652256011963, "learning_rate": 8.532219918695128e-07, "loss": 0.0027, "step": 1438 }, { "epoch": 0.7925645438898451, "grad_norm": 5.219860553741455, "learning_rate": 8.259583100174606e-07, "loss": 0.0024, "step": 1439 }, { "epoch": 0.7931153184165233, "grad_norm": 0.024953359737992287, "learning_rate": 7.991355128984079e-07, "loss": 0.0, "step": 1440 }, { "epoch": 0.7936660929432013, "grad_norm": 0.03742039203643799, "learning_rate": 7.727537197548707e-07, "loss": 0.0, "step": 1441 }, { "epoch": 0.7942168674698795, "grad_norm": 1.4680094718933105, "learning_rate": 7.468130478688218e-07, "loss": 0.0017, "step": 1442 }, { "epoch": 0.7947676419965577, "grad_norm": 0.6728762984275818, "learning_rate": 7.213136125612586e-07, "loss": 0.0012, "step": 1443 }, { "epoch": 0.7953184165232358, "grad_norm": 0.004860843066126108, "learning_rate": 6.962555271915805e-07, "loss": 0.0, "step": 1444 }, { "epoch": 0.7958691910499139, "grad_norm": 0.011788596399128437, "learning_rate": 6.716389031571568e-07, "loss": 0.0, "step": 1445 }, { "epoch": 0.7964199655765921, "grad_norm": 0.0006118030869401991, "learning_rate": 6.474638498928265e-07, "loss": 0.0, "step": 1446 }, { "epoch": 0.7969707401032702, "grad_norm": 8.702446937561035, "learning_rate": 6.237304748703543e-07, "loss": 0.052, "step": 1447 }, { "epoch": 0.7975215146299484, "grad_norm": 0.04058964550495148, "learning_rate": 6.004388835980423e-07, "loss": 0.0001, "step": 1448 }, { "epoch": 0.7980722891566265, "grad_norm": 0.009473210200667381, "learning_rate": 5.77589179620186e-07, "loss": 0.0, "step": 1449 }, { "epoch": 0.7986230636833046, "grad_norm": 1.6105542182922363, "learning_rate": 5.55181464516652e-07, "loss": 0.0018, "step": 1450 }, { "epoch": 0.7991738382099828, "grad_norm": 12.676631927490234, "learning_rate": 5.332158379024122e-07, "loss": 0.023, "step": 1451 }, { "epoch": 0.799724612736661, "grad_norm": 1.0852091312408447, "learning_rate": 5.116923974270993e-07, "loss": 0.0017, "step": 1452 }, { "epoch": 0.800275387263339, "grad_norm": 0.0023765272926539183, "learning_rate": 4.906112387745965e-07, "loss": 0.0, "step": 1453 }, { "epoch": 0.8008261617900172, "grad_norm": 0.06535768508911133, "learning_rate": 4.6997245566257064e-07, "loss": 0.0001, "step": 1454 }, { "epoch": 0.8013769363166954, "grad_norm": 0.14733462035655975, "learning_rate": 4.497761398421063e-07, "loss": 0.0001, "step": 1455 }, { "epoch": 0.8019277108433734, "grad_norm": 9.1502103805542, "learning_rate": 4.3002238109723927e-07, "loss": 0.0202, "step": 1456 }, { "epoch": 0.8024784853700516, "grad_norm": 0.057801272720098495, "learning_rate": 4.107112672446123e-07, "loss": 0.0, "step": 1457 }, { "epoch": 0.8030292598967298, "grad_norm": 5.182955265045166, "learning_rate": 3.9184288413306456e-07, "loss": 0.0044, "step": 1458 }, { "epoch": 0.8035800344234079, "grad_norm": 0.005904569756239653, "learning_rate": 3.734173156432208e-07, "loss": 0.0, "step": 1459 }, { "epoch": 0.804130808950086, "grad_norm": 3.568734645843506, "learning_rate": 3.554346436871581e-07, "loss": 0.0038, "step": 1460 }, { "epoch": 0.8046815834767642, "grad_norm": 0.13475464284420013, "learning_rate": 3.3789494820803957e-07, "loss": 0.0002, "step": 1461 }, { "epoch": 0.8052323580034424, "grad_norm": 0.0019097113981842995, "learning_rate": 3.2079830717972606e-07, "loss": 0.0, "step": 1462 }, { "epoch": 0.8057831325301205, "grad_norm": 0.00276440498419106, "learning_rate": 3.041447966064648e-07, "loss": 0.0, "step": 1463 }, { "epoch": 0.8063339070567986, "grad_norm": 0.0031745252199470997, "learning_rate": 2.8793449052254563e-07, "loss": 0.0, "step": 1464 }, { "epoch": 0.8068846815834768, "grad_norm": 0.015292392112314701, "learning_rate": 2.721674609919345e-07, "loss": 0.0, "step": 1465 }, { "epoch": 0.8074354561101549, "grad_norm": 0.5834283232688904, "learning_rate": 2.568437781080069e-07, "loss": 0.0003, "step": 1466 }, { "epoch": 0.8079862306368331, "grad_norm": 0.011508706025779247, "learning_rate": 2.4196350999320384e-07, "loss": 0.0, "step": 1467 }, { "epoch": 0.8085370051635112, "grad_norm": 0.2335948944091797, "learning_rate": 2.275267227987321e-07, "loss": 0.0001, "step": 1468 }, { "epoch": 0.8090877796901893, "grad_norm": 0.1580556184053421, "learning_rate": 2.135334807042866e-07, "loss": 0.0001, "step": 1469 }, { "epoch": 0.8096385542168675, "grad_norm": 4.0680251121521, "learning_rate": 1.9998384591773944e-07, "loss": 0.0036, "step": 1470 }, { "epoch": 0.8101893287435457, "grad_norm": 0.02373497560620308, "learning_rate": 1.8687787867489592e-07, "loss": 0.0, "step": 1471 }, { "epoch": 0.8107401032702237, "grad_norm": 1.0623420476913452, "learning_rate": 1.7421563723919454e-07, "loss": 0.0005, "step": 1472 }, { "epoch": 0.8112908777969019, "grad_norm": 1.2653566598892212, "learning_rate": 1.6199717790145174e-07, "loss": 0.0025, "step": 1473 }, { "epoch": 0.8118416523235801, "grad_norm": 1.4965075254440308, "learning_rate": 1.5022255497962879e-07, "loss": 0.003, "step": 1474 }, { "epoch": 0.8123924268502581, "grad_norm": 1.3762314319610596, "learning_rate": 1.3889182081860962e-07, "loss": 0.0027, "step": 1475 }, { "epoch": 0.8129432013769363, "grad_norm": 0.4332352876663208, "learning_rate": 1.2800502578991235e-07, "loss": 0.0002, "step": 1476 }, { "epoch": 0.8134939759036145, "grad_norm": 7.788033485412598, "learning_rate": 1.1756221829148928e-07, "loss": 0.0088, "step": 1477 }, { "epoch": 0.8140447504302926, "grad_norm": 7.791640281677246, "learning_rate": 1.0756344474753821e-07, "loss": 0.0037, "step": 1478 }, { "epoch": 0.8145955249569707, "grad_norm": 1.1662095785140991, "learning_rate": 9.800874960826933e-08, "loss": 0.0027, "step": 1479 }, { "epoch": 0.8151462994836489, "grad_norm": 0.7223013639450073, "learning_rate": 8.889817534969425e-08, "loss": 0.0002, "step": 1480 }, { "epoch": 0.815697074010327, "grad_norm": 0.08738849312067032, "learning_rate": 8.023176247348163e-08, "loss": 0.0001, "step": 1481 }, { "epoch": 0.8162478485370052, "grad_norm": 0.0217811968177557, "learning_rate": 7.200954950673522e-08, "loss": 0.0, "step": 1482 }, { "epoch": 0.8167986230636833, "grad_norm": 3.0596630573272705, "learning_rate": 6.423157300184946e-08, "loss": 0.0075, "step": 1483 }, { "epoch": 0.8173493975903614, "grad_norm": 0.0014560967683792114, "learning_rate": 5.6897867536331864e-08, "loss": 0.0, "step": 1484 }, { "epoch": 0.8179001721170396, "grad_norm": 0.012103653512895107, "learning_rate": 5.000846571264761e-08, "loss": 0.0, "step": 1485 }, { "epoch": 0.8184509466437178, "grad_norm": 0.031385134905576706, "learning_rate": 4.35633981580974e-08, "loss": 0.0, "step": 1486 }, { "epoch": 0.8190017211703958, "grad_norm": 0.0024626590311527252, "learning_rate": 3.756269352462871e-08, "loss": 0.0, "step": 1487 }, { "epoch": 0.819552495697074, "grad_norm": 0.004636548459529877, "learning_rate": 3.20063784888025e-08, "loss": 0.0, "step": 1488 }, { "epoch": 0.8201032702237522, "grad_norm": 0.004233711399137974, "learning_rate": 2.6894477751548964e-08, "loss": 0.0, "step": 1489 }, { "epoch": 0.8206540447504302, "grad_norm": 0.985896110534668, "learning_rate": 2.222701403818972e-08, "loss": 0.0017, "step": 1490 }, { "epoch": 0.8212048192771084, "grad_norm": 0.17059260606765747, "learning_rate": 1.8004008098226887e-08, "loss": 0.0001, "step": 1491 }, { "epoch": 0.8217555938037866, "grad_norm": 0.04419637471437454, "learning_rate": 1.4225478705309769e-08, "loss": 0.0, "step": 1492 }, { "epoch": 0.8223063683304647, "grad_norm": 0.5388388633728027, "learning_rate": 1.0891442657134932e-08, "loss": 0.0006, "step": 1493 }, { "epoch": 0.8228571428571428, "grad_norm": 0.3162759244441986, "learning_rate": 8.001914775401798e-09, "loss": 0.0003, "step": 1494 }, { "epoch": 0.823407917383821, "grad_norm": 0.6399761438369751, "learning_rate": 5.5569079056794206e-09, "loss": 0.0001, "step": 1495 }, { "epoch": 0.8239586919104991, "grad_norm": 0.006261682137846947, "learning_rate": 3.5564329174064824e-09, "loss": 0.0, "step": 1496 }, { "epoch": 0.8245094664371773, "grad_norm": 0.002352055162191391, "learning_rate": 2.0004987038246824e-09, "loss": 0.0, "step": 1497 }, { "epoch": 0.8250602409638554, "grad_norm": 0.13489021360874176, "learning_rate": 8.891121819565306e-10, "loss": 0.0001, "step": 1498 }, { "epoch": 0.8256110154905336, "grad_norm": 0.0109406728297472, "learning_rate": 2.2227829252763344e-10, "loss": 0.0, "step": 1499 }, { "epoch": 0.8261617900172117, "grad_norm": 0.27021676301956177, "learning_rate": 0.0, "loss": 0.0002, "step": 1500 }, { "epoch": 0.8261617900172117, "eval_loss": 0.0032487923745065928, "eval_runtime": 233.1866, "eval_samples_per_second": 13.114, "eval_steps_per_second": 6.557, "step": 1500 } ], "logging_steps": 1, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 375, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8308246047741706e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }