{ "best_metric": 0.5870135426521301, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 3.007518796992481, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015037593984962405, "grad_norm": 0.4809579849243164, "learning_rate": 7e-06, "loss": 0.5655, "step": 1 }, { "epoch": 0.015037593984962405, "eval_loss": 0.8995361328125, "eval_runtime": 2.4534, "eval_samples_per_second": 45.65, "eval_steps_per_second": 11.413, "step": 1 }, { "epoch": 0.03007518796992481, "grad_norm": 0.6736605167388916, "learning_rate": 1.4e-05, "loss": 0.7411, "step": 2 }, { "epoch": 0.045112781954887216, "grad_norm": 0.6587191820144653, "learning_rate": 2.1e-05, "loss": 0.7838, "step": 3 }, { "epoch": 0.06015037593984962, "grad_norm": 0.807904839515686, "learning_rate": 2.8e-05, "loss": 0.8039, "step": 4 }, { "epoch": 0.07518796992481203, "grad_norm": 0.6833544969558716, "learning_rate": 3.5e-05, "loss": 0.8463, "step": 5 }, { "epoch": 0.09022556390977443, "grad_norm": 0.6910650730133057, "learning_rate": 4.2e-05, "loss": 0.8464, "step": 6 }, { "epoch": 0.10526315789473684, "grad_norm": 0.7096790671348572, "learning_rate": 4.899999999999999e-05, "loss": 0.7382, "step": 7 }, { "epoch": 0.12030075187969924, "grad_norm": 0.8323878645896912, "learning_rate": 5.6e-05, "loss": 0.7977, "step": 8 }, { "epoch": 0.13533834586466165, "grad_norm": 0.8664156198501587, "learning_rate": 6.3e-05, "loss": 0.7916, "step": 9 }, { "epoch": 0.15037593984962405, "grad_norm": 0.8904569149017334, "learning_rate": 7e-05, "loss": 0.728, "step": 10 }, { "epoch": 0.16541353383458646, "grad_norm": 0.9623700380325317, "learning_rate": 6.999521567473641e-05, "loss": 0.755, "step": 11 }, { "epoch": 0.18045112781954886, "grad_norm": 0.8271142840385437, "learning_rate": 6.998086400693241e-05, "loss": 0.7323, "step": 12 }, { "epoch": 0.19548872180451127, "grad_norm": 0.8464103937149048, "learning_rate": 6.995694892019065e-05, "loss": 0.65, "step": 13 }, { "epoch": 0.21052631578947367, "grad_norm": 0.9229158759117126, "learning_rate": 6.99234769526571e-05, "loss": 0.7554, "step": 14 }, { "epoch": 0.22556390977443608, "grad_norm": 1.0574674606323242, "learning_rate": 6.988045725523343e-05, "loss": 0.7286, "step": 15 }, { "epoch": 0.24060150375939848, "grad_norm": 2.083634376525879, "learning_rate": 6.982790158907539e-05, "loss": 0.7849, "step": 16 }, { "epoch": 0.2556390977443609, "grad_norm": 0.6636488437652588, "learning_rate": 6.976582432237733e-05, "loss": 0.5637, "step": 17 }, { "epoch": 0.2706766917293233, "grad_norm": 0.8397290706634521, "learning_rate": 6.969424242644413e-05, "loss": 0.7196, "step": 18 }, { "epoch": 0.2857142857142857, "grad_norm": 0.6587305068969727, "learning_rate": 6.961317547105138e-05, "loss": 0.6415, "step": 19 }, { "epoch": 0.3007518796992481, "grad_norm": 0.5852208733558655, "learning_rate": 6.952264561909527e-05, "loss": 0.6695, "step": 20 }, { "epoch": 0.3157894736842105, "grad_norm": 0.5851012468338013, "learning_rate": 6.942267762053337e-05, "loss": 0.6713, "step": 21 }, { "epoch": 0.3308270676691729, "grad_norm": 0.5719961524009705, "learning_rate": 6.931329880561832e-05, "loss": 0.6639, "step": 22 }, { "epoch": 0.3458646616541353, "grad_norm": 0.623251736164093, "learning_rate": 6.919453907742597e-05, "loss": 0.6757, "step": 23 }, { "epoch": 0.3609022556390977, "grad_norm": 0.6857374310493469, "learning_rate": 6.90664309036802e-05, "loss": 0.7215, "step": 24 }, { "epoch": 0.37593984962406013, "grad_norm": 0.6974391341209412, "learning_rate": 6.892900930787656e-05, "loss": 0.6228, "step": 25 }, { "epoch": 0.39097744360902253, "grad_norm": 0.7447807192802429, "learning_rate": 6.87823118597072e-05, "loss": 0.6592, "step": 26 }, { "epoch": 0.40601503759398494, "grad_norm": 0.7813267111778259, "learning_rate": 6.862637866478969e-05, "loss": 0.6166, "step": 27 }, { "epoch": 0.42105263157894735, "grad_norm": 0.7480762600898743, "learning_rate": 6.846125235370252e-05, "loss": 0.6013, "step": 28 }, { "epoch": 0.43609022556390975, "grad_norm": 0.7865514755249023, "learning_rate": 6.828697807033038e-05, "loss": 0.64, "step": 29 }, { "epoch": 0.45112781954887216, "grad_norm": 0.9381106495857239, "learning_rate": 6.81036034595222e-05, "loss": 0.61, "step": 30 }, { "epoch": 0.46616541353383456, "grad_norm": 1.1231513023376465, "learning_rate": 6.791117865406564e-05, "loss": 0.6334, "step": 31 }, { "epoch": 0.48120300751879697, "grad_norm": 1.9280123710632324, "learning_rate": 6.770975626098112e-05, "loss": 0.6585, "step": 32 }, { "epoch": 0.49624060150375937, "grad_norm": 0.4580252468585968, "learning_rate": 6.749939134713974e-05, "loss": 0.6021, "step": 33 }, { "epoch": 0.5112781954887218, "grad_norm": 0.5338541269302368, "learning_rate": 6.728014142420846e-05, "loss": 0.6002, "step": 34 }, { "epoch": 0.5263157894736842, "grad_norm": 0.6137806177139282, "learning_rate": 6.7052066432927e-05, "loss": 0.6757, "step": 35 }, { "epoch": 0.5413533834586466, "grad_norm": 0.6993142366409302, "learning_rate": 6.681522872672069e-05, "loss": 0.7284, "step": 36 }, { "epoch": 0.556390977443609, "grad_norm": 0.6657317280769348, "learning_rate": 6.656969305465356e-05, "loss": 0.6837, "step": 37 }, { "epoch": 0.5714285714285714, "grad_norm": 0.6112419366836548, "learning_rate": 6.631552654372672e-05, "loss": 0.6093, "step": 38 }, { "epoch": 0.5864661654135338, "grad_norm": 0.6157020330429077, "learning_rate": 6.60527986805264e-05, "loss": 0.6449, "step": 39 }, { "epoch": 0.6015037593984962, "grad_norm": 0.6100159287452698, "learning_rate": 6.578158129222711e-05, "loss": 0.649, "step": 40 }, { "epoch": 0.6165413533834586, "grad_norm": 0.5631024837493896, "learning_rate": 6.550194852695469e-05, "loss": 0.6376, "step": 41 }, { "epoch": 0.631578947368421, "grad_norm": 0.6128990054130554, "learning_rate": 6.521397683351509e-05, "loss": 0.664, "step": 42 }, { "epoch": 0.6466165413533834, "grad_norm": 0.658621609210968, "learning_rate": 6.491774494049386e-05, "loss": 0.6494, "step": 43 }, { "epoch": 0.6616541353383458, "grad_norm": 0.6706719398498535, "learning_rate": 6.461333383473272e-05, "loss": 0.5795, "step": 44 }, { "epoch": 0.6766917293233082, "grad_norm": 0.8453061580657959, "learning_rate": 6.430082673918849e-05, "loss": 0.6183, "step": 45 }, { "epoch": 0.6917293233082706, "grad_norm": 0.815949022769928, "learning_rate": 6.398030909018069e-05, "loss": 0.5968, "step": 46 }, { "epoch": 0.706766917293233, "grad_norm": 0.9384602904319763, "learning_rate": 6.365186851403423e-05, "loss": 0.5945, "step": 47 }, { "epoch": 0.7218045112781954, "grad_norm": 1.298033356666565, "learning_rate": 6.331559480312315e-05, "loss": 0.6575, "step": 48 }, { "epoch": 0.7368421052631579, "grad_norm": 0.3282231092453003, "learning_rate": 6.297157989132236e-05, "loss": 0.5261, "step": 49 }, { "epoch": 0.7518796992481203, "grad_norm": 0.38446760177612305, "learning_rate": 6.261991782887377e-05, "loss": 0.6484, "step": 50 }, { "epoch": 0.7518796992481203, "eval_loss": 0.6199080348014832, "eval_runtime": 2.4646, "eval_samples_per_second": 45.443, "eval_steps_per_second": 11.361, "step": 50 }, { "epoch": 0.7669172932330827, "grad_norm": 0.4243997037410736, "learning_rate": 6.226070475667393e-05, "loss": 0.6936, "step": 51 }, { "epoch": 0.7819548872180451, "grad_norm": 0.4423169195652008, "learning_rate": 6.189403887999006e-05, "loss": 0.5825, "step": 52 }, { "epoch": 0.7969924812030075, "grad_norm": 0.5177990794181824, "learning_rate": 6.152002044161171e-05, "loss": 0.5981, "step": 53 }, { "epoch": 0.8120300751879699, "grad_norm": 0.5316641926765442, "learning_rate": 6.113875169444539e-05, "loss": 0.5955, "step": 54 }, { "epoch": 0.8270676691729323, "grad_norm": 0.6013980507850647, "learning_rate": 6.0750336873559605e-05, "loss": 0.6729, "step": 55 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5759268403053284, "learning_rate": 6.035488216768811e-05, "loss": 0.6079, "step": 56 }, { "epoch": 0.8571428571428571, "grad_norm": 0.6138127446174622, "learning_rate": 5.9952495690198894e-05, "loss": 0.6268, "step": 57 }, { "epoch": 0.8721804511278195, "grad_norm": 0.6106196641921997, "learning_rate": 5.954328744953709e-05, "loss": 0.5805, "step": 58 }, { "epoch": 0.8872180451127819, "grad_norm": 0.5879772305488586, "learning_rate": 5.91273693191498e-05, "loss": 0.5844, "step": 59 }, { "epoch": 0.9022556390977443, "grad_norm": 0.6408531665802002, "learning_rate": 5.870485500690094e-05, "loss": 0.6236, "step": 60 }, { "epoch": 0.9172932330827067, "grad_norm": 0.6717394590377808, "learning_rate": 5.827586002398468e-05, "loss": 0.5644, "step": 61 }, { "epoch": 0.9323308270676691, "grad_norm": 0.7435616254806519, "learning_rate": 5.784050165334589e-05, "loss": 0.5954, "step": 62 }, { "epoch": 0.9473684210526315, "grad_norm": 0.9809845685958862, "learning_rate": 5.739889891761608e-05, "loss": 0.5608, "step": 63 }, { "epoch": 0.9624060150375939, "grad_norm": 1.4896681308746338, "learning_rate": 5.6951172546573794e-05, "loss": 0.6609, "step": 64 }, { "epoch": 0.9774436090225563, "grad_norm": 0.4128533601760864, "learning_rate": 5.6497444944138376e-05, "loss": 0.6618, "step": 65 }, { "epoch": 0.9924812030075187, "grad_norm": 0.6409559845924377, "learning_rate": 5.603784015490587e-05, "loss": 0.5844, "step": 66 }, { "epoch": 1.0075187969924813, "grad_norm": 1.571585774421692, "learning_rate": 5.557248383023655e-05, "loss": 0.8216, "step": 67 }, { "epoch": 1.0225563909774436, "grad_norm": 0.3379616439342499, "learning_rate": 5.510150319390302e-05, "loss": 0.5735, "step": 68 }, { "epoch": 1.037593984962406, "grad_norm": 0.38901475071907043, "learning_rate": 5.4625027007308546e-05, "loss": 0.607, "step": 69 }, { "epoch": 1.0526315789473684, "grad_norm": 0.3873230814933777, "learning_rate": 5.414318553428494e-05, "loss": 0.5423, "step": 70 }, { "epoch": 1.0676691729323309, "grad_norm": 0.4160054922103882, "learning_rate": 5.3656110505479776e-05, "loss": 0.5218, "step": 71 }, { "epoch": 1.0827067669172932, "grad_norm": 0.4787563681602478, "learning_rate": 5.316393508234253e-05, "loss": 0.5987, "step": 72 }, { "epoch": 1.0977443609022557, "grad_norm": 0.4826807379722595, "learning_rate": 5.266679382071953e-05, "loss": 0.5264, "step": 73 }, { "epoch": 1.112781954887218, "grad_norm": 0.5345706939697266, "learning_rate": 5.216482263406778e-05, "loss": 0.5316, "step": 74 }, { "epoch": 1.1278195488721805, "grad_norm": 0.5583795309066772, "learning_rate": 5.1658158756297576e-05, "loss": 0.5566, "step": 75 }, { "epoch": 1.1428571428571428, "grad_norm": 0.5581653118133545, "learning_rate": 5.114694070425407e-05, "loss": 0.5542, "step": 76 }, { "epoch": 1.1578947368421053, "grad_norm": 0.5767646431922913, "learning_rate": 5.063130823984823e-05, "loss": 0.4702, "step": 77 }, { "epoch": 1.1729323308270676, "grad_norm": 0.6357953548431396, "learning_rate": 5.011140233184724e-05, "loss": 0.4772, "step": 78 }, { "epoch": 1.1879699248120301, "grad_norm": 0.6541237831115723, "learning_rate": 4.958736511733516e-05, "loss": 0.436, "step": 79 }, { "epoch": 1.2030075187969924, "grad_norm": 0.6701495051383972, "learning_rate": 4.905933986285393e-05, "loss": 0.4437, "step": 80 }, { "epoch": 1.218045112781955, "grad_norm": 0.812445342540741, "learning_rate": 4.8527470925235824e-05, "loss": 0.3972, "step": 81 }, { "epoch": 1.2330827067669172, "grad_norm": 1.2333282232284546, "learning_rate": 4.799190371213772e-05, "loss": 0.5098, "step": 82 }, { "epoch": 1.2481203007518797, "grad_norm": 0.7253397107124329, "learning_rate": 4.745278464228808e-05, "loss": 0.3373, "step": 83 }, { "epoch": 1.263157894736842, "grad_norm": 0.3850756585597992, "learning_rate": 4.69102611054575e-05, "loss": 0.5143, "step": 84 }, { "epoch": 1.2781954887218046, "grad_norm": 0.5035658478736877, "learning_rate": 4.6364481422163926e-05, "loss": 0.6007, "step": 85 }, { "epoch": 1.2932330827067668, "grad_norm": 0.535111665725708, "learning_rate": 4.581559480312316e-05, "loss": 0.6376, "step": 86 }, { "epoch": 1.3082706766917294, "grad_norm": 0.6467974185943604, "learning_rate": 4.526375130845627e-05, "loss": 0.6319, "step": 87 }, { "epoch": 1.3233082706766917, "grad_norm": 0.5997623205184937, "learning_rate": 4.4709101806664554e-05, "loss": 0.5534, "step": 88 }, { "epoch": 1.3383458646616542, "grad_norm": 0.6482927799224854, "learning_rate": 4.4151797933383685e-05, "loss": 0.5675, "step": 89 }, { "epoch": 1.3533834586466165, "grad_norm": 0.6287254095077515, "learning_rate": 4.359199204992797e-05, "loss": 0.5392, "step": 90 }, { "epoch": 1.368421052631579, "grad_norm": 0.6130253076553345, "learning_rate": 4.30298372016363e-05, "loss": 0.5295, "step": 91 }, { "epoch": 1.3834586466165413, "grad_norm": 0.626999020576477, "learning_rate": 4.246548707603114e-05, "loss": 0.5849, "step": 92 }, { "epoch": 1.3984962406015038, "grad_norm": 0.6120505928993225, "learning_rate": 4.1899095960801805e-05, "loss": 0.4657, "step": 93 }, { "epoch": 1.413533834586466, "grad_norm": 0.6539551019668579, "learning_rate": 4.133081870162385e-05, "loss": 0.4869, "step": 94 }, { "epoch": 1.4285714285714286, "grad_norm": 0.6927069425582886, "learning_rate": 4.076081065982569e-05, "loss": 0.4645, "step": 95 }, { "epoch": 1.443609022556391, "grad_norm": 0.7693597674369812, "learning_rate": 4.018922766991447e-05, "loss": 0.46, "step": 96 }, { "epoch": 1.4586466165413534, "grad_norm": 0.8337491154670715, "learning_rate": 3.961622599697241e-05, "loss": 0.4384, "step": 97 }, { "epoch": 1.4736842105263157, "grad_norm": 1.05588960647583, "learning_rate": 3.9041962293935516e-05, "loss": 0.3641, "step": 98 }, { "epoch": 1.4887218045112782, "grad_norm": 0.7424226403236389, "learning_rate": 3.84665935587662e-05, "loss": 0.4112, "step": 99 }, { "epoch": 1.5037593984962405, "grad_norm": 0.3569641411304474, "learning_rate": 3.7890277091531636e-05, "loss": 0.5217, "step": 100 }, { "epoch": 1.5037593984962405, "eval_loss": 0.5945790410041809, "eval_runtime": 2.4681, "eval_samples_per_second": 45.378, "eval_steps_per_second": 11.345, "step": 100 }, { "epoch": 1.518796992481203, "grad_norm": 0.422756552696228, "learning_rate": 3.7313170451399475e-05, "loss": 0.5949, "step": 101 }, { "epoch": 1.5338345864661656, "grad_norm": 0.4702007472515106, "learning_rate": 3.673543141356278e-05, "loss": 0.5806, "step": 102 }, { "epoch": 1.5488721804511278, "grad_norm": 0.5038368105888367, "learning_rate": 3.6157217926105783e-05, "loss": 0.5763, "step": 103 }, { "epoch": 1.5639097744360901, "grad_norm": 0.520751953125, "learning_rate": 3.557868806682255e-05, "loss": 0.5575, "step": 104 }, { "epoch": 1.5789473684210527, "grad_norm": 0.5443906188011169, "learning_rate": 3.5e-05, "loss": 0.5049, "step": 105 }, { "epoch": 1.5939849624060152, "grad_norm": 0.6066420078277588, "learning_rate": 3.442131193317745e-05, "loss": 0.5007, "step": 106 }, { "epoch": 1.6090225563909775, "grad_norm": 0.5944270491600037, "learning_rate": 3.384278207389421e-05, "loss": 0.4581, "step": 107 }, { "epoch": 1.6240601503759398, "grad_norm": 0.6081573367118835, "learning_rate": 3.3264568586437216e-05, "loss": 0.4556, "step": 108 }, { "epoch": 1.6390977443609023, "grad_norm": 0.6466420888900757, "learning_rate": 3.268682954860052e-05, "loss": 0.4439, "step": 109 }, { "epoch": 1.6541353383458648, "grad_norm": 0.7060896158218384, "learning_rate": 3.210972290846837e-05, "loss": 0.4741, "step": 110 }, { "epoch": 1.669172932330827, "grad_norm": 0.7146097421646118, "learning_rate": 3.15334064412338e-05, "loss": 0.4322, "step": 111 }, { "epoch": 1.6842105263157894, "grad_norm": 0.8088813424110413, "learning_rate": 3.0958037706064485e-05, "loss": 0.4446, "step": 112 }, { "epoch": 1.699248120300752, "grad_norm": 0.905190646648407, "learning_rate": 3.038377400302758e-05, "loss": 0.3997, "step": 113 }, { "epoch": 1.7142857142857144, "grad_norm": 1.247326135635376, "learning_rate": 2.9810772330085524e-05, "loss": 0.4648, "step": 114 }, { "epoch": 1.7293233082706767, "grad_norm": 0.8141568303108215, "learning_rate": 2.9239189340174306e-05, "loss": 0.3535, "step": 115 }, { "epoch": 1.744360902255639, "grad_norm": 0.36818474531173706, "learning_rate": 2.8669181298376163e-05, "loss": 0.5282, "step": 116 }, { "epoch": 1.7593984962406015, "grad_norm": 0.4283076822757721, "learning_rate": 2.8100904039198193e-05, "loss": 0.611, "step": 117 }, { "epoch": 1.774436090225564, "grad_norm": 0.4514261782169342, "learning_rate": 2.7534512923968863e-05, "loss": 0.5514, "step": 118 }, { "epoch": 1.7894736842105263, "grad_norm": 0.49255284667015076, "learning_rate": 2.6970162798363695e-05, "loss": 0.5096, "step": 119 }, { "epoch": 1.8045112781954886, "grad_norm": 0.5348407626152039, "learning_rate": 2.640800795007203e-05, "loss": 0.5451, "step": 120 }, { "epoch": 1.8195488721804511, "grad_norm": 0.5862839818000793, "learning_rate": 2.5848202066616305e-05, "loss": 0.5232, "step": 121 }, { "epoch": 1.8345864661654137, "grad_norm": 0.6133275032043457, "learning_rate": 2.5290898193335446e-05, "loss": 0.5803, "step": 122 }, { "epoch": 1.849624060150376, "grad_norm": 0.6256411671638489, "learning_rate": 2.4736248691543736e-05, "loss": 0.4742, "step": 123 }, { "epoch": 1.8646616541353382, "grad_norm": 0.6894042491912842, "learning_rate": 2.4184405196876842e-05, "loss": 0.5336, "step": 124 }, { "epoch": 1.8796992481203008, "grad_norm": 0.6677897572517395, "learning_rate": 2.363551857783608e-05, "loss": 0.4437, "step": 125 }, { "epoch": 1.8947368421052633, "grad_norm": 0.7080979943275452, "learning_rate": 2.308973889454249e-05, "loss": 0.4744, "step": 126 }, { "epoch": 1.9097744360902256, "grad_norm": 0.7204222679138184, "learning_rate": 2.2547215357711918e-05, "loss": 0.3941, "step": 127 }, { "epoch": 1.9248120300751879, "grad_norm": 0.8160737156867981, "learning_rate": 2.2008096287862266e-05, "loss": 0.3873, "step": 128 }, { "epoch": 1.9398496240601504, "grad_norm": 0.8907105922698975, "learning_rate": 2.1472529074764177e-05, "loss": 0.3808, "step": 129 }, { "epoch": 1.954887218045113, "grad_norm": 1.2315212488174438, "learning_rate": 2.0940660137146074e-05, "loss": 0.3455, "step": 130 }, { "epoch": 1.9699248120300752, "grad_norm": 1.0352067947387695, "learning_rate": 2.041263488266484e-05, "loss": 0.5057, "step": 131 }, { "epoch": 1.9849624060150375, "grad_norm": 0.576481282711029, "learning_rate": 1.988859766815275e-05, "loss": 0.4717, "step": 132 }, { "epoch": 2.0, "grad_norm": 1.5741318464279175, "learning_rate": 1.9368691760151773e-05, "loss": 0.5513, "step": 133 }, { "epoch": 2.0150375939849625, "grad_norm": 0.3388630449771881, "learning_rate": 1.885305929574593e-05, "loss": 0.5123, "step": 134 }, { "epoch": 2.030075187969925, "grad_norm": 0.39713987708091736, "learning_rate": 1.8341841243702424e-05, "loss": 0.5497, "step": 135 }, { "epoch": 2.045112781954887, "grad_norm": 0.4295600950717926, "learning_rate": 1.7835177365932225e-05, "loss": 0.5128, "step": 136 }, { "epoch": 2.0601503759398496, "grad_norm": 0.44112977385520935, "learning_rate": 1.7333206179280478e-05, "loss": 0.4821, "step": 137 }, { "epoch": 2.075187969924812, "grad_norm": 0.48101627826690674, "learning_rate": 1.6836064917657478e-05, "loss": 0.4567, "step": 138 }, { "epoch": 2.090225563909774, "grad_norm": 0.5259006023406982, "learning_rate": 1.6343889494520224e-05, "loss": 0.4561, "step": 139 }, { "epoch": 2.1052631578947367, "grad_norm": 0.5476028323173523, "learning_rate": 1.5856814465715064e-05, "loss": 0.4576, "step": 140 }, { "epoch": 2.1203007518796992, "grad_norm": 0.5646714568138123, "learning_rate": 1.5374972992691458e-05, "loss": 0.4859, "step": 141 }, { "epoch": 2.1353383458646618, "grad_norm": 0.6168443560600281, "learning_rate": 1.4898496806096974e-05, "loss": 0.488, "step": 142 }, { "epoch": 2.1503759398496243, "grad_norm": 0.5997370481491089, "learning_rate": 1.4427516169763444e-05, "loss": 0.346, "step": 143 }, { "epoch": 2.1654135338345863, "grad_norm": 0.6405689120292664, "learning_rate": 1.396215984509412e-05, "loss": 0.3542, "step": 144 }, { "epoch": 2.180451127819549, "grad_norm": 0.7130200266838074, "learning_rate": 1.3502555055861625e-05, "loss": 0.4121, "step": 145 }, { "epoch": 2.1954887218045114, "grad_norm": 0.7100954651832581, "learning_rate": 1.3048827453426203e-05, "loss": 0.3368, "step": 146 }, { "epoch": 2.2105263157894735, "grad_norm": 0.7974827885627747, "learning_rate": 1.2601101082383917e-05, "loss": 0.4117, "step": 147 }, { "epoch": 2.225563909774436, "grad_norm": 0.8104152083396912, "learning_rate": 1.2159498346654094e-05, "loss": 0.2426, "step": 148 }, { "epoch": 2.2406015037593985, "grad_norm": 1.0993908643722534, "learning_rate": 1.1724139976015306e-05, "loss": 0.169, "step": 149 }, { "epoch": 2.255639097744361, "grad_norm": 0.36070406436920166, "learning_rate": 1.1295144993099068e-05, "loss": 0.4709, "step": 150 }, { "epoch": 2.255639097744361, "eval_loss": 0.5920043587684631, "eval_runtime": 2.4717, "eval_samples_per_second": 45.312, "eval_steps_per_second": 11.328, "step": 150 }, { "epoch": 2.2706766917293235, "grad_norm": 0.4493859112262726, "learning_rate": 1.0872630680850196e-05, "loss": 0.5158, "step": 151 }, { "epoch": 2.2857142857142856, "grad_norm": 0.49576085805892944, "learning_rate": 1.0456712550462898e-05, "loss": 0.5995, "step": 152 }, { "epoch": 2.300751879699248, "grad_norm": 0.5357233285903931, "learning_rate": 1.0047504309801104e-05, "loss": 0.5769, "step": 153 }, { "epoch": 2.3157894736842106, "grad_norm": 0.5386041402816772, "learning_rate": 9.645117832311886e-06, "loss": 0.4945, "step": 154 }, { "epoch": 2.3308270676691727, "grad_norm": 0.6453455686569214, "learning_rate": 9.249663126440394e-06, "loss": 0.5349, "step": 155 }, { "epoch": 2.345864661654135, "grad_norm": 0.6314446330070496, "learning_rate": 8.861248305554624e-06, "loss": 0.4873, "step": 156 }, { "epoch": 2.3609022556390977, "grad_norm": 0.6192399859428406, "learning_rate": 8.47997955838829e-06, "loss": 0.4064, "step": 157 }, { "epoch": 2.3759398496240602, "grad_norm": 0.6588654518127441, "learning_rate": 8.10596112000994e-06, "loss": 0.4504, "step": 158 }, { "epoch": 2.3909774436090228, "grad_norm": 0.6549960970878601, "learning_rate": 7.739295243326067e-06, "loss": 0.4422, "step": 159 }, { "epoch": 2.406015037593985, "grad_norm": 0.7286242842674255, "learning_rate": 7.380082171126228e-06, "loss": 0.4036, "step": 160 }, { "epoch": 2.4210526315789473, "grad_norm": 0.7317733764648438, "learning_rate": 7.028420108677635e-06, "loss": 0.3783, "step": 161 }, { "epoch": 2.43609022556391, "grad_norm": 0.7959410548210144, "learning_rate": 6.684405196876842e-06, "loss": 0.3368, "step": 162 }, { "epoch": 2.451127819548872, "grad_norm": 0.8299567699432373, "learning_rate": 6.3481314859657675e-06, "loss": 0.3001, "step": 163 }, { "epoch": 2.4661654135338344, "grad_norm": 0.91346675157547, "learning_rate": 6.019690909819298e-06, "loss": 0.2692, "step": 164 }, { "epoch": 2.481203007518797, "grad_norm": 1.0719012022018433, "learning_rate": 5.6991732608115e-06, "loss": 0.1983, "step": 165 }, { "epoch": 2.4962406015037595, "grad_norm": 0.3378448188304901, "learning_rate": 5.386666165267256e-06, "loss": 0.4802, "step": 166 }, { "epoch": 2.511278195488722, "grad_norm": 0.4162678122520447, "learning_rate": 5.08225505950613e-06, "loss": 0.5439, "step": 167 }, { "epoch": 2.526315789473684, "grad_norm": 0.46295762062072754, "learning_rate": 4.786023166484913e-06, "loss": 0.5161, "step": 168 }, { "epoch": 2.5413533834586466, "grad_norm": 0.5272025465965271, "learning_rate": 4.498051473045291e-06, "loss": 0.538, "step": 169 }, { "epoch": 2.556390977443609, "grad_norm": 0.5329490303993225, "learning_rate": 4.218418707772886e-06, "loss": 0.458, "step": 170 }, { "epoch": 2.571428571428571, "grad_norm": 0.5895003080368042, "learning_rate": 3.947201319473587e-06, "loss": 0.4601, "step": 171 }, { "epoch": 2.5864661654135337, "grad_norm": 0.6356887221336365, "learning_rate": 3.684473456273278e-06, "loss": 0.4857, "step": 172 }, { "epoch": 2.601503759398496, "grad_norm": 0.6523851156234741, "learning_rate": 3.4303069453464383e-06, "loss": 0.4262, "step": 173 }, { "epoch": 2.6165413533834587, "grad_norm": 0.6722204089164734, "learning_rate": 3.184771273279312e-06, "loss": 0.4229, "step": 174 }, { "epoch": 2.6315789473684212, "grad_norm": 0.7110894918441772, "learning_rate": 2.947933567072987e-06, "loss": 0.391, "step": 175 }, { "epoch": 2.6466165413533833, "grad_norm": 0.7454285621643066, "learning_rate": 2.719858575791534e-06, "loss": 0.392, "step": 176 }, { "epoch": 2.661654135338346, "grad_norm": 0.7555717825889587, "learning_rate": 2.500608652860256e-06, "loss": 0.3266, "step": 177 }, { "epoch": 2.6766917293233083, "grad_norm": 0.8166589736938477, "learning_rate": 2.2902437390188737e-06, "loss": 0.3201, "step": 178 }, { "epoch": 2.6917293233082704, "grad_norm": 0.8460665941238403, "learning_rate": 2.0888213459343587e-06, "loss": 0.2792, "step": 179 }, { "epoch": 2.706766917293233, "grad_norm": 1.0361336469650269, "learning_rate": 1.8963965404777875e-06, "loss": 0.3334, "step": 180 }, { "epoch": 2.7218045112781954, "grad_norm": 1.1610829830169678, "learning_rate": 1.7130219296696263e-06, "loss": 0.1709, "step": 181 }, { "epoch": 2.736842105263158, "grad_norm": 0.37739837169647217, "learning_rate": 1.5387476462974824e-06, "loss": 0.4486, "step": 182 }, { "epoch": 2.7518796992481205, "grad_norm": 0.4364118278026581, "learning_rate": 1.3736213352103147e-06, "loss": 0.5489, "step": 183 }, { "epoch": 2.7669172932330826, "grad_norm": 0.4895642101764679, "learning_rate": 1.2176881402928002e-06, "loss": 0.572, "step": 184 }, { "epoch": 2.781954887218045, "grad_norm": 0.5160538554191589, "learning_rate": 1.0709906921234367e-06, "loss": 0.518, "step": 185 }, { "epoch": 2.7969924812030076, "grad_norm": 0.5683267712593079, "learning_rate": 9.33569096319799e-07, "loss": 0.5391, "step": 186 }, { "epoch": 2.8120300751879697, "grad_norm": 0.5757749676704407, "learning_rate": 8.054609225740255e-07, "loss": 0.5024, "step": 187 }, { "epoch": 2.827067669172932, "grad_norm": 0.6100803017616272, "learning_rate": 6.867011943816724e-07, "loss": 0.4719, "step": 188 }, { "epoch": 2.8421052631578947, "grad_norm": 0.6376129388809204, "learning_rate": 5.77322379466617e-07, "loss": 0.4347, "step": 189 }, { "epoch": 2.857142857142857, "grad_norm": 0.6683883666992188, "learning_rate": 4.773543809047186e-07, "loss": 0.4115, "step": 190 }, { "epoch": 2.8721804511278197, "grad_norm": 0.7268216609954834, "learning_rate": 3.868245289486027e-07, "loss": 0.4384, "step": 191 }, { "epoch": 2.887218045112782, "grad_norm": 0.7266953587532043, "learning_rate": 3.0575757355586817e-07, "loss": 0.3966, "step": 192 }, { "epoch": 2.9022556390977443, "grad_norm": 0.7034627795219421, "learning_rate": 2.3417567762266497e-07, "loss": 0.3116, "step": 193 }, { "epoch": 2.917293233082707, "grad_norm": 0.8307739496231079, "learning_rate": 1.7209841092460043e-07, "loss": 0.3587, "step": 194 }, { "epoch": 2.932330827067669, "grad_norm": 0.7999587059020996, "learning_rate": 1.1954274476655534e-07, "loss": 0.2889, "step": 195 }, { "epoch": 2.9473684210526314, "grad_norm": 0.9190281629562378, "learning_rate": 7.652304734289127e-08, "loss": 0.2466, "step": 196 }, { "epoch": 2.962406015037594, "grad_norm": 1.0203853845596313, "learning_rate": 4.30510798093342e-08, "loss": 0.1898, "step": 197 }, { "epoch": 2.9774436090225564, "grad_norm": 0.4999811351299286, "learning_rate": 1.9135993067588284e-08, "loss": 0.5179, "step": 198 }, { "epoch": 2.992481203007519, "grad_norm": 0.7291033864021301, "learning_rate": 4.784325263584854e-09, "loss": 0.4436, "step": 199 }, { "epoch": 3.007518796992481, "grad_norm": 1.9097191095352173, "learning_rate": 0.0, "loss": 0.4941, "step": 200 }, { "epoch": 3.007518796992481, "eval_loss": 0.5870135426521301, "eval_runtime": 2.4754, "eval_samples_per_second": 45.245, "eval_steps_per_second": 11.311, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 4, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.09457497473024e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }