{ "best_metric": NaN, "best_model_checkpoint": "miner_id_24/checkpoint-50", "epoch": 0.05394470667565745, "eval_steps": 50, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00017981568891885816, "grad_norm": NaN, "learning_rate": 1e-05, "loss": 0.0, "step": 1 }, { "epoch": 0.00017981568891885816, "eval_loss": NaN, "eval_runtime": 169.0918, "eval_samples_per_second": 55.396, "eval_steps_per_second": 13.85, "step": 1 }, { "epoch": 0.0003596313778377163, "grad_norm": NaN, "learning_rate": 2e-05, "loss": 0.0, "step": 2 }, { "epoch": 0.0005394470667565745, "grad_norm": NaN, "learning_rate": 3e-05, "loss": 0.0, "step": 3 }, { "epoch": 0.0007192627556754326, "grad_norm": NaN, "learning_rate": 4e-05, "loss": 0.0, "step": 4 }, { "epoch": 0.0008990784445942908, "grad_norm": NaN, "learning_rate": 5e-05, "loss": 0.0, "step": 5 }, { "epoch": 0.001078894133513149, "grad_norm": NaN, "learning_rate": 6e-05, "loss": 0.0, "step": 6 }, { "epoch": 0.0012587098224320073, "grad_norm": NaN, "learning_rate": 7e-05, "loss": 0.0, "step": 7 }, { "epoch": 0.0014385255113508653, "grad_norm": NaN, "learning_rate": 8e-05, "loss": 0.0, "step": 8 }, { "epoch": 0.0016183412002697235, "grad_norm": NaN, "learning_rate": 9e-05, "loss": 0.6066, "step": 9 }, { "epoch": 0.0017981568891885817, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 2.2056, "step": 10 }, { "epoch": 0.0019779725781074397, "grad_norm": NaN, "learning_rate": 9.99989723479183e-05, "loss": 0.0, "step": 11 }, { "epoch": 0.002157788267026298, "grad_norm": NaN, "learning_rate": 9.999588943391597e-05, "loss": 0.0, "step": 12 }, { "epoch": 0.002337603955945156, "grad_norm": NaN, "learning_rate": 9.999075138471951e-05, "loss": 0.0, "step": 13 }, { "epoch": 0.0025174196448640145, "grad_norm": NaN, "learning_rate": 9.9983558411534e-05, "loss": 0.0, "step": 14 }, { "epoch": 0.0026972353337828725, "grad_norm": NaN, "learning_rate": 9.99743108100344e-05, "loss": 0.0, "step": 15 }, { "epoch": 0.0028770510227017305, "grad_norm": NaN, "learning_rate": 9.996300896035339e-05, "loss": 0.0, "step": 16 }, { "epoch": 0.003056866711620589, "grad_norm": NaN, "learning_rate": 9.994965332706573e-05, "loss": 0.0, "step": 17 }, { "epoch": 0.003236682400539447, "grad_norm": NaN, "learning_rate": 9.993424445916923e-05, "loss": 0.0, "step": 18 }, { "epoch": 0.0034164980894583054, "grad_norm": NaN, "learning_rate": 9.991678299006205e-05, "loss": 0.0, "step": 19 }, { "epoch": 0.0035963137783771634, "grad_norm": NaN, "learning_rate": 9.989726963751682e-05, "loss": 0.0, "step": 20 }, { "epoch": 0.003776129467296022, "grad_norm": NaN, "learning_rate": 9.987570520365104e-05, "loss": 0.0, "step": 21 }, { "epoch": 0.003955945156214879, "grad_norm": NaN, "learning_rate": 9.98520905748941e-05, "loss": 0.0, "step": 22 }, { "epoch": 0.004135760845133738, "grad_norm": NaN, "learning_rate": 9.982642672195092e-05, "loss": 0.0, "step": 23 }, { "epoch": 0.004315576534052596, "grad_norm": NaN, "learning_rate": 9.979871469976196e-05, "loss": 0.0, "step": 24 }, { "epoch": 0.004495392222971455, "grad_norm": NaN, "learning_rate": 9.976895564745991e-05, "loss": 0.0, "step": 25 }, { "epoch": 0.004675207911890312, "grad_norm": NaN, "learning_rate": 9.973715078832288e-05, "loss": 0.0, "step": 26 }, { "epoch": 0.004855023600809171, "grad_norm": NaN, "learning_rate": 9.970330142972401e-05, "loss": 0.0, "step": 27 }, { "epoch": 0.005034839289728029, "grad_norm": NaN, "learning_rate": 9.966740896307791e-05, "loss": 0.0, "step": 28 }, { "epoch": 0.005214654978646887, "grad_norm": NaN, "learning_rate": 9.962947486378326e-05, "loss": 0.0, "step": 29 }, { "epoch": 0.005394470667565745, "grad_norm": NaN, "learning_rate": 9.95895006911623e-05, "loss": 0.0, "step": 30 }, { "epoch": 0.0055742863564846035, "grad_norm": NaN, "learning_rate": 9.954748808839674e-05, "loss": 0.0, "step": 31 }, { "epoch": 0.005754102045403461, "grad_norm": NaN, "learning_rate": 9.95034387824601e-05, "loss": 0.0, "step": 32 }, { "epoch": 0.0059339177343223195, "grad_norm": NaN, "learning_rate": 9.945735458404681e-05, "loss": 0.0, "step": 33 }, { "epoch": 0.006113733423241178, "grad_norm": NaN, "learning_rate": 9.940923738749778e-05, "loss": 0.0, "step": 34 }, { "epoch": 0.006293549112160036, "grad_norm": NaN, "learning_rate": 9.935908917072252e-05, "loss": 0.0, "step": 35 }, { "epoch": 0.006473364801078894, "grad_norm": NaN, "learning_rate": 9.930691199511775e-05, "loss": 0.0, "step": 36 }, { "epoch": 0.006653180489997752, "grad_norm": NaN, "learning_rate": 9.925270800548285e-05, "loss": 0.0, "step": 37 }, { "epoch": 0.006832996178916611, "grad_norm": NaN, "learning_rate": 9.919647942993148e-05, "loss": 0.0, "step": 38 }, { "epoch": 0.007012811867835468, "grad_norm": NaN, "learning_rate": 9.91382285798002e-05, "loss": 0.0, "step": 39 }, { "epoch": 0.007192627556754327, "grad_norm": NaN, "learning_rate": 9.907795784955327e-05, "loss": 0.0, "step": 40 }, { "epoch": 0.007372443245673185, "grad_norm": NaN, "learning_rate": 9.901566971668437e-05, "loss": 0.0, "step": 41 }, { "epoch": 0.007552258934592044, "grad_norm": NaN, "learning_rate": 9.895136674161465e-05, "loss": 0.0, "step": 42 }, { "epoch": 0.007732074623510901, "grad_norm": NaN, "learning_rate": 9.888505156758759e-05, "loss": 0.0, "step": 43 }, { "epoch": 0.007911890312429759, "grad_norm": NaN, "learning_rate": 9.881672692056021e-05, "loss": 0.0, "step": 44 }, { "epoch": 0.008091706001348618, "grad_norm": NaN, "learning_rate": 9.874639560909117e-05, "loss": 0.0, "step": 45 }, { "epoch": 0.008271521690267476, "grad_norm": NaN, "learning_rate": 9.867406052422524e-05, "loss": 0.0, "step": 46 }, { "epoch": 0.008451337379186335, "grad_norm": NaN, "learning_rate": 9.859972463937441e-05, "loss": 0.0, "step": 47 }, { "epoch": 0.008631153068105192, "grad_norm": NaN, "learning_rate": 9.852339101019574e-05, "loss": 0.0, "step": 48 }, { "epoch": 0.00881096875702405, "grad_norm": NaN, "learning_rate": 9.844506277446577e-05, "loss": 0.0, "step": 49 }, { "epoch": 0.00899078444594291, "grad_norm": NaN, "learning_rate": 9.836474315195147e-05, "loss": 0.0, "step": 50 }, { "epoch": 0.00899078444594291, "eval_loss": NaN, "eval_runtime": 168.0705, "eval_samples_per_second": 55.733, "eval_steps_per_second": 13.935, "step": 50 }, { "epoch": 0.009170600134861767, "grad_norm": NaN, "learning_rate": 9.828243544427796e-05, "loss": 0.0, "step": 51 }, { "epoch": 0.009350415823780624, "grad_norm": NaN, "learning_rate": 9.819814303479267e-05, "loss": 0.0, "step": 52 }, { "epoch": 0.009530231512699484, "grad_norm": NaN, "learning_rate": 9.811186938842645e-05, "loss": 0.0, "step": 53 }, { "epoch": 0.009710047201618341, "grad_norm": NaN, "learning_rate": 9.802361805155097e-05, "loss": 0.0, "step": 54 }, { "epoch": 0.009889862890537199, "grad_norm": NaN, "learning_rate": 9.793339265183303e-05, "loss": 0.0, "step": 55 }, { "epoch": 0.010069678579456058, "grad_norm": NaN, "learning_rate": 9.784119689808544e-05, "loss": 0.0, "step": 56 }, { "epoch": 0.010249494268374916, "grad_norm": NaN, "learning_rate": 9.774703458011453e-05, "loss": 0.0, "step": 57 }, { "epoch": 0.010429309957293773, "grad_norm": NaN, "learning_rate": 9.765090956856436e-05, "loss": 0.0, "step": 58 }, { "epoch": 0.010609125646212633, "grad_norm": NaN, "learning_rate": 9.755282581475769e-05, "loss": 0.9026, "step": 59 }, { "epoch": 0.01078894133513149, "grad_norm": NaN, "learning_rate": 9.745278735053343e-05, "loss": 1.7658, "step": 60 }, { "epoch": 0.010968757024050348, "grad_norm": NaN, "learning_rate": 9.735079828808107e-05, "loss": 0.0, "step": 61 }, { "epoch": 0.011148572712969207, "grad_norm": NaN, "learning_rate": 9.724686281977146e-05, "loss": 0.0, "step": 62 }, { "epoch": 0.011328388401888064, "grad_norm": NaN, "learning_rate": 9.714098521798465e-05, "loss": 0.0, "step": 63 }, { "epoch": 0.011508204090806922, "grad_norm": NaN, "learning_rate": 9.703316983493414e-05, "loss": 0.0, "step": 64 }, { "epoch": 0.011688019779725781, "grad_norm": NaN, "learning_rate": 9.692342110248802e-05, "loss": 0.0, "step": 65 }, { "epoch": 0.011867835468644639, "grad_norm": NaN, "learning_rate": 9.681174353198687e-05, "loss": 0.0, "step": 66 }, { "epoch": 0.012047651157563498, "grad_norm": NaN, "learning_rate": 9.669814171405816e-05, "loss": 0.0, "step": 67 }, { "epoch": 0.012227466846482356, "grad_norm": NaN, "learning_rate": 9.65826203184277e-05, "loss": 0.0, "step": 68 }, { "epoch": 0.012407282535401213, "grad_norm": NaN, "learning_rate": 9.64651840937276e-05, "loss": 0.0, "step": 69 }, { "epoch": 0.012587098224320073, "grad_norm": NaN, "learning_rate": 9.63458378673011e-05, "loss": 0.0, "step": 70 }, { "epoch": 0.01276691391323893, "grad_norm": NaN, "learning_rate": 9.622458654500409e-05, "loss": 0.0, "step": 71 }, { "epoch": 0.012946729602157788, "grad_norm": NaN, "learning_rate": 9.610143511100354e-05, "loss": 0.0, "step": 72 }, { "epoch": 0.013126545291076647, "grad_norm": NaN, "learning_rate": 9.597638862757255e-05, "loss": 0.0, "step": 73 }, { "epoch": 0.013306360979995505, "grad_norm": NaN, "learning_rate": 9.584945223488227e-05, "loss": 0.0, "step": 74 }, { "epoch": 0.013486176668914362, "grad_norm": NaN, "learning_rate": 9.572063115079063e-05, "loss": 0.0, "step": 75 }, { "epoch": 0.013665992357833221, "grad_norm": NaN, "learning_rate": 9.558993067062785e-05, "loss": 0.0, "step": 76 }, { "epoch": 0.013845808046752079, "grad_norm": NaN, "learning_rate": 9.545735616697875e-05, "loss": 0.0, "step": 77 }, { "epoch": 0.014025623735670937, "grad_norm": NaN, "learning_rate": 9.53229130894619e-05, "loss": 0.0, "step": 78 }, { "epoch": 0.014205439424589796, "grad_norm": NaN, "learning_rate": 9.518660696450568e-05, "loss": 0.0, "step": 79 }, { "epoch": 0.014385255113508653, "grad_norm": NaN, "learning_rate": 9.504844339512095e-05, "loss": 0.0, "step": 80 }, { "epoch": 0.014565070802427511, "grad_norm": NaN, "learning_rate": 9.490842806067095e-05, "loss": 0.0, "step": 81 }, { "epoch": 0.01474488649134637, "grad_norm": NaN, "learning_rate": 9.476656671663765e-05, "loss": 0.0, "step": 82 }, { "epoch": 0.014924702180265228, "grad_norm": NaN, "learning_rate": 9.46228651943853e-05, "loss": 0.0, "step": 83 }, { "epoch": 0.015104517869184087, "grad_norm": NaN, "learning_rate": 9.44773294009206e-05, "loss": 0.0, "step": 84 }, { "epoch": 0.015284333558102945, "grad_norm": NaN, "learning_rate": 9.432996531865002e-05, "loss": 0.0, "step": 85 }, { "epoch": 0.015464149247021802, "grad_norm": NaN, "learning_rate": 9.418077900513377e-05, "loss": 0.0, "step": 86 }, { "epoch": 0.01564396493594066, "grad_norm": NaN, "learning_rate": 9.40297765928369e-05, "loss": 0.0, "step": 87 }, { "epoch": 0.015823780624859517, "grad_norm": NaN, "learning_rate": 9.387696428887716e-05, "loss": 0.0, "step": 88 }, { "epoch": 0.016003596313778377, "grad_norm": NaN, "learning_rate": 9.372234837476978e-05, "loss": 0.0, "step": 89 }, { "epoch": 0.016183412002697236, "grad_norm": NaN, "learning_rate": 9.356593520616948e-05, "loss": 0.0, "step": 90 }, { "epoch": 0.016363227691616092, "grad_norm": NaN, "learning_rate": 9.340773121260893e-05, "loss": 0.0, "step": 91 }, { "epoch": 0.01654304338053495, "grad_norm": NaN, "learning_rate": 9.324774289723468e-05, "loss": 0.0, "step": 92 }, { "epoch": 0.01672285906945381, "grad_norm": NaN, "learning_rate": 9.308597683653975e-05, "loss": 0.0, "step": 93 }, { "epoch": 0.01690267475837267, "grad_norm": NaN, "learning_rate": 9.292243968009331e-05, "loss": 0.0, "step": 94 }, { "epoch": 0.017082490447291526, "grad_norm": NaN, "learning_rate": 9.275713815026731e-05, "loss": 0.0, "step": 95 }, { "epoch": 0.017262306136210385, "grad_norm": NaN, "learning_rate": 9.259007904196023e-05, "loss": 0.0, "step": 96 }, { "epoch": 0.017442121825129244, "grad_norm": NaN, "learning_rate": 9.242126922231763e-05, "loss": 0.0, "step": 97 }, { "epoch": 0.0176219375140481, "grad_norm": NaN, "learning_rate": 9.225071563045007e-05, "loss": 0.0, "step": 98 }, { "epoch": 0.01780175320296696, "grad_norm": NaN, "learning_rate": 9.207842527714767e-05, "loss": 0.0, "step": 99 }, { "epoch": 0.01798156889188582, "grad_norm": NaN, "learning_rate": 9.190440524459203e-05, "loss": 0.0, "step": 100 }, { "epoch": 0.01798156889188582, "eval_loss": NaN, "eval_runtime": 168.0661, "eval_samples_per_second": 55.734, "eval_steps_per_second": 13.935, "step": 100 }, { "epoch": 0.018161384580804674, "grad_norm": NaN, "learning_rate": 9.172866268606513e-05, "loss": 0.0, "step": 101 }, { "epoch": 0.018341200269723534, "grad_norm": NaN, "learning_rate": 9.155120482565521e-05, "loss": 0.0, "step": 102 }, { "epoch": 0.018521015958642393, "grad_norm": NaN, "learning_rate": 9.137203895795983e-05, "loss": 0.0, "step": 103 }, { "epoch": 0.01870083164756125, "grad_norm": NaN, "learning_rate": 9.119117244778607e-05, "loss": 0.0, "step": 104 }, { "epoch": 0.018880647336480108, "grad_norm": NaN, "learning_rate": 9.10086127298478e-05, "loss": 0.0, "step": 105 }, { "epoch": 0.019060463025398967, "grad_norm": NaN, "learning_rate": 9.082436730845993e-05, "loss": 0.0, "step": 106 }, { "epoch": 0.019240278714317823, "grad_norm": NaN, "learning_rate": 9.063844375723014e-05, "loss": 0.0, "step": 107 }, { "epoch": 0.019420094403236682, "grad_norm": NaN, "learning_rate": 9.045084971874738e-05, "loss": 0.0, "step": 108 }, { "epoch": 0.019599910092155542, "grad_norm": 0.06007186695933342, "learning_rate": 9.02615929042678e-05, "loss": 1.2147, "step": 109 }, { "epoch": 0.019779725781074398, "grad_norm": NaN, "learning_rate": 9.007068109339784e-05, "loss": 1.8944, "step": 110 }, { "epoch": 0.019959541469993257, "grad_norm": NaN, "learning_rate": 8.987812213377424e-05, "loss": 0.0, "step": 111 }, { "epoch": 0.020139357158912116, "grad_norm": NaN, "learning_rate": 8.968392394074164e-05, "loss": 0.0, "step": 112 }, { "epoch": 0.020319172847830972, "grad_norm": NaN, "learning_rate": 8.948809449702711e-05, "loss": 0.0, "step": 113 }, { "epoch": 0.02049898853674983, "grad_norm": NaN, "learning_rate": 8.929064185241213e-05, "loss": 0.0, "step": 114 }, { "epoch": 0.02067880422566869, "grad_norm": NaN, "learning_rate": 8.90915741234015e-05, "loss": 0.0, "step": 115 }, { "epoch": 0.020858619914587546, "grad_norm": NaN, "learning_rate": 8.889089949288986e-05, "loss": 0.0, "step": 116 }, { "epoch": 0.021038435603506406, "grad_norm": NaN, "learning_rate": 8.868862620982534e-05, "loss": 0.0, "step": 117 }, { "epoch": 0.021218251292425265, "grad_norm": NaN, "learning_rate": 8.848476258887031e-05, "loss": 0.0, "step": 118 }, { "epoch": 0.02139806698134412, "grad_norm": NaN, "learning_rate": 8.827931701005974e-05, "loss": 0.0, "step": 119 }, { "epoch": 0.02157788267026298, "grad_norm": NaN, "learning_rate": 8.807229791845673e-05, "loss": 0.0, "step": 120 }, { "epoch": 0.02175769835918184, "grad_norm": NaN, "learning_rate": 8.786371382380528e-05, "loss": 0.0, "step": 121 }, { "epoch": 0.021937514048100695, "grad_norm": NaN, "learning_rate": 8.765357330018056e-05, "loss": 0.0, "step": 122 }, { "epoch": 0.022117329737019555, "grad_norm": NaN, "learning_rate": 8.744188498563641e-05, "loss": 0.0, "step": 123 }, { "epoch": 0.022297145425938414, "grad_norm": NaN, "learning_rate": 8.722865758185035e-05, "loss": 0.0, "step": 124 }, { "epoch": 0.02247696111485727, "grad_norm": NaN, "learning_rate": 8.701389985376578e-05, "loss": 0.0, "step": 125 }, { "epoch": 0.02265677680377613, "grad_norm": NaN, "learning_rate": 8.679762062923175e-05, "loss": 0.0, "step": 126 }, { "epoch": 0.02283659249269499, "grad_norm": NaN, "learning_rate": 8.657982879864007e-05, "loss": 0.0, "step": 127 }, { "epoch": 0.023016408181613844, "grad_norm": NaN, "learning_rate": 8.636053331455987e-05, "loss": 0.0, "step": 128 }, { "epoch": 0.023196223870532703, "grad_norm": NaN, "learning_rate": 8.613974319136958e-05, "loss": 0.0, "step": 129 }, { "epoch": 0.023376039559451563, "grad_norm": NaN, "learning_rate": 8.591746750488639e-05, "loss": 0.0, "step": 130 }, { "epoch": 0.023555855248370422, "grad_norm": NaN, "learning_rate": 8.569371539199316e-05, "loss": 0.0, "step": 131 }, { "epoch": 0.023735670937289278, "grad_norm": NaN, "learning_rate": 8.54684960502629e-05, "loss": 0.0, "step": 132 }, { "epoch": 0.023915486626208137, "grad_norm": NaN, "learning_rate": 8.524181873758059e-05, "loss": 0.0, "step": 133 }, { "epoch": 0.024095302315126996, "grad_norm": NaN, "learning_rate": 8.501369277176276e-05, "loss": 0.0, "step": 134 }, { "epoch": 0.024275118004045852, "grad_norm": NaN, "learning_rate": 8.478412753017433e-05, "loss": 0.0, "step": 135 }, { "epoch": 0.02445493369296471, "grad_norm": NaN, "learning_rate": 8.455313244934324e-05, "loss": 0.0, "step": 136 }, { "epoch": 0.02463474938188357, "grad_norm": NaN, "learning_rate": 8.432071702457252e-05, "loss": 0.0, "step": 137 }, { "epoch": 0.024814565070802427, "grad_norm": NaN, "learning_rate": 8.408689080954998e-05, "loss": 0.0, "step": 138 }, { "epoch": 0.024994380759721286, "grad_norm": NaN, "learning_rate": 8.385166341595548e-05, "loss": 0.0, "step": 139 }, { "epoch": 0.025174196448640145, "grad_norm": NaN, "learning_rate": 8.361504451306585e-05, "loss": 0.0, "step": 140 }, { "epoch": 0.025354012137559, "grad_norm": NaN, "learning_rate": 8.33770438273574e-05, "loss": 0.0, "step": 141 }, { "epoch": 0.02553382782647786, "grad_norm": NaN, "learning_rate": 8.313767114210615e-05, "loss": 0.0, "step": 142 }, { "epoch": 0.02571364351539672, "grad_norm": NaN, "learning_rate": 8.289693629698564e-05, "loss": 0.0, "step": 143 }, { "epoch": 0.025893459204315575, "grad_norm": NaN, "learning_rate": 8.265484918766243e-05, "loss": 0.0, "step": 144 }, { "epoch": 0.026073274893234435, "grad_norm": NaN, "learning_rate": 8.241141976538943e-05, "loss": 0.0, "step": 145 }, { "epoch": 0.026253090582153294, "grad_norm": NaN, "learning_rate": 8.216665803659671e-05, "loss": 0.0, "step": 146 }, { "epoch": 0.02643290627107215, "grad_norm": NaN, "learning_rate": 8.192057406248028e-05, "loss": 0.0, "step": 147 }, { "epoch": 0.02661272195999101, "grad_norm": NaN, "learning_rate": 8.167317795858851e-05, "loss": 0.0, "step": 148 }, { "epoch": 0.02679253764890987, "grad_norm": NaN, "learning_rate": 8.142447989440618e-05, "loss": 0.0, "step": 149 }, { "epoch": 0.026972353337828724, "grad_norm": NaN, "learning_rate": 8.117449009293668e-05, "loss": 0.0, "step": 150 }, { "epoch": 0.026972353337828724, "eval_loss": NaN, "eval_runtime": 167.9429, "eval_samples_per_second": 55.775, "eval_steps_per_second": 13.945, "step": 150 }, { "epoch": 0.027152169026747584, "grad_norm": NaN, "learning_rate": 8.092321883028158e-05, "loss": 0.0, "step": 151 }, { "epoch": 0.027331984715666443, "grad_norm": NaN, "learning_rate": 8.067067643521834e-05, "loss": 0.0, "step": 152 }, { "epoch": 0.0275118004045853, "grad_norm": NaN, "learning_rate": 8.041687328877567e-05, "loss": 0.0, "step": 153 }, { "epoch": 0.027691616093504158, "grad_norm": NaN, "learning_rate": 8.016181982380682e-05, "loss": 0.0, "step": 154 }, { "epoch": 0.027871431782423017, "grad_norm": NaN, "learning_rate": 7.990552652456081e-05, "loss": 0.0, "step": 155 }, { "epoch": 0.028051247471341873, "grad_norm": NaN, "learning_rate": 7.964800392625129e-05, "loss": 0.0, "step": 156 }, { "epoch": 0.028231063160260732, "grad_norm": NaN, "learning_rate": 7.938926261462366e-05, "loss": 0.0, "step": 157 }, { "epoch": 0.028410878849179592, "grad_norm": NaN, "learning_rate": 7.91293132255198e-05, "loss": 0.0, "step": 158 }, { "epoch": 0.028590694538098448, "grad_norm": NaN, "learning_rate": 7.886816644444098e-05, "loss": 0.0, "step": 159 }, { "epoch": 0.028770510227017307, "grad_norm": NaN, "learning_rate": 7.860583300610849e-05, "loss": 1.4892, "step": 160 }, { "epoch": 0.028950325915936166, "grad_norm": NaN, "learning_rate": 7.83423236940225e-05, "loss": 0.0, "step": 161 }, { "epoch": 0.029130141604855022, "grad_norm": NaN, "learning_rate": 7.807764934001874e-05, "loss": 0.0, "step": 162 }, { "epoch": 0.02930995729377388, "grad_norm": NaN, "learning_rate": 7.781182082382325e-05, "loss": 0.0, "step": 163 }, { "epoch": 0.02948977298269274, "grad_norm": NaN, "learning_rate": 7.754484907260513e-05, "loss": 0.0, "step": 164 }, { "epoch": 0.029669588671611596, "grad_norm": NaN, "learning_rate": 7.727674506052743e-05, "loss": 0.0, "step": 165 }, { "epoch": 0.029849404360530456, "grad_norm": NaN, "learning_rate": 7.700751980829602e-05, "loss": 0.0, "step": 166 }, { "epoch": 0.030029220049449315, "grad_norm": NaN, "learning_rate": 7.673718438270648e-05, "loss": 0.0, "step": 167 }, { "epoch": 0.030209035738368174, "grad_norm": NaN, "learning_rate": 7.646574989618938e-05, "loss": 0.0, "step": 168 }, { "epoch": 0.03038885142728703, "grad_norm": NaN, "learning_rate": 7.619322750635327e-05, "loss": 0.0, "step": 169 }, { "epoch": 0.03056866711620589, "grad_norm": NaN, "learning_rate": 7.591962841552627e-05, "loss": 0.0, "step": 170 }, { "epoch": 0.03074848280512475, "grad_norm": NaN, "learning_rate": 7.564496387029532e-05, "loss": 0.0, "step": 171 }, { "epoch": 0.030928298494043605, "grad_norm": NaN, "learning_rate": 7.536924516104411e-05, "loss": 0.0, "step": 172 }, { "epoch": 0.031108114182962464, "grad_norm": NaN, "learning_rate": 7.509248362148889e-05, "loss": 0.0, "step": 173 }, { "epoch": 0.03128792987188132, "grad_norm": NaN, "learning_rate": 7.481469062821252e-05, "loss": 0.0, "step": 174 }, { "epoch": 0.03146774556080018, "grad_norm": NaN, "learning_rate": 7.45358776001969e-05, "loss": 0.0, "step": 175 }, { "epoch": 0.031647561249719035, "grad_norm": NaN, "learning_rate": 7.425605599835361e-05, "loss": 0.0, "step": 176 }, { "epoch": 0.031827376938637894, "grad_norm": NaN, "learning_rate": 7.39752373250527e-05, "loss": 0.0, "step": 177 }, { "epoch": 0.03200719262755675, "grad_norm": NaN, "learning_rate": 7.369343312364993e-05, "loss": 0.0, "step": 178 }, { "epoch": 0.03218700831647561, "grad_norm": NaN, "learning_rate": 7.34106549780123e-05, "loss": 0.0, "step": 179 }, { "epoch": 0.03236682400539447, "grad_norm": NaN, "learning_rate": 7.312691451204178e-05, "loss": 0.0, "step": 180 }, { "epoch": 0.03254663969431333, "grad_norm": NaN, "learning_rate": 7.284222338919758e-05, "loss": 0.0, "step": 181 }, { "epoch": 0.032726455383232184, "grad_norm": NaN, "learning_rate": 7.255659331201673e-05, "loss": 0.0, "step": 182 }, { "epoch": 0.03290627107215104, "grad_norm": NaN, "learning_rate": 7.227003602163295e-05, "loss": 0.0, "step": 183 }, { "epoch": 0.0330860867610699, "grad_norm": NaN, "learning_rate": 7.198256329729412e-05, "loss": 0.0, "step": 184 }, { "epoch": 0.03326590244998876, "grad_norm": NaN, "learning_rate": 7.169418695587791e-05, "loss": 0.0, "step": 185 }, { "epoch": 0.03344571813890762, "grad_norm": NaN, "learning_rate": 7.14049188514063e-05, "loss": 0.0, "step": 186 }, { "epoch": 0.03362553382782648, "grad_norm": NaN, "learning_rate": 7.1114770874558e-05, "loss": 0.0, "step": 187 }, { "epoch": 0.03380534951674534, "grad_norm": NaN, "learning_rate": 7.082375495217995e-05, "loss": 0.0, "step": 188 }, { "epoch": 0.03398516520566419, "grad_norm": NaN, "learning_rate": 7.05318830467969e-05, "loss": 0.0, "step": 189 }, { "epoch": 0.03416498089458305, "grad_norm": NaN, "learning_rate": 7.023916715611969e-05, "loss": 0.0, "step": 190 }, { "epoch": 0.03434479658350191, "grad_norm": NaN, "learning_rate": 6.99456193125521e-05, "loss": 0.0, "step": 191 }, { "epoch": 0.03452461227242077, "grad_norm": NaN, "learning_rate": 6.965125158269619e-05, "loss": 0.0, "step": 192 }, { "epoch": 0.03470442796133963, "grad_norm": NaN, "learning_rate": 6.935607606685642e-05, "loss": 0.0, "step": 193 }, { "epoch": 0.03488424365025849, "grad_norm": NaN, "learning_rate": 6.906010489854209e-05, "loss": 0.0, "step": 194 }, { "epoch": 0.03506405933917734, "grad_norm": NaN, "learning_rate": 6.876335024396872e-05, "loss": 0.0, "step": 195 }, { "epoch": 0.0352438750280962, "grad_norm": NaN, "learning_rate": 6.846582430155783e-05, "loss": 0.0, "step": 196 }, { "epoch": 0.03542369071701506, "grad_norm": NaN, "learning_rate": 6.816753930143558e-05, "loss": 0.0, "step": 197 }, { "epoch": 0.03560350640593392, "grad_norm": NaN, "learning_rate": 6.786850750493006e-05, "loss": 0.0, "step": 198 }, { "epoch": 0.03578332209485278, "grad_norm": NaN, "learning_rate": 6.756874120406714e-05, "loss": 0.0, "step": 199 }, { "epoch": 0.03596313778377164, "grad_norm": NaN, "learning_rate": 6.726825272106538e-05, "loss": 0.0, "step": 200 }, { "epoch": 0.03596313778377164, "eval_loss": NaN, "eval_runtime": 167.7462, "eval_samples_per_second": 55.84, "eval_steps_per_second": 13.962, "step": 200 }, { "epoch": 0.03614295347269049, "grad_norm": NaN, "learning_rate": 6.696705440782938e-05, "loss": 0.0, "step": 201 }, { "epoch": 0.03632276916160935, "grad_norm": NaN, "learning_rate": 6.666515864544209e-05, "loss": 0.0, "step": 202 }, { "epoch": 0.03650258485052821, "grad_norm": NaN, "learning_rate": 6.636257784365584e-05, "loss": 0.0, "step": 203 }, { "epoch": 0.03668240053944707, "grad_norm": NaN, "learning_rate": 6.605932444038229e-05, "loss": 0.0, "step": 204 }, { "epoch": 0.03686221622836593, "grad_norm": NaN, "learning_rate": 6.575541090118105e-05, "loss": 0.0, "step": 205 }, { "epoch": 0.037042031917284786, "grad_norm": NaN, "learning_rate": 6.545084971874738e-05, "loss": 0.0, "step": 206 }, { "epoch": 0.03722184760620364, "grad_norm": NaN, "learning_rate": 6.514565341239861e-05, "loss": 0.0, "step": 207 }, { "epoch": 0.0374016632951225, "grad_norm": NaN, "learning_rate": 6.483983452755953e-05, "loss": 0.5426, "step": 208 }, { "epoch": 0.03758147898404136, "grad_norm": NaN, "learning_rate": 6.453340563524669e-05, "loss": 1.7339, "step": 209 }, { "epoch": 0.037761294672960216, "grad_norm": NaN, "learning_rate": 6.422637933155162e-05, "loss": 0.0, "step": 210 }, { "epoch": 0.037941110361879075, "grad_norm": NaN, "learning_rate": 6.391876823712317e-05, "loss": 0.0, "step": 211 }, { "epoch": 0.038120926050797935, "grad_norm": NaN, "learning_rate": 6.361058499664856e-05, "loss": 0.0, "step": 212 }, { "epoch": 0.03830074173971679, "grad_norm": NaN, "learning_rate": 6.330184227833376e-05, "loss": 0.0, "step": 213 }, { "epoch": 0.038480557428635646, "grad_norm": NaN, "learning_rate": 6.299255277338265e-05, "loss": 0.0, "step": 214 }, { "epoch": 0.038660373117554506, "grad_norm": NaN, "learning_rate": 6.268272919547537e-05, "loss": 0.0, "step": 215 }, { "epoch": 0.038840188806473365, "grad_norm": NaN, "learning_rate": 6.237238428024572e-05, "loss": 0.0, "step": 216 }, { "epoch": 0.039020004495392224, "grad_norm": NaN, "learning_rate": 6.206153078475763e-05, "loss": 0.0, "step": 217 }, { "epoch": 0.039199820184311084, "grad_norm": NaN, "learning_rate": 6.175018148698077e-05, "loss": 0.0, "step": 218 }, { "epoch": 0.039379635873229936, "grad_norm": NaN, "learning_rate": 6.143834918526527e-05, "loss": 0.0, "step": 219 }, { "epoch": 0.039559451562148795, "grad_norm": NaN, "learning_rate": 6.112604669781572e-05, "loss": 0.0, "step": 220 }, { "epoch": 0.039739267251067655, "grad_norm": NaN, "learning_rate": 6.081328686216418e-05, "loss": 0.0, "step": 221 }, { "epoch": 0.039919082939986514, "grad_norm": NaN, "learning_rate": 6.0500082534642464e-05, "loss": 0.0, "step": 222 }, { "epoch": 0.04009889862890537, "grad_norm": NaN, "learning_rate": 6.0186446589853784e-05, "loss": 0.0, "step": 223 }, { "epoch": 0.04027871431782423, "grad_norm": NaN, "learning_rate": 5.987239192014336e-05, "loss": 0.0, "step": 224 }, { "epoch": 0.04045853000674309, "grad_norm": NaN, "learning_rate": 5.955793143506863e-05, "loss": 0.0, "step": 225 }, { "epoch": 0.040638345695661944, "grad_norm": NaN, "learning_rate": 5.924307806086844e-05, "loss": 0.0, "step": 226 }, { "epoch": 0.0408181613845808, "grad_norm": NaN, "learning_rate": 5.8927844739931834e-05, "loss": 0.0, "step": 227 }, { "epoch": 0.04099797707349966, "grad_norm": NaN, "learning_rate": 5.861224443026595e-05, "loss": 0.0, "step": 228 }, { "epoch": 0.04117779276241852, "grad_norm": NaN, "learning_rate": 5.82962901049634e-05, "loss": 0.0, "step": 229 }, { "epoch": 0.04135760845133738, "grad_norm": NaN, "learning_rate": 5.7979994751668964e-05, "loss": 0.0, "step": 230 }, { "epoch": 0.04153742414025624, "grad_norm": NaN, "learning_rate": 5.766337137204579e-05, "loss": 0.0, "step": 231 }, { "epoch": 0.04171723982917509, "grad_norm": NaN, "learning_rate": 5.7346432981240904e-05, "loss": 0.0, "step": 232 }, { "epoch": 0.04189705551809395, "grad_norm": NaN, "learning_rate": 5.7029192607350146e-05, "loss": 0.0, "step": 233 }, { "epoch": 0.04207687120701281, "grad_norm": NaN, "learning_rate": 5.6711663290882776e-05, "loss": 0.0, "step": 234 }, { "epoch": 0.04225668689593167, "grad_norm": NaN, "learning_rate": 5.6393858084225305e-05, "loss": 0.0, "step": 235 }, { "epoch": 0.04243650258485053, "grad_norm": NaN, "learning_rate": 5.6075790051105023e-05, "loss": 0.0, "step": 236 }, { "epoch": 0.04261631827376939, "grad_norm": NaN, "learning_rate": 5.575747226605298e-05, "loss": 0.0, "step": 237 }, { "epoch": 0.04279613396268824, "grad_norm": NaN, "learning_rate": 5.5438917813866554e-05, "loss": 0.0, "step": 238 }, { "epoch": 0.0429759496516071, "grad_norm": NaN, "learning_rate": 5.512013978907157e-05, "loss": 0.0, "step": 239 }, { "epoch": 0.04315576534052596, "grad_norm": NaN, "learning_rate": 5.480115129538409e-05, "loss": 0.0, "step": 240 }, { "epoch": 0.04333558102944482, "grad_norm": NaN, "learning_rate": 5.448196544517168e-05, "loss": 0.0, "step": 241 }, { "epoch": 0.04351539671836368, "grad_norm": NaN, "learning_rate": 5.416259535891447e-05, "loss": 0.0, "step": 242 }, { "epoch": 0.04369521240728254, "grad_norm": NaN, "learning_rate": 5.384305416466584e-05, "loss": 0.0, "step": 243 }, { "epoch": 0.04387502809620139, "grad_norm": NaN, "learning_rate": 5.35233549975127e-05, "loss": 0.0, "step": 244 }, { "epoch": 0.04405484378512025, "grad_norm": NaN, "learning_rate": 5.320351099903565e-05, "loss": 0.0, "step": 245 }, { "epoch": 0.04423465947403911, "grad_norm": NaN, "learning_rate": 5.288353531676873e-05, "loss": 0.0, "step": 246 }, { "epoch": 0.04441447516295797, "grad_norm": NaN, "learning_rate": 5.256344110365896e-05, "loss": 0.0, "step": 247 }, { "epoch": 0.04459429085187683, "grad_norm": NaN, "learning_rate": 5.2243241517525754e-05, "loss": 0.0, "step": 248 }, { "epoch": 0.04477410654079569, "grad_norm": NaN, "learning_rate": 5.192294972051992e-05, "loss": 0.0, "step": 249 }, { "epoch": 0.04495392222971454, "grad_norm": NaN, "learning_rate": 5.1602578878582776e-05, "loss": 0.0, "step": 250 }, { "epoch": 0.04495392222971454, "eval_loss": NaN, "eval_runtime": 167.8426, "eval_samples_per_second": 55.808, "eval_steps_per_second": 13.954, "step": 250 }, { "epoch": 0.0451337379186334, "grad_norm": NaN, "learning_rate": 5.128214216090478e-05, "loss": 0.0, "step": 251 }, { "epoch": 0.04531355360755226, "grad_norm": NaN, "learning_rate": 5.0961652739384356e-05, "loss": 0.0, "step": 252 }, { "epoch": 0.04549336929647112, "grad_norm": NaN, "learning_rate": 5.064112378808637e-05, "loss": 0.0, "step": 253 }, { "epoch": 0.04567318498538998, "grad_norm": NaN, "learning_rate": 5.0320568482700556e-05, "loss": 0.0, "step": 254 }, { "epoch": 0.045853000674308836, "grad_norm": NaN, "learning_rate": 5e-05, "loss": 0.0, "step": 255 }, { "epoch": 0.04603281636322769, "grad_norm": NaN, "learning_rate": 4.967943151729945e-05, "loss": 0.0, "step": 256 }, { "epoch": 0.04621263205214655, "grad_norm": NaN, "learning_rate": 4.935887621191364e-05, "loss": 0.0, "step": 257 }, { "epoch": 0.04639244774106541, "grad_norm": NaN, "learning_rate": 4.903834726061565e-05, "loss": 0.0, "step": 258 }, { "epoch": 0.046572263429984266, "grad_norm": NaN, "learning_rate": 4.871785783909523e-05, "loss": 0.8791, "step": 259 }, { "epoch": 0.046752079118903125, "grad_norm": NaN, "learning_rate": 4.839742112141724e-05, "loss": 2.2549, "step": 260 }, { "epoch": 0.046931894807821985, "grad_norm": NaN, "learning_rate": 4.807705027948008e-05, "loss": 0.0, "step": 261 }, { "epoch": 0.047111710496740844, "grad_norm": NaN, "learning_rate": 4.775675848247427e-05, "loss": 0.0, "step": 262 }, { "epoch": 0.047291526185659696, "grad_norm": NaN, "learning_rate": 4.743655889634105e-05, "loss": 0.0, "step": 263 }, { "epoch": 0.047471341874578556, "grad_norm": NaN, "learning_rate": 4.711646468323129e-05, "loss": 0.0, "step": 264 }, { "epoch": 0.047651157563497415, "grad_norm": NaN, "learning_rate": 4.679648900096436e-05, "loss": 0.0, "step": 265 }, { "epoch": 0.047830973252416274, "grad_norm": NaN, "learning_rate": 4.64766450024873e-05, "loss": 0.0, "step": 266 }, { "epoch": 0.048010788941335134, "grad_norm": NaN, "learning_rate": 4.6156945835334184e-05, "loss": 0.0, "step": 267 }, { "epoch": 0.04819060463025399, "grad_norm": NaN, "learning_rate": 4.583740464108554e-05, "loss": 0.0, "step": 268 }, { "epoch": 0.048370420319172845, "grad_norm": NaN, "learning_rate": 4.551803455482833e-05, "loss": 0.0, "step": 269 }, { "epoch": 0.048550236008091704, "grad_norm": NaN, "learning_rate": 4.5198848704615914e-05, "loss": 0.0, "step": 270 }, { "epoch": 0.048730051697010564, "grad_norm": NaN, "learning_rate": 4.487986021092844e-05, "loss": 0.0, "step": 271 }, { "epoch": 0.04890986738592942, "grad_norm": NaN, "learning_rate": 4.4561082186133464e-05, "loss": 0.0, "step": 272 }, { "epoch": 0.04908968307484828, "grad_norm": NaN, "learning_rate": 4.424252773394704e-05, "loss": 0.0, "step": 273 }, { "epoch": 0.04926949876376714, "grad_norm": NaN, "learning_rate": 4.392420994889498e-05, "loss": 0.0, "step": 274 }, { "epoch": 0.049449314452685994, "grad_norm": NaN, "learning_rate": 4.3606141915774693e-05, "loss": 0.0, "step": 275 }, { "epoch": 0.04962913014160485, "grad_norm": NaN, "learning_rate": 4.328833670911724e-05, "loss": 0.0, "step": 276 }, { "epoch": 0.04980894583052371, "grad_norm": NaN, "learning_rate": 4.297080739264987e-05, "loss": 0.0, "step": 277 }, { "epoch": 0.04998876151944257, "grad_norm": NaN, "learning_rate": 4.265356701875911e-05, "loss": 0.0, "step": 278 }, { "epoch": 0.05016857720836143, "grad_norm": NaN, "learning_rate": 4.23366286279542e-05, "loss": 0.0, "step": 279 }, { "epoch": 0.05034839289728029, "grad_norm": NaN, "learning_rate": 4.2020005248331054e-05, "loss": 0.0, "step": 280 }, { "epoch": 0.05052820858619914, "grad_norm": NaN, "learning_rate": 4.1703709895036625e-05, "loss": 0.0, "step": 281 }, { "epoch": 0.050708024275118, "grad_norm": NaN, "learning_rate": 4.138775556973406e-05, "loss": 0.0, "step": 282 }, { "epoch": 0.05088783996403686, "grad_norm": NaN, "learning_rate": 4.107215526006817e-05, "loss": 0.0, "step": 283 }, { "epoch": 0.05106765565295572, "grad_norm": NaN, "learning_rate": 4.0756921939131565e-05, "loss": 0.0, "step": 284 }, { "epoch": 0.05124747134187458, "grad_norm": NaN, "learning_rate": 4.04420685649314e-05, "loss": 0.0, "step": 285 }, { "epoch": 0.05142728703079344, "grad_norm": NaN, "learning_rate": 4.012760807985665e-05, "loss": 0.0, "step": 286 }, { "epoch": 0.05160710271971229, "grad_norm": NaN, "learning_rate": 3.981355341014623e-05, "loss": 0.0, "step": 287 }, { "epoch": 0.05178691840863115, "grad_norm": NaN, "learning_rate": 3.9499917465357534e-05, "loss": 0.0, "step": 288 }, { "epoch": 0.05196673409755001, "grad_norm": NaN, "learning_rate": 3.9186713137835826e-05, "loss": 0.0, "step": 289 }, { "epoch": 0.05214654978646887, "grad_norm": NaN, "learning_rate": 3.887395330218429e-05, "loss": 0.0, "step": 290 }, { "epoch": 0.05232636547538773, "grad_norm": NaN, "learning_rate": 3.856165081473474e-05, "loss": 0.0, "step": 291 }, { "epoch": 0.05250618116430659, "grad_norm": NaN, "learning_rate": 3.8249818513019244e-05, "loss": 0.0, "step": 292 }, { "epoch": 0.05268599685322544, "grad_norm": NaN, "learning_rate": 3.793846921524237e-05, "loss": 0.0, "step": 293 }, { "epoch": 0.0528658125421443, "grad_norm": NaN, "learning_rate": 3.762761571975429e-05, "loss": 0.0, "step": 294 }, { "epoch": 0.05304562823106316, "grad_norm": NaN, "learning_rate": 3.731727080452464e-05, "loss": 0.0, "step": 295 }, { "epoch": 0.05322544391998202, "grad_norm": NaN, "learning_rate": 3.7007447226617366e-05, "loss": 0.0, "step": 296 }, { "epoch": 0.05340525960890088, "grad_norm": NaN, "learning_rate": 3.6698157721666246e-05, "loss": 0.0, "step": 297 }, { "epoch": 0.05358507529781974, "grad_norm": NaN, "learning_rate": 3.638941500335145e-05, "loss": 0.0, "step": 298 }, { "epoch": 0.053764890986738596, "grad_norm": NaN, "learning_rate": 3.608123176287685e-05, "loss": 0.0, "step": 299 }, { "epoch": 0.05394470667565745, "grad_norm": NaN, "learning_rate": 3.5773620668448384e-05, "loss": 0.0, "step": 300 }, { "epoch": 0.05394470667565745, "eval_loss": NaN, "eval_runtime": 168.1974, "eval_samples_per_second": 55.691, "eval_steps_per_second": 13.924, "step": 300 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 5 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4041078390784e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }