{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9889162927525035, "eval_steps": 500, "global_step": 5200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019017621014471222, "grad_norm": 5.274278163909912, "learning_rate": 4.4999999999999996e-05, "loss": 1.3065, "step": 10 }, { "epoch": 0.0038035242028942443, "grad_norm": 6.159559726715088, "learning_rate": 9.5e-05, "loss": 0.5845, "step": 20 }, { "epoch": 0.005705286304341367, "grad_norm": 0.032951805740594864, "learning_rate": 0.000145, "loss": 0.0535, "step": 30 }, { "epoch": 0.007607048405788489, "grad_norm": 0.0016232666093856096, "learning_rate": 0.00019500000000000002, "loss": 0.0011, "step": 40 }, { "epoch": 0.00950881050723561, "grad_norm": 0.03492136672139168, "learning_rate": 0.000245, "loss": 0.0004, "step": 50 }, { "epoch": 0.011410572608682733, "grad_norm": 0.03490574657917023, "learning_rate": 0.000295, "loss": 0.0002, "step": 60 }, { "epoch": 0.013312334710129855, "grad_norm": 0.2781233489513397, "learning_rate": 0.000345, "loss": 0.0028, "step": 70 }, { "epoch": 0.015214096811576977, "grad_norm": 0.2495400309562683, "learning_rate": 0.000395, "loss": 0.0054, "step": 80 }, { "epoch": 0.0171158589130241, "grad_norm": 0.016650637611746788, "learning_rate": 0.00044500000000000003, "loss": 0.0113, "step": 90 }, { "epoch": 0.01901762101447122, "grad_norm": 0.006048521026968956, "learning_rate": 0.000495, "loss": 0.0129, "step": 100 }, { "epoch": 0.020919383115918344, "grad_norm": 0.04650586470961571, "learning_rate": 0.0004999995932430571, "loss": 0.0531, "step": 110 }, { "epoch": 0.022821145217365467, "grad_norm": 0.2665582597255707, "learning_rate": 0.0004999981871713734, "loss": 0.0106, "step": 120 }, { "epoch": 0.024722907318812586, "grad_norm": 0.3457753360271454, "learning_rate": 0.0004999957767689057, "loss": 0.0111, "step": 130 }, { "epoch": 0.02662466942025971, "grad_norm": 0.3711691200733185, "learning_rate": 0.0004999923620453374, "loss": 0.0089, "step": 140 }, { "epoch": 0.028526431521706832, "grad_norm": 0.5576126575469971, "learning_rate": 0.0004999879430143867, "loss": 0.0225, "step": 150 }, { "epoch": 0.030428193623153955, "grad_norm": 0.061603959649801254, "learning_rate": 0.0004999825196938062, "loss": 0.0066, "step": 160 }, { "epoch": 0.032329955724601074, "grad_norm": 0.443553626537323, "learning_rate": 0.0004999760921053835, "loss": 0.0123, "step": 170 }, { "epoch": 0.0342317178260482, "grad_norm": 0.24774852395057678, "learning_rate": 0.0004999686602749405, "loss": 0.0153, "step": 180 }, { "epoch": 0.03613347992749532, "grad_norm": 0.07191673666238785, "learning_rate": 0.0004999602242323333, "loss": 0.0126, "step": 190 }, { "epoch": 0.03803524202894244, "grad_norm": 0.48675355315208435, "learning_rate": 0.0004999507840114525, "loss": 0.0232, "step": 200 }, { "epoch": 0.039937004130389565, "grad_norm": 0.37680506706237793, "learning_rate": 0.000499940339650223, "loss": 0.0113, "step": 210 }, { "epoch": 0.04183876623183669, "grad_norm": 0.4189215302467346, "learning_rate": 0.0004999288911906033, "loss": 0.0111, "step": 220 }, { "epoch": 0.04374052833328381, "grad_norm": 0.5190231204032898, "learning_rate": 0.0004999164386785859, "loss": 0.0181, "step": 230 }, { "epoch": 0.045642290434730934, "grad_norm": 0.3827992379665375, "learning_rate": 0.0004999029821641969, "loss": 0.0254, "step": 240 }, { "epoch": 0.04754405253617805, "grad_norm": 0.3790438771247864, "learning_rate": 0.0004998885217014959, "loss": 0.0135, "step": 250 }, { "epoch": 0.04944581463762517, "grad_norm": 0.32959797978401184, "learning_rate": 0.0004998730573485757, "loss": 0.02, "step": 260 }, { "epoch": 0.051347576739072295, "grad_norm": 0.11981680989265442, "learning_rate": 0.0004998565891675621, "loss": 0.0274, "step": 270 }, { "epoch": 0.05324933884051942, "grad_norm": 0.417461097240448, "learning_rate": 0.0004998391172246136, "loss": 0.0226, "step": 280 }, { "epoch": 0.05515110094196654, "grad_norm": 0.5383813381195068, "learning_rate": 0.0004998206415899208, "loss": 0.0198, "step": 290 }, { "epoch": 0.057052863043413664, "grad_norm": 0.49503782391548157, "learning_rate": 0.0004998011623377073, "loss": 0.0217, "step": 300 }, { "epoch": 0.05895462514486079, "grad_norm": 0.5187065005302429, "learning_rate": 0.0004997806795462279, "loss": 0.0251, "step": 310 }, { "epoch": 0.06085638724630791, "grad_norm": 0.27587956190109253, "learning_rate": 0.0004997591932977692, "loss": 0.0212, "step": 320 }, { "epoch": 0.06275814934775503, "grad_norm": 0.19010399281978607, "learning_rate": 0.000499736703678649, "loss": 0.0186, "step": 330 }, { "epoch": 0.06465991144920215, "grad_norm": 0.28244227170944214, "learning_rate": 0.0004997132107792161, "loss": 0.0281, "step": 340 }, { "epoch": 0.06656167355064928, "grad_norm": 0.3918805420398712, "learning_rate": 0.0004996887146938497, "loss": 0.0349, "step": 350 }, { "epoch": 0.0684634356520964, "grad_norm": 0.3364613354206085, "learning_rate": 0.0004996632155209592, "loss": 0.0192, "step": 360 }, { "epoch": 0.07036519775354352, "grad_norm": 0.20451515913009644, "learning_rate": 0.0004996367133629837, "loss": 0.0341, "step": 370 }, { "epoch": 0.07226695985499064, "grad_norm": 0.5754940509796143, "learning_rate": 0.0004996092083263919, "loss": 0.0327, "step": 380 }, { "epoch": 0.07416872195643776, "grad_norm": 0.5575674176216125, "learning_rate": 0.000499580700521681, "loss": 0.0407, "step": 390 }, { "epoch": 0.07607048405788489, "grad_norm": 0.4820100665092468, "learning_rate": 0.0004995511900633771, "loss": 0.0441, "step": 400 }, { "epoch": 0.077972246159332, "grad_norm": 0.49140122532844543, "learning_rate": 0.000499520677070034, "loss": 0.036, "step": 410 }, { "epoch": 0.07987400826077913, "grad_norm": 0.3462704122066498, "learning_rate": 0.0004994891616642331, "loss": 0.0325, "step": 420 }, { "epoch": 0.08177577036222625, "grad_norm": 0.3936806917190552, "learning_rate": 0.000499456643972583, "loss": 0.0406, "step": 430 }, { "epoch": 0.08367753246367338, "grad_norm": 0.3352107107639313, "learning_rate": 0.0004994231241257185, "loss": 0.0488, "step": 440 }, { "epoch": 0.08557929456512049, "grad_norm": 0.4352121353149414, "learning_rate": 0.0004993886022583009, "loss": 0.0379, "step": 450 }, { "epoch": 0.08748105666656762, "grad_norm": 0.4921802282333374, "learning_rate": 0.0004993530785090166, "loss": 0.1226, "step": 460 }, { "epoch": 0.08938281876801474, "grad_norm": 0.5758520364761353, "learning_rate": 0.000499316553020577, "loss": 0.0551, "step": 470 }, { "epoch": 0.09128458086946187, "grad_norm": 0.38870593905448914, "learning_rate": 0.0004992790259397178, "loss": 0.0497, "step": 480 }, { "epoch": 0.09318634297090898, "grad_norm": 0.361654132604599, "learning_rate": 0.0004992404974171985, "loss": 0.0599, "step": 490 }, { "epoch": 0.0950881050723561, "grad_norm": 0.452127069234848, "learning_rate": 0.000499200967607802, "loss": 0.0611, "step": 500 }, { "epoch": 0.09698986717380323, "grad_norm": 0.5591238737106323, "learning_rate": 0.0004991604366703332, "loss": 0.0586, "step": 510 }, { "epoch": 0.09889162927525035, "grad_norm": 0.40963825583457947, "learning_rate": 0.0004991189047676192, "loss": 0.048, "step": 520 }, { "epoch": 0.10079339137669747, "grad_norm": 0.5524506568908691, "learning_rate": 0.0004990763720665083, "loss": 0.0497, "step": 530 }, { "epoch": 0.10269515347814459, "grad_norm": 0.3613679111003876, "learning_rate": 0.0004990328387378695, "loss": 0.052, "step": 540 }, { "epoch": 0.10459691557959172, "grad_norm": 0.5716497898101807, "learning_rate": 0.0004989883049565912, "loss": 0.0512, "step": 550 }, { "epoch": 0.10649867768103884, "grad_norm": 0.48545169830322266, "learning_rate": 0.0004989427709015816, "loss": 0.066, "step": 560 }, { "epoch": 0.10840043978248597, "grad_norm": 0.3815110921859741, "learning_rate": 0.0004988962367557668, "loss": 0.0629, "step": 570 }, { "epoch": 0.11030220188393308, "grad_norm": 0.40193411707878113, "learning_rate": 0.000498848702706091, "loss": 0.068, "step": 580 }, { "epoch": 0.1122039639853802, "grad_norm": 0.42606788873672485, "learning_rate": 0.0004988001689435152, "loss": 0.0478, "step": 590 }, { "epoch": 0.11410572608682733, "grad_norm": 0.7104797959327698, "learning_rate": 0.0004987506356630165, "loss": 0.0607, "step": 600 }, { "epoch": 0.11600748818827444, "grad_norm": 0.3995862305164337, "learning_rate": 0.0004987001030635878, "loss": 0.0565, "step": 610 }, { "epoch": 0.11790925028972157, "grad_norm": 0.6329849362373352, "learning_rate": 0.0004986485713482361, "loss": 0.0733, "step": 620 }, { "epoch": 0.11981101239116869, "grad_norm": 0.5191243886947632, "learning_rate": 0.0004985960407239825, "loss": 0.0565, "step": 630 }, { "epoch": 0.12171277449261582, "grad_norm": 0.4808266758918762, "learning_rate": 0.0004985425114018611, "loss": 0.075, "step": 640 }, { "epoch": 0.12361453659406293, "grad_norm": 0.49313193559646606, "learning_rate": 0.000498487983596918, "loss": 0.0637, "step": 650 }, { "epoch": 0.12551629869551006, "grad_norm": 0.6945317387580872, "learning_rate": 0.0004984324575282107, "loss": 0.0778, "step": 660 }, { "epoch": 0.1274180607969572, "grad_norm": 0.3684280514717102, "learning_rate": 0.0004983759334188068, "loss": 0.0577, "step": 670 }, { "epoch": 0.1293198228984043, "grad_norm": 0.5096045136451721, "learning_rate": 0.0004983184114957836, "loss": 0.073, "step": 680 }, { "epoch": 0.13122158499985143, "grad_norm": 0.24991731345653534, "learning_rate": 0.000498259891990227, "loss": 0.079, "step": 690 }, { "epoch": 0.13312334710129856, "grad_norm": 0.4371449649333954, "learning_rate": 0.0004982003751372306, "loss": 0.0814, "step": 700 }, { "epoch": 0.13502510920274566, "grad_norm": 0.7052175402641296, "learning_rate": 0.0004981398611758942, "loss": 0.0813, "step": 710 }, { "epoch": 0.1369268713041928, "grad_norm": 0.3389616310596466, "learning_rate": 0.0004980783503493241, "loss": 0.0806, "step": 720 }, { "epoch": 0.13882863340563992, "grad_norm": 0.5502128601074219, "learning_rate": 0.0004980158429046306, "loss": 0.0623, "step": 730 }, { "epoch": 0.14073039550708705, "grad_norm": 0.6836037635803223, "learning_rate": 0.0004979523390929285, "loss": 0.0701, "step": 740 }, { "epoch": 0.14263215760853415, "grad_norm": 0.7100756168365479, "learning_rate": 0.0004978878391693346, "loss": 0.0812, "step": 750 }, { "epoch": 0.14453391970998128, "grad_norm": 0.5102308392524719, "learning_rate": 0.000497822343392968, "loss": 0.06, "step": 760 }, { "epoch": 0.1464356818114284, "grad_norm": 0.4712526202201843, "learning_rate": 0.0004977558520269484, "loss": 0.0793, "step": 770 }, { "epoch": 0.1483374439128755, "grad_norm": 0.7232615947723389, "learning_rate": 0.0004976883653383948, "loss": 0.0674, "step": 780 }, { "epoch": 0.15023920601432264, "grad_norm": 0.5397039651870728, "learning_rate": 0.0004976198835984253, "loss": 0.0613, "step": 790 }, { "epoch": 0.15214096811576977, "grad_norm": 0.5102890133857727, "learning_rate": 0.0004975504070821548, "loss": 0.0823, "step": 800 }, { "epoch": 0.1540427302172169, "grad_norm": 0.5335078239440918, "learning_rate": 0.0004974799360686952, "loss": 0.0833, "step": 810 }, { "epoch": 0.155944492318664, "grad_norm": 0.5386427640914917, "learning_rate": 0.0004974084708411535, "loss": 0.0736, "step": 820 }, { "epoch": 0.15784625442011113, "grad_norm": 0.5704069137573242, "learning_rate": 0.0004973360116866303, "loss": 0.0788, "step": 830 }, { "epoch": 0.15974801652155826, "grad_norm": 0.6669997572898865, "learning_rate": 0.0004972625588962199, "loss": 0.0713, "step": 840 }, { "epoch": 0.1616497786230054, "grad_norm": 0.4686433970928192, "learning_rate": 0.000497188112765008, "loss": 0.0768, "step": 850 }, { "epoch": 0.1635515407244525, "grad_norm": 0.622735321521759, "learning_rate": 0.0004971126735920707, "loss": 0.0857, "step": 860 }, { "epoch": 0.16545330282589962, "grad_norm": 0.4435874819755554, "learning_rate": 0.0004970362416804739, "loss": 0.108, "step": 870 }, { "epoch": 0.16735506492734675, "grad_norm": 0.6319471001625061, "learning_rate": 0.0004969588173372716, "loss": 0.1079, "step": 880 }, { "epoch": 0.16925682702879385, "grad_norm": 0.730019748210907, "learning_rate": 0.0004968804008735044, "loss": 0.1124, "step": 890 }, { "epoch": 0.17115858913024098, "grad_norm": 0.32919904589653015, "learning_rate": 0.0004968009926041991, "loss": 0.0877, "step": 900 }, { "epoch": 0.17306035123168811, "grad_norm": 0.5234276652336121, "learning_rate": 0.0004967205928483666, "loss": 0.081, "step": 910 }, { "epoch": 0.17496211333313524, "grad_norm": 0.5863314270973206, "learning_rate": 0.000496639201929001, "loss": 0.0709, "step": 920 }, { "epoch": 0.17686387543458235, "grad_norm": 0.42288297414779663, "learning_rate": 0.0004965568201730783, "loss": 0.0923, "step": 930 }, { "epoch": 0.17876563753602948, "grad_norm": 0.6558719873428345, "learning_rate": 0.0004964734479115552, "loss": 0.0787, "step": 940 }, { "epoch": 0.1806673996374766, "grad_norm": 0.42140552401542664, "learning_rate": 0.0004963890854793673, "loss": 0.0836, "step": 950 }, { "epoch": 0.18256916173892374, "grad_norm": 0.8134212493896484, "learning_rate": 0.0004963037332154281, "loss": 0.0949, "step": 960 }, { "epoch": 0.18447092384037084, "grad_norm": 0.6248766183853149, "learning_rate": 0.0004962173914626279, "loss": 0.1023, "step": 970 }, { "epoch": 0.18637268594181797, "grad_norm": 0.7062785029411316, "learning_rate": 0.0004961300605678318, "loss": 0.0872, "step": 980 }, { "epoch": 0.1882744480432651, "grad_norm": 0.5721538662910461, "learning_rate": 0.000496041740881879, "loss": 0.104, "step": 990 }, { "epoch": 0.1901762101447122, "grad_norm": 0.6057381629943848, "learning_rate": 0.0004959524327595805, "loss": 0.071, "step": 1000 }, { "epoch": 0.19207797224615933, "grad_norm": 0.5917540192604065, "learning_rate": 0.0004958621365597186, "loss": 0.0931, "step": 1010 }, { "epoch": 0.19397973434760646, "grad_norm": 0.8036534190177917, "learning_rate": 0.000495770852645045, "loss": 0.1035, "step": 1020 }, { "epoch": 0.1958814964490536, "grad_norm": 0.5942894816398621, "learning_rate": 0.0004956785813822794, "loss": 0.1072, "step": 1030 }, { "epoch": 0.1977832585505007, "grad_norm": 0.7127736210823059, "learning_rate": 0.0004955853231421077, "loss": 0.091, "step": 1040 }, { "epoch": 0.19968502065194782, "grad_norm": 0.507114827632904, "learning_rate": 0.0004954910782991814, "loss": 0.0889, "step": 1050 }, { "epoch": 0.20158678275339495, "grad_norm": 0.5299419164657593, "learning_rate": 0.000495395847232115, "loss": 0.1045, "step": 1060 }, { "epoch": 0.20348854485484205, "grad_norm": 0.4970870912075043, "learning_rate": 0.0004952996303234854, "loss": 0.0869, "step": 1070 }, { "epoch": 0.20539030695628918, "grad_norm": 0.7103607654571533, "learning_rate": 0.0004952024279598298, "loss": 0.0953, "step": 1080 }, { "epoch": 0.2072920690577363, "grad_norm": 0.38354578614234924, "learning_rate": 0.0004951042405316443, "loss": 0.109, "step": 1090 }, { "epoch": 0.20919383115918344, "grad_norm": 0.7918898463249207, "learning_rate": 0.0004950050684333823, "loss": 0.0931, "step": 1100 }, { "epoch": 0.21109559326063054, "grad_norm": 0.37249505519866943, "learning_rate": 0.0004949049120634532, "loss": 0.112, "step": 1110 }, { "epoch": 0.21299735536207767, "grad_norm": 0.4019123613834381, "learning_rate": 0.0004948037718242204, "loss": 0.1038, "step": 1120 }, { "epoch": 0.2148991174635248, "grad_norm": 0.5207439661026001, "learning_rate": 0.0004947016481219997, "loss": 0.1193, "step": 1130 }, { "epoch": 0.21680087956497193, "grad_norm": 0.41174229979515076, "learning_rate": 0.0004945985413670581, "loss": 0.0942, "step": 1140 }, { "epoch": 0.21870264166641903, "grad_norm": 0.6560219526290894, "learning_rate": 0.000494494451973612, "loss": 0.0643, "step": 1150 }, { "epoch": 0.22060440376786616, "grad_norm": 0.4203394651412964, "learning_rate": 0.0004943893803598247, "loss": 0.1187, "step": 1160 }, { "epoch": 0.2225061658693133, "grad_norm": 0.6167390942573547, "learning_rate": 0.0004942833269478063, "loss": 0.0941, "step": 1170 }, { "epoch": 0.2244079279707604, "grad_norm": 0.5587171912193298, "learning_rate": 0.0004941762921636104, "loss": 0.0665, "step": 1180 }, { "epoch": 0.22630969007220753, "grad_norm": 0.5269094109535217, "learning_rate": 0.0004940682764372336, "loss": 0.1143, "step": 1190 }, { "epoch": 0.22821145217365466, "grad_norm": 0.5605872273445129, "learning_rate": 0.0004939592802026132, "loss": 0.0913, "step": 1200 }, { "epoch": 0.23011321427510179, "grad_norm": 0.49498802423477173, "learning_rate": 0.0004938493038976251, "loss": 0.1008, "step": 1210 }, { "epoch": 0.2320149763765489, "grad_norm": 0.5630708932876587, "learning_rate": 0.0004937383479640834, "loss": 0.0885, "step": 1220 }, { "epoch": 0.23391673847799602, "grad_norm": 0.5097892880439758, "learning_rate": 0.0004936264128477368, "loss": 0.0914, "step": 1230 }, { "epoch": 0.23581850057944315, "grad_norm": 0.3842105567455292, "learning_rate": 0.0004935134989982682, "loss": 0.0792, "step": 1240 }, { "epoch": 0.23772026268089025, "grad_norm": 0.5472444891929626, "learning_rate": 0.0004933996068692922, "loss": 0.1069, "step": 1250 }, { "epoch": 0.23962202478233738, "grad_norm": 0.32953816652297974, "learning_rate": 0.0004932847369183538, "loss": 0.0745, "step": 1260 }, { "epoch": 0.2415237868837845, "grad_norm": 0.5384835600852966, "learning_rate": 0.0004931688896069258, "loss": 0.1093, "step": 1270 }, { "epoch": 0.24342554898523164, "grad_norm": 0.5735477209091187, "learning_rate": 0.000493052065400408, "loss": 0.0804, "step": 1280 }, { "epoch": 0.24532731108667874, "grad_norm": 0.47209662199020386, "learning_rate": 0.000492934264768124, "loss": 0.0848, "step": 1290 }, { "epoch": 0.24722907318812587, "grad_norm": 0.7762870192527771, "learning_rate": 0.0004928154881833208, "loss": 0.1005, "step": 1300 }, { "epoch": 0.249130835289573, "grad_norm": 0.5226859450340271, "learning_rate": 0.0004926957361231655, "loss": 0.0874, "step": 1310 }, { "epoch": 0.25103259739102013, "grad_norm": 0.6678869128227234, "learning_rate": 0.0004925750090687445, "loss": 0.0833, "step": 1320 }, { "epoch": 0.25293435949246723, "grad_norm": 0.5953888893127441, "learning_rate": 0.0004924533075050609, "loss": 0.0894, "step": 1330 }, { "epoch": 0.2548361215939144, "grad_norm": 0.5249069333076477, "learning_rate": 0.0004923306319210327, "loss": 0.1101, "step": 1340 }, { "epoch": 0.2567378836953615, "grad_norm": 0.5314027667045593, "learning_rate": 0.0004922069828094908, "loss": 0.0916, "step": 1350 }, { "epoch": 0.2586396457968086, "grad_norm": 0.4231284558773041, "learning_rate": 0.0004920823606671774, "loss": 0.0772, "step": 1360 }, { "epoch": 0.26054140789825575, "grad_norm": 0.5623658895492554, "learning_rate": 0.0004919567659947435, "loss": 0.0908, "step": 1370 }, { "epoch": 0.26244316999970285, "grad_norm": 0.5130127668380737, "learning_rate": 0.0004918301992967472, "loss": 0.0994, "step": 1380 }, { "epoch": 0.26434493210114995, "grad_norm": 0.33209264278411865, "learning_rate": 0.0004917026610816516, "loss": 0.0935, "step": 1390 }, { "epoch": 0.2662466942025971, "grad_norm": 0.39504528045654297, "learning_rate": 0.0004915741518618222, "loss": 0.0931, "step": 1400 }, { "epoch": 0.2681484563040442, "grad_norm": 0.48092228174209595, "learning_rate": 0.0004914446721535263, "loss": 0.1158, "step": 1410 }, { "epoch": 0.2700502184054913, "grad_norm": 0.5582765936851501, "learning_rate": 0.0004913142224769292, "loss": 0.0828, "step": 1420 }, { "epoch": 0.2719519805069385, "grad_norm": 0.3209240138530731, "learning_rate": 0.0004911828033560934, "loss": 0.0905, "step": 1430 }, { "epoch": 0.2738537426083856, "grad_norm": 1.9139939546585083, "learning_rate": 0.0004910504153189758, "loss": 0.1058, "step": 1440 }, { "epoch": 0.2757555047098327, "grad_norm": 0.5591135025024414, "learning_rate": 0.0004909170588974256, "loss": 0.4107, "step": 1450 }, { "epoch": 0.27765726681127983, "grad_norm": 0.3873596787452698, "learning_rate": 0.0004907827346271826, "loss": 0.1475, "step": 1460 }, { "epoch": 0.27955902891272694, "grad_norm": 0.5181601047515869, "learning_rate": 0.0004906474430478746, "loss": 0.113, "step": 1470 }, { "epoch": 0.2814607910141741, "grad_norm": 0.3383086025714874, "learning_rate": 0.0004905111847030159, "loss": 0.5679, "step": 1480 }, { "epoch": 0.2833625531156212, "grad_norm": 0.49057555198669434, "learning_rate": 0.0004903739601400039, "loss": 0.1181, "step": 1490 }, { "epoch": 0.2852643152170683, "grad_norm": 0.572108268737793, "learning_rate": 0.0004902357699101182, "loss": 0.1126, "step": 1500 }, { "epoch": 0.28716607731851546, "grad_norm": 0.40325993299484253, "learning_rate": 0.0004900966145685176, "loss": 0.0911, "step": 1510 }, { "epoch": 0.28906783941996256, "grad_norm": 0.40683841705322266, "learning_rate": 0.000489956494674238, "loss": 0.1002, "step": 1520 }, { "epoch": 0.29096960152140966, "grad_norm": 0.5523852705955505, "learning_rate": 0.0004898154107901905, "loss": 0.1002, "step": 1530 }, { "epoch": 0.2928713636228568, "grad_norm": 0.5049010515213013, "learning_rate": 0.0004896733634831589, "loss": 0.1015, "step": 1540 }, { "epoch": 0.2947731257243039, "grad_norm": 0.3494808077812195, "learning_rate": 0.0004895303533237969, "loss": 0.2635, "step": 1550 }, { "epoch": 0.296674887825751, "grad_norm": 0.5694501399993896, "learning_rate": 0.000489386380886627, "loss": 0.1048, "step": 1560 }, { "epoch": 0.2985766499271982, "grad_norm": 0.4427148997783661, "learning_rate": 0.0004892414467500371, "loss": 0.1254, "step": 1570 }, { "epoch": 0.3004784120286453, "grad_norm": 0.28379982709884644, "learning_rate": 0.0004890955514962786, "loss": 0.2777, "step": 1580 }, { "epoch": 0.30238017413009244, "grad_norm": 0.3206646740436554, "learning_rate": 0.0004889486957114642, "loss": 0.1164, "step": 1590 }, { "epoch": 0.30428193623153954, "grad_norm": 0.5560258030891418, "learning_rate": 0.0004888008799855655, "loss": 0.0796, "step": 1600 }, { "epoch": 0.30618369833298664, "grad_norm": 0.43756505846977234, "learning_rate": 0.00048865210491241, "loss": 0.1102, "step": 1610 }, { "epoch": 0.3080854604344338, "grad_norm": 0.4390459954738617, "learning_rate": 0.0004885023710896799, "loss": 0.1831, "step": 1620 }, { "epoch": 0.3099872225358809, "grad_norm": 0.3506847620010376, "learning_rate": 0.0004883516791189084, "loss": 0.1157, "step": 1630 }, { "epoch": 0.311888984637328, "grad_norm": 0.4262866675853729, "learning_rate": 0.00048820002960547844, "loss": 0.1143, "step": 1640 }, { "epoch": 0.31379074673877516, "grad_norm": 0.5669584274291992, "learning_rate": 0.0004880474231586195, "loss": 0.1037, "step": 1650 }, { "epoch": 0.31569250884022226, "grad_norm": 0.5108714699745178, "learning_rate": 0.00048789386039140535, "loss": 1.2545, "step": 1660 }, { "epoch": 0.31759427094166937, "grad_norm": 0.5437518358230591, "learning_rate": 0.00048773934192075186, "loss": 0.097, "step": 1670 }, { "epoch": 0.3194960330431165, "grad_norm": 0.7191082835197449, "learning_rate": 0.0004875838683674141, "loss": 0.0965, "step": 1680 }, { "epoch": 0.3213977951445636, "grad_norm": 0.5136532783508301, "learning_rate": 0.00048742744035598407, "loss": 0.0818, "step": 1690 }, { "epoch": 0.3232995572460108, "grad_norm": 0.5209280848503113, "learning_rate": 0.0004872700585148882, "loss": 0.0967, "step": 1700 }, { "epoch": 0.3252013193474579, "grad_norm": 0.36031627655029297, "learning_rate": 0.00048711172347638484, "loss": 0.113, "step": 1710 }, { "epoch": 0.327103081448905, "grad_norm": 0.39448150992393494, "learning_rate": 0.0004869524358765616, "loss": 0.0733, "step": 1720 }, { "epoch": 0.32900484355035214, "grad_norm": 0.4772893786430359, "learning_rate": 0.00048679219635533276, "loss": 0.0937, "step": 1730 }, { "epoch": 0.33090660565179925, "grad_norm": 0.4700233042240143, "learning_rate": 0.0004866310055564371, "loss": 0.0874, "step": 1740 }, { "epoch": 0.33280836775324635, "grad_norm": 0.444872111082077, "learning_rate": 0.00048646886412743475, "loss": 0.0994, "step": 1750 }, { "epoch": 0.3347101298546935, "grad_norm": 0.28628623485565186, "learning_rate": 0.0004863057727197049, "loss": 0.097, "step": 1760 }, { "epoch": 0.3366118919561406, "grad_norm": 0.3799450695514679, "learning_rate": 0.0004861417319884434, "loss": 0.0807, "step": 1770 }, { "epoch": 0.3385136540575877, "grad_norm": 0.34396281838417053, "learning_rate": 0.00048597674259265934, "loss": 0.0895, "step": 1780 }, { "epoch": 0.34041541615903487, "grad_norm": 0.3908982276916504, "learning_rate": 0.0004858108051951735, "loss": 0.0872, "step": 1790 }, { "epoch": 0.34231717826048197, "grad_norm": 0.35899925231933594, "learning_rate": 0.0004856439204626147, "loss": 0.0699, "step": 1800 }, { "epoch": 0.3442189403619291, "grad_norm": 0.23835837841033936, "learning_rate": 0.00048547608906541784, "loss": 0.0763, "step": 1810 }, { "epoch": 0.34612070246337623, "grad_norm": 0.3790901303291321, "learning_rate": 0.0004853073116778207, "loss": 0.1081, "step": 1820 }, { "epoch": 0.34802246456482333, "grad_norm": 0.4319717288017273, "learning_rate": 0.0004851375889778614, "loss": 0.0859, "step": 1830 }, { "epoch": 0.3499242266662705, "grad_norm": 0.3756701648235321, "learning_rate": 0.00048496692164737596, "loss": 0.1024, "step": 1840 }, { "epoch": 0.3518259887677176, "grad_norm": 0.1727105975151062, "learning_rate": 0.0004847953103719951, "loss": 0.094, "step": 1850 }, { "epoch": 0.3537277508691647, "grad_norm": 0.49954915046691895, "learning_rate": 0.0004846227558411417, "loss": 0.0923, "step": 1860 }, { "epoch": 0.35562951297061185, "grad_norm": 0.3634874224662781, "learning_rate": 0.0004844492587480283, "loss": 0.0742, "step": 1870 }, { "epoch": 0.35753127507205895, "grad_norm": 0.3322870433330536, "learning_rate": 0.0004842748197896537, "loss": 0.0939, "step": 1880 }, { "epoch": 0.35943303717350605, "grad_norm": 0.22122009098529816, "learning_rate": 0.00048409943966680057, "loss": 0.0669, "step": 1890 }, { "epoch": 0.3613347992749532, "grad_norm": 0.5171657204627991, "learning_rate": 0.00048392311908403276, "loss": 0.0649, "step": 1900 }, { "epoch": 0.3632365613764003, "grad_norm": 0.45435401797294617, "learning_rate": 0.0004837458587496921, "loss": 0.0841, "step": 1910 }, { "epoch": 0.36513832347784747, "grad_norm": 0.35136786103248596, "learning_rate": 0.00048356765937589597, "loss": 0.0915, "step": 1920 }, { "epoch": 0.3670400855792946, "grad_norm": 0.5352773666381836, "learning_rate": 0.0004833885216785338, "loss": 0.0622, "step": 1930 }, { "epoch": 0.3689418476807417, "grad_norm": 0.3906407654285431, "learning_rate": 0.0004832084463772649, "loss": 0.0739, "step": 1940 }, { "epoch": 0.37084360978218883, "grad_norm": 0.23543521761894226, "learning_rate": 0.0004830274341955152, "loss": 0.0766, "step": 1950 }, { "epoch": 0.37274537188363593, "grad_norm": 0.31237101554870605, "learning_rate": 0.0004828454858604744, "loss": 0.0668, "step": 1960 }, { "epoch": 0.37464713398508304, "grad_norm": 0.4033367931842804, "learning_rate": 0.0004826626021030931, "loss": 0.0822, "step": 1970 }, { "epoch": 0.3765488960865302, "grad_norm": 0.3153851628303528, "learning_rate": 0.0004824787836580797, "loss": 0.0506, "step": 1980 }, { "epoch": 0.3784506581879773, "grad_norm": 0.37774601578712463, "learning_rate": 0.0004822940312638977, "loss": 0.0885, "step": 1990 }, { "epoch": 0.3803524202894244, "grad_norm": 0.3738980293273926, "learning_rate": 0.0004821083456627625, "loss": 0.0705, "step": 2000 }, { "epoch": 0.38225418239087156, "grad_norm": 0.3323149085044861, "learning_rate": 0.00048192172760063866, "loss": 0.0765, "step": 2010 }, { "epoch": 0.38415594449231866, "grad_norm": 0.38756170868873596, "learning_rate": 0.0004817341778272366, "loss": 0.0826, "step": 2020 }, { "epoch": 0.38605770659376576, "grad_norm": 0.27625781297683716, "learning_rate": 0.0004815456970960098, "loss": 0.0931, "step": 2030 }, { "epoch": 0.3879594686952129, "grad_norm": 0.28700143098831177, "learning_rate": 0.00048135628616415184, "loss": 0.0782, "step": 2040 }, { "epoch": 0.38986123079666, "grad_norm": 0.3487882912158966, "learning_rate": 0.0004811659457925931, "loss": 0.0834, "step": 2050 }, { "epoch": 0.3917629928981072, "grad_norm": 0.3156881630420685, "learning_rate": 0.00048097467674599795, "loss": 0.076, "step": 2060 }, { "epoch": 0.3936647549995543, "grad_norm": 0.28736022114753723, "learning_rate": 0.0004807824797927615, "loss": 0.0656, "step": 2070 }, { "epoch": 0.3955665171010014, "grad_norm": 0.370560884475708, "learning_rate": 0.0004805893557050065, "loss": 0.0685, "step": 2080 }, { "epoch": 0.39746827920244854, "grad_norm": 0.32262367010116577, "learning_rate": 0.00048039530525858067, "loss": 0.0669, "step": 2090 }, { "epoch": 0.39937004130389564, "grad_norm": 0.21956555545330048, "learning_rate": 0.00048020032923305284, "loss": 0.0827, "step": 2100 }, { "epoch": 0.40127180340534274, "grad_norm": 0.295251727104187, "learning_rate": 0.0004800044284117104, "loss": 0.0557, "step": 2110 }, { "epoch": 0.4031735655067899, "grad_norm": 1.090646505355835, "learning_rate": 0.00047980760358155616, "loss": 0.0861, "step": 2120 }, { "epoch": 0.405075327608237, "grad_norm": 0.44668304920196533, "learning_rate": 0.0004796098555333045, "loss": 0.0726, "step": 2130 }, { "epoch": 0.4069770897096841, "grad_norm": 0.3137907385826111, "learning_rate": 0.00047941118506137915, "loss": 0.0894, "step": 2140 }, { "epoch": 0.40887885181113126, "grad_norm": 0.36221790313720703, "learning_rate": 0.0004792115929639094, "loss": 0.0716, "step": 2150 }, { "epoch": 0.41078061391257836, "grad_norm": 0.53058922290802, "learning_rate": 0.000479011080042727, "loss": 0.0684, "step": 2160 }, { "epoch": 0.4126823760140255, "grad_norm": 0.23870202898979187, "learning_rate": 0.0004788096471033629, "loss": 0.0834, "step": 2170 }, { "epoch": 0.4145841381154726, "grad_norm": 0.27887409925460815, "learning_rate": 0.0004786072949550443, "loss": 0.0614, "step": 2180 }, { "epoch": 0.4164859002169197, "grad_norm": 0.5123117566108704, "learning_rate": 0.0004784040244106909, "loss": 0.0667, "step": 2190 }, { "epoch": 0.4183876623183669, "grad_norm": 0.2506055533885956, "learning_rate": 0.0004781998362869123, "loss": 0.0651, "step": 2200 }, { "epoch": 0.420289424419814, "grad_norm": 0.48434486985206604, "learning_rate": 0.0004779947314040039, "loss": 0.0663, "step": 2210 }, { "epoch": 0.4221911865212611, "grad_norm": 0.3052046597003937, "learning_rate": 0.0004777887105859444, "loss": 0.0866, "step": 2220 }, { "epoch": 0.42409294862270824, "grad_norm": 0.4667761027812958, "learning_rate": 0.00047758177466039197, "loss": 0.1044, "step": 2230 }, { "epoch": 0.42599471072415535, "grad_norm": 0.25950753688812256, "learning_rate": 0.0004773739244586812, "loss": 0.0998, "step": 2240 }, { "epoch": 0.42789647282560245, "grad_norm": 0.38734811544418335, "learning_rate": 0.0004771651608158194, "loss": 0.0791, "step": 2250 }, { "epoch": 0.4297982349270496, "grad_norm": 0.5071670413017273, "learning_rate": 0.0004769554845704838, "loss": 0.072, "step": 2260 }, { "epoch": 0.4316999970284967, "grad_norm": 0.32346102595329285, "learning_rate": 0.00047674489656501773, "loss": 0.0563, "step": 2270 }, { "epoch": 0.43360175912994386, "grad_norm": 0.20895633101463318, "learning_rate": 0.0004765333976454273, "loss": 0.0642, "step": 2280 }, { "epoch": 0.43550352123139097, "grad_norm": 0.3482305407524109, "learning_rate": 0.00047632098866137826, "loss": 0.0716, "step": 2290 }, { "epoch": 0.43740528333283807, "grad_norm": 0.2852407991886139, "learning_rate": 0.00047610767046619225, "loss": 0.0745, "step": 2300 }, { "epoch": 0.4393070454342852, "grad_norm": 0.45966169238090515, "learning_rate": 0.0004758934439168436, "loss": 0.0741, "step": 2310 }, { "epoch": 0.44120880753573233, "grad_norm": 0.21411827206611633, "learning_rate": 0.00047567830987395597, "loss": 0.0598, "step": 2320 }, { "epoch": 0.44311056963717943, "grad_norm": 0.36762118339538574, "learning_rate": 0.0004754622692017985, "loss": 0.0837, "step": 2330 }, { "epoch": 0.4450123317386266, "grad_norm": 0.2668420076370239, "learning_rate": 0.0004752453227682827, "loss": 0.0745, "step": 2340 }, { "epoch": 0.4469140938400737, "grad_norm": 0.3845922350883484, "learning_rate": 0.00047502747144495875, "loss": 0.0747, "step": 2350 }, { "epoch": 0.4488158559415208, "grad_norm": 0.38041210174560547, "learning_rate": 0.00047480871610701213, "loss": 0.0743, "step": 2360 }, { "epoch": 0.45071761804296795, "grad_norm": 0.25837719440460205, "learning_rate": 0.00047458905763326023, "loss": 0.0689, "step": 2370 }, { "epoch": 0.45261938014441505, "grad_norm": 0.32403454184532166, "learning_rate": 0.0004743684969061484, "loss": 0.0669, "step": 2380 }, { "epoch": 0.4545211422458622, "grad_norm": 0.4562000334262848, "learning_rate": 0.00047414703481174687, "loss": 0.0888, "step": 2390 }, { "epoch": 0.4564229043473093, "grad_norm": 0.22524714469909668, "learning_rate": 0.00047392467223974686, "loss": 0.0748, "step": 2400 }, { "epoch": 0.4583246664487564, "grad_norm": 0.33924999833106995, "learning_rate": 0.0004737014100834571, "loss": 0.0757, "step": 2410 }, { "epoch": 0.46022642855020357, "grad_norm": 0.26549142599105835, "learning_rate": 0.0004734772492398003, "loss": 0.0695, "step": 2420 }, { "epoch": 0.46212819065165067, "grad_norm": 0.4339611530303955, "learning_rate": 0.0004732521906093097, "loss": 0.0868, "step": 2430 }, { "epoch": 0.4640299527530978, "grad_norm": 0.3820763826370239, "learning_rate": 0.00047302623509612484, "loss": 0.0687, "step": 2440 }, { "epoch": 0.46593171485454493, "grad_norm": 0.2838119864463806, "learning_rate": 0.00047279938360798884, "loss": 0.0636, "step": 2450 }, { "epoch": 0.46783347695599203, "grad_norm": 0.26066961884498596, "learning_rate": 0.00047257163705624394, "loss": 0.0555, "step": 2460 }, { "epoch": 0.46973523905743914, "grad_norm": 0.23981156945228577, "learning_rate": 0.00047234299635582835, "loss": 0.0606, "step": 2470 }, { "epoch": 0.4716370011588863, "grad_norm": 0.3982088267803192, "learning_rate": 0.0004721134624252722, "loss": 0.0631, "step": 2480 }, { "epoch": 0.4735387632603334, "grad_norm": 0.24350067973136902, "learning_rate": 0.00047188303618669414, "loss": 0.0664, "step": 2490 }, { "epoch": 0.4754405253617805, "grad_norm": 0.2303943932056427, "learning_rate": 0.0004716517185657977, "loss": 0.0665, "step": 2500 }, { "epoch": 0.47734228746322765, "grad_norm": 0.23348209261894226, "learning_rate": 0.00047141951049186703, "loss": 0.0572, "step": 2510 }, { "epoch": 0.47924404956467476, "grad_norm": 0.24423716962337494, "learning_rate": 0.00047118641289776395, "loss": 0.099, "step": 2520 }, { "epoch": 0.4811458116661219, "grad_norm": 0.371750146150589, "learning_rate": 0.00047095242671992346, "loss": 0.064, "step": 2530 }, { "epoch": 0.483047573767569, "grad_norm": 0.45275604724884033, "learning_rate": 0.0004707175528983506, "loss": 0.0745, "step": 2540 }, { "epoch": 0.4849493358690161, "grad_norm": 0.23620209097862244, "learning_rate": 0.00047048179237661617, "loss": 0.0594, "step": 2550 }, { "epoch": 0.4868510979704633, "grad_norm": 0.13869664072990417, "learning_rate": 0.00047024514610185316, "loss": 0.0575, "step": 2560 }, { "epoch": 0.4887528600719104, "grad_norm": 0.17631912231445312, "learning_rate": 0.00047000761502475317, "loss": 0.0434, "step": 2570 }, { "epoch": 0.4906546221733575, "grad_norm": 0.30484968423843384, "learning_rate": 0.0004697692000995621, "loss": 0.0746, "step": 2580 }, { "epoch": 0.49255638427480464, "grad_norm": 0.24606972932815552, "learning_rate": 0.0004695299022840768, "loss": 0.0639, "step": 2590 }, { "epoch": 0.49445814637625174, "grad_norm": 0.36455589532852173, "learning_rate": 0.00046928972253964087, "loss": 0.0677, "step": 2600 }, { "epoch": 0.49635990847769884, "grad_norm": 0.20996150374412537, "learning_rate": 0.0004690486618311408, "loss": 0.0648, "step": 2610 }, { "epoch": 0.498261670579146, "grad_norm": 0.12294875085353851, "learning_rate": 0.0004688067211270025, "loss": 0.0561, "step": 2620 }, { "epoch": 0.5001634326805932, "grad_norm": 0.3342492878437042, "learning_rate": 0.00046856390139918703, "loss": 0.0627, "step": 2630 }, { "epoch": 0.5020651947820403, "grad_norm": 0.28927573561668396, "learning_rate": 0.00046832020362318677, "loss": 0.0947, "step": 2640 }, { "epoch": 0.5039669568834874, "grad_norm": 0.2924613952636719, "learning_rate": 0.00046807562877802144, "loss": 0.036, "step": 2650 }, { "epoch": 0.5058687189849345, "grad_norm": 0.22599062323570251, "learning_rate": 0.0004678301778462344, "loss": 0.0671, "step": 2660 }, { "epoch": 0.5077704810863816, "grad_norm": 0.40285536646842957, "learning_rate": 0.0004675838518138885, "loss": 0.0557, "step": 2670 }, { "epoch": 0.5096722431878288, "grad_norm": 0.35334137082099915, "learning_rate": 0.0004673366516705623, "loss": 0.0639, "step": 2680 }, { "epoch": 0.5115740052892759, "grad_norm": 0.1980566680431366, "learning_rate": 0.00046708857840934564, "loss": 0.0539, "step": 2690 }, { "epoch": 0.513475767390723, "grad_norm": 0.23285962641239166, "learning_rate": 0.0004668396330268364, "loss": 0.0646, "step": 2700 }, { "epoch": 0.5153775294921701, "grad_norm": 0.23495745658874512, "learning_rate": 0.00046658981652313573, "loss": 0.0585, "step": 2710 }, { "epoch": 0.5172792915936172, "grad_norm": 0.18733032047748566, "learning_rate": 0.00046633912990184457, "loss": 0.0551, "step": 2720 }, { "epoch": 0.5191810536950643, "grad_norm": 0.23610015213489532, "learning_rate": 0.00046608757417005944, "loss": 0.0543, "step": 2730 }, { "epoch": 0.5210828157965115, "grad_norm": 0.35595059394836426, "learning_rate": 0.00046583515033836833, "loss": 0.0633, "step": 2740 }, { "epoch": 0.5229845778979586, "grad_norm": 0.23900267481803894, "learning_rate": 0.00046558185942084657, "loss": 0.0483, "step": 2750 }, { "epoch": 0.5248863399994057, "grad_norm": 0.362981915473938, "learning_rate": 0.00046532770243505313, "loss": 0.0462, "step": 2760 }, { "epoch": 0.5267881021008528, "grad_norm": 0.5131031274795532, "learning_rate": 0.0004650726804020261, "loss": 0.055, "step": 2770 }, { "epoch": 0.5286898642022999, "grad_norm": 0.22358974814414978, "learning_rate": 0.00046481679434627886, "loss": 0.0505, "step": 2780 }, { "epoch": 0.530591626303747, "grad_norm": 0.4018985331058502, "learning_rate": 0.00046456004529579574, "loss": 0.065, "step": 2790 }, { "epoch": 0.5324933884051942, "grad_norm": 0.14427751302719116, "learning_rate": 0.00046430243428202824, "loss": 0.0566, "step": 2800 }, { "epoch": 0.5343951505066413, "grad_norm": 0.2335805743932724, "learning_rate": 0.00046404396233989053, "loss": 0.0582, "step": 2810 }, { "epoch": 0.5362969126080884, "grad_norm": 0.20551510155200958, "learning_rate": 0.00046378463050775534, "loss": 0.069, "step": 2820 }, { "epoch": 0.5381986747095355, "grad_norm": 0.2913135588169098, "learning_rate": 0.0004635244398274501, "loss": 0.0787, "step": 2830 }, { "epoch": 0.5401004368109826, "grad_norm": 0.25064730644226074, "learning_rate": 0.0004632633913442524, "loss": 0.0473, "step": 2840 }, { "epoch": 0.5420021989124298, "grad_norm": 0.24504418671131134, "learning_rate": 0.0004630014861068861, "loss": 0.0724, "step": 2850 }, { "epoch": 0.543903961013877, "grad_norm": 0.14822843670845032, "learning_rate": 0.00046273872516751645, "loss": 0.057, "step": 2860 }, { "epoch": 0.545805723115324, "grad_norm": 0.3440416753292084, "learning_rate": 0.0004624751095817471, "loss": 0.0518, "step": 2870 }, { "epoch": 0.5477074852167712, "grad_norm": 0.3021232485771179, "learning_rate": 0.0004622106404086144, "loss": 0.0353, "step": 2880 }, { "epoch": 0.5496092473182183, "grad_norm": 0.27382078766822815, "learning_rate": 0.00046194531871058435, "loss": 0.0517, "step": 2890 }, { "epoch": 0.5515110094196654, "grad_norm": 0.2883986830711365, "learning_rate": 0.0004616791455535477, "loss": 0.0526, "step": 2900 }, { "epoch": 0.5534127715211126, "grad_norm": 0.1981579065322876, "learning_rate": 0.0004614121220068157, "loss": 0.0441, "step": 2910 }, { "epoch": 0.5553145336225597, "grad_norm": 0.133266881108284, "learning_rate": 0.000461144249143116, "loss": 0.0529, "step": 2920 }, { "epoch": 0.5572162957240068, "grad_norm": 0.2734558880329132, "learning_rate": 0.0004608755280385883, "loss": 0.0482, "step": 2930 }, { "epoch": 0.5591180578254539, "grad_norm": 0.1929953396320343, "learning_rate": 0.00046060595977277997, "loss": 0.0545, "step": 2940 }, { "epoch": 0.561019819926901, "grad_norm": 0.34874358773231506, "learning_rate": 0.00046033554542864157, "loss": 0.0563, "step": 2950 }, { "epoch": 0.5629215820283482, "grad_norm": 0.2735145390033722, "learning_rate": 0.00046006428609252293, "loss": 0.0534, "step": 2960 }, { "epoch": 0.5648233441297953, "grad_norm": 0.1838793009519577, "learning_rate": 0.0004597921828541682, "loss": 0.04, "step": 2970 }, { "epoch": 0.5667251062312424, "grad_norm": 0.2556362748146057, "learning_rate": 0.00045951923680671213, "loss": 0.0659, "step": 2980 }, { "epoch": 0.5686268683326895, "grad_norm": 0.2070973515510559, "learning_rate": 0.000459245449046675, "loss": 0.0449, "step": 2990 }, { "epoch": 0.5705286304341366, "grad_norm": 0.31435173749923706, "learning_rate": 0.0004589708206739587, "loss": 0.0668, "step": 3000 }, { "epoch": 0.5724303925355837, "grad_norm": 0.2508755922317505, "learning_rate": 0.0004586953527918422, "loss": 0.0538, "step": 3010 }, { "epoch": 0.5743321546370309, "grad_norm": 0.11883655935525894, "learning_rate": 0.000458419046506977, "loss": 0.0414, "step": 3020 }, { "epoch": 0.576233916738478, "grad_norm": 0.08662758767604828, "learning_rate": 0.0004581419029293828, "loss": 0.0317, "step": 3030 }, { "epoch": 0.5781356788399251, "grad_norm": 0.22283309698104858, "learning_rate": 0.0004578639231724429, "loss": 0.0392, "step": 3040 }, { "epoch": 0.5800374409413722, "grad_norm": 0.25671476125717163, "learning_rate": 0.0004575851083529, "loss": 0.0375, "step": 3050 }, { "epoch": 0.5819392030428193, "grad_norm": 0.2501511871814728, "learning_rate": 0.0004573054595908514, "loss": 0.0367, "step": 3060 }, { "epoch": 0.5838409651442665, "grad_norm": 0.196889728307724, "learning_rate": 0.00045702497800974474, "loss": 0.0887, "step": 3070 }, { "epoch": 0.5857427272457136, "grad_norm": 0.28852781653404236, "learning_rate": 0.00045674366473637317, "loss": 0.0511, "step": 3080 }, { "epoch": 0.5876444893471607, "grad_norm": 0.20641787350177765, "learning_rate": 0.00045646152090087145, "loss": 0.0594, "step": 3090 }, { "epoch": 0.5895462514486078, "grad_norm": 0.1833396703004837, "learning_rate": 0.0004561785476367106, "loss": 0.0476, "step": 3100 }, { "epoch": 0.5914480135500549, "grad_norm": 0.3527635335922241, "learning_rate": 0.0004558947460806939, "loss": 0.0429, "step": 3110 }, { "epoch": 0.593349775651502, "grad_norm": 0.2580285668373108, "learning_rate": 0.00045561011737295235, "loss": 0.0307, "step": 3120 }, { "epoch": 0.5952515377529493, "grad_norm": 0.21790006756782532, "learning_rate": 0.0004553246626569395, "loss": 0.0314, "step": 3130 }, { "epoch": 0.5971532998543964, "grad_norm": 0.30504387617111206, "learning_rate": 0.00045503838307942756, "loss": 0.0398, "step": 3140 }, { "epoch": 0.5990550619558435, "grad_norm": 0.24918900430202484, "learning_rate": 0.00045475127979050254, "loss": 0.0473, "step": 3150 }, { "epoch": 0.6009568240572906, "grad_norm": 0.30492204427719116, "learning_rate": 0.00045446335394355947, "loss": 0.0536, "step": 3160 }, { "epoch": 0.6028585861587377, "grad_norm": 0.23205405473709106, "learning_rate": 0.0004541746066952978, "loss": 0.0219, "step": 3170 }, { "epoch": 0.6047603482601849, "grad_norm": 0.25788795948028564, "learning_rate": 0.000453885039205717, "loss": 0.0458, "step": 3180 }, { "epoch": 0.606662110361632, "grad_norm": 0.14208689332008362, "learning_rate": 0.0004535946526381117, "loss": 0.0583, "step": 3190 }, { "epoch": 0.6085638724630791, "grad_norm": 0.3792324364185333, "learning_rate": 0.0004533034481590671, "loss": 0.0543, "step": 3200 }, { "epoch": 0.6104656345645262, "grad_norm": 0.11220109462738037, "learning_rate": 0.00045301142693845406, "loss": 0.06, "step": 3210 }, { "epoch": 0.6123673966659733, "grad_norm": 0.37494245171546936, "learning_rate": 0.0004527185901494247, "loss": 0.0333, "step": 3220 }, { "epoch": 0.6142691587674204, "grad_norm": 0.181362122297287, "learning_rate": 0.0004524249389684075, "loss": 0.0413, "step": 3230 }, { "epoch": 0.6161709208688676, "grad_norm": 0.1204942986369133, "learning_rate": 0.0004521304745751029, "loss": 0.0606, "step": 3240 }, { "epoch": 0.6180726829703147, "grad_norm": 0.28631624579429626, "learning_rate": 0.00045183519815247803, "loss": 0.0511, "step": 3250 }, { "epoch": 0.6199744450717618, "grad_norm": 0.3006780743598938, "learning_rate": 0.00045153911088676216, "loss": 0.0603, "step": 3260 }, { "epoch": 0.6218762071732089, "grad_norm": 0.20401684939861298, "learning_rate": 0.00045124221396744226, "loss": 0.0462, "step": 3270 }, { "epoch": 0.623777969274656, "grad_norm": 0.12001892924308777, "learning_rate": 0.00045094450858725775, "loss": 0.0624, "step": 3280 }, { "epoch": 0.6256797313761032, "grad_norm": 0.15589171648025513, "learning_rate": 0.0004506459959421962, "loss": 0.0518, "step": 3290 }, { "epoch": 0.6275814934775503, "grad_norm": 0.24381360411643982, "learning_rate": 0.0004503466772314878, "loss": 0.0516, "step": 3300 }, { "epoch": 0.6294832555789974, "grad_norm": 0.24374118447303772, "learning_rate": 0.0004500465536576015, "loss": 0.0428, "step": 3310 }, { "epoch": 0.6313850176804445, "grad_norm": 0.25958311557769775, "learning_rate": 0.00044974562642623926, "loss": 0.0385, "step": 3320 }, { "epoch": 0.6332867797818916, "grad_norm": 0.20062792301177979, "learning_rate": 0.0004494438967463318, "loss": 0.0327, "step": 3330 }, { "epoch": 0.6351885418833387, "grad_norm": 0.21975015103816986, "learning_rate": 0.0004491413658300336, "loss": 0.0476, "step": 3340 }, { "epoch": 0.6370903039847859, "grad_norm": 0.2386341094970703, "learning_rate": 0.00044883803489271785, "loss": 0.052, "step": 3350 }, { "epoch": 0.638992066086233, "grad_norm": 0.24763132631778717, "learning_rate": 0.00044853390515297176, "loss": 0.0552, "step": 3360 }, { "epoch": 0.6408938281876801, "grad_norm": 0.24076853692531586, "learning_rate": 0.00044822897783259176, "loss": 0.0456, "step": 3370 }, { "epoch": 0.6427955902891272, "grad_norm": 0.22434252500534058, "learning_rate": 0.0004479232541565782, "loss": 0.0467, "step": 3380 }, { "epoch": 0.6446973523905744, "grad_norm": 0.12866199016571045, "learning_rate": 0.00044761673535313084, "loss": 0.0454, "step": 3390 }, { "epoch": 0.6465991144920216, "grad_norm": 0.2736774682998657, "learning_rate": 0.0004473094226536436, "loss": 0.0451, "step": 3400 }, { "epoch": 0.6485008765934687, "grad_norm": 0.29224392771720886, "learning_rate": 0.0004470013172926999, "loss": 0.0508, "step": 3410 }, { "epoch": 0.6504026386949158, "grad_norm": 0.23447951674461365, "learning_rate": 0.00044669242050806766, "loss": 0.0479, "step": 3420 }, { "epoch": 0.6523044007963629, "grad_norm": 0.2928425967693329, "learning_rate": 0.0004463827335406938, "loss": 0.0266, "step": 3430 }, { "epoch": 0.65420616289781, "grad_norm": 0.282113641500473, "learning_rate": 0.0004460722576347002, "loss": 0.0442, "step": 3440 }, { "epoch": 0.6561079249992571, "grad_norm": 0.24653756618499756, "learning_rate": 0.0004457609940373777, "loss": 0.0392, "step": 3450 }, { "epoch": 0.6580096871007043, "grad_norm": 0.2997114658355713, "learning_rate": 0.00044544894399918214, "loss": 0.0477, "step": 3460 }, { "epoch": 0.6599114492021514, "grad_norm": 0.08997475355863571, "learning_rate": 0.00044513610877372814, "loss": 0.0424, "step": 3470 }, { "epoch": 0.6618132113035985, "grad_norm": 0.18397915363311768, "learning_rate": 0.00044482248961778516, "loss": 0.0339, "step": 3480 }, { "epoch": 0.6637149734050456, "grad_norm": 0.18064412474632263, "learning_rate": 0.00044450808779127185, "loss": 0.0437, "step": 3490 }, { "epoch": 0.6656167355064927, "grad_norm": 0.18558929860591888, "learning_rate": 0.00044419290455725103, "loss": 0.0554, "step": 3500 }, { "epoch": 0.6675184976079399, "grad_norm": 0.3115510940551758, "learning_rate": 0.00044387694118192477, "loss": 0.0458, "step": 3510 }, { "epoch": 0.669420259709387, "grad_norm": 0.4627574384212494, "learning_rate": 0.0004435601989346293, "loss": 0.0463, "step": 3520 }, { "epoch": 0.6713220218108341, "grad_norm": 0.17065434157848358, "learning_rate": 0.0004432426790878298, "loss": 0.0457, "step": 3530 }, { "epoch": 0.6732237839122812, "grad_norm": 0.3017977178096771, "learning_rate": 0.00044292438291711536, "loss": 0.0452, "step": 3540 }, { "epoch": 0.6751255460137283, "grad_norm": 0.21006427705287933, "learning_rate": 0.00044260531170119377, "loss": 0.041, "step": 3550 }, { "epoch": 0.6770273081151754, "grad_norm": 0.14174698293209076, "learning_rate": 0.0004422854667218865, "loss": 0.0421, "step": 3560 }, { "epoch": 0.6789290702166226, "grad_norm": 0.21689032018184662, "learning_rate": 0.00044196484926412364, "loss": 0.0426, "step": 3570 }, { "epoch": 0.6808308323180697, "grad_norm": 0.25217998027801514, "learning_rate": 0.00044164346061593827, "loss": 0.0616, "step": 3580 }, { "epoch": 0.6827325944195168, "grad_norm": 0.07392685115337372, "learning_rate": 0.0004413213020684619, "loss": 0.0366, "step": 3590 }, { "epoch": 0.6846343565209639, "grad_norm": 0.2156752347946167, "learning_rate": 0.0004409983749159189, "loss": 0.0352, "step": 3600 }, { "epoch": 0.686536118622411, "grad_norm": 0.007870941422879696, "learning_rate": 0.0004406746804556214, "loss": 0.0342, "step": 3610 }, { "epoch": 0.6884378807238583, "grad_norm": 0.18450793623924255, "learning_rate": 0.000440350219987964, "loss": 0.0292, "step": 3620 }, { "epoch": 0.6903396428253054, "grad_norm": 0.3035867214202881, "learning_rate": 0.00044002499481641876, "loss": 0.0439, "step": 3630 }, { "epoch": 0.6922414049267525, "grad_norm": 0.16944675147533417, "learning_rate": 0.0004396990062475299, "loss": 0.0368, "step": 3640 }, { "epoch": 0.6941431670281996, "grad_norm": 0.17465433478355408, "learning_rate": 0.0004393722555909081, "loss": 0.0512, "step": 3650 }, { "epoch": 0.6960449291296467, "grad_norm": 0.4197070002555847, "learning_rate": 0.0004390447441592259, "loss": 0.0494, "step": 3660 }, { "epoch": 0.6979466912310938, "grad_norm": 0.21269617974758148, "learning_rate": 0.0004387164732682122, "loss": 0.0432, "step": 3670 }, { "epoch": 0.699848453332541, "grad_norm": 0.3487708270549774, "learning_rate": 0.0004383874442366468, "loss": 0.0405, "step": 3680 }, { "epoch": 0.7017502154339881, "grad_norm": 0.3449212908744812, "learning_rate": 0.0004380576583863551, "loss": 0.0604, "step": 3690 }, { "epoch": 0.7036519775354352, "grad_norm": 0.1568797379732132, "learning_rate": 0.0004377271170422031, "loss": 0.0493, "step": 3700 }, { "epoch": 0.7055537396368823, "grad_norm": 0.18511539697647095, "learning_rate": 0.0004373958215320918, "loss": 0.0313, "step": 3710 }, { "epoch": 0.7074555017383294, "grad_norm": 0.12269023805856705, "learning_rate": 0.0004370637731869521, "loss": 0.0343, "step": 3720 }, { "epoch": 0.7093572638397766, "grad_norm": 0.1673561930656433, "learning_rate": 0.000436730973340739, "loss": 0.033, "step": 3730 }, { "epoch": 0.7112590259412237, "grad_norm": 0.3525213301181793, "learning_rate": 0.00043639742333042675, "loss": 0.1303, "step": 3740 }, { "epoch": 0.7131607880426708, "grad_norm": 0.33365005254745483, "learning_rate": 0.00043606312449600334, "loss": 0.1418, "step": 3750 }, { "epoch": 0.7150625501441179, "grad_norm": 0.3005249500274658, "learning_rate": 0.00043572807818046484, "loss": 0.0664, "step": 3760 }, { "epoch": 0.716964312245565, "grad_norm": 0.4880094826221466, "learning_rate": 0.00043539228572981036, "loss": 0.0788, "step": 3770 }, { "epoch": 0.7188660743470121, "grad_norm": 0.7306382656097412, "learning_rate": 0.00043505574849303654, "loss": 0.1543, "step": 3780 }, { "epoch": 0.7207678364484593, "grad_norm": 0.3662378489971161, "learning_rate": 0.00043471846782213184, "loss": 0.0682, "step": 3790 }, { "epoch": 0.7226695985499064, "grad_norm": 0.4383717477321625, "learning_rate": 0.0004343804450720717, "loss": 0.4356, "step": 3800 }, { "epoch": 0.7245713606513535, "grad_norm": 0.24195566773414612, "learning_rate": 0.0004340416816008125, "loss": 0.058, "step": 3810 }, { "epoch": 0.7264731227528006, "grad_norm": 0.746701717376709, "learning_rate": 0.00043370217876928643, "loss": 0.137, "step": 3820 }, { "epoch": 0.7283748848542477, "grad_norm": 0.33316388726234436, "learning_rate": 0.000433361937941396, "loss": 0.0669, "step": 3830 }, { "epoch": 0.7302766469556949, "grad_norm": 0.5440863370895386, "learning_rate": 0.00043302096048400846, "loss": 0.0573, "step": 3840 }, { "epoch": 0.732178409057142, "grad_norm": 0.20268595218658447, "learning_rate": 0.00043267924776695034, "loss": 0.0647, "step": 3850 }, { "epoch": 0.7340801711585891, "grad_norm": 0.12712806463241577, "learning_rate": 0.0004323368011630021, "loss": 0.0656, "step": 3860 }, { "epoch": 0.7359819332600362, "grad_norm": 0.25622451305389404, "learning_rate": 0.00043199362204789224, "loss": 0.0369, "step": 3870 }, { "epoch": 0.7378836953614833, "grad_norm": 0.48822298645973206, "learning_rate": 0.0004316497118002922, "loss": 0.0612, "step": 3880 }, { "epoch": 0.7397854574629305, "grad_norm": 0.6518691778182983, "learning_rate": 0.0004313050718018107, "loss": 0.0536, "step": 3890 }, { "epoch": 0.7416872195643777, "grad_norm": 0.4019169509410858, "learning_rate": 0.0004309597034369878, "loss": 0.0511, "step": 3900 }, { "epoch": 0.7435889816658248, "grad_norm": 0.3208947479724884, "learning_rate": 0.00043061360809329007, "loss": 0.0612, "step": 3910 }, { "epoch": 0.7454907437672719, "grad_norm": 0.6742276549339294, "learning_rate": 0.0004302667871611045, "loss": 0.05, "step": 3920 }, { "epoch": 0.747392505868719, "grad_norm": 0.32614463567733765, "learning_rate": 0.0004299192420337326, "loss": 0.0508, "step": 3930 }, { "epoch": 0.7492942679701661, "grad_norm": 0.407650887966156, "learning_rate": 0.0004295709741073859, "loss": 0.0459, "step": 3940 }, { "epoch": 0.7511960300716132, "grad_norm": 0.11781582981348038, "learning_rate": 0.00042922198478117927, "loss": 0.0452, "step": 3950 }, { "epoch": 0.7530977921730604, "grad_norm": 0.10125182569026947, "learning_rate": 0.0004288722754571257, "loss": 0.0391, "step": 3960 }, { "epoch": 0.7549995542745075, "grad_norm": 0.19000868499279022, "learning_rate": 0.00042852184754013075, "loss": 0.0345, "step": 3970 }, { "epoch": 0.7569013163759546, "grad_norm": 0.00830161478370428, "learning_rate": 0.00042817070243798686, "loss": 0.0446, "step": 3980 }, { "epoch": 0.7588030784774017, "grad_norm": 0.37715670466423035, "learning_rate": 0.0004278188415613675, "loss": 0.0533, "step": 3990 }, { "epoch": 0.7607048405788488, "grad_norm": 0.02961985021829605, "learning_rate": 0.000427466266323822, "loss": 0.0274, "step": 4000 }, { "epoch": 0.762606602680296, "grad_norm": 0.3095683753490448, "learning_rate": 0.00042711297814176897, "loss": 0.0333, "step": 4010 }, { "epoch": 0.7645083647817431, "grad_norm": 0.17599205672740936, "learning_rate": 0.0004267589784344917, "loss": 0.0354, "step": 4020 }, { "epoch": 0.7664101268831902, "grad_norm": 0.29849332571029663, "learning_rate": 0.00042640426862413164, "loss": 0.0388, "step": 4030 }, { "epoch": 0.7683118889846373, "grad_norm": 0.2587386667728424, "learning_rate": 0.00042604885013568304, "loss": 0.0535, "step": 4040 }, { "epoch": 0.7702136510860844, "grad_norm": 0.1985943764448166, "learning_rate": 0.00042569272439698725, "loss": 0.0357, "step": 4050 }, { "epoch": 0.7721154131875315, "grad_norm": 0.28795111179351807, "learning_rate": 0.00042533589283872677, "loss": 0.0437, "step": 4060 }, { "epoch": 0.7740171752889787, "grad_norm": 0.15408733487129211, "learning_rate": 0.0004249783568944197, "loss": 0.051, "step": 4070 }, { "epoch": 0.7759189373904258, "grad_norm": 0.35579803586006165, "learning_rate": 0.00042462011800041376, "loss": 0.0445, "step": 4080 }, { "epoch": 0.7778206994918729, "grad_norm": 0.1227014809846878, "learning_rate": 0.0004242611775958809, "loss": 0.0333, "step": 4090 }, { "epoch": 0.77972246159332, "grad_norm": 0.3193877339363098, "learning_rate": 0.0004239015371228111, "loss": 0.0273, "step": 4100 }, { "epoch": 0.7816242236947671, "grad_norm": 0.09365034103393555, "learning_rate": 0.0004235411980260069, "loss": 0.0252, "step": 4110 }, { "epoch": 0.7835259857962144, "grad_norm": 0.28869110345840454, "learning_rate": 0.0004231801617530773, "loss": 0.035, "step": 4120 }, { "epoch": 0.7854277478976615, "grad_norm": 0.2648963928222656, "learning_rate": 0.0004228184297544323, "loss": 0.0579, "step": 4130 }, { "epoch": 0.7873295099991086, "grad_norm": 0.24735620617866516, "learning_rate": 0.00042245600348327664, "loss": 0.0321, "step": 4140 }, { "epoch": 0.7892312721005557, "grad_norm": 0.14888009428977966, "learning_rate": 0.00042209288439560444, "loss": 0.0347, "step": 4150 }, { "epoch": 0.7911330342020028, "grad_norm": 0.11169561743736267, "learning_rate": 0.0004217290739501929, "loss": 0.0409, "step": 4160 }, { "epoch": 0.7930347963034499, "grad_norm": 0.1095479279756546, "learning_rate": 0.0004213645736085968, "loss": 0.0323, "step": 4170 }, { "epoch": 0.7949365584048971, "grad_norm": 0.21435153484344482, "learning_rate": 0.00042099938483514235, "loss": 0.0418, "step": 4180 }, { "epoch": 0.7968383205063442, "grad_norm": 0.1274380087852478, "learning_rate": 0.0004206335090969215, "loss": 0.04, "step": 4190 }, { "epoch": 0.7987400826077913, "grad_norm": 0.10879164189100266, "learning_rate": 0.00042026694786378603, "loss": 0.0443, "step": 4200 }, { "epoch": 0.8006418447092384, "grad_norm": 0.21737106144428253, "learning_rate": 0.00041989970260834157, "loss": 0.0333, "step": 4210 }, { "epoch": 0.8025436068106855, "grad_norm": 0.3080846071243286, "learning_rate": 0.00041953177480594163, "loss": 0.0521, "step": 4220 }, { "epoch": 0.8044453689121327, "grad_norm": 0.23476336896419525, "learning_rate": 0.0004191631659346818, "loss": 0.0312, "step": 4230 }, { "epoch": 0.8063471310135798, "grad_norm": 0.311959832906723, "learning_rate": 0.00041879387747539376, "loss": 0.0351, "step": 4240 }, { "epoch": 0.8082488931150269, "grad_norm": 0.24322810769081116, "learning_rate": 0.0004184239109116393, "loss": 0.0426, "step": 4250 }, { "epoch": 0.810150655216474, "grad_norm": 0.2684605121612549, "learning_rate": 0.00041805326772970455, "loss": 0.0313, "step": 4260 }, { "epoch": 0.8120524173179211, "grad_norm": 0.2264401763677597, "learning_rate": 0.0004176819494185936, "loss": 0.0499, "step": 4270 }, { "epoch": 0.8139541794193682, "grad_norm": 0.12556996941566467, "learning_rate": 0.00041730995747002296, "loss": 0.0269, "step": 4280 }, { "epoch": 0.8158559415208154, "grad_norm": 0.22863665223121643, "learning_rate": 0.00041693729337841546, "loss": 0.0404, "step": 4290 }, { "epoch": 0.8177577036222625, "grad_norm": 0.007670534774661064, "learning_rate": 0.00041656395864089383, "loss": 0.0318, "step": 4300 }, { "epoch": 0.8196594657237096, "grad_norm": 0.2585577368736267, "learning_rate": 0.0004161899547572753, "loss": 0.0293, "step": 4310 }, { "epoch": 0.8215612278251567, "grad_norm": 0.36932700872421265, "learning_rate": 0.00041581528323006526, "loss": 0.0379, "step": 4320 }, { "epoch": 0.8234629899266038, "grad_norm": 0.2459120750427246, "learning_rate": 0.0004154399455644512, "loss": 0.0281, "step": 4330 }, { "epoch": 0.825364752028051, "grad_norm": 0.03090517409145832, "learning_rate": 0.0004150639432682967, "loss": 0.0401, "step": 4340 }, { "epoch": 0.8272665141294981, "grad_norm": 0.11881538480520248, "learning_rate": 0.0004146872778521355, "loss": 0.0376, "step": 4350 }, { "epoch": 0.8291682762309452, "grad_norm": 0.2136705368757248, "learning_rate": 0.0004143099508291652, "loss": 0.0247, "step": 4360 }, { "epoch": 0.8310700383323923, "grad_norm": 0.22370411455631256, "learning_rate": 0.00041393196371524143, "loss": 0.0287, "step": 4370 }, { "epoch": 0.8329718004338394, "grad_norm": 0.26691102981567383, "learning_rate": 0.00041355331802887156, "loss": 0.0329, "step": 4380 }, { "epoch": 0.8348735625352866, "grad_norm": 0.16438519954681396, "learning_rate": 0.00041317401529120866, "loss": 0.0281, "step": 4390 }, { "epoch": 0.8367753246367338, "grad_norm": 0.1785949021577835, "learning_rate": 0.0004127940570260456, "loss": 0.0313, "step": 4400 }, { "epoch": 0.8386770867381809, "grad_norm": 0.11144755035638809, "learning_rate": 0.00041241344475980823, "loss": 0.0283, "step": 4410 }, { "epoch": 0.840578848839628, "grad_norm": 0.3532226085662842, "learning_rate": 0.00041203218002155046, "loss": 0.027, "step": 4420 }, { "epoch": 0.8424806109410751, "grad_norm": 0.15130820870399475, "learning_rate": 0.0004116502643429469, "loss": 0.0283, "step": 4430 }, { "epoch": 0.8443823730425222, "grad_norm": 0.10683543235063553, "learning_rate": 0.00041126769925828733, "loss": 0.0256, "step": 4440 }, { "epoch": 0.8462841351439694, "grad_norm": 0.12012962251901627, "learning_rate": 0.0004108844863044706, "loss": 0.0244, "step": 4450 }, { "epoch": 0.8481858972454165, "grad_norm": 0.209054633975029, "learning_rate": 0.00041050062702099795, "loss": 0.0371, "step": 4460 }, { "epoch": 0.8500876593468636, "grad_norm": 0.27558189630508423, "learning_rate": 0.00041011612294996746, "loss": 0.0241, "step": 4470 }, { "epoch": 0.8519894214483107, "grad_norm": 0.2076425403356552, "learning_rate": 0.0004097309756360674, "loss": 0.0376, "step": 4480 }, { "epoch": 0.8538911835497578, "grad_norm": 0.1804133802652359, "learning_rate": 0.00040934518662657035, "loss": 0.0412, "step": 4490 }, { "epoch": 0.8557929456512049, "grad_norm": 0.30146369338035583, "learning_rate": 0.0004089587574713264, "loss": 0.0271, "step": 4500 }, { "epoch": 0.8576947077526521, "grad_norm": 0.24618133902549744, "learning_rate": 0.00040857168972275785, "loss": 0.0348, "step": 4510 }, { "epoch": 0.8595964698540992, "grad_norm": 0.1946696937084198, "learning_rate": 0.00040818398493585185, "loss": 0.0217, "step": 4520 }, { "epoch": 0.8614982319555463, "grad_norm": 0.14857546985149384, "learning_rate": 0.0004077956446681554, "loss": 0.0429, "step": 4530 }, { "epoch": 0.8633999940569934, "grad_norm": 0.1880268156528473, "learning_rate": 0.0004074066704797682, "loss": 0.0476, "step": 4540 }, { "epoch": 0.8653017561584405, "grad_norm": 0.13485951721668243, "learning_rate": 0.00040701706393333635, "loss": 0.0369, "step": 4550 }, { "epoch": 0.8672035182598877, "grad_norm": 0.022607989609241486, "learning_rate": 0.00040662682659404684, "loss": 0.017, "step": 4560 }, { "epoch": 0.8691052803613348, "grad_norm": 0.4116274416446686, "learning_rate": 0.00040623596002962027, "loss": 0.0333, "step": 4570 }, { "epoch": 0.8710070424627819, "grad_norm": 0.1159391701221466, "learning_rate": 0.0004058444658103055, "loss": 0.0206, "step": 4580 }, { "epoch": 0.872908804564229, "grad_norm": 0.22013840079307556, "learning_rate": 0.00040545234550887264, "loss": 0.0394, "step": 4590 }, { "epoch": 0.8748105666656761, "grad_norm": 0.1728079915046692, "learning_rate": 0.0004050596007006072, "loss": 0.0435, "step": 4600 }, { "epoch": 0.8767123287671232, "grad_norm": 0.14915741980075836, "learning_rate": 0.0004046662329633032, "loss": 0.0402, "step": 4610 }, { "epoch": 0.8786140908685705, "grad_norm": 0.20652537047863007, "learning_rate": 0.0004042722438772576, "loss": 0.037, "step": 4620 }, { "epoch": 0.8805158529700176, "grad_norm": 0.2016768902540207, "learning_rate": 0.00040387763502526325, "loss": 0.0247, "step": 4630 }, { "epoch": 0.8824176150714647, "grad_norm": 0.07532606273889542, "learning_rate": 0.00040348240799260296, "loss": 0.0302, "step": 4640 }, { "epoch": 0.8843193771729118, "grad_norm": 0.16426606476306915, "learning_rate": 0.00040308656436704294, "loss": 0.0219, "step": 4650 }, { "epoch": 0.8862211392743589, "grad_norm": 0.2340356707572937, "learning_rate": 0.0004026901057388265, "loss": 0.0347, "step": 4660 }, { "epoch": 0.8881229013758061, "grad_norm": 0.019909365102648735, "learning_rate": 0.0004022930337006676, "loss": 0.0411, "step": 4670 }, { "epoch": 0.8900246634772532, "grad_norm": 0.17835919559001923, "learning_rate": 0.0004018953498477444, "loss": 0.0393, "step": 4680 }, { "epoch": 0.8919264255787003, "grad_norm": 0.13876459002494812, "learning_rate": 0.00040149705577769313, "loss": 0.0401, "step": 4690 }, { "epoch": 0.8938281876801474, "grad_norm": 0.16416268050670624, "learning_rate": 0.00040109815309060135, "loss": 0.0396, "step": 4700 }, { "epoch": 0.8957299497815945, "grad_norm": 0.09078884869813919, "learning_rate": 0.0004006986433890017, "loss": 0.039, "step": 4710 }, { "epoch": 0.8976317118830416, "grad_norm": 0.08101935684680939, "learning_rate": 0.00040029852827786535, "loss": 0.0269, "step": 4720 }, { "epoch": 0.8995334739844888, "grad_norm": 0.12669484317302704, "learning_rate": 0.00039989780936459566, "loss": 0.0392, "step": 4730 }, { "epoch": 0.9014352360859359, "grad_norm": 0.11005831509828568, "learning_rate": 0.00039949648825902165, "loss": 0.0378, "step": 4740 }, { "epoch": 0.903336998187383, "grad_norm": 0.21079300343990326, "learning_rate": 0.0003990945665733916, "loss": 0.0334, "step": 4750 }, { "epoch": 0.9052387602888301, "grad_norm": 0.07260707765817642, "learning_rate": 0.0003986920459223665, "loss": 0.0287, "step": 4760 }, { "epoch": 0.9071405223902772, "grad_norm": 0.01923411153256893, "learning_rate": 0.0003982889279230135, "loss": 0.0454, "step": 4770 }, { "epoch": 0.9090422844917244, "grad_norm": 0.13614831864833832, "learning_rate": 0.0003978852141947998, "loss": 0.0407, "step": 4780 }, { "epoch": 0.9109440465931715, "grad_norm": 0.1618584245443344, "learning_rate": 0.00039748090635958555, "loss": 0.0389, "step": 4790 }, { "epoch": 0.9128458086946186, "grad_norm": 0.16158536076545715, "learning_rate": 0.00039707600604161773, "loss": 0.0343, "step": 4800 }, { "epoch": 0.9147475707960657, "grad_norm": 0.1736435741186142, "learning_rate": 0.00039667051486752357, "loss": 0.0204, "step": 4810 }, { "epoch": 0.9166493328975128, "grad_norm": 0.15715183317661285, "learning_rate": 0.000396264434466304, "loss": 0.0423, "step": 4820 }, { "epoch": 0.9185510949989599, "grad_norm": 0.1514957845211029, "learning_rate": 0.00039585776646932703, "loss": 0.0329, "step": 4830 }, { "epoch": 0.9204528571004071, "grad_norm": 0.0464215911924839, "learning_rate": 0.0003954505125103212, "loss": 0.0281, "step": 4840 }, { "epoch": 0.9223546192018542, "grad_norm": 0.24331432580947876, "learning_rate": 0.0003950426742253692, "loss": 0.0385, "step": 4850 }, { "epoch": 0.9242563813033013, "grad_norm": 0.006427076645195484, "learning_rate": 0.00039463425325290095, "loss": 0.0312, "step": 4860 }, { "epoch": 0.9261581434047484, "grad_norm": 0.3959774971008301, "learning_rate": 0.00039422525123368755, "loss": 0.0168, "step": 4870 }, { "epoch": 0.9280599055061955, "grad_norm": 0.14587469398975372, "learning_rate": 0.0003938156698108342, "loss": 0.0319, "step": 4880 }, { "epoch": 0.9299616676076428, "grad_norm": 0.1308862864971161, "learning_rate": 0.0003934055106297735, "loss": 0.0247, "step": 4890 }, { "epoch": 0.9318634297090899, "grad_norm": 0.03495126590132713, "learning_rate": 0.0003929947753382596, "loss": 0.0199, "step": 4900 }, { "epoch": 0.933765191810537, "grad_norm": 0.1539415717124939, "learning_rate": 0.0003925834655863608, "loss": 0.0375, "step": 4910 }, { "epoch": 0.9356669539119841, "grad_norm": 0.1579003781080246, "learning_rate": 0.00039217158302645326, "loss": 0.0295, "step": 4920 }, { "epoch": 0.9375687160134312, "grad_norm": 0.007031635381281376, "learning_rate": 0.00039175912931321426, "loss": 0.0258, "step": 4930 }, { "epoch": 0.9394704781148783, "grad_norm": 0.3409572243690491, "learning_rate": 0.00039134610610361574, "loss": 0.0351, "step": 4940 }, { "epoch": 0.9413722402163255, "grad_norm": 0.27617499232292175, "learning_rate": 0.00039093251505691745, "loss": 0.0224, "step": 4950 }, { "epoch": 0.9432740023177726, "grad_norm": 0.18574129045009613, "learning_rate": 0.00039051835783466025, "loss": 0.0258, "step": 4960 }, { "epoch": 0.9451757644192197, "grad_norm": 0.006668297573924065, "learning_rate": 0.0003901036361006596, "loss": 0.0241, "step": 4970 }, { "epoch": 0.9470775265206668, "grad_norm": 0.14266005158424377, "learning_rate": 0.00038968835152099874, "loss": 0.0262, "step": 4980 }, { "epoch": 0.9489792886221139, "grad_norm": 0.18967001140117645, "learning_rate": 0.00038927250576402227, "loss": 0.0233, "step": 4990 }, { "epoch": 0.950881050723561, "grad_norm": 0.1983349472284317, "learning_rate": 0.00038885610050032896, "loss": 0.0364, "step": 5000 }, { "epoch": 0.9527828128250082, "grad_norm": 0.1807214319705963, "learning_rate": 0.00038843913740276546, "loss": 0.0345, "step": 5010 }, { "epoch": 0.9546845749264553, "grad_norm": 0.336378812789917, "learning_rate": 0.0003880216181464195, "loss": 0.021, "step": 5020 }, { "epoch": 0.9565863370279024, "grad_norm": 0.2094445377588272, "learning_rate": 0.0003876035444086129, "loss": 0.0453, "step": 5030 }, { "epoch": 0.9584880991293495, "grad_norm": 0.1133272647857666, "learning_rate": 0.0003871849178688952, "loss": 0.0339, "step": 5040 }, { "epoch": 0.9603898612307966, "grad_norm": 0.21298140287399292, "learning_rate": 0.00038676574020903666, "loss": 0.0417, "step": 5050 }, { "epoch": 0.9622916233322438, "grad_norm": 0.2184475064277649, "learning_rate": 0.00038634601311302166, "loss": 0.02, "step": 5060 }, { "epoch": 0.9641933854336909, "grad_norm": 0.13946540653705597, "learning_rate": 0.0003859257382670417, "loss": 0.0258, "step": 5070 }, { "epoch": 0.966095147535138, "grad_norm": 0.09975899010896683, "learning_rate": 0.000385504917359489, "loss": 0.0261, "step": 5080 }, { "epoch": 0.9679969096365851, "grad_norm": 0.10357426851987839, "learning_rate": 0.00038508355208094925, "loss": 0.0367, "step": 5090 }, { "epoch": 0.9698986717380322, "grad_norm": 0.2898187041282654, "learning_rate": 0.0003846616441241953, "loss": 0.0398, "step": 5100 }, { "epoch": 0.9718004338394793, "grad_norm": 0.21133796870708466, "learning_rate": 0.00038423919518418, "loss": 0.0311, "step": 5110 }, { "epoch": 0.9737021959409266, "grad_norm": 0.20074529945850372, "learning_rate": 0.0003838162069580296, "loss": 0.0319, "step": 5120 }, { "epoch": 0.9756039580423737, "grad_norm": 0.10800763219594955, "learning_rate": 0.0003833926811450368, "loss": 0.0331, "step": 5130 }, { "epoch": 0.9775057201438208, "grad_norm": 0.17509090900421143, "learning_rate": 0.0003829686194466539, "loss": 0.0176, "step": 5140 }, { "epoch": 0.9794074822452679, "grad_norm": 0.16533738374710083, "learning_rate": 0.0003825440235664863, "loss": 0.0182, "step": 5150 }, { "epoch": 0.981309244346715, "grad_norm": 0.15933182835578918, "learning_rate": 0.000382118895210285, "loss": 0.0398, "step": 5160 }, { "epoch": 0.9832110064481622, "grad_norm": 0.2111063301563263, "learning_rate": 0.0003816932360859405, "loss": 0.0247, "step": 5170 }, { "epoch": 0.9851127685496093, "grad_norm": 0.18295446038246155, "learning_rate": 0.0003812670479034754, "loss": 0.041, "step": 5180 }, { "epoch": 0.9870145306510564, "grad_norm": 0.17066188156604767, "learning_rate": 0.0003808403323750379, "loss": 0.032, "step": 5190 }, { "epoch": 0.9889162927525035, "grad_norm": 0.17090731859207153, "learning_rate": 0.00038041309121489443, "loss": 0.0313, "step": 5200 } ], "logging_steps": 10, "max_steps": 15774, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.3194238878926234e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }