diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,58197 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 50, + "global_step": 8124, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00036927621861152144, + "grad_norm": 0.9271662831306458, + "learning_rate": 4e-05, + "loss": 2.3883, + "step": 1 + }, + { + "epoch": 0.0007385524372230429, + "grad_norm": 0.88088059425354, + "learning_rate": 8e-05, + "loss": 2.4973, + "step": 2 + }, + { + "epoch": 0.0011078286558345643, + "grad_norm": 0.9519290924072266, + "learning_rate": 0.00012, + "loss": 2.4782, + "step": 3 + }, + { + "epoch": 0.0014771048744460858, + "grad_norm": 0.9948570728302002, + "learning_rate": 0.00016, + "loss": 2.4998, + "step": 4 + }, + { + "epoch": 0.001846381093057607, + "grad_norm": 0.7471245527267456, + "learning_rate": 0.0002, + "loss": 2.1436, + "step": 5 + }, + { + "epoch": 0.0022156573116691287, + "grad_norm": 0.8344650268554688, + "learning_rate": 0.0001999753664244365, + "loss": 1.9049, + "step": 6 + }, + { + "epoch": 0.00258493353028065, + "grad_norm": 0.7397900819778442, + "learning_rate": 0.000199950732848873, + "loss": 1.5714, + "step": 7 + }, + { + "epoch": 0.0029542097488921715, + "grad_norm": 0.9788260459899902, + "learning_rate": 0.00019992609927330953, + "loss": 1.3497, + "step": 8 + }, + { + "epoch": 0.0033234859675036928, + "grad_norm": 0.9803362488746643, + "learning_rate": 0.00019990146569774604, + "loss": 1.1523, + "step": 9 + }, + { + "epoch": 0.003692762186115214, + "grad_norm": 1.1928863525390625, + "learning_rate": 0.00019987683212218253, + "loss": 0.8251, + "step": 10 + }, + { + "epoch": 0.004062038404726735, + "grad_norm": 0.8220493197441101, + "learning_rate": 0.00019985219854661904, + "loss": 0.8677, + "step": 11 + }, + { + "epoch": 0.004431314623338257, + "grad_norm": 0.6582696437835693, + "learning_rate": 0.00019982756497105556, + "loss": 0.7035, + "step": 12 + }, + { + "epoch": 0.0048005908419497785, + "grad_norm": 0.71855229139328, + "learning_rate": 0.00019980293139549207, + "loss": 0.6519, + "step": 13 + }, + { + "epoch": 0.0051698670605613, + "grad_norm": 0.44830963015556335, + "learning_rate": 0.00019977829781992856, + "loss": 0.5839, + "step": 14 + }, + { + "epoch": 0.005539143279172821, + "grad_norm": 0.495028555393219, + "learning_rate": 0.00019975366424436507, + "loss": 0.5893, + "step": 15 + }, + { + "epoch": 0.005908419497784343, + "grad_norm": 0.39308783411979675, + "learning_rate": 0.00019972903066880156, + "loss": 0.5092, + "step": 16 + }, + { + "epoch": 0.006277695716395864, + "grad_norm": 0.47888630628585815, + "learning_rate": 0.0001997043970932381, + "loss": 0.5774, + "step": 17 + }, + { + "epoch": 0.0066469719350073855, + "grad_norm": 0.6578012704849243, + "learning_rate": 0.0001996797635176746, + "loss": 0.6483, + "step": 18 + }, + { + "epoch": 0.007016248153618907, + "grad_norm": 0.6908981800079346, + "learning_rate": 0.0001996551299421111, + "loss": 0.5636, + "step": 19 + }, + { + "epoch": 0.007385524372230428, + "grad_norm": 0.43489450216293335, + "learning_rate": 0.0001996304963665476, + "loss": 0.5406, + "step": 20 + }, + { + "epoch": 0.00775480059084195, + "grad_norm": 0.3869227170944214, + "learning_rate": 0.0001996058627909841, + "loss": 0.5641, + "step": 21 + }, + { + "epoch": 0.00812407680945347, + "grad_norm": 0.5425620079040527, + "learning_rate": 0.00019958122921542062, + "loss": 0.5257, + "step": 22 + }, + { + "epoch": 0.008493353028064993, + "grad_norm": 0.6320693492889404, + "learning_rate": 0.00019955659563985714, + "loss": 0.5719, + "step": 23 + }, + { + "epoch": 0.008862629246676515, + "grad_norm": 0.464958131313324, + "learning_rate": 0.00019953196206429363, + "loss": 0.5563, + "step": 24 + }, + { + "epoch": 0.009231905465288036, + "grad_norm": 0.4515001177787781, + "learning_rate": 0.00019950732848873014, + "loss": 0.4673, + "step": 25 + }, + { + "epoch": 0.009601181683899557, + "grad_norm": 0.49136847257614136, + "learning_rate": 0.00019948269491316665, + "loss": 0.4557, + "step": 26 + }, + { + "epoch": 0.009970457902511078, + "grad_norm": 0.46212202310562134, + "learning_rate": 0.00019945806133760317, + "loss": 0.5515, + "step": 27 + }, + { + "epoch": 0.0103397341211226, + "grad_norm": 0.4864141345024109, + "learning_rate": 0.00019943342776203966, + "loss": 0.5798, + "step": 28 + }, + { + "epoch": 0.01070901033973412, + "grad_norm": 0.4452303349971771, + "learning_rate": 0.00019940879418647617, + "loss": 0.4315, + "step": 29 + }, + { + "epoch": 0.011078286558345642, + "grad_norm": 0.38728904724121094, + "learning_rate": 0.00019938416061091269, + "loss": 0.4725, + "step": 30 + }, + { + "epoch": 0.011447562776957163, + "grad_norm": 0.3894054591655731, + "learning_rate": 0.0001993595270353492, + "loss": 0.5628, + "step": 31 + }, + { + "epoch": 0.011816838995568686, + "grad_norm": 0.36863842606544495, + "learning_rate": 0.0001993348934597857, + "loss": 0.4811, + "step": 32 + }, + { + "epoch": 0.012186115214180207, + "grad_norm": 0.45459088683128357, + "learning_rate": 0.0001993102598842222, + "loss": 0.4594, + "step": 33 + }, + { + "epoch": 0.012555391432791729, + "grad_norm": 0.3369868993759155, + "learning_rate": 0.0001992856263086587, + "loss": 0.4485, + "step": 34 + }, + { + "epoch": 0.01292466765140325, + "grad_norm": 0.3974998891353607, + "learning_rate": 0.00019926099273309523, + "loss": 0.3911, + "step": 35 + }, + { + "epoch": 0.013293943870014771, + "grad_norm": 0.36964720487594604, + "learning_rate": 0.00019923635915753172, + "loss": 0.5175, + "step": 36 + }, + { + "epoch": 0.013663220088626292, + "grad_norm": 0.34102922677993774, + "learning_rate": 0.00019921172558196823, + "loss": 0.4615, + "step": 37 + }, + { + "epoch": 0.014032496307237814, + "grad_norm": 0.37175315618515015, + "learning_rate": 0.00019918709200640472, + "loss": 0.4277, + "step": 38 + }, + { + "epoch": 0.014401772525849335, + "grad_norm": 0.3571164011955261, + "learning_rate": 0.00019916245843084124, + "loss": 0.414, + "step": 39 + }, + { + "epoch": 0.014771048744460856, + "grad_norm": 0.4616919457912445, + "learning_rate": 0.00019913782485527775, + "loss": 0.5431, + "step": 40 + }, + { + "epoch": 0.015140324963072379, + "grad_norm": 0.4270848333835602, + "learning_rate": 0.00019911319127971427, + "loss": 0.5241, + "step": 41 + }, + { + "epoch": 0.0155096011816839, + "grad_norm": 0.3516715466976166, + "learning_rate": 0.00019908855770415075, + "loss": 0.3697, + "step": 42 + }, + { + "epoch": 0.01587887740029542, + "grad_norm": 0.385493665933609, + "learning_rate": 0.00019906392412858727, + "loss": 0.4971, + "step": 43 + }, + { + "epoch": 0.01624815361890694, + "grad_norm": 0.43616804480552673, + "learning_rate": 0.00019903929055302378, + "loss": 0.4602, + "step": 44 + }, + { + "epoch": 0.016617429837518464, + "grad_norm": 0.34476238489151, + "learning_rate": 0.0001990146569774603, + "loss": 0.5088, + "step": 45 + }, + { + "epoch": 0.016986706056129987, + "grad_norm": 0.3416641652584076, + "learning_rate": 0.00019899002340189678, + "loss": 0.4123, + "step": 46 + }, + { + "epoch": 0.017355982274741506, + "grad_norm": 0.3711118698120117, + "learning_rate": 0.0001989653898263333, + "loss": 0.446, + "step": 47 + }, + { + "epoch": 0.01772525849335303, + "grad_norm": 0.35145944356918335, + "learning_rate": 0.0001989407562507698, + "loss": 0.4833, + "step": 48 + }, + { + "epoch": 0.01809453471196455, + "grad_norm": 0.37488019466400146, + "learning_rate": 0.00019891612267520633, + "loss": 0.4807, + "step": 49 + }, + { + "epoch": 0.01846381093057607, + "grad_norm": 0.3423296809196472, + "learning_rate": 0.00019889148909964282, + "loss": 0.4943, + "step": 50 + }, + { + "epoch": 0.01846381093057607, + "eval_loss": 7.699151515960693, + "eval_runtime": 7.2094, + "eval_samples_per_second": 6.935, + "eval_steps_per_second": 0.971, + "step": 50 + }, + { + "epoch": 0.01883308714918759, + "grad_norm": 0.34390905499458313, + "learning_rate": 0.00019886685552407933, + "loss": 0.3801, + "step": 51 + }, + { + "epoch": 0.019202363367799114, + "grad_norm": 0.39482685923576355, + "learning_rate": 0.00019884222194851582, + "loss": 0.5052, + "step": 52 + }, + { + "epoch": 0.019571639586410634, + "grad_norm": 0.40202629566192627, + "learning_rate": 0.00019881758837295233, + "loss": 0.4766, + "step": 53 + }, + { + "epoch": 0.019940915805022157, + "grad_norm": 0.3378605544567108, + "learning_rate": 0.00019879295479738885, + "loss": 0.4197, + "step": 54 + }, + { + "epoch": 0.02031019202363368, + "grad_norm": 0.30033889412879944, + "learning_rate": 0.00019876832122182536, + "loss": 0.3855, + "step": 55 + }, + { + "epoch": 0.0206794682422452, + "grad_norm": 0.3419307470321655, + "learning_rate": 0.00019874368764626185, + "loss": 0.4573, + "step": 56 + }, + { + "epoch": 0.021048744460856722, + "grad_norm": 0.33110278844833374, + "learning_rate": 0.00019871905407069836, + "loss": 0.4847, + "step": 57 + }, + { + "epoch": 0.02141802067946824, + "grad_norm": 0.33952057361602783, + "learning_rate": 0.00019869442049513488, + "loss": 0.404, + "step": 58 + }, + { + "epoch": 0.021787296898079764, + "grad_norm": 0.4051787853240967, + "learning_rate": 0.0001986697869195714, + "loss": 0.4875, + "step": 59 + }, + { + "epoch": 0.022156573116691284, + "grad_norm": 0.4196843206882477, + "learning_rate": 0.00019864515334400788, + "loss": 0.4758, + "step": 60 + }, + { + "epoch": 0.022525849335302807, + "grad_norm": 0.39869606494903564, + "learning_rate": 0.0001986205197684444, + "loss": 0.4799, + "step": 61 + }, + { + "epoch": 0.022895125553914326, + "grad_norm": 0.34429123997688293, + "learning_rate": 0.0001985958861928809, + "loss": 0.4738, + "step": 62 + }, + { + "epoch": 0.02326440177252585, + "grad_norm": 0.3623432517051697, + "learning_rate": 0.00019857125261731743, + "loss": 0.5323, + "step": 63 + }, + { + "epoch": 0.023633677991137372, + "grad_norm": 0.29430320858955383, + "learning_rate": 0.0001985466190417539, + "loss": 0.4141, + "step": 64 + }, + { + "epoch": 0.024002954209748892, + "grad_norm": 0.37135806679725647, + "learning_rate": 0.00019852198546619043, + "loss": 0.4358, + "step": 65 + }, + { + "epoch": 0.024372230428360415, + "grad_norm": 0.32982441782951355, + "learning_rate": 0.00019849735189062691, + "loss": 0.4663, + "step": 66 + }, + { + "epoch": 0.024741506646971934, + "grad_norm": 0.3640348017215729, + "learning_rate": 0.00019847271831506346, + "loss": 0.4802, + "step": 67 + }, + { + "epoch": 0.025110782865583457, + "grad_norm": 0.29507142305374146, + "learning_rate": 0.00019844808473949994, + "loss": 0.3541, + "step": 68 + }, + { + "epoch": 0.025480059084194977, + "grad_norm": 0.31904470920562744, + "learning_rate": 0.00019842345116393646, + "loss": 0.4135, + "step": 69 + }, + { + "epoch": 0.0258493353028065, + "grad_norm": 0.5192385911941528, + "learning_rate": 0.00019839881758837295, + "loss": 0.4902, + "step": 70 + }, + { + "epoch": 0.02621861152141802, + "grad_norm": 0.3420427441596985, + "learning_rate": 0.00019837418401280946, + "loss": 0.4691, + "step": 71 + }, + { + "epoch": 0.026587887740029542, + "grad_norm": 0.40519043803215027, + "learning_rate": 0.00019834955043724598, + "loss": 0.4685, + "step": 72 + }, + { + "epoch": 0.026957163958641065, + "grad_norm": 0.31065571308135986, + "learning_rate": 0.0001983249168616825, + "loss": 0.4017, + "step": 73 + }, + { + "epoch": 0.027326440177252585, + "grad_norm": 0.35353246331214905, + "learning_rate": 0.00019830028328611898, + "loss": 0.5233, + "step": 74 + }, + { + "epoch": 0.027695716395864108, + "grad_norm": 0.36298489570617676, + "learning_rate": 0.0001982756497105555, + "loss": 0.4821, + "step": 75 + }, + { + "epoch": 0.028064992614475627, + "grad_norm": 0.43392282724380493, + "learning_rate": 0.000198251016134992, + "loss": 0.5163, + "step": 76 + }, + { + "epoch": 0.02843426883308715, + "grad_norm": 0.39038610458374023, + "learning_rate": 0.00019822638255942852, + "loss": 0.503, + "step": 77 + }, + { + "epoch": 0.02880354505169867, + "grad_norm": 0.3230394721031189, + "learning_rate": 0.000198201748983865, + "loss": 0.4861, + "step": 78 + }, + { + "epoch": 0.029172821270310192, + "grad_norm": 0.301174134016037, + "learning_rate": 0.00019817711540830152, + "loss": 0.4156, + "step": 79 + }, + { + "epoch": 0.029542097488921712, + "grad_norm": 0.2888117730617523, + "learning_rate": 0.000198152481832738, + "loss": 0.4177, + "step": 80 + }, + { + "epoch": 0.029911373707533235, + "grad_norm": 0.3205660283565521, + "learning_rate": 0.00019812784825717455, + "loss": 0.3874, + "step": 81 + }, + { + "epoch": 0.030280649926144758, + "grad_norm": 0.30243366956710815, + "learning_rate": 0.00019810321468161104, + "loss": 0.4556, + "step": 82 + }, + { + "epoch": 0.030649926144756277, + "grad_norm": 0.2838730216026306, + "learning_rate": 0.00019807858110604756, + "loss": 0.398, + "step": 83 + }, + { + "epoch": 0.0310192023633678, + "grad_norm": 0.29214543104171753, + "learning_rate": 0.00019805394753048404, + "loss": 0.4796, + "step": 84 + }, + { + "epoch": 0.03138847858197932, + "grad_norm": 0.3265981674194336, + "learning_rate": 0.00019802931395492056, + "loss": 0.4212, + "step": 85 + }, + { + "epoch": 0.03175775480059084, + "grad_norm": 0.35232314467430115, + "learning_rate": 0.00019800468037935707, + "loss": 0.4241, + "step": 86 + }, + { + "epoch": 0.03212703101920236, + "grad_norm": 0.276351660490036, + "learning_rate": 0.0001979800468037936, + "loss": 0.3585, + "step": 87 + }, + { + "epoch": 0.03249630723781388, + "grad_norm": 0.32632961869239807, + "learning_rate": 0.00019795541322823007, + "loss": 0.4378, + "step": 88 + }, + { + "epoch": 0.03286558345642541, + "grad_norm": 0.2885938286781311, + "learning_rate": 0.0001979307796526666, + "loss": 0.367, + "step": 89 + }, + { + "epoch": 0.03323485967503693, + "grad_norm": 0.3009279668331146, + "learning_rate": 0.0001979061460771031, + "loss": 0.3774, + "step": 90 + }, + { + "epoch": 0.03360413589364845, + "grad_norm": 0.30113980174064636, + "learning_rate": 0.00019788151250153962, + "loss": 0.421, + "step": 91 + }, + { + "epoch": 0.033973412112259974, + "grad_norm": 0.32623499631881714, + "learning_rate": 0.0001978568789259761, + "loss": 0.4139, + "step": 92 + }, + { + "epoch": 0.03434268833087149, + "grad_norm": 0.3049476742744446, + "learning_rate": 0.00019783224535041262, + "loss": 0.4059, + "step": 93 + }, + { + "epoch": 0.03471196454948301, + "grad_norm": 0.4079396724700928, + "learning_rate": 0.00019780761177484913, + "loss": 0.4292, + "step": 94 + }, + { + "epoch": 0.03508124076809453, + "grad_norm": 0.36075252294540405, + "learning_rate": 0.00019778297819928565, + "loss": 0.4218, + "step": 95 + }, + { + "epoch": 0.03545051698670606, + "grad_norm": 0.36485761404037476, + "learning_rate": 0.00019775834462372214, + "loss": 0.4383, + "step": 96 + }, + { + "epoch": 0.03581979320531758, + "grad_norm": 0.287807434797287, + "learning_rate": 0.00019773371104815865, + "loss": 0.417, + "step": 97 + }, + { + "epoch": 0.0361890694239291, + "grad_norm": 0.348663330078125, + "learning_rate": 0.00019770907747259514, + "loss": 0.4747, + "step": 98 + }, + { + "epoch": 0.03655834564254062, + "grad_norm": 0.264282763004303, + "learning_rate": 0.00019768444389703168, + "loss": 0.4087, + "step": 99 + }, + { + "epoch": 0.03692762186115214, + "grad_norm": 0.2607477605342865, + "learning_rate": 0.00019765981032146817, + "loss": 0.3513, + "step": 100 + }, + { + "epoch": 0.03692762186115214, + "eval_loss": 7.595666408538818, + "eval_runtime": 6.9172, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 1.012, + "step": 100 + }, + { + "epoch": 0.03729689807976366, + "grad_norm": 0.30780139565467834, + "learning_rate": 0.00019763517674590468, + "loss": 0.4822, + "step": 101 + }, + { + "epoch": 0.03766617429837518, + "grad_norm": 0.26572558283805847, + "learning_rate": 0.00019761054317034117, + "loss": 0.36, + "step": 102 + }, + { + "epoch": 0.03803545051698671, + "grad_norm": 0.3111974895000458, + "learning_rate": 0.00019758590959477769, + "loss": 0.3702, + "step": 103 + }, + { + "epoch": 0.03840472673559823, + "grad_norm": 0.3563987612724304, + "learning_rate": 0.0001975612760192142, + "loss": 0.411, + "step": 104 + }, + { + "epoch": 0.03877400295420975, + "grad_norm": 0.3589745759963989, + "learning_rate": 0.00019753664244365071, + "loss": 0.4023, + "step": 105 + }, + { + "epoch": 0.03914327917282127, + "grad_norm": 0.4102230966091156, + "learning_rate": 0.0001975120088680872, + "loss": 0.4701, + "step": 106 + }, + { + "epoch": 0.039512555391432794, + "grad_norm": 0.36882904171943665, + "learning_rate": 0.00019748737529252372, + "loss": 0.3644, + "step": 107 + }, + { + "epoch": 0.03988183161004431, + "grad_norm": 0.30516114830970764, + "learning_rate": 0.00019746274171696023, + "loss": 0.4135, + "step": 108 + }, + { + "epoch": 0.04025110782865583, + "grad_norm": 0.27239319682121277, + "learning_rate": 0.00019743810814139675, + "loss": 0.3978, + "step": 109 + }, + { + "epoch": 0.04062038404726736, + "grad_norm": 0.3528960049152374, + "learning_rate": 0.00019741347456583323, + "loss": 0.3629, + "step": 110 + }, + { + "epoch": 0.04098966026587888, + "grad_norm": 0.33681753277778625, + "learning_rate": 0.00019738884099026975, + "loss": 0.4085, + "step": 111 + }, + { + "epoch": 0.0413589364844904, + "grad_norm": 0.3217534124851227, + "learning_rate": 0.00019736420741470624, + "loss": 0.4998, + "step": 112 + }, + { + "epoch": 0.04172821270310192, + "grad_norm": 0.34834039211273193, + "learning_rate": 0.00019733957383914278, + "loss": 0.3901, + "step": 113 + }, + { + "epoch": 0.042097488921713444, + "grad_norm": 0.3033251464366913, + "learning_rate": 0.00019731494026357927, + "loss": 0.4318, + "step": 114 + }, + { + "epoch": 0.042466765140324964, + "grad_norm": 0.30152222514152527, + "learning_rate": 0.00019729030668801578, + "loss": 0.3838, + "step": 115 + }, + { + "epoch": 0.04283604135893648, + "grad_norm": 0.26252543926239014, + "learning_rate": 0.00019726567311245227, + "loss": 0.3112, + "step": 116 + }, + { + "epoch": 0.043205317577548, + "grad_norm": 0.30053412914276123, + "learning_rate": 0.00019724103953688878, + "loss": 0.4626, + "step": 117 + }, + { + "epoch": 0.04357459379615953, + "grad_norm": 0.4096478223800659, + "learning_rate": 0.0001972164059613253, + "loss": 0.5144, + "step": 118 + }, + { + "epoch": 0.04394387001477105, + "grad_norm": 0.3301101624965668, + "learning_rate": 0.0001971917723857618, + "loss": 0.3704, + "step": 119 + }, + { + "epoch": 0.04431314623338257, + "grad_norm": 0.3132151663303375, + "learning_rate": 0.0001971671388101983, + "loss": 0.3997, + "step": 120 + }, + { + "epoch": 0.044682422451994094, + "grad_norm": 0.2632290720939636, + "learning_rate": 0.0001971425052346348, + "loss": 0.3255, + "step": 121 + }, + { + "epoch": 0.045051698670605614, + "grad_norm": 0.2516363561153412, + "learning_rate": 0.00019711787165907133, + "loss": 0.3154, + "step": 122 + }, + { + "epoch": 0.04542097488921713, + "grad_norm": 0.3500913679599762, + "learning_rate": 0.00019709323808350784, + "loss": 0.4724, + "step": 123 + }, + { + "epoch": 0.04579025110782865, + "grad_norm": 0.2939763367176056, + "learning_rate": 0.00019706860450794433, + "loss": 0.3685, + "step": 124 + }, + { + "epoch": 0.04615952732644018, + "grad_norm": 0.30508339405059814, + "learning_rate": 0.00019704397093238084, + "loss": 0.4158, + "step": 125 + }, + { + "epoch": 0.0465288035450517, + "grad_norm": 0.3385158181190491, + "learning_rate": 0.00019701933735681736, + "loss": 0.3789, + "step": 126 + }, + { + "epoch": 0.04689807976366322, + "grad_norm": 0.33476635813713074, + "learning_rate": 0.00019699470378125387, + "loss": 0.4632, + "step": 127 + }, + { + "epoch": 0.047267355982274745, + "grad_norm": 0.3433752954006195, + "learning_rate": 0.00019697007020569036, + "loss": 0.4054, + "step": 128 + }, + { + "epoch": 0.047636632200886264, + "grad_norm": 0.29373449087142944, + "learning_rate": 0.00019694543663012688, + "loss": 0.3967, + "step": 129 + }, + { + "epoch": 0.048005908419497784, + "grad_norm": 0.3482401967048645, + "learning_rate": 0.00019692080305456336, + "loss": 0.5018, + "step": 130 + }, + { + "epoch": 0.0483751846381093, + "grad_norm": 0.33771812915802, + "learning_rate": 0.0001968961694789999, + "loss": 0.4113, + "step": 131 + }, + { + "epoch": 0.04874446085672083, + "grad_norm": 0.2921206057071686, + "learning_rate": 0.0001968715359034364, + "loss": 0.3901, + "step": 132 + }, + { + "epoch": 0.04911373707533235, + "grad_norm": 0.30001071095466614, + "learning_rate": 0.0001968469023278729, + "loss": 0.4336, + "step": 133 + }, + { + "epoch": 0.04948301329394387, + "grad_norm": 0.33289170265197754, + "learning_rate": 0.0001968222687523094, + "loss": 0.3591, + "step": 134 + }, + { + "epoch": 0.04985228951255539, + "grad_norm": 0.32086479663848877, + "learning_rate": 0.0001967976351767459, + "loss": 0.3866, + "step": 135 + }, + { + "epoch": 0.050221565731166914, + "grad_norm": 0.3464393615722656, + "learning_rate": 0.00019677300160118242, + "loss": 0.5666, + "step": 136 + }, + { + "epoch": 0.050590841949778434, + "grad_norm": 0.2988763451576233, + "learning_rate": 0.00019674836802561894, + "loss": 0.3846, + "step": 137 + }, + { + "epoch": 0.05096011816838995, + "grad_norm": 0.28799182176589966, + "learning_rate": 0.00019672373445005543, + "loss": 0.3591, + "step": 138 + }, + { + "epoch": 0.05132939438700148, + "grad_norm": 0.2651379406452179, + "learning_rate": 0.00019669910087449194, + "loss": 0.3491, + "step": 139 + }, + { + "epoch": 0.051698670605613, + "grad_norm": 0.3103938102722168, + "learning_rate": 0.00019667446729892846, + "loss": 0.3934, + "step": 140 + }, + { + "epoch": 0.05206794682422452, + "grad_norm": 0.3438890874385834, + "learning_rate": 0.00019664983372336497, + "loss": 0.5211, + "step": 141 + }, + { + "epoch": 0.05243722304283604, + "grad_norm": 0.32043004035949707, + "learning_rate": 0.00019662520014780146, + "loss": 0.3791, + "step": 142 + }, + { + "epoch": 0.052806499261447565, + "grad_norm": 0.3433363437652588, + "learning_rate": 0.00019660056657223797, + "loss": 0.4582, + "step": 143 + }, + { + "epoch": 0.053175775480059084, + "grad_norm": 0.34730157256126404, + "learning_rate": 0.00019657593299667446, + "loss": 0.4381, + "step": 144 + }, + { + "epoch": 0.053545051698670604, + "grad_norm": 0.30199557542800903, + "learning_rate": 0.000196551299421111, + "loss": 0.4061, + "step": 145 + }, + { + "epoch": 0.05391432791728213, + "grad_norm": 0.3475738763809204, + "learning_rate": 0.0001965266658455475, + "loss": 0.3945, + "step": 146 + }, + { + "epoch": 0.05428360413589365, + "grad_norm": 0.28861764073371887, + "learning_rate": 0.000196502032269984, + "loss": 0.4133, + "step": 147 + }, + { + "epoch": 0.05465288035450517, + "grad_norm": 0.3122514486312866, + "learning_rate": 0.0001964773986944205, + "loss": 0.3963, + "step": 148 + }, + { + "epoch": 0.05502215657311669, + "grad_norm": 0.2826829254627228, + "learning_rate": 0.000196452765118857, + "loss": 0.3409, + "step": 149 + }, + { + "epoch": 0.055391432791728215, + "grad_norm": 0.2843906581401825, + "learning_rate": 0.00019642813154329352, + "loss": 0.3333, + "step": 150 + }, + { + "epoch": 0.055391432791728215, + "eval_loss": 7.590583324432373, + "eval_runtime": 7.0297, + "eval_samples_per_second": 7.113, + "eval_steps_per_second": 0.996, + "step": 150 + }, + { + "epoch": 0.055760709010339735, + "grad_norm": 0.3123917579650879, + "learning_rate": 0.00019640349796773004, + "loss": 0.4046, + "step": 151 + }, + { + "epoch": 0.056129985228951254, + "grad_norm": 0.3409869074821472, + "learning_rate": 0.00019637886439216652, + "loss": 0.4826, + "step": 152 + }, + { + "epoch": 0.056499261447562774, + "grad_norm": 0.30526500940322876, + "learning_rate": 0.00019635423081660304, + "loss": 0.4138, + "step": 153 + }, + { + "epoch": 0.0568685376661743, + "grad_norm": 0.2681126594543457, + "learning_rate": 0.00019632959724103955, + "loss": 0.3492, + "step": 154 + }, + { + "epoch": 0.05723781388478582, + "grad_norm": 0.37410515546798706, + "learning_rate": 0.00019630496366547607, + "loss": 0.4561, + "step": 155 + }, + { + "epoch": 0.05760709010339734, + "grad_norm": 0.2942816913127899, + "learning_rate": 0.00019628033008991255, + "loss": 0.4086, + "step": 156 + }, + { + "epoch": 0.057976366322008865, + "grad_norm": 0.27548208832740784, + "learning_rate": 0.00019625569651434907, + "loss": 0.3849, + "step": 157 + }, + { + "epoch": 0.058345642540620385, + "grad_norm": 0.306073397397995, + "learning_rate": 0.00019623106293878556, + "loss": 0.4165, + "step": 158 + }, + { + "epoch": 0.058714918759231904, + "grad_norm": 0.4765852689743042, + "learning_rate": 0.0001962064293632221, + "loss": 0.3426, + "step": 159 + }, + { + "epoch": 0.059084194977843424, + "grad_norm": 0.3074716627597809, + "learning_rate": 0.00019618179578765859, + "loss": 0.3948, + "step": 160 + }, + { + "epoch": 0.05945347119645495, + "grad_norm": 0.2598515748977661, + "learning_rate": 0.0001961571622120951, + "loss": 0.3351, + "step": 161 + }, + { + "epoch": 0.05982274741506647, + "grad_norm": 0.3224346935749054, + "learning_rate": 0.0001961325286365316, + "loss": 0.4008, + "step": 162 + }, + { + "epoch": 0.06019202363367799, + "grad_norm": 0.26676636934280396, + "learning_rate": 0.00019610789506096813, + "loss": 0.327, + "step": 163 + }, + { + "epoch": 0.060561299852289516, + "grad_norm": 0.34058332443237305, + "learning_rate": 0.00019608326148540462, + "loss": 0.3992, + "step": 164 + }, + { + "epoch": 0.060930576070901035, + "grad_norm": 0.27629032731056213, + "learning_rate": 0.00019605862790984113, + "loss": 0.4296, + "step": 165 + }, + { + "epoch": 0.061299852289512555, + "grad_norm": 0.30480247735977173, + "learning_rate": 0.00019603399433427762, + "loss": 0.3839, + "step": 166 + }, + { + "epoch": 0.061669128508124074, + "grad_norm": 0.29698097705841064, + "learning_rate": 0.00019600936075871413, + "loss": 0.3852, + "step": 167 + }, + { + "epoch": 0.0620384047267356, + "grad_norm": 0.3043414354324341, + "learning_rate": 0.00019598472718315065, + "loss": 0.3654, + "step": 168 + }, + { + "epoch": 0.06240768094534712, + "grad_norm": 0.3302856385707855, + "learning_rate": 0.00019596009360758716, + "loss": 0.4524, + "step": 169 + }, + { + "epoch": 0.06277695716395865, + "grad_norm": 0.2712551951408386, + "learning_rate": 0.00019593546003202365, + "loss": 0.3621, + "step": 170 + }, + { + "epoch": 0.06314623338257017, + "grad_norm": 0.29931533336639404, + "learning_rate": 0.00019591082645646017, + "loss": 0.4093, + "step": 171 + }, + { + "epoch": 0.06351550960118169, + "grad_norm": 0.3207045793533325, + "learning_rate": 0.00019588619288089668, + "loss": 0.3306, + "step": 172 + }, + { + "epoch": 0.0638847858197932, + "grad_norm": 0.37446922063827515, + "learning_rate": 0.0001958615593053332, + "loss": 0.4196, + "step": 173 + }, + { + "epoch": 0.06425406203840472, + "grad_norm": 0.3298546075820923, + "learning_rate": 0.00019583692572976968, + "loss": 0.4887, + "step": 174 + }, + { + "epoch": 0.06462333825701624, + "grad_norm": 0.2995353639125824, + "learning_rate": 0.0001958122921542062, + "loss": 0.3489, + "step": 175 + }, + { + "epoch": 0.06499261447562776, + "grad_norm": 0.3142220675945282, + "learning_rate": 0.00019578765857864268, + "loss": 0.4094, + "step": 176 + }, + { + "epoch": 0.0653618906942393, + "grad_norm": 0.26946571469306946, + "learning_rate": 0.00019576302500307923, + "loss": 0.3705, + "step": 177 + }, + { + "epoch": 0.06573116691285082, + "grad_norm": 0.3591237962245941, + "learning_rate": 0.00019573839142751571, + "loss": 0.414, + "step": 178 + }, + { + "epoch": 0.06610044313146234, + "grad_norm": 0.24683284759521484, + "learning_rate": 0.00019571375785195223, + "loss": 0.3237, + "step": 179 + }, + { + "epoch": 0.06646971935007386, + "grad_norm": 0.2725733816623688, + "learning_rate": 0.00019568912427638872, + "loss": 0.305, + "step": 180 + }, + { + "epoch": 0.06683899556868537, + "grad_norm": 0.2675776779651642, + "learning_rate": 0.00019566449070082523, + "loss": 0.3568, + "step": 181 + }, + { + "epoch": 0.0672082717872969, + "grad_norm": 0.30420437455177307, + "learning_rate": 0.00019563985712526175, + "loss": 0.3836, + "step": 182 + }, + { + "epoch": 0.06757754800590841, + "grad_norm": 0.36870890855789185, + "learning_rate": 0.00019561522354969826, + "loss": 0.5199, + "step": 183 + }, + { + "epoch": 0.06794682422451995, + "grad_norm": 0.33839285373687744, + "learning_rate": 0.00019559058997413475, + "loss": 0.4877, + "step": 184 + }, + { + "epoch": 0.06831610044313147, + "grad_norm": 0.32313859462738037, + "learning_rate": 0.00019556595639857126, + "loss": 0.4604, + "step": 185 + }, + { + "epoch": 0.06868537666174299, + "grad_norm": 0.2729129195213318, + "learning_rate": 0.00019554132282300778, + "loss": 0.3299, + "step": 186 + }, + { + "epoch": 0.0690546528803545, + "grad_norm": 0.2671365439891815, + "learning_rate": 0.0001955166892474443, + "loss": 0.3187, + "step": 187 + }, + { + "epoch": 0.06942392909896603, + "grad_norm": 0.2555955946445465, + "learning_rate": 0.00019549205567188078, + "loss": 0.2984, + "step": 188 + }, + { + "epoch": 0.06979320531757754, + "grad_norm": 0.28530117869377136, + "learning_rate": 0.0001954674220963173, + "loss": 0.3348, + "step": 189 + }, + { + "epoch": 0.07016248153618906, + "grad_norm": 0.3249630928039551, + "learning_rate": 0.00019544278852075378, + "loss": 0.5253, + "step": 190 + }, + { + "epoch": 0.0705317577548006, + "grad_norm": 0.33182090520858765, + "learning_rate": 0.00019541815494519032, + "loss": 0.4761, + "step": 191 + }, + { + "epoch": 0.07090103397341212, + "grad_norm": 0.3631516695022583, + "learning_rate": 0.0001953935213696268, + "loss": 0.4011, + "step": 192 + }, + { + "epoch": 0.07127031019202364, + "grad_norm": 0.3483794331550598, + "learning_rate": 0.00019536888779406333, + "loss": 0.5041, + "step": 193 + }, + { + "epoch": 0.07163958641063516, + "grad_norm": 0.2953607141971588, + "learning_rate": 0.0001953442542184998, + "loss": 0.3428, + "step": 194 + }, + { + "epoch": 0.07200886262924668, + "grad_norm": 0.32699698209762573, + "learning_rate": 0.00019531962064293633, + "loss": 0.3822, + "step": 195 + }, + { + "epoch": 0.0723781388478582, + "grad_norm": 0.40169933438301086, + "learning_rate": 0.00019529498706737284, + "loss": 0.4866, + "step": 196 + }, + { + "epoch": 0.07274741506646971, + "grad_norm": 0.3008585572242737, + "learning_rate": 0.00019527035349180936, + "loss": 0.3931, + "step": 197 + }, + { + "epoch": 0.07311669128508123, + "grad_norm": 0.3343890905380249, + "learning_rate": 0.00019524571991624584, + "loss": 0.4818, + "step": 198 + }, + { + "epoch": 0.07348596750369277, + "grad_norm": 0.2947738468647003, + "learning_rate": 0.00019522108634068236, + "loss": 0.3153, + "step": 199 + }, + { + "epoch": 0.07385524372230429, + "grad_norm": 0.5027809739112854, + "learning_rate": 0.00019519645276511887, + "loss": 0.5032, + "step": 200 + }, + { + "epoch": 0.07385524372230429, + "eval_loss": 7.606281757354736, + "eval_runtime": 7.1363, + "eval_samples_per_second": 7.006, + "eval_steps_per_second": 0.981, + "step": 200 + }, + { + "epoch": 0.0742245199409158, + "grad_norm": 0.34096887707710266, + "learning_rate": 0.0001951718191895554, + "loss": 0.4879, + "step": 201 + }, + { + "epoch": 0.07459379615952733, + "grad_norm": 0.31350183486938477, + "learning_rate": 0.00019514718561399188, + "loss": 0.4272, + "step": 202 + }, + { + "epoch": 0.07496307237813885, + "grad_norm": 0.31799545884132385, + "learning_rate": 0.0001951225520384284, + "loss": 0.4551, + "step": 203 + }, + { + "epoch": 0.07533234859675036, + "grad_norm": 0.38535112142562866, + "learning_rate": 0.0001950979184628649, + "loss": 0.4394, + "step": 204 + }, + { + "epoch": 0.07570162481536188, + "grad_norm": 0.3350127935409546, + "learning_rate": 0.00019507328488730142, + "loss": 0.3825, + "step": 205 + }, + { + "epoch": 0.07607090103397342, + "grad_norm": 0.3102962374687195, + "learning_rate": 0.0001950486513117379, + "loss": 0.4046, + "step": 206 + }, + { + "epoch": 0.07644017725258494, + "grad_norm": 0.23887918889522552, + "learning_rate": 0.00019502401773617442, + "loss": 0.307, + "step": 207 + }, + { + "epoch": 0.07680945347119646, + "grad_norm": 0.31021204590797424, + "learning_rate": 0.0001949993841606109, + "loss": 0.4068, + "step": 208 + }, + { + "epoch": 0.07717872968980798, + "grad_norm": 0.28989648818969727, + "learning_rate": 0.00019497475058504745, + "loss": 0.3574, + "step": 209 + }, + { + "epoch": 0.0775480059084195, + "grad_norm": 0.27846693992614746, + "learning_rate": 0.00019495011700948394, + "loss": 0.3755, + "step": 210 + }, + { + "epoch": 0.07791728212703102, + "grad_norm": 0.30180665850639343, + "learning_rate": 0.00019492548343392045, + "loss": 0.4028, + "step": 211 + }, + { + "epoch": 0.07828655834564253, + "grad_norm": 0.279528945684433, + "learning_rate": 0.00019490084985835694, + "loss": 0.3874, + "step": 212 + }, + { + "epoch": 0.07865583456425407, + "grad_norm": 0.2765534818172455, + "learning_rate": 0.00019487621628279346, + "loss": 0.3544, + "step": 213 + }, + { + "epoch": 0.07902511078286559, + "grad_norm": 0.33842530846595764, + "learning_rate": 0.00019485158270722997, + "loss": 0.461, + "step": 214 + }, + { + "epoch": 0.0793943870014771, + "grad_norm": 0.33343109488487244, + "learning_rate": 0.00019482694913166648, + "loss": 0.4103, + "step": 215 + }, + { + "epoch": 0.07976366322008863, + "grad_norm": 0.332978218793869, + "learning_rate": 0.00019480231555610297, + "loss": 0.4103, + "step": 216 + }, + { + "epoch": 0.08013293943870015, + "grad_norm": 0.27333706617355347, + "learning_rate": 0.0001947776819805395, + "loss": 0.3705, + "step": 217 + }, + { + "epoch": 0.08050221565731167, + "grad_norm": 0.3391788601875305, + "learning_rate": 0.000194753048404976, + "loss": 0.4383, + "step": 218 + }, + { + "epoch": 0.08087149187592318, + "grad_norm": 0.26999446749687195, + "learning_rate": 0.00019472841482941252, + "loss": 0.2942, + "step": 219 + }, + { + "epoch": 0.08124076809453472, + "grad_norm": 0.29670652747154236, + "learning_rate": 0.000194703781253849, + "loss": 0.3808, + "step": 220 + }, + { + "epoch": 0.08161004431314624, + "grad_norm": 0.31347814202308655, + "learning_rate": 0.00019467914767828552, + "loss": 0.4008, + "step": 221 + }, + { + "epoch": 0.08197932053175776, + "grad_norm": 0.2620101869106293, + "learning_rate": 0.000194654514102722, + "loss": 0.3715, + "step": 222 + }, + { + "epoch": 0.08234859675036928, + "grad_norm": 0.31677815318107605, + "learning_rate": 0.00019462988052715855, + "loss": 0.4463, + "step": 223 + }, + { + "epoch": 0.0827178729689808, + "grad_norm": 0.30082473158836365, + "learning_rate": 0.00019460524695159504, + "loss": 0.3854, + "step": 224 + }, + { + "epoch": 0.08308714918759232, + "grad_norm": 0.3013985753059387, + "learning_rate": 0.00019458061337603155, + "loss": 0.418, + "step": 225 + }, + { + "epoch": 0.08345642540620384, + "grad_norm": 0.258806049823761, + "learning_rate": 0.00019455597980046804, + "loss": 0.3697, + "step": 226 + }, + { + "epoch": 0.08382570162481537, + "grad_norm": 0.2760540843009949, + "learning_rate": 0.00019453134622490455, + "loss": 0.3442, + "step": 227 + }, + { + "epoch": 0.08419497784342689, + "grad_norm": 0.23491321504116058, + "learning_rate": 0.00019450671264934107, + "loss": 0.2841, + "step": 228 + }, + { + "epoch": 0.08456425406203841, + "grad_norm": 0.2904042601585388, + "learning_rate": 0.00019448207907377758, + "loss": 0.3567, + "step": 229 + }, + { + "epoch": 0.08493353028064993, + "grad_norm": 0.3854878544807434, + "learning_rate": 0.00019445744549821407, + "loss": 0.5345, + "step": 230 + }, + { + "epoch": 0.08530280649926145, + "grad_norm": 0.28415659070014954, + "learning_rate": 0.00019443281192265058, + "loss": 0.4123, + "step": 231 + }, + { + "epoch": 0.08567208271787297, + "grad_norm": 0.3094068169593811, + "learning_rate": 0.0001944081783470871, + "loss": 0.4024, + "step": 232 + }, + { + "epoch": 0.08604135893648449, + "grad_norm": 0.25084954500198364, + "learning_rate": 0.0001943835447715236, + "loss": 0.3446, + "step": 233 + }, + { + "epoch": 0.086410635155096, + "grad_norm": 0.3149997591972351, + "learning_rate": 0.0001943589111959601, + "loss": 0.4271, + "step": 234 + }, + { + "epoch": 0.08677991137370754, + "grad_norm": 0.3112153112888336, + "learning_rate": 0.00019433427762039661, + "loss": 0.4092, + "step": 235 + }, + { + "epoch": 0.08714918759231906, + "grad_norm": 0.28938791155815125, + "learning_rate": 0.00019430964404483313, + "loss": 0.3827, + "step": 236 + }, + { + "epoch": 0.08751846381093058, + "grad_norm": 0.30853238701820374, + "learning_rate": 0.00019428501046926962, + "loss": 0.3705, + "step": 237 + }, + { + "epoch": 0.0878877400295421, + "grad_norm": 0.3236076235771179, + "learning_rate": 0.00019426037689370613, + "loss": 0.4961, + "step": 238 + }, + { + "epoch": 0.08825701624815362, + "grad_norm": 0.29427987337112427, + "learning_rate": 0.00019423574331814262, + "loss": 0.4023, + "step": 239 + }, + { + "epoch": 0.08862629246676514, + "grad_norm": 0.2497962862253189, + "learning_rate": 0.00019421110974257913, + "loss": 0.2933, + "step": 240 + }, + { + "epoch": 0.08899556868537666, + "grad_norm": 0.31414633989334106, + "learning_rate": 0.00019418647616701565, + "loss": 0.4387, + "step": 241 + }, + { + "epoch": 0.08936484490398819, + "grad_norm": 0.3126966059207916, + "learning_rate": 0.00019416184259145216, + "loss": 0.4267, + "step": 242 + }, + { + "epoch": 0.08973412112259971, + "grad_norm": 0.34239670634269714, + "learning_rate": 0.00019413720901588865, + "loss": 0.4555, + "step": 243 + }, + { + "epoch": 0.09010339734121123, + "grad_norm": 0.32149040699005127, + "learning_rate": 0.00019411257544032517, + "loss": 0.4632, + "step": 244 + }, + { + "epoch": 0.09047267355982275, + "grad_norm": 0.31332024931907654, + "learning_rate": 0.00019408794186476168, + "loss": 0.4247, + "step": 245 + }, + { + "epoch": 0.09084194977843427, + "grad_norm": 0.29506734013557434, + "learning_rate": 0.0001940633082891982, + "loss": 0.4519, + "step": 246 + }, + { + "epoch": 0.09121122599704579, + "grad_norm": 0.3080827295780182, + "learning_rate": 0.00019403867471363468, + "loss": 0.3943, + "step": 247 + }, + { + "epoch": 0.0915805022156573, + "grad_norm": 0.2912343740463257, + "learning_rate": 0.0001940140411380712, + "loss": 0.4771, + "step": 248 + }, + { + "epoch": 0.09194977843426884, + "grad_norm": 0.2911964952945709, + "learning_rate": 0.00019398940756250768, + "loss": 0.3964, + "step": 249 + }, + { + "epoch": 0.09231905465288036, + "grad_norm": 0.26767492294311523, + "learning_rate": 0.00019396477398694423, + "loss": 0.3481, + "step": 250 + }, + { + "epoch": 0.09231905465288036, + "eval_loss": 7.698209285736084, + "eval_runtime": 6.9204, + "eval_samples_per_second": 7.225, + "eval_steps_per_second": 1.011, + "step": 250 + }, + { + "epoch": 0.09268833087149188, + "grad_norm": 0.3298513889312744, + "learning_rate": 0.00019394014041138071, + "loss": 0.4542, + "step": 251 + }, + { + "epoch": 0.0930576070901034, + "grad_norm": 0.33949264883995056, + "learning_rate": 0.00019391550683581723, + "loss": 0.4617, + "step": 252 + }, + { + "epoch": 0.09342688330871492, + "grad_norm": 0.3252466917037964, + "learning_rate": 0.00019389087326025372, + "loss": 0.4509, + "step": 253 + }, + { + "epoch": 0.09379615952732644, + "grad_norm": 0.388219952583313, + "learning_rate": 0.00019386623968469023, + "loss": 0.3606, + "step": 254 + }, + { + "epoch": 0.09416543574593796, + "grad_norm": 0.29722291231155396, + "learning_rate": 0.00019384160610912675, + "loss": 0.3635, + "step": 255 + }, + { + "epoch": 0.09453471196454949, + "grad_norm": 0.3315044343471527, + "learning_rate": 0.00019381697253356326, + "loss": 0.322, + "step": 256 + }, + { + "epoch": 0.09490398818316101, + "grad_norm": 0.2928721010684967, + "learning_rate": 0.00019379233895799975, + "loss": 0.403, + "step": 257 + }, + { + "epoch": 0.09527326440177253, + "grad_norm": 0.3018152713775635, + "learning_rate": 0.00019376770538243626, + "loss": 0.4419, + "step": 258 + }, + { + "epoch": 0.09564254062038405, + "grad_norm": 0.2882957458496094, + "learning_rate": 0.00019374307180687278, + "loss": 0.3995, + "step": 259 + }, + { + "epoch": 0.09601181683899557, + "grad_norm": 0.3042584955692291, + "learning_rate": 0.0001937184382313093, + "loss": 0.4324, + "step": 260 + }, + { + "epoch": 0.09638109305760709, + "grad_norm": 0.2930835783481598, + "learning_rate": 0.00019369380465574578, + "loss": 0.4016, + "step": 261 + }, + { + "epoch": 0.0967503692762186, + "grad_norm": 0.27013394236564636, + "learning_rate": 0.0001936691710801823, + "loss": 0.3631, + "step": 262 + }, + { + "epoch": 0.09711964549483014, + "grad_norm": 0.26821768283843994, + "learning_rate": 0.0001936445375046188, + "loss": 0.3161, + "step": 263 + }, + { + "epoch": 0.09748892171344166, + "grad_norm": 0.3145991563796997, + "learning_rate": 0.00019361990392905532, + "loss": 0.4056, + "step": 264 + }, + { + "epoch": 0.09785819793205318, + "grad_norm": 0.4326309859752655, + "learning_rate": 0.0001935952703534918, + "loss": 0.4242, + "step": 265 + }, + { + "epoch": 0.0982274741506647, + "grad_norm": 0.2894769012928009, + "learning_rate": 0.00019357063677792832, + "loss": 0.4024, + "step": 266 + }, + { + "epoch": 0.09859675036927622, + "grad_norm": 0.3390533924102783, + "learning_rate": 0.0001935460032023648, + "loss": 0.4014, + "step": 267 + }, + { + "epoch": 0.09896602658788774, + "grad_norm": 0.37753161787986755, + "learning_rate": 0.00019352136962680135, + "loss": 0.4585, + "step": 268 + }, + { + "epoch": 0.09933530280649926, + "grad_norm": 0.24638783931732178, + "learning_rate": 0.00019349673605123784, + "loss": 0.3092, + "step": 269 + }, + { + "epoch": 0.09970457902511078, + "grad_norm": 0.3004394471645355, + "learning_rate": 0.00019347210247567436, + "loss": 0.4057, + "step": 270 + }, + { + "epoch": 0.10007385524372231, + "grad_norm": 0.33128201961517334, + "learning_rate": 0.00019344746890011084, + "loss": 0.3571, + "step": 271 + }, + { + "epoch": 0.10044313146233383, + "grad_norm": 0.2760719954967499, + "learning_rate": 0.00019342283532454736, + "loss": 0.2891, + "step": 272 + }, + { + "epoch": 0.10081240768094535, + "grad_norm": 0.2924439609050751, + "learning_rate": 0.00019339820174898387, + "loss": 0.3333, + "step": 273 + }, + { + "epoch": 0.10118168389955687, + "grad_norm": 0.29440510272979736, + "learning_rate": 0.0001933735681734204, + "loss": 0.3616, + "step": 274 + }, + { + "epoch": 0.10155096011816839, + "grad_norm": 0.32632264494895935, + "learning_rate": 0.00019334893459785688, + "loss": 0.4118, + "step": 275 + }, + { + "epoch": 0.1019202363367799, + "grad_norm": 0.2541729509830475, + "learning_rate": 0.0001933243010222934, + "loss": 0.3578, + "step": 276 + }, + { + "epoch": 0.10228951255539143, + "grad_norm": 0.27940472960472107, + "learning_rate": 0.0001932996674467299, + "loss": 0.3891, + "step": 277 + }, + { + "epoch": 0.10265878877400296, + "grad_norm": 0.2847420573234558, + "learning_rate": 0.00019327503387116642, + "loss": 0.3375, + "step": 278 + }, + { + "epoch": 0.10302806499261448, + "grad_norm": 0.3491232395172119, + "learning_rate": 0.0001932504002956029, + "loss": 0.4197, + "step": 279 + }, + { + "epoch": 0.103397341211226, + "grad_norm": 0.27968740463256836, + "learning_rate": 0.00019322576672003942, + "loss": 0.3837, + "step": 280 + }, + { + "epoch": 0.10376661742983752, + "grad_norm": 0.3751474618911743, + "learning_rate": 0.0001932011331444759, + "loss": 0.4774, + "step": 281 + }, + { + "epoch": 0.10413589364844904, + "grad_norm": 0.2843906879425049, + "learning_rate": 0.00019317649956891245, + "loss": 0.3505, + "step": 282 + }, + { + "epoch": 0.10450516986706056, + "grad_norm": 0.3035489618778229, + "learning_rate": 0.00019315186599334894, + "loss": 0.3379, + "step": 283 + }, + { + "epoch": 0.10487444608567208, + "grad_norm": 0.26670679450035095, + "learning_rate": 0.00019312723241778545, + "loss": 0.3033, + "step": 284 + }, + { + "epoch": 0.10524372230428361, + "grad_norm": 0.2777157127857208, + "learning_rate": 0.00019310259884222194, + "loss": 0.3486, + "step": 285 + }, + { + "epoch": 0.10561299852289513, + "grad_norm": 0.28091469407081604, + "learning_rate": 0.00019307796526665845, + "loss": 0.341, + "step": 286 + }, + { + "epoch": 0.10598227474150665, + "grad_norm": 0.2901500463485718, + "learning_rate": 0.00019305333169109497, + "loss": 0.4077, + "step": 287 + }, + { + "epoch": 0.10635155096011817, + "grad_norm": 0.27171966433525085, + "learning_rate": 0.00019302869811553148, + "loss": 0.3557, + "step": 288 + }, + { + "epoch": 0.10672082717872969, + "grad_norm": 0.4258878529071808, + "learning_rate": 0.00019300406453996797, + "loss": 0.4984, + "step": 289 + }, + { + "epoch": 0.10709010339734121, + "grad_norm": 0.33638089895248413, + "learning_rate": 0.00019297943096440449, + "loss": 0.3913, + "step": 290 + }, + { + "epoch": 0.10745937961595273, + "grad_norm": 0.28694894909858704, + "learning_rate": 0.000192954797388841, + "loss": 0.3559, + "step": 291 + }, + { + "epoch": 0.10782865583456426, + "grad_norm": 0.32328546047210693, + "learning_rate": 0.00019293016381327752, + "loss": 0.3458, + "step": 292 + }, + { + "epoch": 0.10819793205317578, + "grad_norm": 0.2909404933452606, + "learning_rate": 0.000192905530237714, + "loss": 0.3829, + "step": 293 + }, + { + "epoch": 0.1085672082717873, + "grad_norm": 0.31008490920066833, + "learning_rate": 0.00019288089666215052, + "loss": 0.346, + "step": 294 + }, + { + "epoch": 0.10893648449039882, + "grad_norm": 0.3429504632949829, + "learning_rate": 0.00019285626308658703, + "loss": 0.2725, + "step": 295 + }, + { + "epoch": 0.10930576070901034, + "grad_norm": 0.22843964397907257, + "learning_rate": 0.00019283162951102355, + "loss": 0.2714, + "step": 296 + }, + { + "epoch": 0.10967503692762186, + "grad_norm": 0.3285588324069977, + "learning_rate": 0.00019280699593546003, + "loss": 0.4223, + "step": 297 + }, + { + "epoch": 0.11004431314623338, + "grad_norm": 0.30291518568992615, + "learning_rate": 0.00019278236235989655, + "loss": 0.3247, + "step": 298 + }, + { + "epoch": 0.11041358936484491, + "grad_norm": 0.30423396825790405, + "learning_rate": 0.00019275772878433304, + "loss": 0.4042, + "step": 299 + }, + { + "epoch": 0.11078286558345643, + "grad_norm": 0.2661963105201721, + "learning_rate": 0.00019273309520876958, + "loss": 0.3369, + "step": 300 + }, + { + "epoch": 0.11078286558345643, + "eval_loss": 7.82208251953125, + "eval_runtime": 6.9138, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 1.012, + "step": 300 + }, + { + "epoch": 0.11115214180206795, + "grad_norm": 0.29318246245384216, + "learning_rate": 0.00019270846163320607, + "loss": 0.3857, + "step": 301 + }, + { + "epoch": 0.11152141802067947, + "grad_norm": 0.33537909388542175, + "learning_rate": 0.00019268382805764258, + "loss": 0.3939, + "step": 302 + }, + { + "epoch": 0.11189069423929099, + "grad_norm": 0.2596702575683594, + "learning_rate": 0.00019265919448207907, + "loss": 0.3183, + "step": 303 + }, + { + "epoch": 0.11225997045790251, + "grad_norm": 0.32874003052711487, + "learning_rate": 0.00019263456090651558, + "loss": 0.3488, + "step": 304 + }, + { + "epoch": 0.11262924667651403, + "grad_norm": 0.37063878774642944, + "learning_rate": 0.0001926099273309521, + "loss": 0.4932, + "step": 305 + }, + { + "epoch": 0.11299852289512555, + "grad_norm": 0.36670756340026855, + "learning_rate": 0.0001925852937553886, + "loss": 0.428, + "step": 306 + }, + { + "epoch": 0.11336779911373708, + "grad_norm": 0.3560425937175751, + "learning_rate": 0.0001925606601798251, + "loss": 0.3646, + "step": 307 + }, + { + "epoch": 0.1137370753323486, + "grad_norm": 0.2823483347892761, + "learning_rate": 0.00019253602660426161, + "loss": 0.3584, + "step": 308 + }, + { + "epoch": 0.11410635155096012, + "grad_norm": 0.7822741866111755, + "learning_rate": 0.00019251139302869813, + "loss": 0.3855, + "step": 309 + }, + { + "epoch": 0.11447562776957164, + "grad_norm": 0.316175639629364, + "learning_rate": 0.00019248675945313464, + "loss": 0.4222, + "step": 310 + }, + { + "epoch": 0.11484490398818316, + "grad_norm": 0.25412991642951965, + "learning_rate": 0.00019246212587757113, + "loss": 0.3711, + "step": 311 + }, + { + "epoch": 0.11521418020679468, + "grad_norm": 0.2927306592464447, + "learning_rate": 0.00019243749230200765, + "loss": 0.4183, + "step": 312 + }, + { + "epoch": 0.1155834564254062, + "grad_norm": 0.3077561855316162, + "learning_rate": 0.00019241285872644413, + "loss": 0.4072, + "step": 313 + }, + { + "epoch": 0.11595273264401773, + "grad_norm": 0.2560058832168579, + "learning_rate": 0.00019238822515088068, + "loss": 0.278, + "step": 314 + }, + { + "epoch": 0.11632200886262925, + "grad_norm": 0.3226087987422943, + "learning_rate": 0.00019236359157531716, + "loss": 0.4393, + "step": 315 + }, + { + "epoch": 0.11669128508124077, + "grad_norm": 0.31423744559288025, + "learning_rate": 0.00019233895799975368, + "loss": 0.4141, + "step": 316 + }, + { + "epoch": 0.11706056129985229, + "grad_norm": 0.30898934602737427, + "learning_rate": 0.00019231432442419016, + "loss": 0.3859, + "step": 317 + }, + { + "epoch": 0.11742983751846381, + "grad_norm": 0.30242830514907837, + "learning_rate": 0.00019228969084862668, + "loss": 0.3707, + "step": 318 + }, + { + "epoch": 0.11779911373707533, + "grad_norm": 0.3828931152820587, + "learning_rate": 0.0001922650572730632, + "loss": 0.4634, + "step": 319 + }, + { + "epoch": 0.11816838995568685, + "grad_norm": 0.29980096220970154, + "learning_rate": 0.0001922404236974997, + "loss": 0.407, + "step": 320 + }, + { + "epoch": 0.11853766617429838, + "grad_norm": 0.3550015091896057, + "learning_rate": 0.0001922157901219362, + "loss": 0.4743, + "step": 321 + }, + { + "epoch": 0.1189069423929099, + "grad_norm": 0.33236315846443176, + "learning_rate": 0.0001921911565463727, + "loss": 0.4378, + "step": 322 + }, + { + "epoch": 0.11927621861152142, + "grad_norm": 0.3312847316265106, + "learning_rate": 0.00019216652297080923, + "loss": 0.3815, + "step": 323 + }, + { + "epoch": 0.11964549483013294, + "grad_norm": 0.2941649556159973, + "learning_rate": 0.00019214188939524574, + "loss": 0.3539, + "step": 324 + }, + { + "epoch": 0.12001477104874446, + "grad_norm": 0.28852352499961853, + "learning_rate": 0.00019211725581968223, + "loss": 0.3667, + "step": 325 + }, + { + "epoch": 0.12038404726735598, + "grad_norm": 0.27327391505241394, + "learning_rate": 0.00019209262224411874, + "loss": 0.3468, + "step": 326 + }, + { + "epoch": 0.1207533234859675, + "grad_norm": 0.5207664370536804, + "learning_rate": 0.00019206798866855523, + "loss": 0.5375, + "step": 327 + }, + { + "epoch": 0.12112259970457903, + "grad_norm": 0.29434478282928467, + "learning_rate": 0.00019204335509299177, + "loss": 0.3568, + "step": 328 + }, + { + "epoch": 0.12149187592319055, + "grad_norm": 0.32505714893341064, + "learning_rate": 0.00019201872151742826, + "loss": 0.3523, + "step": 329 + }, + { + "epoch": 0.12186115214180207, + "grad_norm": 0.2976304590702057, + "learning_rate": 0.00019199408794186477, + "loss": 0.3962, + "step": 330 + }, + { + "epoch": 0.12223042836041359, + "grad_norm": 0.28575676679611206, + "learning_rate": 0.00019196945436630126, + "loss": 0.3674, + "step": 331 + }, + { + "epoch": 0.12259970457902511, + "grad_norm": 0.286398321390152, + "learning_rate": 0.00019194482079073778, + "loss": 0.4329, + "step": 332 + }, + { + "epoch": 0.12296898079763663, + "grad_norm": 0.37335601449012756, + "learning_rate": 0.0001919201872151743, + "loss": 0.4117, + "step": 333 + }, + { + "epoch": 0.12333825701624815, + "grad_norm": 0.34859809279441833, + "learning_rate": 0.0001918955536396108, + "loss": 0.4448, + "step": 334 + }, + { + "epoch": 0.12370753323485968, + "grad_norm": 0.3215370774269104, + "learning_rate": 0.0001918709200640473, + "loss": 0.4229, + "step": 335 + }, + { + "epoch": 0.1240768094534712, + "grad_norm": 0.25345876812934875, + "learning_rate": 0.0001918462864884838, + "loss": 0.3007, + "step": 336 + }, + { + "epoch": 0.12444608567208272, + "grad_norm": 0.23834021389484406, + "learning_rate": 0.00019182165291292032, + "loss": 0.3227, + "step": 337 + }, + { + "epoch": 0.12481536189069424, + "grad_norm": 0.39364761114120483, + "learning_rate": 0.00019179701933735684, + "loss": 0.3609, + "step": 338 + }, + { + "epoch": 0.12518463810930577, + "grad_norm": 0.32490667700767517, + "learning_rate": 0.00019177238576179332, + "loss": 0.3746, + "step": 339 + }, + { + "epoch": 0.1255539143279173, + "grad_norm": 0.344700425863266, + "learning_rate": 0.00019174775218622984, + "loss": 0.3941, + "step": 340 + }, + { + "epoch": 0.1259231905465288, + "grad_norm": 0.32623812556266785, + "learning_rate": 0.00019172311861066635, + "loss": 0.4114, + "step": 341 + }, + { + "epoch": 0.12629246676514033, + "grad_norm": 0.28320956230163574, + "learning_rate": 0.00019169848503510287, + "loss": 0.3801, + "step": 342 + }, + { + "epoch": 0.12666174298375185, + "grad_norm": 0.2658449113368988, + "learning_rate": 0.00019167385145953936, + "loss": 0.318, + "step": 343 + }, + { + "epoch": 0.12703101920236337, + "grad_norm": 0.3121381402015686, + "learning_rate": 0.00019164921788397587, + "loss": 0.3544, + "step": 344 + }, + { + "epoch": 0.1274002954209749, + "grad_norm": 0.32068705558776855, + "learning_rate": 0.00019162458430841236, + "loss": 0.3833, + "step": 345 + }, + { + "epoch": 0.1277695716395864, + "grad_norm": 0.2748071551322937, + "learning_rate": 0.0001915999507328489, + "loss": 0.3449, + "step": 346 + }, + { + "epoch": 0.12813884785819793, + "grad_norm": 0.2816767394542694, + "learning_rate": 0.0001915753171572854, + "loss": 0.3271, + "step": 347 + }, + { + "epoch": 0.12850812407680945, + "grad_norm": 0.2567208409309387, + "learning_rate": 0.0001915506835817219, + "loss": 0.3163, + "step": 348 + }, + { + "epoch": 0.12887740029542097, + "grad_norm": 0.24986563622951508, + "learning_rate": 0.0001915260500061584, + "loss": 0.3639, + "step": 349 + }, + { + "epoch": 0.1292466765140325, + "grad_norm": 0.32256218791007996, + "learning_rate": 0.0001915014164305949, + "loss": 0.3763, + "step": 350 + }, + { + "epoch": 0.1292466765140325, + "eval_loss": 7.9321770668029785, + "eval_runtime": 6.9378, + "eval_samples_per_second": 7.207, + "eval_steps_per_second": 1.009, + "step": 350 + }, + { + "epoch": 0.129615952732644, + "grad_norm": 0.8587042093276978, + "learning_rate": 0.00019147678285503142, + "loss": 0.4857, + "step": 351 + }, + { + "epoch": 0.12998522895125553, + "grad_norm": 0.2607508897781372, + "learning_rate": 0.00019145214927946793, + "loss": 0.2744, + "step": 352 + }, + { + "epoch": 0.13035450516986707, + "grad_norm": 0.304496169090271, + "learning_rate": 0.00019142751570390442, + "loss": 0.3612, + "step": 353 + }, + { + "epoch": 0.1307237813884786, + "grad_norm": 0.3805667757987976, + "learning_rate": 0.00019140288212834094, + "loss": 0.4094, + "step": 354 + }, + { + "epoch": 0.1310930576070901, + "grad_norm": 0.2820616662502289, + "learning_rate": 0.00019137824855277745, + "loss": 0.3452, + "step": 355 + }, + { + "epoch": 0.13146233382570163, + "grad_norm": 0.29570889472961426, + "learning_rate": 0.00019135361497721396, + "loss": 0.3724, + "step": 356 + }, + { + "epoch": 0.13183161004431315, + "grad_norm": 0.27747437357902527, + "learning_rate": 0.00019132898140165045, + "loss": 0.3785, + "step": 357 + }, + { + "epoch": 0.13220088626292467, + "grad_norm": 0.2890758216381073, + "learning_rate": 0.00019130434782608697, + "loss": 0.361, + "step": 358 + }, + { + "epoch": 0.1325701624815362, + "grad_norm": 0.32133665680885315, + "learning_rate": 0.00019127971425052345, + "loss": 0.403, + "step": 359 + }, + { + "epoch": 0.1329394387001477, + "grad_norm": 0.29666760563850403, + "learning_rate": 0.00019125508067496, + "loss": 0.4117, + "step": 360 + }, + { + "epoch": 0.13330871491875923, + "grad_norm": 0.3286416530609131, + "learning_rate": 0.00019123044709939648, + "loss": 0.4368, + "step": 361 + }, + { + "epoch": 0.13367799113737075, + "grad_norm": 0.30330100655555725, + "learning_rate": 0.000191205813523833, + "loss": 0.4062, + "step": 362 + }, + { + "epoch": 0.13404726735598227, + "grad_norm": 0.32072365283966064, + "learning_rate": 0.00019118117994826949, + "loss": 0.398, + "step": 363 + }, + { + "epoch": 0.1344165435745938, + "grad_norm": 0.2915611267089844, + "learning_rate": 0.000191156546372706, + "loss": 0.3502, + "step": 364 + }, + { + "epoch": 0.1347858197932053, + "grad_norm": 0.2943446636199951, + "learning_rate": 0.00019113191279714252, + "loss": 0.3449, + "step": 365 + }, + { + "epoch": 0.13515509601181683, + "grad_norm": 0.2828218936920166, + "learning_rate": 0.00019110727922157903, + "loss": 0.3233, + "step": 366 + }, + { + "epoch": 0.13552437223042835, + "grad_norm": 0.279194712638855, + "learning_rate": 0.00019108264564601552, + "loss": 0.3439, + "step": 367 + }, + { + "epoch": 0.1358936484490399, + "grad_norm": 0.32922396063804626, + "learning_rate": 0.00019105801207045203, + "loss": 0.3571, + "step": 368 + }, + { + "epoch": 0.1362629246676514, + "grad_norm": 0.2880307137966156, + "learning_rate": 0.00019103337849488855, + "loss": 0.3711, + "step": 369 + }, + { + "epoch": 0.13663220088626293, + "grad_norm": 0.36342552304267883, + "learning_rate": 0.00019100874491932506, + "loss": 0.4433, + "step": 370 + }, + { + "epoch": 0.13700147710487445, + "grad_norm": 0.3126503527164459, + "learning_rate": 0.00019098411134376155, + "loss": 0.3954, + "step": 371 + }, + { + "epoch": 0.13737075332348597, + "grad_norm": 0.33674049377441406, + "learning_rate": 0.00019095947776819806, + "loss": 0.3139, + "step": 372 + }, + { + "epoch": 0.1377400295420975, + "grad_norm": 0.25850239396095276, + "learning_rate": 0.00019093484419263458, + "loss": 0.3133, + "step": 373 + }, + { + "epoch": 0.138109305760709, + "grad_norm": 0.24507209658622742, + "learning_rate": 0.0001909102106170711, + "loss": 0.2983, + "step": 374 + }, + { + "epoch": 0.13847858197932053, + "grad_norm": 0.3395346701145172, + "learning_rate": 0.00019088557704150758, + "loss": 0.3894, + "step": 375 + }, + { + "epoch": 0.13884785819793205, + "grad_norm": 0.37446025013923645, + "learning_rate": 0.0001908609434659441, + "loss": 0.4543, + "step": 376 + }, + { + "epoch": 0.13921713441654357, + "grad_norm": 0.32740628719329834, + "learning_rate": 0.00019083630989038058, + "loss": 0.4217, + "step": 377 + }, + { + "epoch": 0.1395864106351551, + "grad_norm": 0.423298180103302, + "learning_rate": 0.00019081167631481712, + "loss": 0.3645, + "step": 378 + }, + { + "epoch": 0.1399556868537666, + "grad_norm": 0.2733071446418762, + "learning_rate": 0.0001907870427392536, + "loss": 0.3138, + "step": 379 + }, + { + "epoch": 0.14032496307237813, + "grad_norm": 0.26174455881118774, + "learning_rate": 0.00019076240916369013, + "loss": 0.322, + "step": 380 + }, + { + "epoch": 0.14069423929098965, + "grad_norm": 0.2634088099002838, + "learning_rate": 0.00019073777558812661, + "loss": 0.3466, + "step": 381 + }, + { + "epoch": 0.1410635155096012, + "grad_norm": 0.27490049600601196, + "learning_rate": 0.00019071314201256313, + "loss": 0.3589, + "step": 382 + }, + { + "epoch": 0.14143279172821271, + "grad_norm": 0.30026838183403015, + "learning_rate": 0.00019068850843699964, + "loss": 0.3761, + "step": 383 + }, + { + "epoch": 0.14180206794682423, + "grad_norm": 0.30868440866470337, + "learning_rate": 0.00019066387486143616, + "loss": 0.4435, + "step": 384 + }, + { + "epoch": 0.14217134416543575, + "grad_norm": 0.3078373074531555, + "learning_rate": 0.00019063924128587265, + "loss": 0.3015, + "step": 385 + }, + { + "epoch": 0.14254062038404727, + "grad_norm": 0.27882182598114014, + "learning_rate": 0.00019061460771030916, + "loss": 0.3767, + "step": 386 + }, + { + "epoch": 0.1429098966026588, + "grad_norm": 0.2654573619365692, + "learning_rate": 0.00019058997413474567, + "loss": 0.3293, + "step": 387 + }, + { + "epoch": 0.1432791728212703, + "grad_norm": 0.3311960697174072, + "learning_rate": 0.0001905653405591822, + "loss": 0.4541, + "step": 388 + }, + { + "epoch": 0.14364844903988183, + "grad_norm": 0.3342706263065338, + "learning_rate": 0.00019054070698361868, + "loss": 0.3981, + "step": 389 + }, + { + "epoch": 0.14401772525849335, + "grad_norm": 0.24865975975990295, + "learning_rate": 0.0001905160734080552, + "loss": 0.2944, + "step": 390 + }, + { + "epoch": 0.14438700147710487, + "grad_norm": 0.3011600077152252, + "learning_rate": 0.00019049143983249168, + "loss": 0.4008, + "step": 391 + }, + { + "epoch": 0.1447562776957164, + "grad_norm": 0.25898277759552, + "learning_rate": 0.00019046680625692822, + "loss": 0.3053, + "step": 392 + }, + { + "epoch": 0.1451255539143279, + "grad_norm": 0.2723996639251709, + "learning_rate": 0.0001904421726813647, + "loss": 0.3454, + "step": 393 + }, + { + "epoch": 0.14549483013293943, + "grad_norm": 0.2535833418369293, + "learning_rate": 0.00019041753910580122, + "loss": 0.3995, + "step": 394 + }, + { + "epoch": 0.14586410635155095, + "grad_norm": 0.2934975028038025, + "learning_rate": 0.0001903929055302377, + "loss": 0.3866, + "step": 395 + }, + { + "epoch": 0.14623338257016247, + "grad_norm": 0.3095795214176178, + "learning_rate": 0.00019036827195467423, + "loss": 0.3741, + "step": 396 + }, + { + "epoch": 0.14660265878877402, + "grad_norm": 0.2689141631126404, + "learning_rate": 0.00019034363837911074, + "loss": 0.3469, + "step": 397 + }, + { + "epoch": 0.14697193500738553, + "grad_norm": 0.24755823612213135, + "learning_rate": 0.00019031900480354725, + "loss": 0.3461, + "step": 398 + }, + { + "epoch": 0.14734121122599705, + "grad_norm": 0.26234883069992065, + "learning_rate": 0.00019029437122798374, + "loss": 0.3608, + "step": 399 + }, + { + "epoch": 0.14771048744460857, + "grad_norm": 0.34892693161964417, + "learning_rate": 0.00019026973765242026, + "loss": 0.446, + "step": 400 + }, + { + "epoch": 0.14771048744460857, + "eval_loss": 7.540850639343262, + "eval_runtime": 6.9197, + "eval_samples_per_second": 7.226, + "eval_steps_per_second": 1.012, + "step": 400 + }, + { + "epoch": 0.1480797636632201, + "grad_norm": 0.31576693058013916, + "learning_rate": 0.00019024510407685677, + "loss": 0.3966, + "step": 401 + }, + { + "epoch": 0.1484490398818316, + "grad_norm": 0.2870180606842041, + "learning_rate": 0.00019022047050129329, + "loss": 0.3375, + "step": 402 + }, + { + "epoch": 0.14881831610044313, + "grad_norm": 0.24565967917442322, + "learning_rate": 0.00019019583692572977, + "loss": 0.308, + "step": 403 + }, + { + "epoch": 0.14918759231905465, + "grad_norm": 0.3703746497631073, + "learning_rate": 0.0001901712033501663, + "loss": 0.3764, + "step": 404 + }, + { + "epoch": 0.14955686853766617, + "grad_norm": 0.27821844816207886, + "learning_rate": 0.0001901465697746028, + "loss": 0.3456, + "step": 405 + }, + { + "epoch": 0.1499261447562777, + "grad_norm": 0.27104848623275757, + "learning_rate": 0.00019012193619903932, + "loss": 0.3934, + "step": 406 + }, + { + "epoch": 0.1502954209748892, + "grad_norm": 0.2912060022354126, + "learning_rate": 0.0001900973026234758, + "loss": 0.3832, + "step": 407 + }, + { + "epoch": 0.15066469719350073, + "grad_norm": 0.3446679711341858, + "learning_rate": 0.00019007266904791232, + "loss": 0.3986, + "step": 408 + }, + { + "epoch": 0.15103397341211225, + "grad_norm": 0.3842846751213074, + "learning_rate": 0.0001900480354723488, + "loss": 0.441, + "step": 409 + }, + { + "epoch": 0.15140324963072377, + "grad_norm": 0.28692421317100525, + "learning_rate": 0.00019002340189678535, + "loss": 0.3299, + "step": 410 + }, + { + "epoch": 0.15177252584933532, + "grad_norm": 0.3028642237186432, + "learning_rate": 0.00018999876832122184, + "loss": 0.3812, + "step": 411 + }, + { + "epoch": 0.15214180206794684, + "grad_norm": 0.2658669054508209, + "learning_rate": 0.00018997413474565835, + "loss": 0.3182, + "step": 412 + }, + { + "epoch": 0.15251107828655835, + "grad_norm": 0.2910005450248718, + "learning_rate": 0.00018994950117009484, + "loss": 0.4075, + "step": 413 + }, + { + "epoch": 0.15288035450516987, + "grad_norm": 0.35665634274482727, + "learning_rate": 0.00018992486759453135, + "loss": 0.37, + "step": 414 + }, + { + "epoch": 0.1532496307237814, + "grad_norm": 0.29228347539901733, + "learning_rate": 0.00018990023401896787, + "loss": 0.3374, + "step": 415 + }, + { + "epoch": 0.1536189069423929, + "grad_norm": 0.3172018527984619, + "learning_rate": 0.00018987560044340438, + "loss": 0.3141, + "step": 416 + }, + { + "epoch": 0.15398818316100443, + "grad_norm": 0.2540213465690613, + "learning_rate": 0.00018985096686784087, + "loss": 0.301, + "step": 417 + }, + { + "epoch": 0.15435745937961595, + "grad_norm": 0.2916422188282013, + "learning_rate": 0.00018982633329227738, + "loss": 0.335, + "step": 418 + }, + { + "epoch": 0.15472673559822747, + "grad_norm": 0.3135952353477478, + "learning_rate": 0.0001898016997167139, + "loss": 0.3827, + "step": 419 + }, + { + "epoch": 0.155096011816839, + "grad_norm": 0.3074333965778351, + "learning_rate": 0.00018977706614115041, + "loss": 0.3998, + "step": 420 + }, + { + "epoch": 0.1554652880354505, + "grad_norm": 0.43735846877098083, + "learning_rate": 0.0001897524325655869, + "loss": 0.459, + "step": 421 + }, + { + "epoch": 0.15583456425406203, + "grad_norm": 0.25438621640205383, + "learning_rate": 0.00018972779899002342, + "loss": 0.2827, + "step": 422 + }, + { + "epoch": 0.15620384047267355, + "grad_norm": 0.2913402318954468, + "learning_rate": 0.0001897031654144599, + "loss": 0.4021, + "step": 423 + }, + { + "epoch": 0.15657311669128507, + "grad_norm": 0.26882025599479675, + "learning_rate": 0.00018967853183889645, + "loss": 0.3673, + "step": 424 + }, + { + "epoch": 0.15694239290989662, + "grad_norm": 0.31678059697151184, + "learning_rate": 0.00018965389826333293, + "loss": 0.3481, + "step": 425 + }, + { + "epoch": 0.15731166912850814, + "grad_norm": 0.2833426892757416, + "learning_rate": 0.00018962926468776945, + "loss": 0.3195, + "step": 426 + }, + { + "epoch": 0.15768094534711966, + "grad_norm": 0.3256889581680298, + "learning_rate": 0.00018960463111220593, + "loss": 0.3514, + "step": 427 + }, + { + "epoch": 0.15805022156573117, + "grad_norm": 0.29935166239738464, + "learning_rate": 0.00018957999753664245, + "loss": 0.4135, + "step": 428 + }, + { + "epoch": 0.1584194977843427, + "grad_norm": 0.3257281184196472, + "learning_rate": 0.00018955536396107896, + "loss": 0.4791, + "step": 429 + }, + { + "epoch": 0.1587887740029542, + "grad_norm": 0.3422739803791046, + "learning_rate": 0.00018953073038551548, + "loss": 0.3734, + "step": 430 + }, + { + "epoch": 0.15915805022156573, + "grad_norm": 0.28668829798698425, + "learning_rate": 0.00018950609680995197, + "loss": 0.4097, + "step": 431 + }, + { + "epoch": 0.15952732644017725, + "grad_norm": 0.2483820766210556, + "learning_rate": 0.00018948146323438848, + "loss": 0.3578, + "step": 432 + }, + { + "epoch": 0.15989660265878877, + "grad_norm": 0.36205053329467773, + "learning_rate": 0.000189456829658825, + "loss": 0.3743, + "step": 433 + }, + { + "epoch": 0.1602658788774003, + "grad_norm": 0.32113391160964966, + "learning_rate": 0.0001894321960832615, + "loss": 0.4787, + "step": 434 + }, + { + "epoch": 0.1606351550960118, + "grad_norm": 0.3053653836250305, + "learning_rate": 0.000189407562507698, + "loss": 0.3688, + "step": 435 + }, + { + "epoch": 0.16100443131462333, + "grad_norm": 0.30101513862609863, + "learning_rate": 0.0001893829289321345, + "loss": 0.3871, + "step": 436 + }, + { + "epoch": 0.16137370753323485, + "grad_norm": 0.2840447723865509, + "learning_rate": 0.00018935829535657103, + "loss": 0.3602, + "step": 437 + }, + { + "epoch": 0.16174298375184637, + "grad_norm": 0.25173690915107727, + "learning_rate": 0.00018933366178100754, + "loss": 0.3069, + "step": 438 + }, + { + "epoch": 0.1621122599704579, + "grad_norm": 0.29010775685310364, + "learning_rate": 0.00018930902820544403, + "loss": 0.4142, + "step": 439 + }, + { + "epoch": 0.16248153618906944, + "grad_norm": 0.3123351037502289, + "learning_rate": 0.00018928439462988054, + "loss": 0.3595, + "step": 440 + }, + { + "epoch": 0.16285081240768096, + "grad_norm": 0.328711599111557, + "learning_rate": 0.00018925976105431703, + "loss": 0.3695, + "step": 441 + }, + { + "epoch": 0.16322008862629248, + "grad_norm": 0.2994988262653351, + "learning_rate": 0.00018923512747875357, + "loss": 0.3846, + "step": 442 + }, + { + "epoch": 0.163589364844904, + "grad_norm": 0.28901907801628113, + "learning_rate": 0.00018921049390319006, + "loss": 0.3962, + "step": 443 + }, + { + "epoch": 0.16395864106351551, + "grad_norm": 0.26698020100593567, + "learning_rate": 0.00018918586032762658, + "loss": 0.3025, + "step": 444 + }, + { + "epoch": 0.16432791728212703, + "grad_norm": 0.2793664336204529, + "learning_rate": 0.00018916122675206306, + "loss": 0.3376, + "step": 445 + }, + { + "epoch": 0.16469719350073855, + "grad_norm": 0.2733173668384552, + "learning_rate": 0.00018913659317649958, + "loss": 0.2961, + "step": 446 + }, + { + "epoch": 0.16506646971935007, + "grad_norm": 0.2825769782066345, + "learning_rate": 0.0001891119596009361, + "loss": 0.3152, + "step": 447 + }, + { + "epoch": 0.1654357459379616, + "grad_norm": 0.26671573519706726, + "learning_rate": 0.0001890873260253726, + "loss": 0.3121, + "step": 448 + }, + { + "epoch": 0.1658050221565731, + "grad_norm": 0.2881945073604584, + "learning_rate": 0.0001890626924498091, + "loss": 0.3875, + "step": 449 + }, + { + "epoch": 0.16617429837518463, + "grad_norm": 0.518216073513031, + "learning_rate": 0.0001890380588742456, + "loss": 0.4236, + "step": 450 + }, + { + "epoch": 0.16617429837518463, + "eval_loss": 7.578165054321289, + "eval_runtime": 6.9178, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 1.012, + "step": 450 + }, + { + "epoch": 0.16654357459379615, + "grad_norm": 0.3728052079677582, + "learning_rate": 0.00018901342529868212, + "loss": 0.3705, + "step": 451 + }, + { + "epoch": 0.16691285081240767, + "grad_norm": 0.3082796633243561, + "learning_rate": 0.00018898879172311864, + "loss": 0.3137, + "step": 452 + }, + { + "epoch": 0.1672821270310192, + "grad_norm": 0.28793081641197205, + "learning_rate": 0.00018896415814755513, + "loss": 0.4069, + "step": 453 + }, + { + "epoch": 0.16765140324963074, + "grad_norm": 0.3412259817123413, + "learning_rate": 0.00018893952457199164, + "loss": 0.423, + "step": 454 + }, + { + "epoch": 0.16802067946824226, + "grad_norm": 0.3220004737377167, + "learning_rate": 0.00018891489099642813, + "loss": 0.3531, + "step": 455 + }, + { + "epoch": 0.16838995568685378, + "grad_norm": 0.2635413706302643, + "learning_rate": 0.00018889025742086467, + "loss": 0.3653, + "step": 456 + }, + { + "epoch": 0.1687592319054653, + "grad_norm": 0.4020637273788452, + "learning_rate": 0.00018886562384530116, + "loss": 0.4517, + "step": 457 + }, + { + "epoch": 0.16912850812407682, + "grad_norm": 0.3290424346923828, + "learning_rate": 0.00018884099026973767, + "loss": 0.4558, + "step": 458 + }, + { + "epoch": 0.16949778434268833, + "grad_norm": 0.3512584865093231, + "learning_rate": 0.00018881635669417416, + "loss": 0.3883, + "step": 459 + }, + { + "epoch": 0.16986706056129985, + "grad_norm": 0.27051299810409546, + "learning_rate": 0.00018879172311861067, + "loss": 0.3462, + "step": 460 + }, + { + "epoch": 0.17023633677991137, + "grad_norm": 0.32740721106529236, + "learning_rate": 0.0001887670895430472, + "loss": 0.3482, + "step": 461 + }, + { + "epoch": 0.1706056129985229, + "grad_norm": 0.29077714681625366, + "learning_rate": 0.0001887424559674837, + "loss": 0.3195, + "step": 462 + }, + { + "epoch": 0.1709748892171344, + "grad_norm": 0.2554857134819031, + "learning_rate": 0.0001887178223919202, + "loss": 0.3418, + "step": 463 + }, + { + "epoch": 0.17134416543574593, + "grad_norm": 0.3015806972980499, + "learning_rate": 0.0001886931888163567, + "loss": 0.4394, + "step": 464 + }, + { + "epoch": 0.17171344165435745, + "grad_norm": 0.29238954186439514, + "learning_rate": 0.00018866855524079322, + "loss": 0.3977, + "step": 465 + }, + { + "epoch": 0.17208271787296897, + "grad_norm": 0.31837034225463867, + "learning_rate": 0.00018864392166522973, + "loss": 0.4241, + "step": 466 + }, + { + "epoch": 0.1724519940915805, + "grad_norm": 0.45523160696029663, + "learning_rate": 0.00018861928808966622, + "loss": 0.4731, + "step": 467 + }, + { + "epoch": 0.172821270310192, + "grad_norm": 0.3064796030521393, + "learning_rate": 0.00018859465451410274, + "loss": 0.3707, + "step": 468 + }, + { + "epoch": 0.17319054652880356, + "grad_norm": 0.27165788412094116, + "learning_rate": 0.00018857002093853922, + "loss": 0.3787, + "step": 469 + }, + { + "epoch": 0.17355982274741508, + "grad_norm": 0.28276559710502625, + "learning_rate": 0.00018854538736297574, + "loss": 0.3718, + "step": 470 + }, + { + "epoch": 0.1739290989660266, + "grad_norm": 0.2306227684020996, + "learning_rate": 0.00018852075378741225, + "loss": 0.2854, + "step": 471 + }, + { + "epoch": 0.17429837518463812, + "grad_norm": 0.2953512370586395, + "learning_rate": 0.00018849612021184874, + "loss": 0.4538, + "step": 472 + }, + { + "epoch": 0.17466765140324964, + "grad_norm": 0.3433714210987091, + "learning_rate": 0.00018847148663628526, + "loss": 0.3665, + "step": 473 + }, + { + "epoch": 0.17503692762186115, + "grad_norm": 0.30472332239151, + "learning_rate": 0.00018844685306072177, + "loss": 0.3809, + "step": 474 + }, + { + "epoch": 0.17540620384047267, + "grad_norm": 0.2550714910030365, + "learning_rate": 0.00018842221948515829, + "loss": 0.2875, + "step": 475 + }, + { + "epoch": 0.1757754800590842, + "grad_norm": 0.3965039849281311, + "learning_rate": 0.00018839758590959477, + "loss": 0.3937, + "step": 476 + }, + { + "epoch": 0.1761447562776957, + "grad_norm": 0.3124586343765259, + "learning_rate": 0.0001883729523340313, + "loss": 0.35, + "step": 477 + }, + { + "epoch": 0.17651403249630723, + "grad_norm": 0.2857762277126312, + "learning_rate": 0.0001883483187584678, + "loss": 0.3817, + "step": 478 + }, + { + "epoch": 0.17688330871491875, + "grad_norm": 0.3115213215351105, + "learning_rate": 0.00018832368518290432, + "loss": 0.3659, + "step": 479 + }, + { + "epoch": 0.17725258493353027, + "grad_norm": 0.27608948945999146, + "learning_rate": 0.0001882990516073408, + "loss": 0.365, + "step": 480 + }, + { + "epoch": 0.1776218611521418, + "grad_norm": 0.28605446219444275, + "learning_rate": 0.00018827441803177732, + "loss": 0.3442, + "step": 481 + }, + { + "epoch": 0.1779911373707533, + "grad_norm": 0.32482242584228516, + "learning_rate": 0.0001882497844562138, + "loss": 0.4342, + "step": 482 + }, + { + "epoch": 0.17836041358936486, + "grad_norm": 0.32246559858322144, + "learning_rate": 0.00018822515088065035, + "loss": 0.3867, + "step": 483 + }, + { + "epoch": 0.17872968980797638, + "grad_norm": 0.2884840965270996, + "learning_rate": 0.00018820051730508684, + "loss": 0.3263, + "step": 484 + }, + { + "epoch": 0.1790989660265879, + "grad_norm": 0.36319419741630554, + "learning_rate": 0.00018817588372952335, + "loss": 0.3853, + "step": 485 + }, + { + "epoch": 0.17946824224519942, + "grad_norm": 0.27709540724754333, + "learning_rate": 0.00018815125015395984, + "loss": 0.3496, + "step": 486 + }, + { + "epoch": 0.17983751846381094, + "grad_norm": 0.27379247546195984, + "learning_rate": 0.00018812661657839635, + "loss": 0.3718, + "step": 487 + }, + { + "epoch": 0.18020679468242246, + "grad_norm": 0.2610037922859192, + "learning_rate": 0.00018810198300283287, + "loss": 0.3121, + "step": 488 + }, + { + "epoch": 0.18057607090103397, + "grad_norm": 0.27584755420684814, + "learning_rate": 0.00018807734942726938, + "loss": 0.3411, + "step": 489 + }, + { + "epoch": 0.1809453471196455, + "grad_norm": 0.2690613865852356, + "learning_rate": 0.00018805271585170587, + "loss": 0.3177, + "step": 490 + }, + { + "epoch": 0.181314623338257, + "grad_norm": 0.2773762047290802, + "learning_rate": 0.00018802808227614238, + "loss": 0.336, + "step": 491 + }, + { + "epoch": 0.18168389955686853, + "grad_norm": 0.2810940444469452, + "learning_rate": 0.0001880034487005789, + "loss": 0.3297, + "step": 492 + }, + { + "epoch": 0.18205317577548005, + "grad_norm": 0.24947303533554077, + "learning_rate": 0.0001879788151250154, + "loss": 0.3454, + "step": 493 + }, + { + "epoch": 0.18242245199409157, + "grad_norm": 0.32752540707588196, + "learning_rate": 0.0001879541815494519, + "loss": 0.3289, + "step": 494 + }, + { + "epoch": 0.1827917282127031, + "grad_norm": 0.28684183955192566, + "learning_rate": 0.00018792954797388842, + "loss": 0.4221, + "step": 495 + }, + { + "epoch": 0.1831610044313146, + "grad_norm": 0.2908467948436737, + "learning_rate": 0.0001879049143983249, + "loss": 0.2949, + "step": 496 + }, + { + "epoch": 0.18353028064992616, + "grad_norm": 0.2830837368965149, + "learning_rate": 0.00018788028082276144, + "loss": 0.3823, + "step": 497 + }, + { + "epoch": 0.18389955686853768, + "grad_norm": 0.2559138536453247, + "learning_rate": 0.00018785564724719793, + "loss": 0.3584, + "step": 498 + }, + { + "epoch": 0.1842688330871492, + "grad_norm": 0.26681581139564514, + "learning_rate": 0.00018783101367163445, + "loss": 0.3581, + "step": 499 + }, + { + "epoch": 0.18463810930576072, + "grad_norm": 0.29130181670188904, + "learning_rate": 0.00018780638009607093, + "loss": 0.3704, + "step": 500 + }, + { + "epoch": 0.18463810930576072, + "eval_loss": 7.8280534744262695, + "eval_runtime": 7.1712, + "eval_samples_per_second": 6.972, + "eval_steps_per_second": 0.976, + "step": 500 + }, + { + "epoch": 0.18500738552437224, + "grad_norm": 0.24769262969493866, + "learning_rate": 0.00018778174652050745, + "loss": 0.2826, + "step": 501 + }, + { + "epoch": 0.18537666174298376, + "grad_norm": 0.26274457573890686, + "learning_rate": 0.00018775711294494396, + "loss": 0.3039, + "step": 502 + }, + { + "epoch": 0.18574593796159528, + "grad_norm": 0.31332075595855713, + "learning_rate": 0.00018773247936938048, + "loss": 0.3442, + "step": 503 + }, + { + "epoch": 0.1861152141802068, + "grad_norm": 0.31300967931747437, + "learning_rate": 0.00018770784579381697, + "loss": 0.3758, + "step": 504 + }, + { + "epoch": 0.18648449039881831, + "grad_norm": 0.2740127742290497, + "learning_rate": 0.00018768321221825348, + "loss": 0.3487, + "step": 505 + }, + { + "epoch": 0.18685376661742983, + "grad_norm": 0.29850566387176514, + "learning_rate": 0.00018765857864269, + "loss": 0.3769, + "step": 506 + }, + { + "epoch": 0.18722304283604135, + "grad_norm": 0.30501025915145874, + "learning_rate": 0.0001876339450671265, + "loss": 0.338, + "step": 507 + }, + { + "epoch": 0.18759231905465287, + "grad_norm": 0.28575384616851807, + "learning_rate": 0.000187609311491563, + "loss": 0.3429, + "step": 508 + }, + { + "epoch": 0.1879615952732644, + "grad_norm": 0.27363353967666626, + "learning_rate": 0.0001875846779159995, + "loss": 0.3057, + "step": 509 + }, + { + "epoch": 0.1883308714918759, + "grad_norm": 0.2838500142097473, + "learning_rate": 0.00018756004434043603, + "loss": 0.377, + "step": 510 + }, + { + "epoch": 0.18870014771048743, + "grad_norm": 0.2866860330104828, + "learning_rate": 0.00018753541076487254, + "loss": 0.3183, + "step": 511 + }, + { + "epoch": 0.18906942392909898, + "grad_norm": 0.30567336082458496, + "learning_rate": 0.00018751077718930903, + "loss": 0.4165, + "step": 512 + }, + { + "epoch": 0.1894387001477105, + "grad_norm": 0.29118427634239197, + "learning_rate": 0.00018748614361374554, + "loss": 0.4027, + "step": 513 + }, + { + "epoch": 0.18980797636632202, + "grad_norm": 0.31201428174972534, + "learning_rate": 0.00018746151003818203, + "loss": 0.3624, + "step": 514 + }, + { + "epoch": 0.19017725258493354, + "grad_norm": 0.30729058384895325, + "learning_rate": 0.00018743687646261857, + "loss": 0.4049, + "step": 515 + }, + { + "epoch": 0.19054652880354506, + "grad_norm": 0.39005765318870544, + "learning_rate": 0.00018741224288705506, + "loss": 0.3956, + "step": 516 + }, + { + "epoch": 0.19091580502215658, + "grad_norm": 0.3322615325450897, + "learning_rate": 0.00018738760931149157, + "loss": 0.3387, + "step": 517 + }, + { + "epoch": 0.1912850812407681, + "grad_norm": 0.23067909479141235, + "learning_rate": 0.00018736297573592806, + "loss": 0.2331, + "step": 518 + }, + { + "epoch": 0.19165435745937962, + "grad_norm": 0.3623248338699341, + "learning_rate": 0.00018733834216036458, + "loss": 0.3328, + "step": 519 + }, + { + "epoch": 0.19202363367799113, + "grad_norm": 0.2990085184574127, + "learning_rate": 0.0001873137085848011, + "loss": 0.3761, + "step": 520 + }, + { + "epoch": 0.19239290989660265, + "grad_norm": 0.2480216771364212, + "learning_rate": 0.0001872890750092376, + "loss": 0.3163, + "step": 521 + }, + { + "epoch": 0.19276218611521417, + "grad_norm": 0.2659197151660919, + "learning_rate": 0.0001872644414336741, + "loss": 0.3176, + "step": 522 + }, + { + "epoch": 0.1931314623338257, + "grad_norm": 0.3514041602611542, + "learning_rate": 0.0001872398078581106, + "loss": 0.4081, + "step": 523 + }, + { + "epoch": 0.1935007385524372, + "grad_norm": 0.26306578516960144, + "learning_rate": 0.00018721517428254712, + "loss": 0.3521, + "step": 524 + }, + { + "epoch": 0.19387001477104873, + "grad_norm": 0.3017035722732544, + "learning_rate": 0.00018719054070698364, + "loss": 0.3464, + "step": 525 + }, + { + "epoch": 0.19423929098966028, + "grad_norm": 0.28066661953926086, + "learning_rate": 0.00018716590713142013, + "loss": 0.3132, + "step": 526 + }, + { + "epoch": 0.1946085672082718, + "grad_norm": 0.3137780725955963, + "learning_rate": 0.00018714127355585664, + "loss": 0.3437, + "step": 527 + }, + { + "epoch": 0.19497784342688332, + "grad_norm": 0.34142574667930603, + "learning_rate": 0.00018711663998029313, + "loss": 0.3467, + "step": 528 + }, + { + "epoch": 0.19534711964549484, + "grad_norm": 0.2694997191429138, + "learning_rate": 0.00018709200640472967, + "loss": 0.3244, + "step": 529 + }, + { + "epoch": 0.19571639586410636, + "grad_norm": 0.2794923484325409, + "learning_rate": 0.00018706737282916616, + "loss": 0.3471, + "step": 530 + }, + { + "epoch": 0.19608567208271788, + "grad_norm": 0.23955784738063812, + "learning_rate": 0.00018704273925360267, + "loss": 0.2798, + "step": 531 + }, + { + "epoch": 0.1964549483013294, + "grad_norm": 0.280259907245636, + "learning_rate": 0.00018701810567803916, + "loss": 0.3385, + "step": 532 + }, + { + "epoch": 0.19682422451994092, + "grad_norm": 0.30550122261047363, + "learning_rate": 0.00018699347210247567, + "loss": 0.3911, + "step": 533 + }, + { + "epoch": 0.19719350073855244, + "grad_norm": 0.2588183581829071, + "learning_rate": 0.0001869688385269122, + "loss": 0.3178, + "step": 534 + }, + { + "epoch": 0.19756277695716395, + "grad_norm": 0.31514012813568115, + "learning_rate": 0.0001869442049513487, + "loss": 0.3889, + "step": 535 + }, + { + "epoch": 0.19793205317577547, + "grad_norm": 0.3196103572845459, + "learning_rate": 0.0001869195713757852, + "loss": 0.4085, + "step": 536 + }, + { + "epoch": 0.198301329394387, + "grad_norm": 0.26055750250816345, + "learning_rate": 0.0001868949378002217, + "loss": 0.2888, + "step": 537 + }, + { + "epoch": 0.1986706056129985, + "grad_norm": 0.23201389610767365, + "learning_rate": 0.00018687030422465822, + "loss": 0.2781, + "step": 538 + }, + { + "epoch": 0.19903988183161003, + "grad_norm": 0.3453541398048401, + "learning_rate": 0.00018684567064909473, + "loss": 0.4182, + "step": 539 + }, + { + "epoch": 0.19940915805022155, + "grad_norm": 0.2894502878189087, + "learning_rate": 0.00018682103707353122, + "loss": 0.3088, + "step": 540 + }, + { + "epoch": 0.1997784342688331, + "grad_norm": 0.3013773560523987, + "learning_rate": 0.00018679640349796774, + "loss": 0.3858, + "step": 541 + }, + { + "epoch": 0.20014771048744462, + "grad_norm": 0.2649989724159241, + "learning_rate": 0.00018677176992240425, + "loss": 0.3397, + "step": 542 + }, + { + "epoch": 0.20051698670605614, + "grad_norm": 0.2491888701915741, + "learning_rate": 0.00018674713634684077, + "loss": 0.3981, + "step": 543 + }, + { + "epoch": 0.20088626292466766, + "grad_norm": 0.3280429542064667, + "learning_rate": 0.00018672250277127725, + "loss": 0.461, + "step": 544 + }, + { + "epoch": 0.20125553914327918, + "grad_norm": 0.29260045289993286, + "learning_rate": 0.00018669786919571377, + "loss": 0.4402, + "step": 545 + }, + { + "epoch": 0.2016248153618907, + "grad_norm": 0.28776174783706665, + "learning_rate": 0.00018667323562015026, + "loss": 0.3989, + "step": 546 + }, + { + "epoch": 0.20199409158050222, + "grad_norm": 0.24941587448120117, + "learning_rate": 0.0001866486020445868, + "loss": 0.3191, + "step": 547 + }, + { + "epoch": 0.20236336779911374, + "grad_norm": 0.3085111677646637, + "learning_rate": 0.00018662396846902328, + "loss": 0.283, + "step": 548 + }, + { + "epoch": 0.20273264401772526, + "grad_norm": 0.2765428125858307, + "learning_rate": 0.0001865993348934598, + "loss": 0.3578, + "step": 549 + }, + { + "epoch": 0.20310192023633677, + "grad_norm": 0.27120059728622437, + "learning_rate": 0.0001865747013178963, + "loss": 0.3519, + "step": 550 + }, + { + "epoch": 0.20310192023633677, + "eval_loss": 8.065503120422363, + "eval_runtime": 6.9537, + "eval_samples_per_second": 7.19, + "eval_steps_per_second": 1.007, + "step": 550 + }, + { + "epoch": 0.2034711964549483, + "grad_norm": 0.32950153946876526, + "learning_rate": 0.0001865500677423328, + "loss": 0.4247, + "step": 551 + }, + { + "epoch": 0.2038404726735598, + "grad_norm": 0.30643901228904724, + "learning_rate": 0.00018652543416676932, + "loss": 0.4113, + "step": 552 + }, + { + "epoch": 0.20420974889217133, + "grad_norm": 0.2926720380783081, + "learning_rate": 0.00018650080059120583, + "loss": 0.3453, + "step": 553 + }, + { + "epoch": 0.20457902511078285, + "grad_norm": 0.2336941808462143, + "learning_rate": 0.00018647616701564232, + "loss": 0.2955, + "step": 554 + }, + { + "epoch": 0.2049483013293944, + "grad_norm": 0.26778921484947205, + "learning_rate": 0.00018645153344007883, + "loss": 0.3264, + "step": 555 + }, + { + "epoch": 0.20531757754800592, + "grad_norm": 0.24116957187652588, + "learning_rate": 0.00018642689986451535, + "loss": 0.3072, + "step": 556 + }, + { + "epoch": 0.20568685376661744, + "grad_norm": 0.3223934471607208, + "learning_rate": 0.00018640226628895186, + "loss": 0.325, + "step": 557 + }, + { + "epoch": 0.20605612998522896, + "grad_norm": 0.2811295986175537, + "learning_rate": 0.00018637763271338835, + "loss": 0.3179, + "step": 558 + }, + { + "epoch": 0.20642540620384048, + "grad_norm": 0.30208703875541687, + "learning_rate": 0.00018635299913782486, + "loss": 0.4123, + "step": 559 + }, + { + "epoch": 0.206794682422452, + "grad_norm": 0.29882189631462097, + "learning_rate": 0.00018632836556226135, + "loss": 0.3773, + "step": 560 + }, + { + "epoch": 0.20716395864106352, + "grad_norm": 0.2922777235507965, + "learning_rate": 0.0001863037319866979, + "loss": 0.3055, + "step": 561 + }, + { + "epoch": 0.20753323485967504, + "grad_norm": 0.24089719355106354, + "learning_rate": 0.00018627909841113438, + "loss": 0.2964, + "step": 562 + }, + { + "epoch": 0.20790251107828656, + "grad_norm": 0.30070650577545166, + "learning_rate": 0.0001862544648355709, + "loss": 0.4017, + "step": 563 + }, + { + "epoch": 0.20827178729689808, + "grad_norm": 0.32056915760040283, + "learning_rate": 0.00018622983126000738, + "loss": 0.3823, + "step": 564 + }, + { + "epoch": 0.2086410635155096, + "grad_norm": 0.30773088335990906, + "learning_rate": 0.0001862051976844439, + "loss": 0.4545, + "step": 565 + }, + { + "epoch": 0.20901033973412111, + "grad_norm": 0.32103025913238525, + "learning_rate": 0.0001861805641088804, + "loss": 0.3432, + "step": 566 + }, + { + "epoch": 0.20937961595273263, + "grad_norm": 0.2710670828819275, + "learning_rate": 0.00018615593053331693, + "loss": 0.3332, + "step": 567 + }, + { + "epoch": 0.20974889217134415, + "grad_norm": 0.2406773567199707, + "learning_rate": 0.00018613129695775341, + "loss": 0.2752, + "step": 568 + }, + { + "epoch": 0.2101181683899557, + "grad_norm": 0.3111715614795685, + "learning_rate": 0.00018610666338218993, + "loss": 0.2992, + "step": 569 + }, + { + "epoch": 0.21048744460856722, + "grad_norm": 0.29249367117881775, + "learning_rate": 0.00018608202980662644, + "loss": 0.407, + "step": 570 + }, + { + "epoch": 0.21085672082717874, + "grad_norm": 0.25834277272224426, + "learning_rate": 0.00018605739623106296, + "loss": 0.3157, + "step": 571 + }, + { + "epoch": 0.21122599704579026, + "grad_norm": 0.3324020504951477, + "learning_rate": 0.00018603276265549945, + "loss": 0.3695, + "step": 572 + }, + { + "epoch": 0.21159527326440178, + "grad_norm": 0.24943684041500092, + "learning_rate": 0.00018600812907993596, + "loss": 0.3541, + "step": 573 + }, + { + "epoch": 0.2119645494830133, + "grad_norm": 0.277592271566391, + "learning_rate": 0.00018598349550437248, + "loss": 0.3517, + "step": 574 + }, + { + "epoch": 0.21233382570162482, + "grad_norm": 0.24661049246788025, + "learning_rate": 0.000185958861928809, + "loss": 0.3035, + "step": 575 + }, + { + "epoch": 0.21270310192023634, + "grad_norm": 0.2558872401714325, + "learning_rate": 0.00018593422835324548, + "loss": 0.2883, + "step": 576 + }, + { + "epoch": 0.21307237813884786, + "grad_norm": 0.2520885765552521, + "learning_rate": 0.000185909594777682, + "loss": 0.3017, + "step": 577 + }, + { + "epoch": 0.21344165435745938, + "grad_norm": 0.2577928304672241, + "learning_rate": 0.00018588496120211848, + "loss": 0.3, + "step": 578 + }, + { + "epoch": 0.2138109305760709, + "grad_norm": 0.3452261984348297, + "learning_rate": 0.00018586032762655502, + "loss": 0.4417, + "step": 579 + }, + { + "epoch": 0.21418020679468242, + "grad_norm": 0.25366154313087463, + "learning_rate": 0.0001858356940509915, + "loss": 0.3167, + "step": 580 + }, + { + "epoch": 0.21454948301329393, + "grad_norm": 0.29611727595329285, + "learning_rate": 0.00018581106047542802, + "loss": 0.284, + "step": 581 + }, + { + "epoch": 0.21491875923190545, + "grad_norm": 0.27096664905548096, + "learning_rate": 0.0001857864268998645, + "loss": 0.3919, + "step": 582 + }, + { + "epoch": 0.21528803545051697, + "grad_norm": 0.3325769305229187, + "learning_rate": 0.00018576179332430103, + "loss": 0.4605, + "step": 583 + }, + { + "epoch": 0.21565731166912852, + "grad_norm": 0.2713104486465454, + "learning_rate": 0.00018573715974873754, + "loss": 0.3677, + "step": 584 + }, + { + "epoch": 0.21602658788774004, + "grad_norm": 0.282330185174942, + "learning_rate": 0.00018571252617317406, + "loss": 0.2968, + "step": 585 + }, + { + "epoch": 0.21639586410635156, + "grad_norm": 0.28354090452194214, + "learning_rate": 0.00018568789259761054, + "loss": 0.3376, + "step": 586 + }, + { + "epoch": 0.21676514032496308, + "grad_norm": 0.3000122606754303, + "learning_rate": 0.00018566325902204706, + "loss": 0.3734, + "step": 587 + }, + { + "epoch": 0.2171344165435746, + "grad_norm": 0.2997065484523773, + "learning_rate": 0.00018563862544648357, + "loss": 0.4104, + "step": 588 + }, + { + "epoch": 0.21750369276218612, + "grad_norm": 0.23571345210075378, + "learning_rate": 0.0001856139918709201, + "loss": 0.312, + "step": 589 + }, + { + "epoch": 0.21787296898079764, + "grad_norm": 0.22919754683971405, + "learning_rate": 0.00018558935829535657, + "loss": 0.2519, + "step": 590 + }, + { + "epoch": 0.21824224519940916, + "grad_norm": 0.2568872272968292, + "learning_rate": 0.0001855647247197931, + "loss": 0.3489, + "step": 591 + }, + { + "epoch": 0.21861152141802068, + "grad_norm": 0.24573929607868195, + "learning_rate": 0.00018554009114422958, + "loss": 0.302, + "step": 592 + }, + { + "epoch": 0.2189807976366322, + "grad_norm": 0.3064424395561218, + "learning_rate": 0.00018551545756866612, + "loss": 0.3728, + "step": 593 + }, + { + "epoch": 0.21935007385524372, + "grad_norm": 0.2694615125656128, + "learning_rate": 0.0001854908239931026, + "loss": 0.3477, + "step": 594 + }, + { + "epoch": 0.21971935007385524, + "grad_norm": 0.2890640199184418, + "learning_rate": 0.00018546619041753912, + "loss": 0.3882, + "step": 595 + }, + { + "epoch": 0.22008862629246675, + "grad_norm": 0.28973162174224854, + "learning_rate": 0.0001854415568419756, + "loss": 0.37, + "step": 596 + }, + { + "epoch": 0.22045790251107827, + "grad_norm": 0.3029009699821472, + "learning_rate": 0.00018541692326641212, + "loss": 0.3662, + "step": 597 + }, + { + "epoch": 0.22082717872968982, + "grad_norm": 0.25944048166275024, + "learning_rate": 0.00018539228969084864, + "loss": 0.3511, + "step": 598 + }, + { + "epoch": 0.22119645494830134, + "grad_norm": 0.27884286642074585, + "learning_rate": 0.00018536765611528515, + "loss": 0.3468, + "step": 599 + }, + { + "epoch": 0.22156573116691286, + "grad_norm": 0.2858894467353821, + "learning_rate": 0.00018534302253972164, + "loss": 0.339, + "step": 600 + }, + { + "epoch": 0.22156573116691286, + "eval_loss": 7.835242748260498, + "eval_runtime": 6.9189, + "eval_samples_per_second": 7.227, + "eval_steps_per_second": 1.012, + "step": 600 + }, + { + "epoch": 0.22193500738552438, + "grad_norm": 0.26113972067832947, + "learning_rate": 0.00018531838896415815, + "loss": 0.375, + "step": 601 + }, + { + "epoch": 0.2223042836041359, + "grad_norm": 0.4488375186920166, + "learning_rate": 0.00018529375538859467, + "loss": 0.4265, + "step": 602 + }, + { + "epoch": 0.22267355982274742, + "grad_norm": 0.31581351161003113, + "learning_rate": 0.00018526912181303118, + "loss": 0.4291, + "step": 603 + }, + { + "epoch": 0.22304283604135894, + "grad_norm": 0.3254198431968689, + "learning_rate": 0.00018524448823746767, + "loss": 0.304, + "step": 604 + }, + { + "epoch": 0.22341211225997046, + "grad_norm": 0.32129237055778503, + "learning_rate": 0.00018521985466190419, + "loss": 0.4416, + "step": 605 + }, + { + "epoch": 0.22378138847858198, + "grad_norm": 0.29720792174339294, + "learning_rate": 0.0001851952210863407, + "loss": 0.3792, + "step": 606 + }, + { + "epoch": 0.2241506646971935, + "grad_norm": 0.29661765694618225, + "learning_rate": 0.00018517058751077721, + "loss": 0.348, + "step": 607 + }, + { + "epoch": 0.22451994091580502, + "grad_norm": 0.25023216009140015, + "learning_rate": 0.0001851459539352137, + "loss": 0.3183, + "step": 608 + }, + { + "epoch": 0.22488921713441654, + "grad_norm": 0.25598829984664917, + "learning_rate": 0.00018512132035965022, + "loss": 0.3268, + "step": 609 + }, + { + "epoch": 0.22525849335302806, + "grad_norm": 0.33751747012138367, + "learning_rate": 0.0001850966867840867, + "loss": 0.3527, + "step": 610 + }, + { + "epoch": 0.22562776957163957, + "grad_norm": 0.3107909858226776, + "learning_rate": 0.00018507205320852325, + "loss": 0.3617, + "step": 611 + }, + { + "epoch": 0.2259970457902511, + "grad_norm": 0.2475201040506363, + "learning_rate": 0.00018504741963295973, + "loss": 0.3791, + "step": 612 + }, + { + "epoch": 0.22636632200886264, + "grad_norm": 0.31140202283859253, + "learning_rate": 0.00018502278605739625, + "loss": 0.3911, + "step": 613 + }, + { + "epoch": 0.22673559822747416, + "grad_norm": 0.25788649916648865, + "learning_rate": 0.00018499815248183274, + "loss": 0.3229, + "step": 614 + }, + { + "epoch": 0.22710487444608568, + "grad_norm": 0.2878996431827545, + "learning_rate": 0.00018497351890626925, + "loss": 0.3463, + "step": 615 + }, + { + "epoch": 0.2274741506646972, + "grad_norm": 0.3253519535064697, + "learning_rate": 0.00018494888533070577, + "loss": 0.3739, + "step": 616 + }, + { + "epoch": 0.22784342688330872, + "grad_norm": 0.2856617271900177, + "learning_rate": 0.00018492425175514228, + "loss": 0.3758, + "step": 617 + }, + { + "epoch": 0.22821270310192024, + "grad_norm": 0.30674973130226135, + "learning_rate": 0.00018489961817957877, + "loss": 0.3559, + "step": 618 + }, + { + "epoch": 0.22858197932053176, + "grad_norm": 0.2839246392250061, + "learning_rate": 0.00018487498460401528, + "loss": 0.3162, + "step": 619 + }, + { + "epoch": 0.22895125553914328, + "grad_norm": 0.27280157804489136, + "learning_rate": 0.0001848503510284518, + "loss": 0.2978, + "step": 620 + }, + { + "epoch": 0.2293205317577548, + "grad_norm": 0.337427020072937, + "learning_rate": 0.0001848257174528883, + "loss": 0.4273, + "step": 621 + }, + { + "epoch": 0.22968980797636632, + "grad_norm": 0.31877660751342773, + "learning_rate": 0.0001848010838773248, + "loss": 0.383, + "step": 622 + }, + { + "epoch": 0.23005908419497784, + "grad_norm": 0.2836788296699524, + "learning_rate": 0.0001847764503017613, + "loss": 0.3476, + "step": 623 + }, + { + "epoch": 0.23042836041358936, + "grad_norm": 0.25409194827079773, + "learning_rate": 0.0001847518167261978, + "loss": 0.3265, + "step": 624 + }, + { + "epoch": 0.23079763663220088, + "grad_norm": 0.2683696150779724, + "learning_rate": 0.00018472718315063434, + "loss": 0.3515, + "step": 625 + }, + { + "epoch": 0.2311669128508124, + "grad_norm": 0.24589329957962036, + "learning_rate": 0.00018470254957507083, + "loss": 0.2713, + "step": 626 + }, + { + "epoch": 0.23153618906942394, + "grad_norm": 0.263883501291275, + "learning_rate": 0.00018467791599950734, + "loss": 0.313, + "step": 627 + }, + { + "epoch": 0.23190546528803546, + "grad_norm": 0.23859520256519318, + "learning_rate": 0.00018465328242394383, + "loss": 0.2742, + "step": 628 + }, + { + "epoch": 0.23227474150664698, + "grad_norm": 0.3602641820907593, + "learning_rate": 0.00018462864884838035, + "loss": 0.3939, + "step": 629 + }, + { + "epoch": 0.2326440177252585, + "grad_norm": 0.31765761971473694, + "learning_rate": 0.00018460401527281686, + "loss": 0.4711, + "step": 630 + }, + { + "epoch": 0.23301329394387002, + "grad_norm": 0.3336300551891327, + "learning_rate": 0.00018457938169725338, + "loss": 0.3929, + "step": 631 + }, + { + "epoch": 0.23338257016248154, + "grad_norm": 0.26948854327201843, + "learning_rate": 0.00018455474812168986, + "loss": 0.2945, + "step": 632 + }, + { + "epoch": 0.23375184638109306, + "grad_norm": 0.25854113698005676, + "learning_rate": 0.00018453011454612638, + "loss": 0.364, + "step": 633 + }, + { + "epoch": 0.23412112259970458, + "grad_norm": 0.3249545395374298, + "learning_rate": 0.0001845054809705629, + "loss": 0.4313, + "step": 634 + }, + { + "epoch": 0.2344903988183161, + "grad_norm": 0.26184847950935364, + "learning_rate": 0.0001844808473949994, + "loss": 0.3484, + "step": 635 + }, + { + "epoch": 0.23485967503692762, + "grad_norm": 0.27996307611465454, + "learning_rate": 0.0001844562138194359, + "loss": 0.3321, + "step": 636 + }, + { + "epoch": 0.23522895125553914, + "grad_norm": 0.2890424132347107, + "learning_rate": 0.0001844315802438724, + "loss": 0.3298, + "step": 637 + }, + { + "epoch": 0.23559822747415066, + "grad_norm": 0.25146883726119995, + "learning_rate": 0.0001844069466683089, + "loss": 0.3076, + "step": 638 + }, + { + "epoch": 0.23596750369276218, + "grad_norm": 0.36291494965553284, + "learning_rate": 0.00018438231309274544, + "loss": 0.45, + "step": 639 + }, + { + "epoch": 0.2363367799113737, + "grad_norm": 0.28212517499923706, + "learning_rate": 0.00018435767951718193, + "loss": 0.3174, + "step": 640 + }, + { + "epoch": 0.23670605612998524, + "grad_norm": 0.2811421751976013, + "learning_rate": 0.00018433304594161844, + "loss": 0.3551, + "step": 641 + }, + { + "epoch": 0.23707533234859676, + "grad_norm": 0.26826873421669006, + "learning_rate": 0.00018430841236605493, + "loss": 0.3482, + "step": 642 + }, + { + "epoch": 0.23744460856720828, + "grad_norm": 0.288368821144104, + "learning_rate": 0.00018428377879049144, + "loss": 0.3249, + "step": 643 + }, + { + "epoch": 0.2378138847858198, + "grad_norm": 0.2753399610519409, + "learning_rate": 0.00018425914521492796, + "loss": 0.3572, + "step": 644 + }, + { + "epoch": 0.23818316100443132, + "grad_norm": 0.25386354327201843, + "learning_rate": 0.00018423451163936447, + "loss": 0.2843, + "step": 645 + }, + { + "epoch": 0.23855243722304284, + "grad_norm": 0.26572418212890625, + "learning_rate": 0.00018420987806380096, + "loss": 0.337, + "step": 646 + }, + { + "epoch": 0.23892171344165436, + "grad_norm": 0.36823752522468567, + "learning_rate": 0.00018418524448823748, + "loss": 0.415, + "step": 647 + }, + { + "epoch": 0.23929098966026588, + "grad_norm": 0.3368772268295288, + "learning_rate": 0.000184160610912674, + "loss": 0.3858, + "step": 648 + }, + { + "epoch": 0.2396602658788774, + "grad_norm": 0.4017770290374756, + "learning_rate": 0.0001841359773371105, + "loss": 0.3018, + "step": 649 + }, + { + "epoch": 0.24002954209748892, + "grad_norm": 0.22857922315597534, + "learning_rate": 0.000184111343761547, + "loss": 0.267, + "step": 650 + }, + { + "epoch": 0.24002954209748892, + "eval_loss": 7.790600776672363, + "eval_runtime": 6.9163, + "eval_samples_per_second": 7.229, + "eval_steps_per_second": 1.012, + "step": 650 + }, + { + "epoch": 0.24039881831610044, + "grad_norm": 0.293861448764801, + "learning_rate": 0.0001840867101859835, + "loss": 0.2774, + "step": 651 + }, + { + "epoch": 0.24076809453471196, + "grad_norm": 0.26243409514427185, + "learning_rate": 0.00018406207661042002, + "loss": 0.325, + "step": 652 + }, + { + "epoch": 0.24113737075332348, + "grad_norm": 0.30947738885879517, + "learning_rate": 0.00018403744303485654, + "loss": 0.3486, + "step": 653 + }, + { + "epoch": 0.241506646971935, + "grad_norm": 0.2856808602809906, + "learning_rate": 0.00018401280945929302, + "loss": 0.4366, + "step": 654 + }, + { + "epoch": 0.24187592319054652, + "grad_norm": 0.259084016084671, + "learning_rate": 0.00018398817588372954, + "loss": 0.3209, + "step": 655 + }, + { + "epoch": 0.24224519940915806, + "grad_norm": 0.25716495513916016, + "learning_rate": 0.00018396354230816603, + "loss": 0.3055, + "step": 656 + }, + { + "epoch": 0.24261447562776958, + "grad_norm": 0.2595136761665344, + "learning_rate": 0.00018393890873260257, + "loss": 0.3161, + "step": 657 + }, + { + "epoch": 0.2429837518463811, + "grad_norm": 0.3223538100719452, + "learning_rate": 0.00018391427515703905, + "loss": 0.4696, + "step": 658 + }, + { + "epoch": 0.24335302806499262, + "grad_norm": 0.28250429034233093, + "learning_rate": 0.00018388964158147557, + "loss": 0.366, + "step": 659 + }, + { + "epoch": 0.24372230428360414, + "grad_norm": 0.2720170319080353, + "learning_rate": 0.00018386500800591206, + "loss": 0.3066, + "step": 660 + }, + { + "epoch": 0.24409158050221566, + "grad_norm": 0.2541826367378235, + "learning_rate": 0.00018384037443034857, + "loss": 0.3227, + "step": 661 + }, + { + "epoch": 0.24446085672082718, + "grad_norm": 0.39735937118530273, + "learning_rate": 0.00018381574085478509, + "loss": 0.4538, + "step": 662 + }, + { + "epoch": 0.2448301329394387, + "grad_norm": 0.29610827565193176, + "learning_rate": 0.0001837911072792216, + "loss": 0.3878, + "step": 663 + }, + { + "epoch": 0.24519940915805022, + "grad_norm": 0.2725978195667267, + "learning_rate": 0.0001837664737036581, + "loss": 0.3051, + "step": 664 + }, + { + "epoch": 0.24556868537666174, + "grad_norm": 0.3400838077068329, + "learning_rate": 0.0001837418401280946, + "loss": 0.4011, + "step": 665 + }, + { + "epoch": 0.24593796159527326, + "grad_norm": 0.26927271485328674, + "learning_rate": 0.00018371720655253112, + "loss": 0.3139, + "step": 666 + }, + { + "epoch": 0.24630723781388478, + "grad_norm": 0.31705546379089355, + "learning_rate": 0.00018369257297696763, + "loss": 0.3719, + "step": 667 + }, + { + "epoch": 0.2466765140324963, + "grad_norm": 0.24875083565711975, + "learning_rate": 0.00018366793940140412, + "loss": 0.3054, + "step": 668 + }, + { + "epoch": 0.24704579025110782, + "grad_norm": 0.46187645196914673, + "learning_rate": 0.00018364330582584063, + "loss": 0.2764, + "step": 669 + }, + { + "epoch": 0.24741506646971936, + "grad_norm": 0.2528071105480194, + "learning_rate": 0.00018361867225027712, + "loss": 0.3232, + "step": 670 + }, + { + "epoch": 0.24778434268833088, + "grad_norm": 0.2717245817184448, + "learning_rate": 0.00018359403867471366, + "loss": 0.3097, + "step": 671 + }, + { + "epoch": 0.2481536189069424, + "grad_norm": 0.24125029146671295, + "learning_rate": 0.00018356940509915015, + "loss": 0.2983, + "step": 672 + }, + { + "epoch": 0.24852289512555392, + "grad_norm": 0.2709982097148895, + "learning_rate": 0.00018354477152358667, + "loss": 0.3068, + "step": 673 + }, + { + "epoch": 0.24889217134416544, + "grad_norm": 0.26097947359085083, + "learning_rate": 0.00018352013794802315, + "loss": 0.2994, + "step": 674 + }, + { + "epoch": 0.24926144756277696, + "grad_norm": 0.2783623933792114, + "learning_rate": 0.00018349550437245967, + "loss": 0.3492, + "step": 675 + }, + { + "epoch": 0.24963072378138848, + "grad_norm": 0.26279401779174805, + "learning_rate": 0.00018347087079689618, + "loss": 0.2945, + "step": 676 + }, + { + "epoch": 0.25, + "grad_norm": 0.3207119107246399, + "learning_rate": 0.0001834462372213327, + "loss": 0.3704, + "step": 677 + }, + { + "epoch": 0.25036927621861155, + "grad_norm": 0.25572821497917175, + "learning_rate": 0.00018342160364576918, + "loss": 0.323, + "step": 678 + }, + { + "epoch": 0.25073855243722304, + "grad_norm": 0.29006001353263855, + "learning_rate": 0.0001833969700702057, + "loss": 0.3518, + "step": 679 + }, + { + "epoch": 0.2511078286558346, + "grad_norm": 0.3809874951839447, + "learning_rate": 0.00018337233649464221, + "loss": 0.4361, + "step": 680 + }, + { + "epoch": 0.2514771048744461, + "grad_norm": 0.3497256934642792, + "learning_rate": 0.00018334770291907873, + "loss": 0.3276, + "step": 681 + }, + { + "epoch": 0.2518463810930576, + "grad_norm": 0.2930906414985657, + "learning_rate": 0.00018332306934351522, + "loss": 0.3754, + "step": 682 + }, + { + "epoch": 0.2522156573116691, + "grad_norm": 0.2927539050579071, + "learning_rate": 0.00018329843576795173, + "loss": 0.4167, + "step": 683 + }, + { + "epoch": 0.25258493353028066, + "grad_norm": 0.27059948444366455, + "learning_rate": 0.00018327380219238825, + "loss": 0.3208, + "step": 684 + }, + { + "epoch": 0.25295420974889216, + "grad_norm": 0.3991597592830658, + "learning_rate": 0.00018324916861682476, + "loss": 0.4825, + "step": 685 + }, + { + "epoch": 0.2533234859675037, + "grad_norm": 0.24698469042778015, + "learning_rate": 0.00018322453504126125, + "loss": 0.3237, + "step": 686 + }, + { + "epoch": 0.2536927621861152, + "grad_norm": 0.3192894756793976, + "learning_rate": 0.00018319990146569776, + "loss": 0.3818, + "step": 687 + }, + { + "epoch": 0.25406203840472674, + "grad_norm": 0.2911951243877411, + "learning_rate": 0.00018317526789013425, + "loss": 0.3657, + "step": 688 + }, + { + "epoch": 0.25443131462333823, + "grad_norm": 0.23214633762836456, + "learning_rate": 0.0001831506343145708, + "loss": 0.3537, + "step": 689 + }, + { + "epoch": 0.2548005908419498, + "grad_norm": 0.2587379515171051, + "learning_rate": 0.00018312600073900728, + "loss": 0.3142, + "step": 690 + }, + { + "epoch": 0.2551698670605613, + "grad_norm": 0.2375342845916748, + "learning_rate": 0.0001831013671634438, + "loss": 0.281, + "step": 691 + }, + { + "epoch": 0.2555391432791728, + "grad_norm": 0.2895774245262146, + "learning_rate": 0.00018307673358788028, + "loss": 0.3247, + "step": 692 + }, + { + "epoch": 0.25590841949778437, + "grad_norm": 0.27866557240486145, + "learning_rate": 0.0001830521000123168, + "loss": 0.3159, + "step": 693 + }, + { + "epoch": 0.25627769571639586, + "grad_norm": 0.2720243036746979, + "learning_rate": 0.0001830274664367533, + "loss": 0.3723, + "step": 694 + }, + { + "epoch": 0.2566469719350074, + "grad_norm": 0.301096111536026, + "learning_rate": 0.00018300283286118983, + "loss": 0.3225, + "step": 695 + }, + { + "epoch": 0.2570162481536189, + "grad_norm": 0.28419265151023865, + "learning_rate": 0.0001829781992856263, + "loss": 0.3678, + "step": 696 + }, + { + "epoch": 0.25738552437223045, + "grad_norm": 0.25694453716278076, + "learning_rate": 0.00018295356571006283, + "loss": 0.2579, + "step": 697 + }, + { + "epoch": 0.25775480059084194, + "grad_norm": 0.27401110529899597, + "learning_rate": 0.00018292893213449934, + "loss": 0.315, + "step": 698 + }, + { + "epoch": 0.2581240768094535, + "grad_norm": 0.34122467041015625, + "learning_rate": 0.00018290429855893586, + "loss": 0.4364, + "step": 699 + }, + { + "epoch": 0.258493353028065, + "grad_norm": 0.28709614276885986, + "learning_rate": 0.00018287966498337234, + "loss": 0.3614, + "step": 700 + }, + { + "epoch": 0.258493353028065, + "eval_loss": 7.569185733795166, + "eval_runtime": 6.9707, + "eval_samples_per_second": 7.173, + "eval_steps_per_second": 1.004, + "step": 700 + }, + { + "epoch": 0.2588626292466765, + "grad_norm": 0.3032025992870331, + "learning_rate": 0.00018285503140780883, + "loss": 0.3466, + "step": 701 + }, + { + "epoch": 0.259231905465288, + "grad_norm": 0.28988832235336304, + "learning_rate": 0.00018283039783224535, + "loss": 0.3816, + "step": 702 + }, + { + "epoch": 0.25960118168389956, + "grad_norm": 0.2672080993652344, + "learning_rate": 0.00018280576425668186, + "loss": 0.345, + "step": 703 + }, + { + "epoch": 0.25997045790251105, + "grad_norm": 0.3715870976448059, + "learning_rate": 0.00018278113068111838, + "loss": 0.3808, + "step": 704 + }, + { + "epoch": 0.2603397341211226, + "grad_norm": 0.26245784759521484, + "learning_rate": 0.00018275649710555486, + "loss": 0.3229, + "step": 705 + }, + { + "epoch": 0.26070901033973415, + "grad_norm": 0.2798522412776947, + "learning_rate": 0.00018273186352999138, + "loss": 0.242, + "step": 706 + }, + { + "epoch": 0.26107828655834564, + "grad_norm": 0.3050256073474884, + "learning_rate": 0.0001827072299544279, + "loss": 0.3203, + "step": 707 + }, + { + "epoch": 0.2614475627769572, + "grad_norm": 0.25486963987350464, + "learning_rate": 0.0001826825963788644, + "loss": 0.3702, + "step": 708 + }, + { + "epoch": 0.2618168389955687, + "grad_norm": 0.2902330756187439, + "learning_rate": 0.0001826579628033009, + "loss": 0.3138, + "step": 709 + }, + { + "epoch": 0.2621861152141802, + "grad_norm": 0.2677933871746063, + "learning_rate": 0.0001826333292277374, + "loss": 0.3324, + "step": 710 + }, + { + "epoch": 0.2625553914327917, + "grad_norm": 0.306822270154953, + "learning_rate": 0.00018260869565217392, + "loss": 0.2795, + "step": 711 + }, + { + "epoch": 0.26292466765140327, + "grad_norm": 0.2645772695541382, + "learning_rate": 0.00018258406207661044, + "loss": 0.2773, + "step": 712 + }, + { + "epoch": 0.26329394387001476, + "grad_norm": 0.28124552965164185, + "learning_rate": 0.00018255942850104693, + "loss": 0.3348, + "step": 713 + }, + { + "epoch": 0.2636632200886263, + "grad_norm": 0.2847256362438202, + "learning_rate": 0.00018253479492548344, + "loss": 0.3276, + "step": 714 + }, + { + "epoch": 0.2640324963072378, + "grad_norm": 0.3100880980491638, + "learning_rate": 0.00018251016134991993, + "loss": 0.3278, + "step": 715 + }, + { + "epoch": 0.26440177252584934, + "grad_norm": 0.24322248995304108, + "learning_rate": 0.00018248552777435647, + "loss": 0.3122, + "step": 716 + }, + { + "epoch": 0.26477104874446084, + "grad_norm": 0.31305819749832153, + "learning_rate": 0.00018246089419879296, + "loss": 0.4232, + "step": 717 + }, + { + "epoch": 0.2651403249630724, + "grad_norm": 0.24644005298614502, + "learning_rate": 0.00018243626062322947, + "loss": 0.2839, + "step": 718 + }, + { + "epoch": 0.2655096011816839, + "grad_norm": 0.28868818283081055, + "learning_rate": 0.00018241162704766596, + "loss": 0.3434, + "step": 719 + }, + { + "epoch": 0.2658788774002954, + "grad_norm": 0.3566708266735077, + "learning_rate": 0.00018238699347210247, + "loss": 0.3674, + "step": 720 + }, + { + "epoch": 0.26624815361890697, + "grad_norm": 0.285990446805954, + "learning_rate": 0.000182362359896539, + "loss": 0.3202, + "step": 721 + }, + { + "epoch": 0.26661742983751846, + "grad_norm": 0.28163185715675354, + "learning_rate": 0.0001823377263209755, + "loss": 0.3235, + "step": 722 + }, + { + "epoch": 0.26698670605613, + "grad_norm": 0.30181750655174255, + "learning_rate": 0.000182313092745412, + "loss": 0.3769, + "step": 723 + }, + { + "epoch": 0.2673559822747415, + "grad_norm": 0.29629477858543396, + "learning_rate": 0.0001822884591698485, + "loss": 0.4508, + "step": 724 + }, + { + "epoch": 0.26772525849335305, + "grad_norm": 0.30531978607177734, + "learning_rate": 0.00018226382559428502, + "loss": 0.3721, + "step": 725 + }, + { + "epoch": 0.26809453471196454, + "grad_norm": 0.2935096323490143, + "learning_rate": 0.00018223919201872154, + "loss": 0.4026, + "step": 726 + }, + { + "epoch": 0.2684638109305761, + "grad_norm": 0.2841987609863281, + "learning_rate": 0.00018221455844315802, + "loss": 0.3567, + "step": 727 + }, + { + "epoch": 0.2688330871491876, + "grad_norm": 0.26891282200813293, + "learning_rate": 0.00018218992486759454, + "loss": 0.2895, + "step": 728 + }, + { + "epoch": 0.2692023633677991, + "grad_norm": 0.2891632616519928, + "learning_rate": 0.00018216529129203103, + "loss": 0.3791, + "step": 729 + }, + { + "epoch": 0.2695716395864106, + "grad_norm": 0.2506559193134308, + "learning_rate": 0.00018214065771646757, + "loss": 0.3148, + "step": 730 + }, + { + "epoch": 0.26994091580502216, + "grad_norm": 0.26931077241897583, + "learning_rate": 0.00018211602414090405, + "loss": 0.2893, + "step": 731 + }, + { + "epoch": 0.27031019202363366, + "grad_norm": 0.25128957629203796, + "learning_rate": 0.00018209139056534057, + "loss": 0.2796, + "step": 732 + }, + { + "epoch": 0.2706794682422452, + "grad_norm": 0.26875555515289307, + "learning_rate": 0.00018206675698977706, + "loss": 0.3373, + "step": 733 + }, + { + "epoch": 0.2710487444608567, + "grad_norm": 0.2901384234428406, + "learning_rate": 0.00018204212341421357, + "loss": 0.3636, + "step": 734 + }, + { + "epoch": 0.27141802067946824, + "grad_norm": 0.5337287783622742, + "learning_rate": 0.00018201748983865009, + "loss": 0.3672, + "step": 735 + }, + { + "epoch": 0.2717872968980798, + "grad_norm": 0.2673599421977997, + "learning_rate": 0.0001819928562630866, + "loss": 0.3151, + "step": 736 + }, + { + "epoch": 0.2721565731166913, + "grad_norm": 0.3110290467739105, + "learning_rate": 0.0001819682226875231, + "loss": 0.364, + "step": 737 + }, + { + "epoch": 0.2725258493353028, + "grad_norm": 0.27627620100975037, + "learning_rate": 0.0001819435891119596, + "loss": 0.3297, + "step": 738 + }, + { + "epoch": 0.2728951255539143, + "grad_norm": 0.38521748781204224, + "learning_rate": 0.00018191895553639612, + "loss": 0.3879, + "step": 739 + }, + { + "epoch": 0.27326440177252587, + "grad_norm": 0.30822885036468506, + "learning_rate": 0.00018189432196083263, + "loss": 0.3561, + "step": 740 + }, + { + "epoch": 0.27363367799113736, + "grad_norm": 0.32634204626083374, + "learning_rate": 0.00018186968838526912, + "loss": 0.3153, + "step": 741 + }, + { + "epoch": 0.2740029542097489, + "grad_norm": 0.2870141565799713, + "learning_rate": 0.00018184505480970563, + "loss": 0.3658, + "step": 742 + }, + { + "epoch": 0.2743722304283604, + "grad_norm": 0.36711204051971436, + "learning_rate": 0.00018182042123414215, + "loss": 0.4398, + "step": 743 + }, + { + "epoch": 0.27474150664697194, + "grad_norm": 0.29540640115737915, + "learning_rate": 0.00018179578765857866, + "loss": 0.3993, + "step": 744 + }, + { + "epoch": 0.27511078286558344, + "grad_norm": 0.306117445230484, + "learning_rate": 0.00018177115408301515, + "loss": 0.4187, + "step": 745 + }, + { + "epoch": 0.275480059084195, + "grad_norm": 0.28601935505867004, + "learning_rate": 0.00018174652050745167, + "loss": 0.3492, + "step": 746 + }, + { + "epoch": 0.2758493353028065, + "grad_norm": 0.2790848910808563, + "learning_rate": 0.00018172188693188815, + "loss": 0.2995, + "step": 747 + }, + { + "epoch": 0.276218611521418, + "grad_norm": 0.3157104551792145, + "learning_rate": 0.0001816972533563247, + "loss": 0.3469, + "step": 748 + }, + { + "epoch": 0.2765878877400295, + "grad_norm": 0.3087219297885895, + "learning_rate": 0.00018167261978076118, + "loss": 0.3602, + "step": 749 + }, + { + "epoch": 0.27695716395864106, + "grad_norm": 0.29983654618263245, + "learning_rate": 0.0001816479862051977, + "loss": 0.3561, + "step": 750 + }, + { + "epoch": 0.27695716395864106, + "eval_loss": 7.710749626159668, + "eval_runtime": 6.9179, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 1.012, + "step": 750 + }, + { + "epoch": 0.2773264401772526, + "grad_norm": 0.2899138331413269, + "learning_rate": 0.00018162335262963418, + "loss": 0.3655, + "step": 751 + }, + { + "epoch": 0.2776957163958641, + "grad_norm": 0.2892729640007019, + "learning_rate": 0.0001815987190540707, + "loss": 0.3775, + "step": 752 + }, + { + "epoch": 0.27806499261447565, + "grad_norm": 0.21820051968097687, + "learning_rate": 0.00018157408547850721, + "loss": 0.2736, + "step": 753 + }, + { + "epoch": 0.27843426883308714, + "grad_norm": 0.28620773553848267, + "learning_rate": 0.00018154945190294373, + "loss": 0.401, + "step": 754 + }, + { + "epoch": 0.2788035450516987, + "grad_norm": 0.28439512848854065, + "learning_rate": 0.00018152481832738022, + "loss": 0.321, + "step": 755 + }, + { + "epoch": 0.2791728212703102, + "grad_norm": 0.2741285562515259, + "learning_rate": 0.00018150018475181673, + "loss": 0.3478, + "step": 756 + }, + { + "epoch": 0.2795420974889217, + "grad_norm": 0.26128458976745605, + "learning_rate": 0.00018147555117625325, + "loss": 0.3155, + "step": 757 + }, + { + "epoch": 0.2799113737075332, + "grad_norm": 0.2934739589691162, + "learning_rate": 0.00018145091760068976, + "loss": 0.4206, + "step": 758 + }, + { + "epoch": 0.28028064992614476, + "grad_norm": 0.2968691885471344, + "learning_rate": 0.00018142628402512625, + "loss": 0.434, + "step": 759 + }, + { + "epoch": 0.28064992614475626, + "grad_norm": 0.2692506015300751, + "learning_rate": 0.00018140165044956276, + "loss": 0.3097, + "step": 760 + }, + { + "epoch": 0.2810192023633678, + "grad_norm": 0.33005863428115845, + "learning_rate": 0.00018137701687399925, + "loss": 0.3176, + "step": 761 + }, + { + "epoch": 0.2813884785819793, + "grad_norm": 0.26975953578948975, + "learning_rate": 0.0001813523832984358, + "loss": 0.3404, + "step": 762 + }, + { + "epoch": 0.28175775480059084, + "grad_norm": 0.3143404424190521, + "learning_rate": 0.00018132774972287228, + "loss": 0.3582, + "step": 763 + }, + { + "epoch": 0.2821270310192024, + "grad_norm": 0.2800186574459076, + "learning_rate": 0.0001813031161473088, + "loss": 0.3268, + "step": 764 + }, + { + "epoch": 0.2824963072378139, + "grad_norm": 0.2704242169857025, + "learning_rate": 0.00018127848257174528, + "loss": 0.3449, + "step": 765 + }, + { + "epoch": 0.28286558345642543, + "grad_norm": 0.27032992243766785, + "learning_rate": 0.0001812538489961818, + "loss": 0.3438, + "step": 766 + }, + { + "epoch": 0.2832348596750369, + "grad_norm": 0.43337303400039673, + "learning_rate": 0.0001812292154206183, + "loss": 0.3398, + "step": 767 + }, + { + "epoch": 0.28360413589364847, + "grad_norm": 0.439068466424942, + "learning_rate": 0.00018120458184505482, + "loss": 0.3659, + "step": 768 + }, + { + "epoch": 0.28397341211225996, + "grad_norm": 0.24385535717010498, + "learning_rate": 0.0001811799482694913, + "loss": 0.2749, + "step": 769 + }, + { + "epoch": 0.2843426883308715, + "grad_norm": 0.3012402653694153, + "learning_rate": 0.00018115531469392783, + "loss": 0.35, + "step": 770 + }, + { + "epoch": 0.284711964549483, + "grad_norm": 0.293885737657547, + "learning_rate": 0.00018113068111836434, + "loss": 0.3521, + "step": 771 + }, + { + "epoch": 0.28508124076809455, + "grad_norm": 0.26932981610298157, + "learning_rate": 0.00018110604754280086, + "loss": 0.3864, + "step": 772 + }, + { + "epoch": 0.28545051698670604, + "grad_norm": 0.2916109561920166, + "learning_rate": 0.00018108141396723734, + "loss": 0.383, + "step": 773 + }, + { + "epoch": 0.2858197932053176, + "grad_norm": 0.3528326153755188, + "learning_rate": 0.00018105678039167386, + "loss": 0.377, + "step": 774 + }, + { + "epoch": 0.2861890694239291, + "grad_norm": 0.23773305118083954, + "learning_rate": 0.00018103214681611035, + "loss": 0.2861, + "step": 775 + }, + { + "epoch": 0.2865583456425406, + "grad_norm": 0.3272114396095276, + "learning_rate": 0.0001810075132405469, + "loss": 0.3615, + "step": 776 + }, + { + "epoch": 0.2869276218611521, + "grad_norm": 0.3330075740814209, + "learning_rate": 0.00018098287966498338, + "loss": 0.3752, + "step": 777 + }, + { + "epoch": 0.28729689807976366, + "grad_norm": 0.3110154867172241, + "learning_rate": 0.0001809582460894199, + "loss": 0.3512, + "step": 778 + }, + { + "epoch": 0.2876661742983752, + "grad_norm": 0.30666112899780273, + "learning_rate": 0.00018093361251385638, + "loss": 0.3613, + "step": 779 + }, + { + "epoch": 0.2880354505169867, + "grad_norm": 0.3322407603263855, + "learning_rate": 0.0001809089789382929, + "loss": 0.3785, + "step": 780 + }, + { + "epoch": 0.28840472673559825, + "grad_norm": 0.27546021342277527, + "learning_rate": 0.0001808843453627294, + "loss": 0.308, + "step": 781 + }, + { + "epoch": 0.28877400295420974, + "grad_norm": 0.2651527225971222, + "learning_rate": 0.00018085971178716592, + "loss": 0.3074, + "step": 782 + }, + { + "epoch": 0.2891432791728213, + "grad_norm": 0.2930489480495453, + "learning_rate": 0.0001808350782116024, + "loss": 0.4028, + "step": 783 + }, + { + "epoch": 0.2895125553914328, + "grad_norm": 0.2625216245651245, + "learning_rate": 0.00018081044463603892, + "loss": 0.3048, + "step": 784 + }, + { + "epoch": 0.2898818316100443, + "grad_norm": 0.3637246787548065, + "learning_rate": 0.00018078581106047544, + "loss": 0.4241, + "step": 785 + }, + { + "epoch": 0.2902511078286558, + "grad_norm": 0.26004502177238464, + "learning_rate": 0.00018076117748491195, + "loss": 0.3047, + "step": 786 + }, + { + "epoch": 0.29062038404726737, + "grad_norm": 0.273285448551178, + "learning_rate": 0.00018073654390934844, + "loss": 0.3314, + "step": 787 + }, + { + "epoch": 0.29098966026587886, + "grad_norm": 0.2540383040904999, + "learning_rate": 0.00018071191033378496, + "loss": 0.3339, + "step": 788 + }, + { + "epoch": 0.2913589364844904, + "grad_norm": 0.284517377614975, + "learning_rate": 0.00018068727675822147, + "loss": 0.4181, + "step": 789 + }, + { + "epoch": 0.2917282127031019, + "grad_norm": 0.2687482237815857, + "learning_rate": 0.00018066264318265798, + "loss": 0.3643, + "step": 790 + }, + { + "epoch": 0.29209748892171344, + "grad_norm": 0.26633602380752563, + "learning_rate": 0.00018063800960709447, + "loss": 0.3483, + "step": 791 + }, + { + "epoch": 0.29246676514032494, + "grad_norm": 0.2679583728313446, + "learning_rate": 0.000180613376031531, + "loss": 0.3356, + "step": 792 + }, + { + "epoch": 0.2928360413589365, + "grad_norm": 0.2590503990650177, + "learning_rate": 0.00018058874245596747, + "loss": 0.2984, + "step": 793 + }, + { + "epoch": 0.29320531757754803, + "grad_norm": 0.2677006125450134, + "learning_rate": 0.00018056410888040402, + "loss": 0.3222, + "step": 794 + }, + { + "epoch": 0.2935745937961595, + "grad_norm": 0.27977511286735535, + "learning_rate": 0.0001805394753048405, + "loss": 0.3224, + "step": 795 + }, + { + "epoch": 0.29394387001477107, + "grad_norm": 0.28381040692329407, + "learning_rate": 0.00018051484172927702, + "loss": 0.3501, + "step": 796 + }, + { + "epoch": 0.29431314623338256, + "grad_norm": 0.24731963872909546, + "learning_rate": 0.0001804902081537135, + "loss": 0.2848, + "step": 797 + }, + { + "epoch": 0.2946824224519941, + "grad_norm": 0.28228527307510376, + "learning_rate": 0.00018046557457815002, + "loss": 0.3254, + "step": 798 + }, + { + "epoch": 0.2950516986706056, + "grad_norm": 0.3394302725791931, + "learning_rate": 0.00018044094100258653, + "loss": 0.3856, + "step": 799 + }, + { + "epoch": 0.29542097488921715, + "grad_norm": 0.24118614196777344, + "learning_rate": 0.00018041630742702305, + "loss": 0.3632, + "step": 800 + }, + { + "epoch": 0.29542097488921715, + "eval_loss": 7.635326385498047, + "eval_runtime": 7.5047, + "eval_samples_per_second": 6.662, + "eval_steps_per_second": 0.933, + "step": 800 + }, + { + "epoch": 0.29579025110782864, + "grad_norm": 0.2576707899570465, + "learning_rate": 0.00018039167385145954, + "loss": 0.3001, + "step": 801 + }, + { + "epoch": 0.2961595273264402, + "grad_norm": 0.27399665117263794, + "learning_rate": 0.00018036704027589605, + "loss": 0.3208, + "step": 802 + }, + { + "epoch": 0.2965288035450517, + "grad_norm": 0.30259624123573303, + "learning_rate": 0.00018034240670033257, + "loss": 0.354, + "step": 803 + }, + { + "epoch": 0.2968980797636632, + "grad_norm": 0.3033263087272644, + "learning_rate": 0.00018031777312476908, + "loss": 0.3647, + "step": 804 + }, + { + "epoch": 0.2972673559822747, + "grad_norm": 0.26793143153190613, + "learning_rate": 0.00018029313954920557, + "loss": 0.3536, + "step": 805 + }, + { + "epoch": 0.29763663220088626, + "grad_norm": 0.29811224341392517, + "learning_rate": 0.00018026850597364208, + "loss": 0.384, + "step": 806 + }, + { + "epoch": 0.2980059084194978, + "grad_norm": 0.2861284613609314, + "learning_rate": 0.00018024387239807857, + "loss": 0.3413, + "step": 807 + }, + { + "epoch": 0.2983751846381093, + "grad_norm": 0.2971009910106659, + "learning_rate": 0.0001802192388225151, + "loss": 0.3797, + "step": 808 + }, + { + "epoch": 0.29874446085672085, + "grad_norm": 0.2787891924381256, + "learning_rate": 0.0001801946052469516, + "loss": 0.3903, + "step": 809 + }, + { + "epoch": 0.29911373707533234, + "grad_norm": 0.26950135827064514, + "learning_rate": 0.00018016997167138811, + "loss": 0.3221, + "step": 810 + }, + { + "epoch": 0.2994830132939439, + "grad_norm": 0.27545055747032166, + "learning_rate": 0.0001801453380958246, + "loss": 0.3258, + "step": 811 + }, + { + "epoch": 0.2998522895125554, + "grad_norm": 0.25140896439552307, + "learning_rate": 0.00018012070452026112, + "loss": 0.315, + "step": 812 + }, + { + "epoch": 0.30022156573116693, + "grad_norm": 0.26844027638435364, + "learning_rate": 0.00018009607094469763, + "loss": 0.3037, + "step": 813 + }, + { + "epoch": 0.3005908419497784, + "grad_norm": 0.27014753222465515, + "learning_rate": 0.00018007143736913415, + "loss": 0.3642, + "step": 814 + }, + { + "epoch": 0.30096011816838997, + "grad_norm": 0.2609260380268097, + "learning_rate": 0.00018004680379357063, + "loss": 0.255, + "step": 815 + }, + { + "epoch": 0.30132939438700146, + "grad_norm": 0.23914338648319244, + "learning_rate": 0.00018002217021800715, + "loss": 0.2754, + "step": 816 + }, + { + "epoch": 0.301698670605613, + "grad_norm": 0.26947325468063354, + "learning_rate": 0.00017999753664244366, + "loss": 0.3436, + "step": 817 + }, + { + "epoch": 0.3020679468242245, + "grad_norm": 0.3559544086456299, + "learning_rate": 0.00017997290306688018, + "loss": 0.4625, + "step": 818 + }, + { + "epoch": 0.30243722304283605, + "grad_norm": 0.3010619580745697, + "learning_rate": 0.00017994826949131666, + "loss": 0.336, + "step": 819 + }, + { + "epoch": 0.30280649926144754, + "grad_norm": 0.29284095764160156, + "learning_rate": 0.00017992363591575318, + "loss": 0.3209, + "step": 820 + }, + { + "epoch": 0.3031757754800591, + "grad_norm": 0.24582554399967194, + "learning_rate": 0.0001798990023401897, + "loss": 0.2847, + "step": 821 + }, + { + "epoch": 0.30354505169867063, + "grad_norm": 0.235111266374588, + "learning_rate": 0.0001798743687646262, + "loss": 0.3022, + "step": 822 + }, + { + "epoch": 0.3039143279172821, + "grad_norm": 0.3057333528995514, + "learning_rate": 0.0001798497351890627, + "loss": 0.3788, + "step": 823 + }, + { + "epoch": 0.30428360413589367, + "grad_norm": 0.30990734696388245, + "learning_rate": 0.0001798251016134992, + "loss": 0.35, + "step": 824 + }, + { + "epoch": 0.30465288035450516, + "grad_norm": 0.2965867221355438, + "learning_rate": 0.0001798004680379357, + "loss": 0.3635, + "step": 825 + }, + { + "epoch": 0.3050221565731167, + "grad_norm": 0.28720253705978394, + "learning_rate": 0.00017977583446237224, + "loss": 0.3862, + "step": 826 + }, + { + "epoch": 0.3053914327917282, + "grad_norm": 0.3264056146144867, + "learning_rate": 0.00017975120088680873, + "loss": 0.3566, + "step": 827 + }, + { + "epoch": 0.30576070901033975, + "grad_norm": 0.27594080567359924, + "learning_rate": 0.00017972656731124524, + "loss": 0.3364, + "step": 828 + }, + { + "epoch": 0.30612998522895124, + "grad_norm": 0.2566494941711426, + "learning_rate": 0.00017970193373568173, + "loss": 0.3112, + "step": 829 + }, + { + "epoch": 0.3064992614475628, + "grad_norm": 0.24703358113765717, + "learning_rate": 0.00017967730016011824, + "loss": 0.2979, + "step": 830 + }, + { + "epoch": 0.3068685376661743, + "grad_norm": 0.2710617184638977, + "learning_rate": 0.00017965266658455476, + "loss": 0.3252, + "step": 831 + }, + { + "epoch": 0.3072378138847858, + "grad_norm": 0.2870490550994873, + "learning_rate": 0.00017962803300899127, + "loss": 0.3347, + "step": 832 + }, + { + "epoch": 0.3076070901033973, + "grad_norm": 0.27746692299842834, + "learning_rate": 0.00017960339943342776, + "loss": 0.3283, + "step": 833 + }, + { + "epoch": 0.30797636632200887, + "grad_norm": 0.30770808458328247, + "learning_rate": 0.00017957876585786428, + "loss": 0.3707, + "step": 834 + }, + { + "epoch": 0.30834564254062036, + "grad_norm": 0.33171936869621277, + "learning_rate": 0.0001795541322823008, + "loss": 0.3122, + "step": 835 + }, + { + "epoch": 0.3087149187592319, + "grad_norm": 0.24512360990047455, + "learning_rate": 0.0001795294987067373, + "loss": 0.2951, + "step": 836 + }, + { + "epoch": 0.30908419497784345, + "grad_norm": 0.24775078892707825, + "learning_rate": 0.0001795048651311738, + "loss": 0.312, + "step": 837 + }, + { + "epoch": 0.30945347119645494, + "grad_norm": 0.261923611164093, + "learning_rate": 0.0001794802315556103, + "loss": 0.3328, + "step": 838 + }, + { + "epoch": 0.3098227474150665, + "grad_norm": 0.42608585953712463, + "learning_rate": 0.0001794555979800468, + "loss": 0.3736, + "step": 839 + }, + { + "epoch": 0.310192023633678, + "grad_norm": 0.26447972655296326, + "learning_rate": 0.00017943096440448334, + "loss": 0.337, + "step": 840 + }, + { + "epoch": 0.31056129985228953, + "grad_norm": 0.22069180011749268, + "learning_rate": 0.00017940633082891982, + "loss": 0.231, + "step": 841 + }, + { + "epoch": 0.310930576070901, + "grad_norm": 0.2970902621746063, + "learning_rate": 0.00017938169725335634, + "loss": 0.3954, + "step": 842 + }, + { + "epoch": 0.31129985228951257, + "grad_norm": 0.3341182470321655, + "learning_rate": 0.00017935706367779283, + "loss": 0.409, + "step": 843 + }, + { + "epoch": 0.31166912850812406, + "grad_norm": 0.2773878276348114, + "learning_rate": 0.00017933243010222934, + "loss": 0.3392, + "step": 844 + }, + { + "epoch": 0.3120384047267356, + "grad_norm": 0.2898205518722534, + "learning_rate": 0.00017930779652666586, + "loss": 0.3002, + "step": 845 + }, + { + "epoch": 0.3124076809453471, + "grad_norm": 0.24817512929439545, + "learning_rate": 0.00017928316295110237, + "loss": 0.2876, + "step": 846 + }, + { + "epoch": 0.31277695716395865, + "grad_norm": 0.2827116847038269, + "learning_rate": 0.00017925852937553886, + "loss": 0.3111, + "step": 847 + }, + { + "epoch": 0.31314623338257014, + "grad_norm": 0.2370031327009201, + "learning_rate": 0.00017923389579997537, + "loss": 0.2753, + "step": 848 + }, + { + "epoch": 0.3135155096011817, + "grad_norm": 0.26568716764450073, + "learning_rate": 0.0001792092622244119, + "loss": 0.3289, + "step": 849 + }, + { + "epoch": 0.31388478581979323, + "grad_norm": 0.32577386498451233, + "learning_rate": 0.0001791846286488484, + "loss": 0.3725, + "step": 850 + }, + { + "epoch": 0.31388478581979323, + "eval_loss": 7.8960041999816895, + "eval_runtime": 6.9331, + "eval_samples_per_second": 7.212, + "eval_steps_per_second": 1.01, + "step": 850 + }, + { + "epoch": 0.3142540620384047, + "grad_norm": 0.22731703519821167, + "learning_rate": 0.0001791599950732849, + "loss": 0.244, + "step": 851 + }, + { + "epoch": 0.31462333825701627, + "grad_norm": 0.24874229729175568, + "learning_rate": 0.0001791353614977214, + "loss": 0.327, + "step": 852 + }, + { + "epoch": 0.31499261447562776, + "grad_norm": 0.3693811595439911, + "learning_rate": 0.00017911072792215792, + "loss": 0.3216, + "step": 853 + }, + { + "epoch": 0.3153618906942393, + "grad_norm": 0.35060736536979675, + "learning_rate": 0.00017908609434659443, + "loss": 0.4056, + "step": 854 + }, + { + "epoch": 0.3157311669128508, + "grad_norm": 0.2921643853187561, + "learning_rate": 0.00017906146077103092, + "loss": 0.4148, + "step": 855 + }, + { + "epoch": 0.31610044313146235, + "grad_norm": 0.24892932176589966, + "learning_rate": 0.00017903682719546744, + "loss": 0.308, + "step": 856 + }, + { + "epoch": 0.31646971935007384, + "grad_norm": 0.3194721043109894, + "learning_rate": 0.00017901219361990392, + "loss": 0.2883, + "step": 857 + }, + { + "epoch": 0.3168389955686854, + "grad_norm": 0.33443185687065125, + "learning_rate": 0.00017898756004434046, + "loss": 0.3122, + "step": 858 + }, + { + "epoch": 0.3172082717872969, + "grad_norm": 0.31378236413002014, + "learning_rate": 0.00017896292646877695, + "loss": 0.3851, + "step": 859 + }, + { + "epoch": 0.3175775480059084, + "grad_norm": 0.2370665818452835, + "learning_rate": 0.00017893829289321347, + "loss": 0.308, + "step": 860 + }, + { + "epoch": 0.3179468242245199, + "grad_norm": 0.27492281794548035, + "learning_rate": 0.00017891365931764995, + "loss": 0.2933, + "step": 861 + }, + { + "epoch": 0.31831610044313147, + "grad_norm": 0.2368503212928772, + "learning_rate": 0.00017888902574208647, + "loss": 0.2963, + "step": 862 + }, + { + "epoch": 0.31868537666174296, + "grad_norm": 0.2467430680990219, + "learning_rate": 0.00017886439216652298, + "loss": 0.2851, + "step": 863 + }, + { + "epoch": 0.3190546528803545, + "grad_norm": 0.25410428643226624, + "learning_rate": 0.0001788397585909595, + "loss": 0.2738, + "step": 864 + }, + { + "epoch": 0.31942392909896605, + "grad_norm": 0.2257508486509323, + "learning_rate": 0.00017881512501539599, + "loss": 0.2676, + "step": 865 + }, + { + "epoch": 0.31979320531757754, + "grad_norm": 0.29200345277786255, + "learning_rate": 0.0001787904914398325, + "loss": 0.3231, + "step": 866 + }, + { + "epoch": 0.3201624815361891, + "grad_norm": 0.2947424352169037, + "learning_rate": 0.00017876585786426902, + "loss": 0.3725, + "step": 867 + }, + { + "epoch": 0.3205317577548006, + "grad_norm": 0.31930410861968994, + "learning_rate": 0.00017874122428870553, + "loss": 0.4208, + "step": 868 + }, + { + "epoch": 0.32090103397341213, + "grad_norm": 0.2770059108734131, + "learning_rate": 0.00017871659071314202, + "loss": 0.2733, + "step": 869 + }, + { + "epoch": 0.3212703101920236, + "grad_norm": 0.32301560044288635, + "learning_rate": 0.00017869195713757853, + "loss": 0.2967, + "step": 870 + }, + { + "epoch": 0.32163958641063517, + "grad_norm": 0.21610920131206512, + "learning_rate": 0.00017866732356201502, + "loss": 0.2306, + "step": 871 + }, + { + "epoch": 0.32200886262924666, + "grad_norm": 0.24497228860855103, + "learning_rate": 0.00017864268998645156, + "loss": 0.3268, + "step": 872 + }, + { + "epoch": 0.3223781388478582, + "grad_norm": 0.2331826090812683, + "learning_rate": 0.00017861805641088805, + "loss": 0.3084, + "step": 873 + }, + { + "epoch": 0.3227474150664697, + "grad_norm": 0.3439067006111145, + "learning_rate": 0.00017859342283532456, + "loss": 0.4648, + "step": 874 + }, + { + "epoch": 0.32311669128508125, + "grad_norm": 0.2757513225078583, + "learning_rate": 0.00017856878925976105, + "loss": 0.3371, + "step": 875 + }, + { + "epoch": 0.32348596750369274, + "grad_norm": 0.29035484790802, + "learning_rate": 0.00017854415568419757, + "loss": 0.322, + "step": 876 + }, + { + "epoch": 0.3238552437223043, + "grad_norm": 0.26096072793006897, + "learning_rate": 0.00017851952210863408, + "loss": 0.3239, + "step": 877 + }, + { + "epoch": 0.3242245199409158, + "grad_norm": 0.28644588589668274, + "learning_rate": 0.0001784948885330706, + "loss": 0.3722, + "step": 878 + }, + { + "epoch": 0.3245937961595273, + "grad_norm": 0.3428855538368225, + "learning_rate": 0.00017847025495750708, + "loss": 0.4062, + "step": 879 + }, + { + "epoch": 0.3249630723781389, + "grad_norm": 0.247748002409935, + "learning_rate": 0.0001784456213819436, + "loss": 0.3119, + "step": 880 + }, + { + "epoch": 0.32533234859675036, + "grad_norm": 0.25992152094841003, + "learning_rate": 0.0001784209878063801, + "loss": 0.3331, + "step": 881 + }, + { + "epoch": 0.3257016248153619, + "grad_norm": 0.301998496055603, + "learning_rate": 0.00017839635423081663, + "loss": 0.3968, + "step": 882 + }, + { + "epoch": 0.3260709010339734, + "grad_norm": 0.25745338201522827, + "learning_rate": 0.00017837172065525311, + "loss": 0.3173, + "step": 883 + }, + { + "epoch": 0.32644017725258495, + "grad_norm": 0.26824715733528137, + "learning_rate": 0.00017834708707968963, + "loss": 0.3904, + "step": 884 + }, + { + "epoch": 0.32680945347119644, + "grad_norm": 0.3045341670513153, + "learning_rate": 0.00017832245350412614, + "loss": 0.3901, + "step": 885 + }, + { + "epoch": 0.327178729689808, + "grad_norm": 0.36401525139808655, + "learning_rate": 0.00017829781992856266, + "loss": 0.4275, + "step": 886 + }, + { + "epoch": 0.3275480059084195, + "grad_norm": 0.2504934072494507, + "learning_rate": 0.00017827318635299915, + "loss": 0.3246, + "step": 887 + }, + { + "epoch": 0.32791728212703103, + "grad_norm": 0.285227507352829, + "learning_rate": 0.00017824855277743566, + "loss": 0.403, + "step": 888 + }, + { + "epoch": 0.3282865583456425, + "grad_norm": 0.2748340666294098, + "learning_rate": 0.00017822391920187215, + "loss": 0.3422, + "step": 889 + }, + { + "epoch": 0.32865583456425407, + "grad_norm": 0.3067420721054077, + "learning_rate": 0.0001781992856263087, + "loss": 0.3882, + "step": 890 + }, + { + "epoch": 0.32902511078286556, + "grad_norm": 0.23144252598285675, + "learning_rate": 0.00017817465205074518, + "loss": 0.2905, + "step": 891 + }, + { + "epoch": 0.3293943870014771, + "grad_norm": 0.2322801947593689, + "learning_rate": 0.0001781500184751817, + "loss": 0.2957, + "step": 892 + }, + { + "epoch": 0.3297636632200886, + "grad_norm": 0.28140661120414734, + "learning_rate": 0.00017812538489961818, + "loss": 0.2991, + "step": 893 + }, + { + "epoch": 0.33013293943870015, + "grad_norm": 0.2845861613750458, + "learning_rate": 0.0001781007513240547, + "loss": 0.3533, + "step": 894 + }, + { + "epoch": 0.3305022156573117, + "grad_norm": 0.2729456424713135, + "learning_rate": 0.0001780761177484912, + "loss": 0.3073, + "step": 895 + }, + { + "epoch": 0.3308714918759232, + "grad_norm": 0.3154023289680481, + "learning_rate": 0.00017805148417292772, + "loss": 0.3588, + "step": 896 + }, + { + "epoch": 0.33124076809453473, + "grad_norm": 0.2474805861711502, + "learning_rate": 0.0001780268505973642, + "loss": 0.2677, + "step": 897 + }, + { + "epoch": 0.3316100443131462, + "grad_norm": 0.3318121135234833, + "learning_rate": 0.00017800221702180073, + "loss": 0.4354, + "step": 898 + }, + { + "epoch": 0.33197932053175777, + "grad_norm": 0.29484444856643677, + "learning_rate": 0.00017797758344623724, + "loss": 0.3193, + "step": 899 + }, + { + "epoch": 0.33234859675036926, + "grad_norm": 0.33257654309272766, + "learning_rate": 0.00017795294987067375, + "loss": 0.3813, + "step": 900 + }, + { + "epoch": 0.33234859675036926, + "eval_loss": 7.976132869720459, + "eval_runtime": 7.1185, + "eval_samples_per_second": 7.024, + "eval_steps_per_second": 0.983, + "step": 900 + }, + { + "epoch": 0.3327178729689808, + "grad_norm": 0.27464133501052856, + "learning_rate": 0.00017792831629511024, + "loss": 0.2678, + "step": 901 + }, + { + "epoch": 0.3330871491875923, + "grad_norm": 0.3016645014286041, + "learning_rate": 0.00017790368271954676, + "loss": 0.4005, + "step": 902 + }, + { + "epoch": 0.33345642540620385, + "grad_norm": 0.2769106328487396, + "learning_rate": 0.00017787904914398324, + "loss": 0.3507, + "step": 903 + }, + { + "epoch": 0.33382570162481534, + "grad_norm": 0.25971776247024536, + "learning_rate": 0.00017785441556841979, + "loss": 0.2636, + "step": 904 + }, + { + "epoch": 0.3341949778434269, + "grad_norm": 0.29569417238235474, + "learning_rate": 0.00017782978199285627, + "loss": 0.3145, + "step": 905 + }, + { + "epoch": 0.3345642540620384, + "grad_norm": 0.2812824249267578, + "learning_rate": 0.0001778051484172928, + "loss": 0.3325, + "step": 906 + }, + { + "epoch": 0.3349335302806499, + "grad_norm": 0.26452457904815674, + "learning_rate": 0.00017778051484172928, + "loss": 0.2804, + "step": 907 + }, + { + "epoch": 0.3353028064992615, + "grad_norm": 0.29220932722091675, + "learning_rate": 0.0001777558812661658, + "loss": 0.3536, + "step": 908 + }, + { + "epoch": 0.33567208271787297, + "grad_norm": 0.2855215072631836, + "learning_rate": 0.0001777312476906023, + "loss": 0.324, + "step": 909 + }, + { + "epoch": 0.3360413589364845, + "grad_norm": 0.2799937427043915, + "learning_rate": 0.00017770661411503882, + "loss": 0.3573, + "step": 910 + }, + { + "epoch": 0.336410635155096, + "grad_norm": 0.2758358418941498, + "learning_rate": 0.0001776819805394753, + "loss": 0.3311, + "step": 911 + }, + { + "epoch": 0.33677991137370755, + "grad_norm": 0.30094167590141296, + "learning_rate": 0.00017765734696391182, + "loss": 0.3577, + "step": 912 + }, + { + "epoch": 0.33714918759231904, + "grad_norm": 0.2670411765575409, + "learning_rate": 0.00017763271338834834, + "loss": 0.3416, + "step": 913 + }, + { + "epoch": 0.3375184638109306, + "grad_norm": 0.39020755887031555, + "learning_rate": 0.00017760807981278485, + "loss": 0.3106, + "step": 914 + }, + { + "epoch": 0.3378877400295421, + "grad_norm": 0.2880863547325134, + "learning_rate": 0.00017758344623722134, + "loss": 0.3529, + "step": 915 + }, + { + "epoch": 0.33825701624815363, + "grad_norm": 0.32445740699768066, + "learning_rate": 0.00017755881266165785, + "loss": 0.3194, + "step": 916 + }, + { + "epoch": 0.3386262924667651, + "grad_norm": 0.2782299816608429, + "learning_rate": 0.00017753417908609437, + "loss": 0.3336, + "step": 917 + }, + { + "epoch": 0.33899556868537667, + "grad_norm": 0.3180716037750244, + "learning_rate": 0.00017750954551053088, + "loss": 0.3583, + "step": 918 + }, + { + "epoch": 0.33936484490398816, + "grad_norm": 0.28461307287216187, + "learning_rate": 0.00017748491193496737, + "loss": 0.3398, + "step": 919 + }, + { + "epoch": 0.3397341211225997, + "grad_norm": 0.22245624661445618, + "learning_rate": 0.00017746027835940388, + "loss": 0.2933, + "step": 920 + }, + { + "epoch": 0.3401033973412112, + "grad_norm": 0.28383955359458923, + "learning_rate": 0.00017743564478384037, + "loss": 0.3268, + "step": 921 + }, + { + "epoch": 0.34047267355982275, + "grad_norm": 0.32046714425086975, + "learning_rate": 0.00017741101120827691, + "loss": 0.3405, + "step": 922 + }, + { + "epoch": 0.3408419497784343, + "grad_norm": 0.3406597673892975, + "learning_rate": 0.0001773863776327134, + "loss": 0.3311, + "step": 923 + }, + { + "epoch": 0.3412112259970458, + "grad_norm": 0.25953730940818787, + "learning_rate": 0.00017736174405714992, + "loss": 0.332, + "step": 924 + }, + { + "epoch": 0.34158050221565733, + "grad_norm": 0.2518807053565979, + "learning_rate": 0.0001773371104815864, + "loss": 0.2926, + "step": 925 + }, + { + "epoch": 0.3419497784342688, + "grad_norm": 0.2617349326610565, + "learning_rate": 0.00017731247690602292, + "loss": 0.2576, + "step": 926 + }, + { + "epoch": 0.3423190546528804, + "grad_norm": 0.26028379797935486, + "learning_rate": 0.00017728784333045943, + "loss": 0.3479, + "step": 927 + }, + { + "epoch": 0.34268833087149186, + "grad_norm": 0.30594491958618164, + "learning_rate": 0.00017726320975489595, + "loss": 0.3395, + "step": 928 + }, + { + "epoch": 0.3430576070901034, + "grad_norm": 0.33180734515190125, + "learning_rate": 0.00017723857617933244, + "loss": 0.355, + "step": 929 + }, + { + "epoch": 0.3434268833087149, + "grad_norm": 0.30077919363975525, + "learning_rate": 0.00017721394260376895, + "loss": 0.3689, + "step": 930 + }, + { + "epoch": 0.34379615952732645, + "grad_norm": 0.3236958384513855, + "learning_rate": 0.00017718930902820546, + "loss": 0.4404, + "step": 931 + }, + { + "epoch": 0.34416543574593794, + "grad_norm": 0.3163043260574341, + "learning_rate": 0.00017716467545264198, + "loss": 0.4347, + "step": 932 + }, + { + "epoch": 0.3445347119645495, + "grad_norm": 0.27312856912612915, + "learning_rate": 0.00017714004187707847, + "loss": 0.3328, + "step": 933 + }, + { + "epoch": 0.344903988183161, + "grad_norm": 0.28686830401420593, + "learning_rate": 0.00017711540830151495, + "loss": 0.3797, + "step": 934 + }, + { + "epoch": 0.34527326440177253, + "grad_norm": 0.29462483525276184, + "learning_rate": 0.00017709077472595147, + "loss": 0.3612, + "step": 935 + }, + { + "epoch": 0.345642540620384, + "grad_norm": 0.2514147162437439, + "learning_rate": 0.00017706614115038798, + "loss": 0.3277, + "step": 936 + }, + { + "epoch": 0.34601181683899557, + "grad_norm": 0.22290514409542084, + "learning_rate": 0.0001770415075748245, + "loss": 0.2888, + "step": 937 + }, + { + "epoch": 0.3463810930576071, + "grad_norm": 0.2678544521331787, + "learning_rate": 0.00017701687399926099, + "loss": 0.2827, + "step": 938 + }, + { + "epoch": 0.3467503692762186, + "grad_norm": 0.274467408657074, + "learning_rate": 0.0001769922404236975, + "loss": 0.3243, + "step": 939 + }, + { + "epoch": 0.34711964549483015, + "grad_norm": 0.2691059410572052, + "learning_rate": 0.00017696760684813401, + "loss": 0.3037, + "step": 940 + }, + { + "epoch": 0.34748892171344165, + "grad_norm": 0.2699129581451416, + "learning_rate": 0.00017694297327257053, + "loss": 0.2762, + "step": 941 + }, + { + "epoch": 0.3478581979320532, + "grad_norm": 0.26856863498687744, + "learning_rate": 0.00017691833969700702, + "loss": 0.2755, + "step": 942 + }, + { + "epoch": 0.3482274741506647, + "grad_norm": 0.26118382811546326, + "learning_rate": 0.00017689370612144353, + "loss": 0.2966, + "step": 943 + }, + { + "epoch": 0.34859675036927623, + "grad_norm": 0.24606135487556458, + "learning_rate": 0.00017686907254588002, + "loss": 0.3125, + "step": 944 + }, + { + "epoch": 0.3489660265878877, + "grad_norm": 0.2303576022386551, + "learning_rate": 0.00017684443897031656, + "loss": 0.2952, + "step": 945 + }, + { + "epoch": 0.34933530280649927, + "grad_norm": 0.2659030556678772, + "learning_rate": 0.00017681980539475305, + "loss": 0.3666, + "step": 946 + }, + { + "epoch": 0.34970457902511076, + "grad_norm": 0.26473742723464966, + "learning_rate": 0.00017679517181918956, + "loss": 0.2862, + "step": 947 + }, + { + "epoch": 0.3500738552437223, + "grad_norm": 0.2867063283920288, + "learning_rate": 0.00017677053824362605, + "loss": 0.3609, + "step": 948 + }, + { + "epoch": 0.3504431314623338, + "grad_norm": 0.28919199109077454, + "learning_rate": 0.00017674590466806257, + "loss": 0.3697, + "step": 949 + }, + { + "epoch": 0.35081240768094535, + "grad_norm": 0.2240419089794159, + "learning_rate": 0.00017672127109249908, + "loss": 0.2322, + "step": 950 + }, + { + "epoch": 0.35081240768094535, + "eval_loss": 7.87774658203125, + "eval_runtime": 6.9235, + "eval_samples_per_second": 7.222, + "eval_steps_per_second": 1.011, + "step": 950 + }, + { + "epoch": 0.3511816838995569, + "grad_norm": 0.22913576662540436, + "learning_rate": 0.0001766966375169356, + "loss": 0.2739, + "step": 951 + }, + { + "epoch": 0.3515509601181684, + "grad_norm": 0.23535776138305664, + "learning_rate": 0.00017667200394137208, + "loss": 0.3111, + "step": 952 + }, + { + "epoch": 0.35192023633677993, + "grad_norm": 0.295166015625, + "learning_rate": 0.0001766473703658086, + "loss": 0.3545, + "step": 953 + }, + { + "epoch": 0.3522895125553914, + "grad_norm": 0.28357383608818054, + "learning_rate": 0.0001766227367902451, + "loss": 0.2866, + "step": 954 + }, + { + "epoch": 0.352658788774003, + "grad_norm": 0.2677425742149353, + "learning_rate": 0.00017659810321468163, + "loss": 0.26, + "step": 955 + }, + { + "epoch": 0.35302806499261447, + "grad_norm": 0.32977399230003357, + "learning_rate": 0.0001765734696391181, + "loss": 0.3042, + "step": 956 + }, + { + "epoch": 0.353397341211226, + "grad_norm": 0.37018856406211853, + "learning_rate": 0.00017654883606355463, + "loss": 0.3685, + "step": 957 + }, + { + "epoch": 0.3537666174298375, + "grad_norm": 0.258654922246933, + "learning_rate": 0.00017652420248799114, + "loss": 0.3164, + "step": 958 + }, + { + "epoch": 0.35413589364844905, + "grad_norm": 0.2643733620643616, + "learning_rate": 0.00017649956891242766, + "loss": 0.3167, + "step": 959 + }, + { + "epoch": 0.35450516986706054, + "grad_norm": 0.24846959114074707, + "learning_rate": 0.00017647493533686414, + "loss": 0.3181, + "step": 960 + }, + { + "epoch": 0.3548744460856721, + "grad_norm": 0.33625632524490356, + "learning_rate": 0.00017645030176130066, + "loss": 0.3182, + "step": 961 + }, + { + "epoch": 0.3552437223042836, + "grad_norm": 0.20629143714904785, + "learning_rate": 0.00017642566818573715, + "loss": 0.2426, + "step": 962 + }, + { + "epoch": 0.35561299852289513, + "grad_norm": 0.23089516162872314, + "learning_rate": 0.0001764010346101737, + "loss": 0.2483, + "step": 963 + }, + { + "epoch": 0.3559822747415066, + "grad_norm": 0.3318641185760498, + "learning_rate": 0.00017637640103461018, + "loss": 0.3954, + "step": 964 + }, + { + "epoch": 0.35635155096011817, + "grad_norm": 0.29646244645118713, + "learning_rate": 0.0001763517674590467, + "loss": 0.3125, + "step": 965 + }, + { + "epoch": 0.3567208271787297, + "grad_norm": 0.26911357045173645, + "learning_rate": 0.00017632713388348318, + "loss": 0.3163, + "step": 966 + }, + { + "epoch": 0.3570901033973412, + "grad_norm": 0.23117204010486603, + "learning_rate": 0.0001763025003079197, + "loss": 0.2709, + "step": 967 + }, + { + "epoch": 0.35745937961595275, + "grad_norm": 0.30560117959976196, + "learning_rate": 0.0001762778667323562, + "loss": 0.3618, + "step": 968 + }, + { + "epoch": 0.35782865583456425, + "grad_norm": 0.2935681641101837, + "learning_rate": 0.00017625323315679272, + "loss": 0.2761, + "step": 969 + }, + { + "epoch": 0.3581979320531758, + "grad_norm": 0.2757806181907654, + "learning_rate": 0.0001762285995812292, + "loss": 0.321, + "step": 970 + }, + { + "epoch": 0.3585672082717873, + "grad_norm": 0.2714608907699585, + "learning_rate": 0.00017620396600566572, + "loss": 0.3255, + "step": 971 + }, + { + "epoch": 0.35893648449039883, + "grad_norm": 0.26465290784835815, + "learning_rate": 0.00017617933243010224, + "loss": 0.3035, + "step": 972 + }, + { + "epoch": 0.3593057607090103, + "grad_norm": 0.30062395334243774, + "learning_rate": 0.00017615469885453875, + "loss": 0.3602, + "step": 973 + }, + { + "epoch": 0.35967503692762187, + "grad_norm": 0.2546748220920563, + "learning_rate": 0.00017613006527897524, + "loss": 0.294, + "step": 974 + }, + { + "epoch": 0.36004431314623336, + "grad_norm": 0.3131730854511261, + "learning_rate": 0.00017610543170341176, + "loss": 0.3925, + "step": 975 + }, + { + "epoch": 0.3604135893648449, + "grad_norm": 0.2644570767879486, + "learning_rate": 0.00017608079812784824, + "loss": 0.3392, + "step": 976 + }, + { + "epoch": 0.3607828655834564, + "grad_norm": 0.4097766876220703, + "learning_rate": 0.00017605616455228479, + "loss": 0.3498, + "step": 977 + }, + { + "epoch": 0.36115214180206795, + "grad_norm": 0.3088406026363373, + "learning_rate": 0.00017603153097672127, + "loss": 0.3661, + "step": 978 + }, + { + "epoch": 0.36152141802067944, + "grad_norm": 0.26827898621559143, + "learning_rate": 0.0001760068974011578, + "loss": 0.299, + "step": 979 + }, + { + "epoch": 0.361890694239291, + "grad_norm": 0.27683645486831665, + "learning_rate": 0.00017598226382559428, + "loss": 0.3588, + "step": 980 + }, + { + "epoch": 0.36225997045790254, + "grad_norm": 0.2762688100337982, + "learning_rate": 0.0001759576302500308, + "loss": 0.3253, + "step": 981 + }, + { + "epoch": 0.362629246676514, + "grad_norm": 0.2833753526210785, + "learning_rate": 0.0001759329966744673, + "loss": 0.3304, + "step": 982 + }, + { + "epoch": 0.3629985228951256, + "grad_norm": 0.27478983998298645, + "learning_rate": 0.00017590836309890382, + "loss": 0.3439, + "step": 983 + }, + { + "epoch": 0.36336779911373707, + "grad_norm": 0.24711984395980835, + "learning_rate": 0.0001758837295233403, + "loss": 0.3076, + "step": 984 + }, + { + "epoch": 0.3637370753323486, + "grad_norm": 0.30375272035598755, + "learning_rate": 0.00017585909594777682, + "loss": 0.4099, + "step": 985 + }, + { + "epoch": 0.3641063515509601, + "grad_norm": 0.2154420167207718, + "learning_rate": 0.00017583446237221334, + "loss": 0.2473, + "step": 986 + }, + { + "epoch": 0.36447562776957165, + "grad_norm": 0.2740940451622009, + "learning_rate": 0.00017580982879664985, + "loss": 0.3073, + "step": 987 + }, + { + "epoch": 0.36484490398818314, + "grad_norm": 0.24222302436828613, + "learning_rate": 0.00017578519522108634, + "loss": 0.3279, + "step": 988 + }, + { + "epoch": 0.3652141802067947, + "grad_norm": 0.2698388397693634, + "learning_rate": 0.00017576056164552285, + "loss": 0.3287, + "step": 989 + }, + { + "epoch": 0.3655834564254062, + "grad_norm": 0.25645673274993896, + "learning_rate": 0.00017573592806995937, + "loss": 0.3127, + "step": 990 + }, + { + "epoch": 0.36595273264401773, + "grad_norm": 0.24756230413913727, + "learning_rate": 0.00017571129449439588, + "loss": 0.3367, + "step": 991 + }, + { + "epoch": 0.3663220088626292, + "grad_norm": 0.2556271553039551, + "learning_rate": 0.00017568666091883237, + "loss": 0.3353, + "step": 992 + }, + { + "epoch": 0.36669128508124077, + "grad_norm": 0.2676128149032593, + "learning_rate": 0.00017566202734326888, + "loss": 0.3359, + "step": 993 + }, + { + "epoch": 0.3670605612998523, + "grad_norm": 0.28313693404197693, + "learning_rate": 0.00017563739376770537, + "loss": 0.2929, + "step": 994 + }, + { + "epoch": 0.3674298375184638, + "grad_norm": 0.3573448657989502, + "learning_rate": 0.0001756127601921419, + "loss": 0.3631, + "step": 995 + }, + { + "epoch": 0.36779911373707536, + "grad_norm": 0.284042626619339, + "learning_rate": 0.0001755881266165784, + "loss": 0.3091, + "step": 996 + }, + { + "epoch": 0.36816838995568685, + "grad_norm": 0.23255212604999542, + "learning_rate": 0.00017556349304101492, + "loss": 0.2946, + "step": 997 + }, + { + "epoch": 0.3685376661742984, + "grad_norm": 0.26934102177619934, + "learning_rate": 0.0001755388594654514, + "loss": 0.3628, + "step": 998 + }, + { + "epoch": 0.3689069423929099, + "grad_norm": 0.2808159291744232, + "learning_rate": 0.00017551422588988792, + "loss": 0.3469, + "step": 999 + }, + { + "epoch": 0.36927621861152143, + "grad_norm": 0.3583812117576599, + "learning_rate": 0.00017548959231432443, + "loss": 0.4157, + "step": 1000 + }, + { + "epoch": 0.36927621861152143, + "eval_loss": 7.464180946350098, + "eval_runtime": 6.9168, + "eval_samples_per_second": 7.229, + "eval_steps_per_second": 1.012, + "step": 1000 + }, + { + "epoch": 0.3696454948301329, + "grad_norm": 0.7096746563911438, + "learning_rate": 0.00017546495873876095, + "loss": 0.3453, + "step": 1001 + }, + { + "epoch": 0.3700147710487445, + "grad_norm": 0.31353241205215454, + "learning_rate": 0.00017544032516319743, + "loss": 0.3416, + "step": 1002 + }, + { + "epoch": 0.37038404726735596, + "grad_norm": 0.2600584030151367, + "learning_rate": 0.00017541569158763395, + "loss": 0.3038, + "step": 1003 + }, + { + "epoch": 0.3707533234859675, + "grad_norm": 0.26631131768226624, + "learning_rate": 0.00017539105801207046, + "loss": 0.3615, + "step": 1004 + }, + { + "epoch": 0.371122599704579, + "grad_norm": 0.22836722433567047, + "learning_rate": 0.00017536642443650698, + "loss": 0.2716, + "step": 1005 + }, + { + "epoch": 0.37149187592319055, + "grad_norm": 0.25219273567199707, + "learning_rate": 0.00017534179086094347, + "loss": 0.2751, + "step": 1006 + }, + { + "epoch": 0.37186115214180204, + "grad_norm": 0.2883814573287964, + "learning_rate": 0.00017531715728537998, + "loss": 0.3988, + "step": 1007 + }, + { + "epoch": 0.3722304283604136, + "grad_norm": 0.2607775330543518, + "learning_rate": 0.00017529252370981647, + "loss": 0.3016, + "step": 1008 + }, + { + "epoch": 0.37259970457902514, + "grad_norm": 0.40350931882858276, + "learning_rate": 0.000175267890134253, + "loss": 0.372, + "step": 1009 + }, + { + "epoch": 0.37296898079763663, + "grad_norm": 0.23411189019680023, + "learning_rate": 0.0001752432565586895, + "loss": 0.257, + "step": 1010 + }, + { + "epoch": 0.3733382570162482, + "grad_norm": 0.3108779191970825, + "learning_rate": 0.000175218622983126, + "loss": 0.3267, + "step": 1011 + }, + { + "epoch": 0.37370753323485967, + "grad_norm": 0.279238760471344, + "learning_rate": 0.0001751939894075625, + "loss": 0.3266, + "step": 1012 + }, + { + "epoch": 0.3740768094534712, + "grad_norm": 0.32407793402671814, + "learning_rate": 0.00017516935583199901, + "loss": 0.3454, + "step": 1013 + }, + { + "epoch": 0.3744460856720827, + "grad_norm": 0.3369502127170563, + "learning_rate": 0.00017514472225643553, + "loss": 0.361, + "step": 1014 + }, + { + "epoch": 0.37481536189069425, + "grad_norm": 0.27311286330223083, + "learning_rate": 0.00017512008868087204, + "loss": 0.3495, + "step": 1015 + }, + { + "epoch": 0.37518463810930575, + "grad_norm": 0.28632447123527527, + "learning_rate": 0.00017509545510530853, + "loss": 0.3453, + "step": 1016 + }, + { + "epoch": 0.3755539143279173, + "grad_norm": 0.3392500579357147, + "learning_rate": 0.00017507082152974505, + "loss": 0.3044, + "step": 1017 + }, + { + "epoch": 0.3759231905465288, + "grad_norm": 0.30238479375839233, + "learning_rate": 0.00017504618795418156, + "loss": 0.384, + "step": 1018 + }, + { + "epoch": 0.37629246676514033, + "grad_norm": 0.24523282051086426, + "learning_rate": 0.00017502155437861808, + "loss": 0.2907, + "step": 1019 + }, + { + "epoch": 0.3766617429837518, + "grad_norm": 0.23749764263629913, + "learning_rate": 0.00017499692080305456, + "loss": 0.2591, + "step": 1020 + }, + { + "epoch": 0.37703101920236337, + "grad_norm": 0.2930164039134979, + "learning_rate": 0.00017497228722749108, + "loss": 0.4052, + "step": 1021 + }, + { + "epoch": 0.37740029542097486, + "grad_norm": 0.23216955363750458, + "learning_rate": 0.0001749476536519276, + "loss": 0.2865, + "step": 1022 + }, + { + "epoch": 0.3777695716395864, + "grad_norm": 0.4129747152328491, + "learning_rate": 0.0001749230200763641, + "loss": 0.3857, + "step": 1023 + }, + { + "epoch": 0.37813884785819796, + "grad_norm": 0.2881711423397064, + "learning_rate": 0.0001748983865008006, + "loss": 0.3683, + "step": 1024 + }, + { + "epoch": 0.37850812407680945, + "grad_norm": 0.21952371299266815, + "learning_rate": 0.0001748737529252371, + "loss": 0.2133, + "step": 1025 + }, + { + "epoch": 0.378877400295421, + "grad_norm": 0.49470216035842896, + "learning_rate": 0.0001748491193496736, + "loss": 0.3781, + "step": 1026 + }, + { + "epoch": 0.3792466765140325, + "grad_norm": 0.25917941331863403, + "learning_rate": 0.00017482448577411014, + "loss": 0.3027, + "step": 1027 + }, + { + "epoch": 0.37961595273264404, + "grad_norm": 0.27126753330230713, + "learning_rate": 0.00017479985219854663, + "loss": 0.265, + "step": 1028 + }, + { + "epoch": 0.3799852289512555, + "grad_norm": 0.24823777377605438, + "learning_rate": 0.00017477521862298314, + "loss": 0.3188, + "step": 1029 + }, + { + "epoch": 0.3803545051698671, + "grad_norm": 0.26010334491729736, + "learning_rate": 0.00017475058504741963, + "loss": 0.3203, + "step": 1030 + }, + { + "epoch": 0.38072378138847857, + "grad_norm": 0.3931683599948883, + "learning_rate": 0.00017472595147185614, + "loss": 0.2839, + "step": 1031 + }, + { + "epoch": 0.3810930576070901, + "grad_norm": 0.2431991845369339, + "learning_rate": 0.00017470131789629266, + "loss": 0.277, + "step": 1032 + }, + { + "epoch": 0.3814623338257016, + "grad_norm": 0.2657429575920105, + "learning_rate": 0.00017467668432072917, + "loss": 0.2949, + "step": 1033 + }, + { + "epoch": 0.38183161004431315, + "grad_norm": 0.22521480917930603, + "learning_rate": 0.00017465205074516566, + "loss": 0.2693, + "step": 1034 + }, + { + "epoch": 0.38220088626292464, + "grad_norm": 0.2713657319545746, + "learning_rate": 0.00017462741716960217, + "loss": 0.3346, + "step": 1035 + }, + { + "epoch": 0.3825701624815362, + "grad_norm": 0.2508034408092499, + "learning_rate": 0.0001746027835940387, + "loss": 0.2649, + "step": 1036 + }, + { + "epoch": 0.3829394387001477, + "grad_norm": 0.2952634394168854, + "learning_rate": 0.0001745781500184752, + "loss": 0.29, + "step": 1037 + }, + { + "epoch": 0.38330871491875923, + "grad_norm": 0.3666415512561798, + "learning_rate": 0.0001745535164429117, + "loss": 0.3123, + "step": 1038 + }, + { + "epoch": 0.3836779911373708, + "grad_norm": 0.292267769575119, + "learning_rate": 0.0001745288828673482, + "loss": 0.351, + "step": 1039 + }, + { + "epoch": 0.38404726735598227, + "grad_norm": 0.2579355835914612, + "learning_rate": 0.0001745042492917847, + "loss": 0.33, + "step": 1040 + }, + { + "epoch": 0.3844165435745938, + "grad_norm": 0.2539235055446625, + "learning_rate": 0.00017447961571622123, + "loss": 0.3655, + "step": 1041 + }, + { + "epoch": 0.3847858197932053, + "grad_norm": 0.2959385812282562, + "learning_rate": 0.00017445498214065772, + "loss": 0.3529, + "step": 1042 + }, + { + "epoch": 0.38515509601181686, + "grad_norm": 0.26736995577812195, + "learning_rate": 0.00017443034856509424, + "loss": 0.284, + "step": 1043 + }, + { + "epoch": 0.38552437223042835, + "grad_norm": 0.33904993534088135, + "learning_rate": 0.00017440571498953072, + "loss": 0.3737, + "step": 1044 + }, + { + "epoch": 0.3858936484490399, + "grad_norm": 0.2756654620170593, + "learning_rate": 0.00017438108141396724, + "loss": 0.289, + "step": 1045 + }, + { + "epoch": 0.3862629246676514, + "grad_norm": 0.3539923131465912, + "learning_rate": 0.00017435644783840375, + "loss": 0.3493, + "step": 1046 + }, + { + "epoch": 0.38663220088626293, + "grad_norm": 0.3212818503379822, + "learning_rate": 0.00017433181426284027, + "loss": 0.4307, + "step": 1047 + }, + { + "epoch": 0.3870014771048744, + "grad_norm": 0.3098317086696625, + "learning_rate": 0.00017430718068727676, + "loss": 0.3549, + "step": 1048 + }, + { + "epoch": 0.387370753323486, + "grad_norm": 0.27969104051589966, + "learning_rate": 0.00017428254711171327, + "loss": 0.3617, + "step": 1049 + }, + { + "epoch": 0.38774002954209746, + "grad_norm": 0.256176620721817, + "learning_rate": 0.00017425791353614978, + "loss": 0.3358, + "step": 1050 + }, + { + "epoch": 0.38774002954209746, + "eval_loss": 7.4618754386901855, + "eval_runtime": 6.9904, + "eval_samples_per_second": 7.153, + "eval_steps_per_second": 1.001, + "step": 1050 + }, + { + "epoch": 0.388109305760709, + "grad_norm": 0.29004257917404175, + "learning_rate": 0.0001742332799605863, + "loss": 0.326, + "step": 1051 + }, + { + "epoch": 0.38847858197932056, + "grad_norm": 0.3104065954685211, + "learning_rate": 0.0001742086463850228, + "loss": 0.3108, + "step": 1052 + }, + { + "epoch": 0.38884785819793205, + "grad_norm": 0.2983158528804779, + "learning_rate": 0.0001741840128094593, + "loss": 0.3464, + "step": 1053 + }, + { + "epoch": 0.3892171344165436, + "grad_norm": 0.2994527518749237, + "learning_rate": 0.00017415937923389582, + "loss": 0.3258, + "step": 1054 + }, + { + "epoch": 0.3895864106351551, + "grad_norm": 0.23926031589508057, + "learning_rate": 0.00017413474565833233, + "loss": 0.2604, + "step": 1055 + }, + { + "epoch": 0.38995568685376664, + "grad_norm": 0.23964358866214752, + "learning_rate": 0.00017411011208276882, + "loss": 0.2867, + "step": 1056 + }, + { + "epoch": 0.39032496307237813, + "grad_norm": 0.24859659373760223, + "learning_rate": 0.00017408547850720533, + "loss": 0.2971, + "step": 1057 + }, + { + "epoch": 0.3906942392909897, + "grad_norm": 0.2759004831314087, + "learning_rate": 0.00017406084493164182, + "loss": 0.3298, + "step": 1058 + }, + { + "epoch": 0.39106351550960117, + "grad_norm": 0.24251145124435425, + "learning_rate": 0.00017403621135607836, + "loss": 0.3124, + "step": 1059 + }, + { + "epoch": 0.3914327917282127, + "grad_norm": 0.3219092786312103, + "learning_rate": 0.00017401157778051485, + "loss": 0.3647, + "step": 1060 + }, + { + "epoch": 0.3918020679468242, + "grad_norm": 0.43054139614105225, + "learning_rate": 0.00017398694420495136, + "loss": 0.3504, + "step": 1061 + }, + { + "epoch": 0.39217134416543575, + "grad_norm": 0.316385418176651, + "learning_rate": 0.00017396231062938785, + "loss": 0.297, + "step": 1062 + }, + { + "epoch": 0.39254062038404725, + "grad_norm": 0.38464751839637756, + "learning_rate": 0.00017393767705382437, + "loss": 0.3474, + "step": 1063 + }, + { + "epoch": 0.3929098966026588, + "grad_norm": 0.24074231088161469, + "learning_rate": 0.00017391304347826088, + "loss": 0.3232, + "step": 1064 + }, + { + "epoch": 0.3932791728212703, + "grad_norm": 0.2971631586551666, + "learning_rate": 0.0001738884099026974, + "loss": 0.342, + "step": 1065 + }, + { + "epoch": 0.39364844903988183, + "grad_norm": 0.27659279108047485, + "learning_rate": 0.00017386377632713388, + "loss": 0.2993, + "step": 1066 + }, + { + "epoch": 0.3940177252584934, + "grad_norm": 0.2785486876964569, + "learning_rate": 0.0001738391427515704, + "loss": 0.3568, + "step": 1067 + }, + { + "epoch": 0.39438700147710487, + "grad_norm": 0.2273043692111969, + "learning_rate": 0.0001738145091760069, + "loss": 0.255, + "step": 1068 + }, + { + "epoch": 0.3947562776957164, + "grad_norm": 0.2827133238315582, + "learning_rate": 0.00017378987560044343, + "loss": 0.3308, + "step": 1069 + }, + { + "epoch": 0.3951255539143279, + "grad_norm": 0.2558094263076782, + "learning_rate": 0.00017376524202487992, + "loss": 0.3266, + "step": 1070 + }, + { + "epoch": 0.39549483013293946, + "grad_norm": 0.29753121733665466, + "learning_rate": 0.00017374060844931643, + "loss": 0.343, + "step": 1071 + }, + { + "epoch": 0.39586410635155095, + "grad_norm": 0.3100808262825012, + "learning_rate": 0.00017371597487375292, + "loss": 0.4055, + "step": 1072 + }, + { + "epoch": 0.3962333825701625, + "grad_norm": 0.2667597532272339, + "learning_rate": 0.00017369134129818946, + "loss": 0.3037, + "step": 1073 + }, + { + "epoch": 0.396602658788774, + "grad_norm": 0.3185896873474121, + "learning_rate": 0.00017366670772262595, + "loss": 0.2997, + "step": 1074 + }, + { + "epoch": 0.39697193500738553, + "grad_norm": 0.23661619424819946, + "learning_rate": 0.00017364207414706246, + "loss": 0.3162, + "step": 1075 + }, + { + "epoch": 0.397341211225997, + "grad_norm": 0.2815224230289459, + "learning_rate": 0.00017361744057149895, + "loss": 0.328, + "step": 1076 + }, + { + "epoch": 0.3977104874446086, + "grad_norm": 0.2627623379230499, + "learning_rate": 0.00017359280699593546, + "loss": 0.3111, + "step": 1077 + }, + { + "epoch": 0.39807976366322007, + "grad_norm": 0.24790215492248535, + "learning_rate": 0.00017356817342037198, + "loss": 0.3156, + "step": 1078 + }, + { + "epoch": 0.3984490398818316, + "grad_norm": 0.2516396641731262, + "learning_rate": 0.0001735435398448085, + "loss": 0.3509, + "step": 1079 + }, + { + "epoch": 0.3988183161004431, + "grad_norm": 0.332731157541275, + "learning_rate": 0.00017351890626924498, + "loss": 0.4127, + "step": 1080 + }, + { + "epoch": 0.39918759231905465, + "grad_norm": 0.2754536271095276, + "learning_rate": 0.0001734942726936815, + "loss": 0.3578, + "step": 1081 + }, + { + "epoch": 0.3995568685376662, + "grad_norm": 0.22805437445640564, + "learning_rate": 0.000173469639118118, + "loss": 0.346, + "step": 1082 + }, + { + "epoch": 0.3999261447562777, + "grad_norm": 0.27076783776283264, + "learning_rate": 0.00017344500554255452, + "loss": 0.2866, + "step": 1083 + }, + { + "epoch": 0.40029542097488924, + "grad_norm": 0.21836122870445251, + "learning_rate": 0.000173420371966991, + "loss": 0.2891, + "step": 1084 + }, + { + "epoch": 0.40066469719350073, + "grad_norm": 0.2988778054714203, + "learning_rate": 0.00017339573839142753, + "loss": 0.269, + "step": 1085 + }, + { + "epoch": 0.4010339734121123, + "grad_norm": 0.246245875954628, + "learning_rate": 0.00017337110481586401, + "loss": 0.2964, + "step": 1086 + }, + { + "epoch": 0.40140324963072377, + "grad_norm": 0.23227769136428833, + "learning_rate": 0.00017334647124030056, + "loss": 0.3229, + "step": 1087 + }, + { + "epoch": 0.4017725258493353, + "grad_norm": 0.36826738715171814, + "learning_rate": 0.00017332183766473704, + "loss": 0.3475, + "step": 1088 + }, + { + "epoch": 0.4021418020679468, + "grad_norm": 0.3728565573692322, + "learning_rate": 0.00017329720408917356, + "loss": 0.2872, + "step": 1089 + }, + { + "epoch": 0.40251107828655835, + "grad_norm": 0.29593944549560547, + "learning_rate": 0.00017327257051361005, + "loss": 0.3795, + "step": 1090 + }, + { + "epoch": 0.40288035450516985, + "grad_norm": 0.2308139204978943, + "learning_rate": 0.00017324793693804656, + "loss": 0.2661, + "step": 1091 + }, + { + "epoch": 0.4032496307237814, + "grad_norm": 0.28743091225624084, + "learning_rate": 0.00017322330336248307, + "loss": 0.2545, + "step": 1092 + }, + { + "epoch": 0.4036189069423929, + "grad_norm": 0.2686830759048462, + "learning_rate": 0.0001731986697869196, + "loss": 0.3562, + "step": 1093 + }, + { + "epoch": 0.40398818316100443, + "grad_norm": 0.2505398690700531, + "learning_rate": 0.00017317403621135608, + "loss": 0.3467, + "step": 1094 + }, + { + "epoch": 0.404357459379616, + "grad_norm": 0.24357803165912628, + "learning_rate": 0.0001731494026357926, + "loss": 0.2837, + "step": 1095 + }, + { + "epoch": 0.40472673559822747, + "grad_norm": 0.24741888046264648, + "learning_rate": 0.0001731247690602291, + "loss": 0.3394, + "step": 1096 + }, + { + "epoch": 0.405096011816839, + "grad_norm": 0.24446335434913635, + "learning_rate": 0.00017310013548466562, + "loss": 0.2673, + "step": 1097 + }, + { + "epoch": 0.4054652880354505, + "grad_norm": 0.25031301379203796, + "learning_rate": 0.0001730755019091021, + "loss": 0.3087, + "step": 1098 + }, + { + "epoch": 0.40583456425406206, + "grad_norm": 0.2642655074596405, + "learning_rate": 0.00017305086833353862, + "loss": 0.3304, + "step": 1099 + }, + { + "epoch": 0.40620384047267355, + "grad_norm": 0.2880112826824188, + "learning_rate": 0.00017302623475797514, + "loss": 0.3797, + "step": 1100 + }, + { + "epoch": 0.40620384047267355, + "eval_loss": 7.5619730949401855, + "eval_runtime": 6.926, + "eval_samples_per_second": 7.219, + "eval_steps_per_second": 1.011, + "step": 1100 + }, + { + "epoch": 0.4065731166912851, + "grad_norm": 0.3128634989261627, + "learning_rate": 0.00017300160118241165, + "loss": 0.3683, + "step": 1101 + }, + { + "epoch": 0.4069423929098966, + "grad_norm": 0.2598084509372711, + "learning_rate": 0.00017297696760684814, + "loss": 0.2975, + "step": 1102 + }, + { + "epoch": 0.40731166912850814, + "grad_norm": 0.32353758811950684, + "learning_rate": 0.00017295233403128465, + "loss": 0.448, + "step": 1103 + }, + { + "epoch": 0.4076809453471196, + "grad_norm": 0.24262599647045135, + "learning_rate": 0.00017292770045572114, + "loss": 0.2952, + "step": 1104 + }, + { + "epoch": 0.4080502215657312, + "grad_norm": 0.2655535042285919, + "learning_rate": 0.00017290306688015768, + "loss": 0.333, + "step": 1105 + }, + { + "epoch": 0.40841949778434267, + "grad_norm": 0.28923526406288147, + "learning_rate": 0.00017287843330459417, + "loss": 0.3254, + "step": 1106 + }, + { + "epoch": 0.4087887740029542, + "grad_norm": 0.2606164813041687, + "learning_rate": 0.00017285379972903069, + "loss": 0.3045, + "step": 1107 + }, + { + "epoch": 0.4091580502215657, + "grad_norm": 0.291080504655838, + "learning_rate": 0.00017282916615346717, + "loss": 0.3853, + "step": 1108 + }, + { + "epoch": 0.40952732644017725, + "grad_norm": 0.2605729103088379, + "learning_rate": 0.0001728045325779037, + "loss": 0.3411, + "step": 1109 + }, + { + "epoch": 0.4098966026587888, + "grad_norm": 0.27798473834991455, + "learning_rate": 0.0001727798990023402, + "loss": 0.3176, + "step": 1110 + }, + { + "epoch": 0.4102658788774003, + "grad_norm": 0.3642788529396057, + "learning_rate": 0.00017275526542677672, + "loss": 0.4039, + "step": 1111 + }, + { + "epoch": 0.41063515509601184, + "grad_norm": 0.31448784470558167, + "learning_rate": 0.0001727306318512132, + "loss": 0.3842, + "step": 1112 + }, + { + "epoch": 0.41100443131462333, + "grad_norm": 0.2117696851491928, + "learning_rate": 0.00017270599827564972, + "loss": 0.2436, + "step": 1113 + }, + { + "epoch": 0.4113737075332349, + "grad_norm": 0.27614179253578186, + "learning_rate": 0.00017268136470008623, + "loss": 0.3084, + "step": 1114 + }, + { + "epoch": 0.41174298375184637, + "grad_norm": 0.2514500617980957, + "learning_rate": 0.00017265673112452275, + "loss": 0.3129, + "step": 1115 + }, + { + "epoch": 0.4121122599704579, + "grad_norm": 0.23883867263793945, + "learning_rate": 0.00017263209754895924, + "loss": 0.3045, + "step": 1116 + }, + { + "epoch": 0.4124815361890694, + "grad_norm": 0.2858302891254425, + "learning_rate": 0.00017260746397339575, + "loss": 0.3281, + "step": 1117 + }, + { + "epoch": 0.41285081240768096, + "grad_norm": 0.25097280740737915, + "learning_rate": 0.00017258283039783224, + "loss": 0.2868, + "step": 1118 + }, + { + "epoch": 0.41322008862629245, + "grad_norm": 0.26380279660224915, + "learning_rate": 0.00017255819682226878, + "loss": 0.3225, + "step": 1119 + }, + { + "epoch": 0.413589364844904, + "grad_norm": 0.2811550199985504, + "learning_rate": 0.00017253356324670527, + "loss": 0.4731, + "step": 1120 + }, + { + "epoch": 0.4139586410635155, + "grad_norm": 0.33904004096984863, + "learning_rate": 0.00017250892967114178, + "loss": 0.3722, + "step": 1121 + }, + { + "epoch": 0.41432791728212703, + "grad_norm": 0.2014738917350769, + "learning_rate": 0.00017248429609557827, + "loss": 0.2375, + "step": 1122 + }, + { + "epoch": 0.4146971935007385, + "grad_norm": 0.29208263754844666, + "learning_rate": 0.00017245966252001478, + "loss": 0.3307, + "step": 1123 + }, + { + "epoch": 0.4150664697193501, + "grad_norm": 0.40991997718811035, + "learning_rate": 0.0001724350289444513, + "loss": 0.3067, + "step": 1124 + }, + { + "epoch": 0.4154357459379616, + "grad_norm": 0.2646535336971283, + "learning_rate": 0.00017241039536888781, + "loss": 0.3595, + "step": 1125 + }, + { + "epoch": 0.4158050221565731, + "grad_norm": 0.34228771924972534, + "learning_rate": 0.0001723857617933243, + "loss": 0.3349, + "step": 1126 + }, + { + "epoch": 0.41617429837518466, + "grad_norm": 0.27039459347724915, + "learning_rate": 0.00017236112821776082, + "loss": 0.3742, + "step": 1127 + }, + { + "epoch": 0.41654357459379615, + "grad_norm": 0.25759157538414, + "learning_rate": 0.00017233649464219733, + "loss": 0.3126, + "step": 1128 + }, + { + "epoch": 0.4169128508124077, + "grad_norm": 0.2705003023147583, + "learning_rate": 0.00017231186106663385, + "loss": 0.3492, + "step": 1129 + }, + { + "epoch": 0.4172821270310192, + "grad_norm": 0.24222534894943237, + "learning_rate": 0.00017228722749107033, + "loss": 0.2684, + "step": 1130 + }, + { + "epoch": 0.41765140324963074, + "grad_norm": 0.23254096508026123, + "learning_rate": 0.00017226259391550685, + "loss": 0.3002, + "step": 1131 + }, + { + "epoch": 0.41802067946824223, + "grad_norm": 0.27737656235694885, + "learning_rate": 0.00017223796033994336, + "loss": 0.3719, + "step": 1132 + }, + { + "epoch": 0.4183899556868538, + "grad_norm": 0.2727561891078949, + "learning_rate": 0.00017221332676437988, + "loss": 0.2941, + "step": 1133 + }, + { + "epoch": 0.41875923190546527, + "grad_norm": 0.25847339630126953, + "learning_rate": 0.00017218869318881636, + "loss": 0.2982, + "step": 1134 + }, + { + "epoch": 0.4191285081240768, + "grad_norm": 0.22166021168231964, + "learning_rate": 0.00017216405961325288, + "loss": 0.3039, + "step": 1135 + }, + { + "epoch": 0.4194977843426883, + "grad_norm": 0.2369915246963501, + "learning_rate": 0.00017213942603768937, + "loss": 0.308, + "step": 1136 + }, + { + "epoch": 0.41986706056129985, + "grad_norm": 0.25311577320098877, + "learning_rate": 0.0001721147924621259, + "loss": 0.2804, + "step": 1137 + }, + { + "epoch": 0.4202363367799114, + "grad_norm": 0.23553772270679474, + "learning_rate": 0.0001720901588865624, + "loss": 0.2643, + "step": 1138 + }, + { + "epoch": 0.4206056129985229, + "grad_norm": 0.21996288001537323, + "learning_rate": 0.0001720655253109989, + "loss": 0.2876, + "step": 1139 + }, + { + "epoch": 0.42097488921713444, + "grad_norm": 0.24668976664543152, + "learning_rate": 0.0001720408917354354, + "loss": 0.3179, + "step": 1140 + }, + { + "epoch": 0.42134416543574593, + "grad_norm": 0.25266775488853455, + "learning_rate": 0.0001720162581598719, + "loss": 0.3313, + "step": 1141 + }, + { + "epoch": 0.4217134416543575, + "grad_norm": 0.2202993929386139, + "learning_rate": 0.00017199162458430843, + "loss": 0.2843, + "step": 1142 + }, + { + "epoch": 0.42208271787296897, + "grad_norm": 0.29777970910072327, + "learning_rate": 0.00017196699100874494, + "loss": 0.3675, + "step": 1143 + }, + { + "epoch": 0.4224519940915805, + "grad_norm": 0.31485676765441895, + "learning_rate": 0.00017194235743318143, + "loss": 0.4229, + "step": 1144 + }, + { + "epoch": 0.422821270310192, + "grad_norm": 0.3259516656398773, + "learning_rate": 0.00017191772385761794, + "loss": 0.4381, + "step": 1145 + }, + { + "epoch": 0.42319054652880356, + "grad_norm": 0.2696915566921234, + "learning_rate": 0.00017189309028205446, + "loss": 0.3293, + "step": 1146 + }, + { + "epoch": 0.42355982274741505, + "grad_norm": 0.2776342034339905, + "learning_rate": 0.00017186845670649097, + "loss": 0.3528, + "step": 1147 + }, + { + "epoch": 0.4239290989660266, + "grad_norm": 0.2947021722793579, + "learning_rate": 0.00017184382313092746, + "loss": 0.3264, + "step": 1148 + }, + { + "epoch": 0.4242983751846381, + "grad_norm": 0.27383801341056824, + "learning_rate": 0.00017181918955536398, + "loss": 0.2956, + "step": 1149 + }, + { + "epoch": 0.42466765140324964, + "grad_norm": 0.31496670842170715, + "learning_rate": 0.00017179455597980046, + "loss": 0.3529, + "step": 1150 + }, + { + "epoch": 0.42466765140324964, + "eval_loss": 7.777125835418701, + "eval_runtime": 6.911, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 1.013, + "step": 1150 + }, + { + "epoch": 0.4250369276218611, + "grad_norm": 0.2928325831890106, + "learning_rate": 0.000171769922404237, + "loss": 0.3187, + "step": 1151 + }, + { + "epoch": 0.4254062038404727, + "grad_norm": 0.30881592631340027, + "learning_rate": 0.0001717452888286735, + "loss": 0.4277, + "step": 1152 + }, + { + "epoch": 0.4257754800590842, + "grad_norm": 0.2803179621696472, + "learning_rate": 0.00017172065525311, + "loss": 0.3318, + "step": 1153 + }, + { + "epoch": 0.4261447562776957, + "grad_norm": 0.2890618145465851, + "learning_rate": 0.0001716960216775465, + "loss": 0.338, + "step": 1154 + }, + { + "epoch": 0.42651403249630726, + "grad_norm": 0.3008515536785126, + "learning_rate": 0.000171671388101983, + "loss": 0.3157, + "step": 1155 + }, + { + "epoch": 0.42688330871491875, + "grad_norm": 0.25320613384246826, + "learning_rate": 0.00017164675452641952, + "loss": 0.3105, + "step": 1156 + }, + { + "epoch": 0.4272525849335303, + "grad_norm": 0.28372466564178467, + "learning_rate": 0.00017162212095085604, + "loss": 0.3352, + "step": 1157 + }, + { + "epoch": 0.4276218611521418, + "grad_norm": 0.297025203704834, + "learning_rate": 0.00017159748737529253, + "loss": 0.355, + "step": 1158 + }, + { + "epoch": 0.42799113737075334, + "grad_norm": 0.2884141802787781, + "learning_rate": 0.00017157285379972904, + "loss": 0.3281, + "step": 1159 + }, + { + "epoch": 0.42836041358936483, + "grad_norm": 0.26834312081336975, + "learning_rate": 0.00017154822022416556, + "loss": 0.3018, + "step": 1160 + }, + { + "epoch": 0.4287296898079764, + "grad_norm": 0.3533369302749634, + "learning_rate": 0.00017152358664860207, + "loss": 0.4066, + "step": 1161 + }, + { + "epoch": 0.42909896602658787, + "grad_norm": 0.27890342473983765, + "learning_rate": 0.00017149895307303856, + "loss": 0.3471, + "step": 1162 + }, + { + "epoch": 0.4294682422451994, + "grad_norm": 0.29080912470817566, + "learning_rate": 0.00017147431949747507, + "loss": 0.2714, + "step": 1163 + }, + { + "epoch": 0.4298375184638109, + "grad_norm": 0.2491619884967804, + "learning_rate": 0.0001714496859219116, + "loss": 0.2917, + "step": 1164 + }, + { + "epoch": 0.43020679468242246, + "grad_norm": 0.2970031499862671, + "learning_rate": 0.00017142505234634807, + "loss": 0.4354, + "step": 1165 + }, + { + "epoch": 0.43057607090103395, + "grad_norm": 0.2576717138290405, + "learning_rate": 0.0001714004187707846, + "loss": 0.2915, + "step": 1166 + }, + { + "epoch": 0.4309453471196455, + "grad_norm": 0.27182736992836, + "learning_rate": 0.00017137578519522108, + "loss": 0.4206, + "step": 1167 + }, + { + "epoch": 0.43131462333825704, + "grad_norm": 0.2675734758377075, + "learning_rate": 0.0001713511516196576, + "loss": 0.2971, + "step": 1168 + }, + { + "epoch": 0.43168389955686853, + "grad_norm": 0.323015958070755, + "learning_rate": 0.0001713265180440941, + "loss": 0.3479, + "step": 1169 + }, + { + "epoch": 0.4320531757754801, + "grad_norm": 0.26693296432495117, + "learning_rate": 0.00017130188446853062, + "loss": 0.3054, + "step": 1170 + }, + { + "epoch": 0.4324224519940916, + "grad_norm": 0.2482832968235016, + "learning_rate": 0.0001712772508929671, + "loss": 0.3146, + "step": 1171 + }, + { + "epoch": 0.4327917282127031, + "grad_norm": 0.2422439455986023, + "learning_rate": 0.00017125261731740362, + "loss": 0.3323, + "step": 1172 + }, + { + "epoch": 0.4331610044313146, + "grad_norm": 0.2577032744884491, + "learning_rate": 0.00017122798374184014, + "loss": 0.2903, + "step": 1173 + }, + { + "epoch": 0.43353028064992616, + "grad_norm": 0.25987499952316284, + "learning_rate": 0.00017120335016627665, + "loss": 0.2985, + "step": 1174 + }, + { + "epoch": 0.43389955686853765, + "grad_norm": 0.28118860721588135, + "learning_rate": 0.00017117871659071314, + "loss": 0.2808, + "step": 1175 + }, + { + "epoch": 0.4342688330871492, + "grad_norm": 0.3168046474456787, + "learning_rate": 0.00017115408301514965, + "loss": 0.3289, + "step": 1176 + }, + { + "epoch": 0.4346381093057607, + "grad_norm": 0.258706271648407, + "learning_rate": 0.00017112944943958614, + "loss": 0.3294, + "step": 1177 + }, + { + "epoch": 0.43500738552437224, + "grad_norm": 0.3282741606235504, + "learning_rate": 0.00017110481586402268, + "loss": 0.4288, + "step": 1178 + }, + { + "epoch": 0.43537666174298373, + "grad_norm": 0.23919552564620972, + "learning_rate": 0.00017108018228845917, + "loss": 0.2921, + "step": 1179 + }, + { + "epoch": 0.4357459379615953, + "grad_norm": 0.2555493712425232, + "learning_rate": 0.00017105554871289569, + "loss": 0.3424, + "step": 1180 + }, + { + "epoch": 0.43611521418020677, + "grad_norm": 0.2584797441959381, + "learning_rate": 0.00017103091513733217, + "loss": 0.3236, + "step": 1181 + }, + { + "epoch": 0.4364844903988183, + "grad_norm": 0.24993477761745453, + "learning_rate": 0.0001710062815617687, + "loss": 0.3287, + "step": 1182 + }, + { + "epoch": 0.43685376661742986, + "grad_norm": 0.24919241666793823, + "learning_rate": 0.0001709816479862052, + "loss": 0.284, + "step": 1183 + }, + { + "epoch": 0.43722304283604135, + "grad_norm": 0.2976304292678833, + "learning_rate": 0.00017095701441064172, + "loss": 0.3668, + "step": 1184 + }, + { + "epoch": 0.4375923190546529, + "grad_norm": 0.3170109987258911, + "learning_rate": 0.0001709323808350782, + "loss": 0.3178, + "step": 1185 + }, + { + "epoch": 0.4379615952732644, + "grad_norm": 0.28933364152908325, + "learning_rate": 0.00017090774725951472, + "loss": 0.2637, + "step": 1186 + }, + { + "epoch": 0.43833087149187594, + "grad_norm": 0.3028348982334137, + "learning_rate": 0.00017088311368395123, + "loss": 0.2952, + "step": 1187 + }, + { + "epoch": 0.43870014771048743, + "grad_norm": 0.27760207653045654, + "learning_rate": 0.00017085848010838775, + "loss": 0.3097, + "step": 1188 + }, + { + "epoch": 0.439069423929099, + "grad_norm": 0.37352538108825684, + "learning_rate": 0.00017083384653282424, + "loss": 0.3629, + "step": 1189 + }, + { + "epoch": 0.43943870014771047, + "grad_norm": 0.2567557096481323, + "learning_rate": 0.00017080921295726075, + "loss": 0.3121, + "step": 1190 + }, + { + "epoch": 0.439807976366322, + "grad_norm": 0.28638720512390137, + "learning_rate": 0.00017078457938169726, + "loss": 0.3835, + "step": 1191 + }, + { + "epoch": 0.4401772525849335, + "grad_norm": 0.29230207204818726, + "learning_rate": 0.00017075994580613378, + "loss": 0.3215, + "step": 1192 + }, + { + "epoch": 0.44054652880354506, + "grad_norm": 0.26492372155189514, + "learning_rate": 0.00017073531223057027, + "loss": 0.2988, + "step": 1193 + }, + { + "epoch": 0.44091580502215655, + "grad_norm": 0.2999979555606842, + "learning_rate": 0.00017071067865500678, + "loss": 0.3492, + "step": 1194 + }, + { + "epoch": 0.4412850812407681, + "grad_norm": 0.34691399335861206, + "learning_rate": 0.00017068604507944327, + "loss": 0.4072, + "step": 1195 + }, + { + "epoch": 0.44165435745937964, + "grad_norm": 0.28073757886886597, + "learning_rate": 0.0001706614115038798, + "loss": 0.3387, + "step": 1196 + }, + { + "epoch": 0.44202363367799113, + "grad_norm": 0.22701922059059143, + "learning_rate": 0.0001706367779283163, + "loss": 0.2705, + "step": 1197 + }, + { + "epoch": 0.4423929098966027, + "grad_norm": 0.27518054842948914, + "learning_rate": 0.0001706121443527528, + "loss": 0.3067, + "step": 1198 + }, + { + "epoch": 0.4427621861152142, + "grad_norm": 0.2714622914791107, + "learning_rate": 0.0001705875107771893, + "loss": 0.3193, + "step": 1199 + }, + { + "epoch": 0.4431314623338257, + "grad_norm": 0.26023775339126587, + "learning_rate": 0.00017056287720162582, + "loss": 0.3281, + "step": 1200 + }, + { + "epoch": 0.4431314623338257, + "eval_loss": 7.751384258270264, + "eval_runtime": 6.9764, + "eval_samples_per_second": 7.167, + "eval_steps_per_second": 1.003, + "step": 1200 + }, + { + "epoch": 0.4435007385524372, + "grad_norm": 0.2871365249156952, + "learning_rate": 0.00017053824362606233, + "loss": 0.3565, + "step": 1201 + }, + { + "epoch": 0.44387001477104876, + "grad_norm": 0.26998981833457947, + "learning_rate": 0.00017051361005049884, + "loss": 0.2731, + "step": 1202 + }, + { + "epoch": 0.44423929098966025, + "grad_norm": 0.26470690965652466, + "learning_rate": 0.00017048897647493533, + "loss": 0.3532, + "step": 1203 + }, + { + "epoch": 0.4446085672082718, + "grad_norm": 0.22583994269371033, + "learning_rate": 0.00017046434289937185, + "loss": 0.2973, + "step": 1204 + }, + { + "epoch": 0.4449778434268833, + "grad_norm": 0.2860575318336487, + "learning_rate": 0.00017043970932380836, + "loss": 0.3424, + "step": 1205 + }, + { + "epoch": 0.44534711964549484, + "grad_norm": 0.34147146344184875, + "learning_rate": 0.00017041507574824488, + "loss": 0.3809, + "step": 1206 + }, + { + "epoch": 0.44571639586410633, + "grad_norm": 0.2634851932525635, + "learning_rate": 0.00017039044217268136, + "loss": 0.278, + "step": 1207 + }, + { + "epoch": 0.4460856720827179, + "grad_norm": 0.2344469130039215, + "learning_rate": 0.00017036580859711788, + "loss": 0.2946, + "step": 1208 + }, + { + "epoch": 0.44645494830132937, + "grad_norm": 0.24788957834243774, + "learning_rate": 0.00017034117502155437, + "loss": 0.3056, + "step": 1209 + }, + { + "epoch": 0.4468242245199409, + "grad_norm": 0.2634503245353699, + "learning_rate": 0.0001703165414459909, + "loss": 0.2877, + "step": 1210 + }, + { + "epoch": 0.44719350073855246, + "grad_norm": 0.2738768458366394, + "learning_rate": 0.0001702919078704274, + "loss": 0.3548, + "step": 1211 + }, + { + "epoch": 0.44756277695716395, + "grad_norm": 0.2772246301174164, + "learning_rate": 0.0001702672742948639, + "loss": 0.3213, + "step": 1212 + }, + { + "epoch": 0.4479320531757755, + "grad_norm": 0.30349859595298767, + "learning_rate": 0.0001702426407193004, + "loss": 0.2714, + "step": 1213 + }, + { + "epoch": 0.448301329394387, + "grad_norm": 0.2149275839328766, + "learning_rate": 0.0001702180071437369, + "loss": 0.2455, + "step": 1214 + }, + { + "epoch": 0.44867060561299854, + "grad_norm": 0.2335432767868042, + "learning_rate": 0.00017019337356817343, + "loss": 0.2561, + "step": 1215 + }, + { + "epoch": 0.44903988183161003, + "grad_norm": 0.28139686584472656, + "learning_rate": 0.00017016873999260994, + "loss": 0.3466, + "step": 1216 + }, + { + "epoch": 0.4494091580502216, + "grad_norm": 0.2793072462081909, + "learning_rate": 0.00017014410641704643, + "loss": 0.3467, + "step": 1217 + }, + { + "epoch": 0.44977843426883307, + "grad_norm": 0.25835534930229187, + "learning_rate": 0.00017011947284148294, + "loss": 0.2509, + "step": 1218 + }, + { + "epoch": 0.4501477104874446, + "grad_norm": 0.3196823298931122, + "learning_rate": 0.00017009483926591946, + "loss": 0.3333, + "step": 1219 + }, + { + "epoch": 0.4505169867060561, + "grad_norm": 0.2686567008495331, + "learning_rate": 0.00017007020569035597, + "loss": 0.2736, + "step": 1220 + }, + { + "epoch": 0.45088626292466766, + "grad_norm": 0.2681311368942261, + "learning_rate": 0.00017004557211479246, + "loss": 0.3051, + "step": 1221 + }, + { + "epoch": 0.45125553914327915, + "grad_norm": 0.21180082857608795, + "learning_rate": 0.00017002093853922897, + "loss": 0.2801, + "step": 1222 + }, + { + "epoch": 0.4516248153618907, + "grad_norm": 0.25265827775001526, + "learning_rate": 0.0001699963049636655, + "loss": 0.323, + "step": 1223 + }, + { + "epoch": 0.4519940915805022, + "grad_norm": 0.31457677483558655, + "learning_rate": 0.000169971671388102, + "loss": 0.325, + "step": 1224 + }, + { + "epoch": 0.45236336779911374, + "grad_norm": 0.2467675656080246, + "learning_rate": 0.0001699470378125385, + "loss": 0.3362, + "step": 1225 + }, + { + "epoch": 0.4527326440177253, + "grad_norm": 0.280852347612381, + "learning_rate": 0.000169922404236975, + "loss": 0.3199, + "step": 1226 + }, + { + "epoch": 0.4531019202363368, + "grad_norm": 0.26614290475845337, + "learning_rate": 0.0001698977706614115, + "loss": 0.3253, + "step": 1227 + }, + { + "epoch": 0.4534711964549483, + "grad_norm": 0.2899266183376312, + "learning_rate": 0.00016987313708584804, + "loss": 0.4359, + "step": 1228 + }, + { + "epoch": 0.4538404726735598, + "grad_norm": 0.2477445900440216, + "learning_rate": 0.00016984850351028452, + "loss": 0.2913, + "step": 1229 + }, + { + "epoch": 0.45420974889217136, + "grad_norm": 0.2792622148990631, + "learning_rate": 0.00016982386993472104, + "loss": 0.3694, + "step": 1230 + }, + { + "epoch": 0.45457902511078285, + "grad_norm": 0.3972775936126709, + "learning_rate": 0.00016979923635915753, + "loss": 0.3341, + "step": 1231 + }, + { + "epoch": 0.4549483013293944, + "grad_norm": 0.2875741124153137, + "learning_rate": 0.00016977460278359404, + "loss": 0.3153, + "step": 1232 + }, + { + "epoch": 0.4553175775480059, + "grad_norm": 0.28061237931251526, + "learning_rate": 0.00016974996920803055, + "loss": 0.3025, + "step": 1233 + }, + { + "epoch": 0.45568685376661744, + "grad_norm": 0.2716069221496582, + "learning_rate": 0.00016972533563246707, + "loss": 0.3034, + "step": 1234 + }, + { + "epoch": 0.45605612998522893, + "grad_norm": 0.3086186349391937, + "learning_rate": 0.00016970070205690356, + "loss": 0.3375, + "step": 1235 + }, + { + "epoch": 0.4564254062038405, + "grad_norm": 0.308723509311676, + "learning_rate": 0.00016967606848134007, + "loss": 0.3474, + "step": 1236 + }, + { + "epoch": 0.45679468242245197, + "grad_norm": 0.25116440653800964, + "learning_rate": 0.00016965143490577659, + "loss": 0.2796, + "step": 1237 + }, + { + "epoch": 0.4571639586410635, + "grad_norm": 0.2542695999145508, + "learning_rate": 0.0001696268013302131, + "loss": 0.2726, + "step": 1238 + }, + { + "epoch": 0.45753323485967506, + "grad_norm": 0.27403321862220764, + "learning_rate": 0.0001696021677546496, + "loss": 0.2938, + "step": 1239 + }, + { + "epoch": 0.45790251107828656, + "grad_norm": 0.2657535970211029, + "learning_rate": 0.0001695775341790861, + "loss": 0.2798, + "step": 1240 + }, + { + "epoch": 0.4582717872968981, + "grad_norm": 0.328223317861557, + "learning_rate": 0.0001695529006035226, + "loss": 0.3668, + "step": 1241 + }, + { + "epoch": 0.4586410635155096, + "grad_norm": 0.24663668870925903, + "learning_rate": 0.00016952826702795913, + "loss": 0.2734, + "step": 1242 + }, + { + "epoch": 0.45901033973412114, + "grad_norm": 0.2590343952178955, + "learning_rate": 0.00016950363345239562, + "loss": 0.3411, + "step": 1243 + }, + { + "epoch": 0.45937961595273263, + "grad_norm": 0.20424900949001312, + "learning_rate": 0.00016947899987683213, + "loss": 0.2356, + "step": 1244 + }, + { + "epoch": 0.4597488921713442, + "grad_norm": 0.26161104440689087, + "learning_rate": 0.00016945436630126862, + "loss": 0.3106, + "step": 1245 + }, + { + "epoch": 0.4601181683899557, + "grad_norm": 0.23681941628456116, + "learning_rate": 0.00016942973272570514, + "loss": 0.2687, + "step": 1246 + }, + { + "epoch": 0.4604874446085672, + "grad_norm": 0.2350742220878601, + "learning_rate": 0.00016940509915014165, + "loss": 0.2732, + "step": 1247 + }, + { + "epoch": 0.4608567208271787, + "grad_norm": 0.2512291669845581, + "learning_rate": 0.00016938046557457817, + "loss": 0.2685, + "step": 1248 + }, + { + "epoch": 0.46122599704579026, + "grad_norm": 0.26542362570762634, + "learning_rate": 0.00016935583199901465, + "loss": 0.3173, + "step": 1249 + }, + { + "epoch": 0.46159527326440175, + "grad_norm": 0.27273064851760864, + "learning_rate": 0.00016933119842345117, + "loss": 0.3127, + "step": 1250 + }, + { + "epoch": 0.46159527326440175, + "eval_loss": 7.39493465423584, + "eval_runtime": 6.9106, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 1.013, + "step": 1250 + }, + { + "epoch": 0.4619645494830133, + "grad_norm": 0.3303544521331787, + "learning_rate": 0.00016930656484788768, + "loss": 0.3305, + "step": 1251 + }, + { + "epoch": 0.4623338257016248, + "grad_norm": 0.22199586033821106, + "learning_rate": 0.0001692819312723242, + "loss": 0.2634, + "step": 1252 + }, + { + "epoch": 0.46270310192023634, + "grad_norm": 0.33486780524253845, + "learning_rate": 0.00016925729769676068, + "loss": 0.3972, + "step": 1253 + }, + { + "epoch": 0.4630723781388479, + "grad_norm": 0.2760350704193115, + "learning_rate": 0.0001692326641211972, + "loss": 0.307, + "step": 1254 + }, + { + "epoch": 0.4634416543574594, + "grad_norm": 0.23962976038455963, + "learning_rate": 0.0001692080305456337, + "loss": 0.2504, + "step": 1255 + }, + { + "epoch": 0.4638109305760709, + "grad_norm": 0.2577246129512787, + "learning_rate": 0.00016918339697007023, + "loss": 0.3133, + "step": 1256 + }, + { + "epoch": 0.4641802067946824, + "grad_norm": 0.2570315897464752, + "learning_rate": 0.00016915876339450672, + "loss": 0.3261, + "step": 1257 + }, + { + "epoch": 0.46454948301329396, + "grad_norm": 0.2468554526567459, + "learning_rate": 0.00016913412981894323, + "loss": 0.2834, + "step": 1258 + }, + { + "epoch": 0.46491875923190545, + "grad_norm": 0.2781396508216858, + "learning_rate": 0.00016910949624337972, + "loss": 0.3497, + "step": 1259 + }, + { + "epoch": 0.465288035450517, + "grad_norm": 0.3397352397441864, + "learning_rate": 0.00016908486266781623, + "loss": 0.3958, + "step": 1260 + }, + { + "epoch": 0.4656573116691285, + "grad_norm": 0.27221816778182983, + "learning_rate": 0.00016906022909225275, + "loss": 0.2756, + "step": 1261 + }, + { + "epoch": 0.46602658788774004, + "grad_norm": 0.2758025527000427, + "learning_rate": 0.00016903559551668926, + "loss": 0.322, + "step": 1262 + }, + { + "epoch": 0.46639586410635153, + "grad_norm": 0.3043409585952759, + "learning_rate": 0.00016901096194112575, + "loss": 0.4234, + "step": 1263 + }, + { + "epoch": 0.4667651403249631, + "grad_norm": 0.26296183466911316, + "learning_rate": 0.00016898632836556226, + "loss": 0.3945, + "step": 1264 + }, + { + "epoch": 0.46713441654357457, + "grad_norm": 0.24974465370178223, + "learning_rate": 0.00016896169478999878, + "loss": 0.3135, + "step": 1265 + }, + { + "epoch": 0.4675036927621861, + "grad_norm": 0.2953634262084961, + "learning_rate": 0.0001689370612144353, + "loss": 0.2772, + "step": 1266 + }, + { + "epoch": 0.4678729689807976, + "grad_norm": 0.2357940524816513, + "learning_rate": 0.00016891242763887178, + "loss": 0.2797, + "step": 1267 + }, + { + "epoch": 0.46824224519940916, + "grad_norm": 0.2741159200668335, + "learning_rate": 0.0001688877940633083, + "loss": 0.3144, + "step": 1268 + }, + { + "epoch": 0.4686115214180207, + "grad_norm": 0.2766612470149994, + "learning_rate": 0.0001688631604877448, + "loss": 0.3581, + "step": 1269 + }, + { + "epoch": 0.4689807976366322, + "grad_norm": 0.23679442703723907, + "learning_rate": 0.00016883852691218133, + "loss": 0.2801, + "step": 1270 + }, + { + "epoch": 0.46935007385524374, + "grad_norm": 0.30123576521873474, + "learning_rate": 0.0001688138933366178, + "loss": 0.3753, + "step": 1271 + }, + { + "epoch": 0.46971935007385524, + "grad_norm": 0.24590067565441132, + "learning_rate": 0.00016878925976105433, + "loss": 0.3017, + "step": 1272 + }, + { + "epoch": 0.4700886262924668, + "grad_norm": 0.2648707926273346, + "learning_rate": 0.00016876462618549081, + "loss": 0.2945, + "step": 1273 + }, + { + "epoch": 0.4704579025110783, + "grad_norm": 0.24729423224925995, + "learning_rate": 0.00016873999260992736, + "loss": 0.353, + "step": 1274 + }, + { + "epoch": 0.4708271787296898, + "grad_norm": 0.2910615801811218, + "learning_rate": 0.00016871535903436384, + "loss": 0.3363, + "step": 1275 + }, + { + "epoch": 0.4711964549483013, + "grad_norm": 0.31637582182884216, + "learning_rate": 0.00016869072545880036, + "loss": 0.3962, + "step": 1276 + }, + { + "epoch": 0.47156573116691286, + "grad_norm": 0.2796592116355896, + "learning_rate": 0.00016866609188323685, + "loss": 0.3269, + "step": 1277 + }, + { + "epoch": 0.47193500738552435, + "grad_norm": 0.2645513117313385, + "learning_rate": 0.00016864145830767336, + "loss": 0.3378, + "step": 1278 + }, + { + "epoch": 0.4723042836041359, + "grad_norm": 0.2613782584667206, + "learning_rate": 0.00016861682473210988, + "loss": 0.2831, + "step": 1279 + }, + { + "epoch": 0.4726735598227474, + "grad_norm": 0.24296632409095764, + "learning_rate": 0.0001685921911565464, + "loss": 0.3206, + "step": 1280 + }, + { + "epoch": 0.47304283604135894, + "grad_norm": 0.32532069087028503, + "learning_rate": 0.00016856755758098288, + "loss": 0.3794, + "step": 1281 + }, + { + "epoch": 0.4734121122599705, + "grad_norm": 0.2513275742530823, + "learning_rate": 0.0001685429240054194, + "loss": 0.3105, + "step": 1282 + }, + { + "epoch": 0.473781388478582, + "grad_norm": 0.2619151473045349, + "learning_rate": 0.0001685182904298559, + "loss": 0.3661, + "step": 1283 + }, + { + "epoch": 0.4741506646971935, + "grad_norm": 0.2510347068309784, + "learning_rate": 0.00016849365685429242, + "loss": 0.2862, + "step": 1284 + }, + { + "epoch": 0.474519940915805, + "grad_norm": 0.26665735244750977, + "learning_rate": 0.0001684690232787289, + "loss": 0.3256, + "step": 1285 + }, + { + "epoch": 0.47488921713441656, + "grad_norm": 0.2659134268760681, + "learning_rate": 0.00016844438970316542, + "loss": 0.3083, + "step": 1286 + }, + { + "epoch": 0.47525849335302806, + "grad_norm": 0.32094913721084595, + "learning_rate": 0.0001684197561276019, + "loss": 0.3608, + "step": 1287 + }, + { + "epoch": 0.4756277695716396, + "grad_norm": 0.30373165011405945, + "learning_rate": 0.00016839512255203845, + "loss": 0.3524, + "step": 1288 + }, + { + "epoch": 0.4759970457902511, + "grad_norm": 0.2945308983325958, + "learning_rate": 0.00016837048897647494, + "loss": 0.3217, + "step": 1289 + }, + { + "epoch": 0.47636632200886264, + "grad_norm": 0.28923529386520386, + "learning_rate": 0.00016834585540091146, + "loss": 0.3181, + "step": 1290 + }, + { + "epoch": 0.47673559822747413, + "grad_norm": 0.3239431381225586, + "learning_rate": 0.00016832122182534794, + "loss": 0.3679, + "step": 1291 + }, + { + "epoch": 0.4771048744460857, + "grad_norm": 0.3583320379257202, + "learning_rate": 0.00016829658824978446, + "loss": 0.4003, + "step": 1292 + }, + { + "epoch": 0.4774741506646972, + "grad_norm": 0.30283451080322266, + "learning_rate": 0.00016827195467422097, + "loss": 0.3614, + "step": 1293 + }, + { + "epoch": 0.4778434268833087, + "grad_norm": 0.256317138671875, + "learning_rate": 0.0001682473210986575, + "loss": 0.2705, + "step": 1294 + }, + { + "epoch": 0.4782127031019202, + "grad_norm": 0.23529498279094696, + "learning_rate": 0.00016822268752309397, + "loss": 0.2352, + "step": 1295 + }, + { + "epoch": 0.47858197932053176, + "grad_norm": 0.27355632185935974, + "learning_rate": 0.0001681980539475305, + "loss": 0.2822, + "step": 1296 + }, + { + "epoch": 0.4789512555391433, + "grad_norm": 0.2640261948108673, + "learning_rate": 0.000168173420371967, + "loss": 0.3602, + "step": 1297 + }, + { + "epoch": 0.4793205317577548, + "grad_norm": 0.25632980465888977, + "learning_rate": 0.00016814878679640352, + "loss": 0.3629, + "step": 1298 + }, + { + "epoch": 0.47968980797636634, + "grad_norm": 0.2685964107513428, + "learning_rate": 0.00016812415322084, + "loss": 0.2968, + "step": 1299 + }, + { + "epoch": 0.48005908419497784, + "grad_norm": 0.2813291549682617, + "learning_rate": 0.00016809951964527652, + "loss": 0.3835, + "step": 1300 + }, + { + "epoch": 0.48005908419497784, + "eval_loss": 7.488663196563721, + "eval_runtime": 7.3029, + "eval_samples_per_second": 6.847, + "eval_steps_per_second": 0.959, + "step": 1300 + }, + { + "epoch": 0.4804283604135894, + "grad_norm": 0.2676311433315277, + "learning_rate": 0.00016807488606971303, + "loss": 0.3187, + "step": 1301 + }, + { + "epoch": 0.4807976366322009, + "grad_norm": 0.3175662159919739, + "learning_rate": 0.00016805025249414955, + "loss": 0.3378, + "step": 1302 + }, + { + "epoch": 0.4811669128508124, + "grad_norm": 0.2842338979244232, + "learning_rate": 0.00016802561891858604, + "loss": 0.3032, + "step": 1303 + }, + { + "epoch": 0.4815361890694239, + "grad_norm": 0.29982811212539673, + "learning_rate": 0.00016800098534302255, + "loss": 0.2599, + "step": 1304 + }, + { + "epoch": 0.48190546528803546, + "grad_norm": 0.23763129115104675, + "learning_rate": 0.00016797635176745904, + "loss": 0.2985, + "step": 1305 + }, + { + "epoch": 0.48227474150664695, + "grad_norm": 0.25424662232398987, + "learning_rate": 0.00016795171819189558, + "loss": 0.2342, + "step": 1306 + }, + { + "epoch": 0.4826440177252585, + "grad_norm": 0.22310347855091095, + "learning_rate": 0.00016792708461633207, + "loss": 0.2614, + "step": 1307 + }, + { + "epoch": 0.48301329394387, + "grad_norm": 0.27328959107398987, + "learning_rate": 0.00016790245104076858, + "loss": 0.3158, + "step": 1308 + }, + { + "epoch": 0.48338257016248154, + "grad_norm": 0.237890362739563, + "learning_rate": 0.00016787781746520507, + "loss": 0.2611, + "step": 1309 + }, + { + "epoch": 0.48375184638109303, + "grad_norm": 0.3651958703994751, + "learning_rate": 0.00016785318388964159, + "loss": 0.3792, + "step": 1310 + }, + { + "epoch": 0.4841211225997046, + "grad_norm": 0.28348222374916077, + "learning_rate": 0.0001678285503140781, + "loss": 0.295, + "step": 1311 + }, + { + "epoch": 0.4844903988183161, + "grad_norm": 0.30662021040916443, + "learning_rate": 0.00016780391673851461, + "loss": 0.3451, + "step": 1312 + }, + { + "epoch": 0.4848596750369276, + "grad_norm": 0.38783174753189087, + "learning_rate": 0.0001677792831629511, + "loss": 0.3828, + "step": 1313 + }, + { + "epoch": 0.48522895125553916, + "grad_norm": 0.32809388637542725, + "learning_rate": 0.00016775464958738762, + "loss": 0.3543, + "step": 1314 + }, + { + "epoch": 0.48559822747415066, + "grad_norm": 0.24442286789417267, + "learning_rate": 0.00016773001601182413, + "loss": 0.3106, + "step": 1315 + }, + { + "epoch": 0.4859675036927622, + "grad_norm": 0.2712607979774475, + "learning_rate": 0.00016770538243626065, + "loss": 0.2838, + "step": 1316 + }, + { + "epoch": 0.4863367799113737, + "grad_norm": 0.29960665106773376, + "learning_rate": 0.00016768074886069713, + "loss": 0.3182, + "step": 1317 + }, + { + "epoch": 0.48670605612998524, + "grad_norm": 0.25534406304359436, + "learning_rate": 0.00016765611528513365, + "loss": 0.2832, + "step": 1318 + }, + { + "epoch": 0.48707533234859673, + "grad_norm": 0.2968490719795227, + "learning_rate": 0.00016763148170957014, + "loss": 0.3066, + "step": 1319 + }, + { + "epoch": 0.4874446085672083, + "grad_norm": 0.26333171129226685, + "learning_rate": 0.00016760684813400668, + "loss": 0.2881, + "step": 1320 + }, + { + "epoch": 0.4878138847858198, + "grad_norm": 0.2727160155773163, + "learning_rate": 0.00016758221455844317, + "loss": 0.326, + "step": 1321 + }, + { + "epoch": 0.4881831610044313, + "grad_norm": 0.24175716936588287, + "learning_rate": 0.00016755758098287968, + "loss": 0.2599, + "step": 1322 + }, + { + "epoch": 0.4885524372230428, + "grad_norm": 0.2926419675350189, + "learning_rate": 0.00016753294740731617, + "loss": 0.2591, + "step": 1323 + }, + { + "epoch": 0.48892171344165436, + "grad_norm": 0.31337377429008484, + "learning_rate": 0.00016750831383175268, + "loss": 0.3711, + "step": 1324 + }, + { + "epoch": 0.48929098966026585, + "grad_norm": 0.25357192754745483, + "learning_rate": 0.0001674836802561892, + "loss": 0.3118, + "step": 1325 + }, + { + "epoch": 0.4896602658788774, + "grad_norm": 0.31705743074417114, + "learning_rate": 0.0001674590466806257, + "loss": 0.3255, + "step": 1326 + }, + { + "epoch": 0.49002954209748895, + "grad_norm": 0.2725938856601715, + "learning_rate": 0.0001674344131050622, + "loss": 0.2789, + "step": 1327 + }, + { + "epoch": 0.49039881831610044, + "grad_norm": 0.2607617676258087, + "learning_rate": 0.0001674097795294987, + "loss": 0.3223, + "step": 1328 + }, + { + "epoch": 0.490768094534712, + "grad_norm": 0.2654040455818176, + "learning_rate": 0.00016738514595393523, + "loss": 0.2909, + "step": 1329 + }, + { + "epoch": 0.4911373707533235, + "grad_norm": 0.25885656476020813, + "learning_rate": 0.00016736051237837174, + "loss": 0.3221, + "step": 1330 + }, + { + "epoch": 0.491506646971935, + "grad_norm": 0.31475913524627686, + "learning_rate": 0.00016733587880280823, + "loss": 0.3329, + "step": 1331 + }, + { + "epoch": 0.4918759231905465, + "grad_norm": 0.2200312316417694, + "learning_rate": 0.00016731124522724474, + "loss": 0.2283, + "step": 1332 + }, + { + "epoch": 0.49224519940915806, + "grad_norm": 0.24004139006137848, + "learning_rate": 0.00016728661165168126, + "loss": 0.2704, + "step": 1333 + }, + { + "epoch": 0.49261447562776955, + "grad_norm": 0.23220740258693695, + "learning_rate": 0.00016726197807611777, + "loss": 0.3007, + "step": 1334 + }, + { + "epoch": 0.4929837518463811, + "grad_norm": 0.3104247450828552, + "learning_rate": 0.00016723734450055426, + "loss": 0.3588, + "step": 1335 + }, + { + "epoch": 0.4933530280649926, + "grad_norm": 0.274080365896225, + "learning_rate": 0.00016721271092499078, + "loss": 0.3235, + "step": 1336 + }, + { + "epoch": 0.49372230428360414, + "grad_norm": 0.2597687542438507, + "learning_rate": 0.00016718807734942726, + "loss": 0.3379, + "step": 1337 + }, + { + "epoch": 0.49409158050221563, + "grad_norm": 0.21632221341133118, + "learning_rate": 0.0001671634437738638, + "loss": 0.2512, + "step": 1338 + }, + { + "epoch": 0.4944608567208272, + "grad_norm": 0.23573407530784607, + "learning_rate": 0.0001671388101983003, + "loss": 0.2958, + "step": 1339 + }, + { + "epoch": 0.4948301329394387, + "grad_norm": 0.28679823875427246, + "learning_rate": 0.0001671141766227368, + "loss": 0.3575, + "step": 1340 + }, + { + "epoch": 0.4951994091580502, + "grad_norm": 0.3272477388381958, + "learning_rate": 0.0001670895430471733, + "loss": 0.3146, + "step": 1341 + }, + { + "epoch": 0.49556868537666177, + "grad_norm": 0.24512146413326263, + "learning_rate": 0.0001670649094716098, + "loss": 0.2997, + "step": 1342 + }, + { + "epoch": 0.49593796159527326, + "grad_norm": 0.3463704288005829, + "learning_rate": 0.00016704027589604632, + "loss": 0.3276, + "step": 1343 + }, + { + "epoch": 0.4963072378138848, + "grad_norm": 0.25755786895751953, + "learning_rate": 0.00016701564232048284, + "loss": 0.3109, + "step": 1344 + }, + { + "epoch": 0.4966765140324963, + "grad_norm": 0.2736072242259979, + "learning_rate": 0.00016699100874491933, + "loss": 0.3559, + "step": 1345 + }, + { + "epoch": 0.49704579025110784, + "grad_norm": 0.2861804962158203, + "learning_rate": 0.00016696637516935584, + "loss": 0.3149, + "step": 1346 + }, + { + "epoch": 0.49741506646971934, + "grad_norm": 0.2922269403934479, + "learning_rate": 0.00016694174159379236, + "loss": 0.3081, + "step": 1347 + }, + { + "epoch": 0.4977843426883309, + "grad_norm": 0.27395495772361755, + "learning_rate": 0.00016691710801822887, + "loss": 0.3482, + "step": 1348 + }, + { + "epoch": 0.4981536189069424, + "grad_norm": 0.2672346532344818, + "learning_rate": 0.00016689247444266536, + "loss": 0.346, + "step": 1349 + }, + { + "epoch": 0.4985228951255539, + "grad_norm": 0.2659207284450531, + "learning_rate": 0.00016686784086710187, + "loss": 0.2834, + "step": 1350 + }, + { + "epoch": 0.4985228951255539, + "eval_loss": 7.398162841796875, + "eval_runtime": 6.9093, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 1.013, + "step": 1350 + }, + { + "epoch": 0.4988921713441654, + "grad_norm": 0.24694380164146423, + "learning_rate": 0.00016684320729153836, + "loss": 0.2718, + "step": 1351 + }, + { + "epoch": 0.49926144756277696, + "grad_norm": 0.2784233093261719, + "learning_rate": 0.0001668185737159749, + "loss": 0.3167, + "step": 1352 + }, + { + "epoch": 0.49963072378138845, + "grad_norm": 0.2302161157131195, + "learning_rate": 0.0001667939401404114, + "loss": 0.2343, + "step": 1353 + }, + { + "epoch": 0.5, + "grad_norm": 0.2306922972202301, + "learning_rate": 0.0001667693065648479, + "loss": 0.2871, + "step": 1354 + }, + { + "epoch": 0.5003692762186115, + "grad_norm": 0.3154774308204651, + "learning_rate": 0.0001667446729892844, + "loss": 0.3623, + "step": 1355 + }, + { + "epoch": 0.5007385524372231, + "grad_norm": 0.24718604981899261, + "learning_rate": 0.0001667200394137209, + "loss": 0.2609, + "step": 1356 + }, + { + "epoch": 0.5011078286558346, + "grad_norm": 0.3553144931793213, + "learning_rate": 0.00016669540583815742, + "loss": 0.3936, + "step": 1357 + }, + { + "epoch": 0.5014771048744461, + "grad_norm": 0.2789725363254547, + "learning_rate": 0.00016667077226259394, + "loss": 0.3274, + "step": 1358 + }, + { + "epoch": 0.5018463810930576, + "grad_norm": 0.26267123222351074, + "learning_rate": 0.00016664613868703042, + "loss": 0.3165, + "step": 1359 + }, + { + "epoch": 0.5022156573116692, + "grad_norm": 0.2878930866718292, + "learning_rate": 0.00016662150511146694, + "loss": 0.3026, + "step": 1360 + }, + { + "epoch": 0.5025849335302807, + "grad_norm": 0.2872017025947571, + "learning_rate": 0.00016659687153590345, + "loss": 0.3307, + "step": 1361 + }, + { + "epoch": 0.5029542097488922, + "grad_norm": 0.3103313744068146, + "learning_rate": 0.00016657223796033997, + "loss": 0.3132, + "step": 1362 + }, + { + "epoch": 0.5033234859675036, + "grad_norm": 0.25116729736328125, + "learning_rate": 0.00016654760438477645, + "loss": 0.2859, + "step": 1363 + }, + { + "epoch": 0.5036927621861153, + "grad_norm": 0.270654559135437, + "learning_rate": 0.00016652297080921297, + "loss": 0.3255, + "step": 1364 + }, + { + "epoch": 0.5040620384047267, + "grad_norm": 0.29739823937416077, + "learning_rate": 0.00016649833723364948, + "loss": 0.3094, + "step": 1365 + }, + { + "epoch": 0.5044313146233382, + "grad_norm": 0.28871089220046997, + "learning_rate": 0.000166473703658086, + "loss": 0.3509, + "step": 1366 + }, + { + "epoch": 0.5048005908419497, + "grad_norm": 0.3003438711166382, + "learning_rate": 0.00016644907008252249, + "loss": 0.3189, + "step": 1367 + }, + { + "epoch": 0.5051698670605613, + "grad_norm": 0.2714230716228485, + "learning_rate": 0.000166424436506959, + "loss": 0.3492, + "step": 1368 + }, + { + "epoch": 0.5055391432791728, + "grad_norm": 0.29401907324790955, + "learning_rate": 0.0001663998029313955, + "loss": 0.3582, + "step": 1369 + }, + { + "epoch": 0.5059084194977843, + "grad_norm": 0.28125712275505066, + "learning_rate": 0.00016637516935583203, + "loss": 0.2723, + "step": 1370 + }, + { + "epoch": 0.5062776957163959, + "grad_norm": 0.29865872859954834, + "learning_rate": 0.00016635053578026852, + "loss": 0.3086, + "step": 1371 + }, + { + "epoch": 0.5066469719350074, + "grad_norm": 0.43557608127593994, + "learning_rate": 0.00016632590220470503, + "loss": 0.3785, + "step": 1372 + }, + { + "epoch": 0.5070162481536189, + "grad_norm": 0.27842384576797485, + "learning_rate": 0.00016630126862914152, + "loss": 0.2983, + "step": 1373 + }, + { + "epoch": 0.5073855243722304, + "grad_norm": 0.2520846426486969, + "learning_rate": 0.00016627663505357803, + "loss": 0.2765, + "step": 1374 + }, + { + "epoch": 0.507754800590842, + "grad_norm": 0.2655579149723053, + "learning_rate": 0.00016625200147801455, + "loss": 0.3274, + "step": 1375 + }, + { + "epoch": 0.5081240768094535, + "grad_norm": 0.2557450532913208, + "learning_rate": 0.00016622736790245106, + "loss": 0.3128, + "step": 1376 + }, + { + "epoch": 0.508493353028065, + "grad_norm": 0.33803847432136536, + "learning_rate": 0.00016620273432688755, + "loss": 0.3637, + "step": 1377 + }, + { + "epoch": 0.5088626292466765, + "grad_norm": 0.26982131600379944, + "learning_rate": 0.00016617810075132407, + "loss": 0.2702, + "step": 1378 + }, + { + "epoch": 0.5092319054652881, + "grad_norm": 0.2546871304512024, + "learning_rate": 0.00016615346717576058, + "loss": 0.3006, + "step": 1379 + }, + { + "epoch": 0.5096011816838996, + "grad_norm": 0.29123106598854065, + "learning_rate": 0.0001661288336001971, + "loss": 0.3127, + "step": 1380 + }, + { + "epoch": 0.509970457902511, + "grad_norm": 0.2689873278141022, + "learning_rate": 0.00016610420002463358, + "loss": 0.3355, + "step": 1381 + }, + { + "epoch": 0.5103397341211225, + "grad_norm": 0.24545596539974213, + "learning_rate": 0.0001660795664490701, + "loss": 0.2783, + "step": 1382 + }, + { + "epoch": 0.5107090103397341, + "grad_norm": 0.2998077869415283, + "learning_rate": 0.00016605493287350658, + "loss": 0.3234, + "step": 1383 + }, + { + "epoch": 0.5110782865583456, + "grad_norm": 0.2883042097091675, + "learning_rate": 0.00016603029929794313, + "loss": 0.3669, + "step": 1384 + }, + { + "epoch": 0.5114475627769571, + "grad_norm": 0.26299551129341125, + "learning_rate": 0.00016600566572237961, + "loss": 0.3222, + "step": 1385 + }, + { + "epoch": 0.5118168389955687, + "grad_norm": 0.18907879292964935, + "learning_rate": 0.00016598103214681613, + "loss": 0.2374, + "step": 1386 + }, + { + "epoch": 0.5121861152141802, + "grad_norm": 0.2673446536064148, + "learning_rate": 0.00016595639857125262, + "loss": 0.3083, + "step": 1387 + }, + { + "epoch": 0.5125553914327917, + "grad_norm": 0.22434107959270477, + "learning_rate": 0.00016593176499568913, + "loss": 0.2698, + "step": 1388 + }, + { + "epoch": 0.5129246676514032, + "grad_norm": 0.3749803900718689, + "learning_rate": 0.00016590713142012565, + "loss": 0.3189, + "step": 1389 + }, + { + "epoch": 0.5132939438700148, + "grad_norm": 0.26632341742515564, + "learning_rate": 0.00016588249784456216, + "loss": 0.3021, + "step": 1390 + }, + { + "epoch": 0.5136632200886263, + "grad_norm": 0.2152630239725113, + "learning_rate": 0.00016585786426899865, + "loss": 0.2645, + "step": 1391 + }, + { + "epoch": 0.5140324963072378, + "grad_norm": 0.21173277497291565, + "learning_rate": 0.00016583323069343516, + "loss": 0.258, + "step": 1392 + }, + { + "epoch": 0.5144017725258493, + "grad_norm": 0.29508668184280396, + "learning_rate": 0.00016580859711787168, + "loss": 0.3911, + "step": 1393 + }, + { + "epoch": 0.5147710487444609, + "grad_norm": 0.2779857814311981, + "learning_rate": 0.0001657839635423082, + "loss": 0.2846, + "step": 1394 + }, + { + "epoch": 0.5151403249630724, + "grad_norm": 0.29273658990859985, + "learning_rate": 0.00016575932996674468, + "loss": 0.2948, + "step": 1395 + }, + { + "epoch": 0.5155096011816839, + "grad_norm": 0.22535477578639984, + "learning_rate": 0.0001657346963911812, + "loss": 0.2356, + "step": 1396 + }, + { + "epoch": 0.5158788774002954, + "grad_norm": 0.2937314808368683, + "learning_rate": 0.00016571006281561768, + "loss": 0.2979, + "step": 1397 + }, + { + "epoch": 0.516248153618907, + "grad_norm": 0.2740054726600647, + "learning_rate": 0.0001656854292400542, + "loss": 0.3457, + "step": 1398 + }, + { + "epoch": 0.5166174298375185, + "grad_norm": 0.25646868348121643, + "learning_rate": 0.0001656607956644907, + "loss": 0.3055, + "step": 1399 + }, + { + "epoch": 0.51698670605613, + "grad_norm": 0.31118524074554443, + "learning_rate": 0.0001656361620889272, + "loss": 0.2801, + "step": 1400 + }, + { + "epoch": 0.51698670605613, + "eval_loss": 7.630523204803467, + "eval_runtime": 7.1442, + "eval_samples_per_second": 6.999, + "eval_steps_per_second": 0.98, + "step": 1400 + }, + { + "epoch": 0.5173559822747416, + "grad_norm": 0.25323861837387085, + "learning_rate": 0.0001656115285133637, + "loss": 0.2247, + "step": 1401 + }, + { + "epoch": 0.517725258493353, + "grad_norm": 0.27337586879730225, + "learning_rate": 0.00016558689493780023, + "loss": 0.3012, + "step": 1402 + }, + { + "epoch": 0.5180945347119645, + "grad_norm": 0.29918622970581055, + "learning_rate": 0.00016556226136223674, + "loss": 0.3654, + "step": 1403 + }, + { + "epoch": 0.518463810930576, + "grad_norm": 0.28812846541404724, + "learning_rate": 0.00016553762778667323, + "loss": 0.2936, + "step": 1404 + }, + { + "epoch": 0.5188330871491876, + "grad_norm": 0.28900471329689026, + "learning_rate": 0.00016551299421110974, + "loss": 0.3158, + "step": 1405 + }, + { + "epoch": 0.5192023633677991, + "grad_norm": 0.3591167628765106, + "learning_rate": 0.00016548836063554626, + "loss": 0.3306, + "step": 1406 + }, + { + "epoch": 0.5195716395864106, + "grad_norm": 0.217594176530838, + "learning_rate": 0.00016546372705998277, + "loss": 0.2415, + "step": 1407 + }, + { + "epoch": 0.5199409158050221, + "grad_norm": 0.2799479365348816, + "learning_rate": 0.00016543909348441926, + "loss": 0.3172, + "step": 1408 + }, + { + "epoch": 0.5203101920236337, + "grad_norm": 0.2554279565811157, + "learning_rate": 0.00016541445990885578, + "loss": 0.2626, + "step": 1409 + }, + { + "epoch": 0.5206794682422452, + "grad_norm": 0.32111266255378723, + "learning_rate": 0.00016538982633329226, + "loss": 0.355, + "step": 1410 + }, + { + "epoch": 0.5210487444608567, + "grad_norm": 0.23784315586090088, + "learning_rate": 0.0001653651927577288, + "loss": 0.2723, + "step": 1411 + }, + { + "epoch": 0.5214180206794683, + "grad_norm": 0.3516959547996521, + "learning_rate": 0.0001653405591821653, + "loss": 0.3849, + "step": 1412 + }, + { + "epoch": 0.5217872968980798, + "grad_norm": 0.2798328697681427, + "learning_rate": 0.0001653159256066018, + "loss": 0.3199, + "step": 1413 + }, + { + "epoch": 0.5221565731166913, + "grad_norm": 0.31100359559059143, + "learning_rate": 0.0001652912920310383, + "loss": 0.3359, + "step": 1414 + }, + { + "epoch": 0.5225258493353028, + "grad_norm": 0.33902430534362793, + "learning_rate": 0.0001652666584554748, + "loss": 0.3947, + "step": 1415 + }, + { + "epoch": 0.5228951255539144, + "grad_norm": 0.28545519709587097, + "learning_rate": 0.00016524202487991132, + "loss": 0.3378, + "step": 1416 + }, + { + "epoch": 0.5232644017725259, + "grad_norm": 0.2658042311668396, + "learning_rate": 0.00016521739130434784, + "loss": 0.2703, + "step": 1417 + }, + { + "epoch": 0.5236336779911374, + "grad_norm": 0.25140413641929626, + "learning_rate": 0.00016519275772878433, + "loss": 0.3354, + "step": 1418 + }, + { + "epoch": 0.5240029542097489, + "grad_norm": 0.2862764596939087, + "learning_rate": 0.00016516812415322084, + "loss": 0.3194, + "step": 1419 + }, + { + "epoch": 0.5243722304283605, + "grad_norm": 0.25336530804634094, + "learning_rate": 0.00016514349057765736, + "loss": 0.2817, + "step": 1420 + }, + { + "epoch": 0.524741506646972, + "grad_norm": 0.224054217338562, + "learning_rate": 0.00016511885700209387, + "loss": 0.2857, + "step": 1421 + }, + { + "epoch": 0.5251107828655834, + "grad_norm": 0.2458321750164032, + "learning_rate": 0.00016509422342653036, + "loss": 0.3282, + "step": 1422 + }, + { + "epoch": 0.5254800590841949, + "grad_norm": 0.32071107625961304, + "learning_rate": 0.00016506958985096687, + "loss": 0.2915, + "step": 1423 + }, + { + "epoch": 0.5258493353028065, + "grad_norm": 0.26842883229255676, + "learning_rate": 0.00016504495627540336, + "loss": 0.3211, + "step": 1424 + }, + { + "epoch": 0.526218611521418, + "grad_norm": 0.24505074322223663, + "learning_rate": 0.0001650203226998399, + "loss": 0.2817, + "step": 1425 + }, + { + "epoch": 0.5265878877400295, + "grad_norm": 0.22624437510967255, + "learning_rate": 0.0001649956891242764, + "loss": 0.2614, + "step": 1426 + }, + { + "epoch": 0.5269571639586411, + "grad_norm": 0.23325221240520477, + "learning_rate": 0.0001649710555487129, + "loss": 0.2975, + "step": 1427 + }, + { + "epoch": 0.5273264401772526, + "grad_norm": 0.2249109148979187, + "learning_rate": 0.0001649464219731494, + "loss": 0.2915, + "step": 1428 + }, + { + "epoch": 0.5276957163958641, + "grad_norm": 0.25846782326698303, + "learning_rate": 0.0001649217883975859, + "loss": 0.3215, + "step": 1429 + }, + { + "epoch": 0.5280649926144756, + "grad_norm": 0.27339687943458557, + "learning_rate": 0.00016489715482202242, + "loss": 0.293, + "step": 1430 + }, + { + "epoch": 0.5284342688330872, + "grad_norm": 0.2816579043865204, + "learning_rate": 0.00016487252124645894, + "loss": 0.3071, + "step": 1431 + }, + { + "epoch": 0.5288035450516987, + "grad_norm": 0.31145527958869934, + "learning_rate": 0.00016484788767089542, + "loss": 0.3224, + "step": 1432 + }, + { + "epoch": 0.5291728212703102, + "grad_norm": 0.2712482213973999, + "learning_rate": 0.00016482325409533194, + "loss": 0.3486, + "step": 1433 + }, + { + "epoch": 0.5295420974889217, + "grad_norm": 0.2737232744693756, + "learning_rate": 0.00016479862051976845, + "loss": 0.2881, + "step": 1434 + }, + { + "epoch": 0.5299113737075333, + "grad_norm": 0.28899168968200684, + "learning_rate": 0.00016477398694420497, + "loss": 0.405, + "step": 1435 + }, + { + "epoch": 0.5302806499261448, + "grad_norm": 0.22227564454078674, + "learning_rate": 0.00016474935336864145, + "loss": 0.2282, + "step": 1436 + }, + { + "epoch": 0.5306499261447563, + "grad_norm": 0.2697335183620453, + "learning_rate": 0.00016472471979307797, + "loss": 0.335, + "step": 1437 + }, + { + "epoch": 0.5310192023633677, + "grad_norm": 0.2573060095310211, + "learning_rate": 0.00016470008621751448, + "loss": 0.3003, + "step": 1438 + }, + { + "epoch": 0.5313884785819794, + "grad_norm": 0.2943517565727234, + "learning_rate": 0.000164675452641951, + "loss": 0.3621, + "step": 1439 + }, + { + "epoch": 0.5317577548005908, + "grad_norm": 0.27101999521255493, + "learning_rate": 0.00016465081906638749, + "loss": 0.329, + "step": 1440 + }, + { + "epoch": 0.5321270310192023, + "grad_norm": 0.28076571226119995, + "learning_rate": 0.000164626185490824, + "loss": 0.3078, + "step": 1441 + }, + { + "epoch": 0.5324963072378139, + "grad_norm": 0.2468804568052292, + "learning_rate": 0.0001646015519152605, + "loss": 0.3182, + "step": 1442 + }, + { + "epoch": 0.5328655834564254, + "grad_norm": 0.2434876263141632, + "learning_rate": 0.00016457691833969703, + "loss": 0.3101, + "step": 1443 + }, + { + "epoch": 0.5332348596750369, + "grad_norm": 0.40741223096847534, + "learning_rate": 0.00016455228476413352, + "loss": 0.3806, + "step": 1444 + }, + { + "epoch": 0.5336041358936484, + "grad_norm": 0.23966705799102783, + "learning_rate": 0.00016452765118857003, + "loss": 0.3279, + "step": 1445 + }, + { + "epoch": 0.53397341211226, + "grad_norm": 0.24010653793811798, + "learning_rate": 0.00016450301761300652, + "loss": 0.3505, + "step": 1446 + }, + { + "epoch": 0.5343426883308715, + "grad_norm": 0.2255295217037201, + "learning_rate": 0.00016447838403744303, + "loss": 0.2629, + "step": 1447 + }, + { + "epoch": 0.534711964549483, + "grad_norm": 0.2610571086406708, + "learning_rate": 0.00016445375046187955, + "loss": 0.3171, + "step": 1448 + }, + { + "epoch": 0.5350812407680945, + "grad_norm": 0.26715385913848877, + "learning_rate": 0.00016442911688631606, + "loss": 0.29, + "step": 1449 + }, + { + "epoch": 0.5354505169867061, + "grad_norm": 0.234164759516716, + "learning_rate": 0.00016440448331075255, + "loss": 0.304, + "step": 1450 + }, + { + "epoch": 0.5354505169867061, + "eval_loss": 7.701912879943848, + "eval_runtime": 6.9519, + "eval_samples_per_second": 7.192, + "eval_steps_per_second": 1.007, + "step": 1450 + }, + { + "epoch": 0.5358197932053176, + "grad_norm": 0.2456762194633484, + "learning_rate": 0.00016437984973518907, + "loss": 0.3061, + "step": 1451 + }, + { + "epoch": 0.5361890694239291, + "grad_norm": 0.305461049079895, + "learning_rate": 0.00016435521615962558, + "loss": 0.3871, + "step": 1452 + }, + { + "epoch": 0.5365583456425406, + "grad_norm": 0.2389150708913803, + "learning_rate": 0.0001643305825840621, + "loss": 0.2685, + "step": 1453 + }, + { + "epoch": 0.5369276218611522, + "grad_norm": 0.27864402532577515, + "learning_rate": 0.00016430594900849858, + "loss": 0.2708, + "step": 1454 + }, + { + "epoch": 0.5372968980797637, + "grad_norm": 0.22496536374092102, + "learning_rate": 0.0001642813154329351, + "loss": 0.2803, + "step": 1455 + }, + { + "epoch": 0.5376661742983752, + "grad_norm": 0.276080459356308, + "learning_rate": 0.00016425668185737158, + "loss": 0.301, + "step": 1456 + }, + { + "epoch": 0.5380354505169868, + "grad_norm": 0.27176433801651, + "learning_rate": 0.00016423204828180813, + "loss": 0.3324, + "step": 1457 + }, + { + "epoch": 0.5384047267355982, + "grad_norm": 0.27836737036705017, + "learning_rate": 0.00016420741470624461, + "loss": 0.3494, + "step": 1458 + }, + { + "epoch": 0.5387740029542097, + "grad_norm": 0.35179686546325684, + "learning_rate": 0.00016418278113068113, + "loss": 0.3527, + "step": 1459 + }, + { + "epoch": 0.5391432791728212, + "grad_norm": 0.2354753315448761, + "learning_rate": 0.00016415814755511762, + "loss": 0.2813, + "step": 1460 + }, + { + "epoch": 0.5395125553914328, + "grad_norm": 0.314908891916275, + "learning_rate": 0.00016413351397955413, + "loss": 0.3097, + "step": 1461 + }, + { + "epoch": 0.5398818316100443, + "grad_norm": 0.20056059956550598, + "learning_rate": 0.00016410888040399065, + "loss": 0.2323, + "step": 1462 + }, + { + "epoch": 0.5402511078286558, + "grad_norm": 0.34253835678100586, + "learning_rate": 0.00016408424682842716, + "loss": 0.3779, + "step": 1463 + }, + { + "epoch": 0.5406203840472673, + "grad_norm": 0.24463196098804474, + "learning_rate": 0.00016405961325286365, + "loss": 0.2747, + "step": 1464 + }, + { + "epoch": 0.5409896602658789, + "grad_norm": 0.31938499212265015, + "learning_rate": 0.00016403497967730016, + "loss": 0.3504, + "step": 1465 + }, + { + "epoch": 0.5413589364844904, + "grad_norm": 0.2998482882976532, + "learning_rate": 0.00016401034610173668, + "loss": 0.3043, + "step": 1466 + }, + { + "epoch": 0.5417282127031019, + "grad_norm": 0.2368236780166626, + "learning_rate": 0.0001639857125261732, + "loss": 0.2681, + "step": 1467 + }, + { + "epoch": 0.5420974889217134, + "grad_norm": 0.26847004890441895, + "learning_rate": 0.00016396107895060968, + "loss": 0.2902, + "step": 1468 + }, + { + "epoch": 0.542466765140325, + "grad_norm": 0.22659504413604736, + "learning_rate": 0.0001639364453750462, + "loss": 0.262, + "step": 1469 + }, + { + "epoch": 0.5428360413589365, + "grad_norm": 0.24348299205303192, + "learning_rate": 0.0001639118117994827, + "loss": 0.3225, + "step": 1470 + }, + { + "epoch": 0.543205317577548, + "grad_norm": 0.24891531467437744, + "learning_rate": 0.00016388717822391922, + "loss": 0.2754, + "step": 1471 + }, + { + "epoch": 0.5435745937961596, + "grad_norm": 0.24874362349510193, + "learning_rate": 0.0001638625446483557, + "loss": 0.2692, + "step": 1472 + }, + { + "epoch": 0.5439438700147711, + "grad_norm": 0.33797869086265564, + "learning_rate": 0.00016383791107279222, + "loss": 0.3725, + "step": 1473 + }, + { + "epoch": 0.5443131462333826, + "grad_norm": 0.26838093996047974, + "learning_rate": 0.0001638132774972287, + "loss": 0.3148, + "step": 1474 + }, + { + "epoch": 0.544682422451994, + "grad_norm": 0.26082298159599304, + "learning_rate": 0.00016378864392166525, + "loss": 0.3255, + "step": 1475 + }, + { + "epoch": 0.5450516986706057, + "grad_norm": 0.3179328739643097, + "learning_rate": 0.00016376401034610174, + "loss": 0.4443, + "step": 1476 + }, + { + "epoch": 0.5454209748892171, + "grad_norm": 0.24166199564933777, + "learning_rate": 0.00016373937677053826, + "loss": 0.2579, + "step": 1477 + }, + { + "epoch": 0.5457902511078286, + "grad_norm": 0.27253440022468567, + "learning_rate": 0.00016371474319497474, + "loss": 0.3323, + "step": 1478 + }, + { + "epoch": 0.5461595273264401, + "grad_norm": 0.2972589433193207, + "learning_rate": 0.00016369010961941126, + "loss": 0.3995, + "step": 1479 + }, + { + "epoch": 0.5465288035450517, + "grad_norm": 0.29314813017845154, + "learning_rate": 0.00016366547604384777, + "loss": 0.3511, + "step": 1480 + }, + { + "epoch": 0.5468980797636632, + "grad_norm": 0.34684857726097107, + "learning_rate": 0.0001636408424682843, + "loss": 0.3263, + "step": 1481 + }, + { + "epoch": 0.5472673559822747, + "grad_norm": 0.2844546139240265, + "learning_rate": 0.00016361620889272078, + "loss": 0.3029, + "step": 1482 + }, + { + "epoch": 0.5476366322008862, + "grad_norm": 0.22748489677906036, + "learning_rate": 0.0001635915753171573, + "loss": 0.2925, + "step": 1483 + }, + { + "epoch": 0.5480059084194978, + "grad_norm": 0.37643754482269287, + "learning_rate": 0.0001635669417415938, + "loss": 0.3789, + "step": 1484 + }, + { + "epoch": 0.5483751846381093, + "grad_norm": 0.2671006917953491, + "learning_rate": 0.00016354230816603032, + "loss": 0.3162, + "step": 1485 + }, + { + "epoch": 0.5487444608567208, + "grad_norm": 0.21860508620738983, + "learning_rate": 0.0001635176745904668, + "loss": 0.3222, + "step": 1486 + }, + { + "epoch": 0.5491137370753324, + "grad_norm": 0.25474813580513, + "learning_rate": 0.00016349304101490332, + "loss": 0.3523, + "step": 1487 + }, + { + "epoch": 0.5494830132939439, + "grad_norm": 0.2932925522327423, + "learning_rate": 0.0001634684074393398, + "loss": 0.3841, + "step": 1488 + }, + { + "epoch": 0.5498522895125554, + "grad_norm": 0.3073911964893341, + "learning_rate": 0.00016344377386377635, + "loss": 0.3135, + "step": 1489 + }, + { + "epoch": 0.5502215657311669, + "grad_norm": 0.26409900188446045, + "learning_rate": 0.00016341914028821284, + "loss": 0.2958, + "step": 1490 + }, + { + "epoch": 0.5505908419497785, + "grad_norm": 0.30042722821235657, + "learning_rate": 0.00016339450671264935, + "loss": 0.3642, + "step": 1491 + }, + { + "epoch": 0.55096011816839, + "grad_norm": 0.23854683339595795, + "learning_rate": 0.00016336987313708584, + "loss": 0.2773, + "step": 1492 + }, + { + "epoch": 0.5513293943870015, + "grad_norm": 0.24511685967445374, + "learning_rate": 0.00016334523956152236, + "loss": 0.2742, + "step": 1493 + }, + { + "epoch": 0.551698670605613, + "grad_norm": 0.2707538604736328, + "learning_rate": 0.00016332060598595887, + "loss": 0.3189, + "step": 1494 + }, + { + "epoch": 0.5520679468242246, + "grad_norm": 0.2984696328639984, + "learning_rate": 0.00016329597241039538, + "loss": 0.3296, + "step": 1495 + }, + { + "epoch": 0.552437223042836, + "grad_norm": 0.27879297733306885, + "learning_rate": 0.00016327133883483187, + "loss": 0.3541, + "step": 1496 + }, + { + "epoch": 0.5528064992614475, + "grad_norm": 0.30997321009635925, + "learning_rate": 0.0001632467052592684, + "loss": 0.3105, + "step": 1497 + }, + { + "epoch": 0.553175775480059, + "grad_norm": 0.20996759831905365, + "learning_rate": 0.0001632220716837049, + "loss": 0.2851, + "step": 1498 + }, + { + "epoch": 0.5535450516986706, + "grad_norm": 0.3022238314151764, + "learning_rate": 0.00016319743810814142, + "loss": 0.3536, + "step": 1499 + }, + { + "epoch": 0.5539143279172821, + "grad_norm": 0.23744630813598633, + "learning_rate": 0.0001631728045325779, + "loss": 0.2845, + "step": 1500 + }, + { + "epoch": 0.5539143279172821, + "eval_loss": 7.584290981292725, + "eval_runtime": 7.2209, + "eval_samples_per_second": 6.924, + "eval_steps_per_second": 0.969, + "step": 1500 + }, + { + "epoch": 0.5542836041358936, + "grad_norm": 0.25254857540130615, + "learning_rate": 0.00016314817095701442, + "loss": 0.2986, + "step": 1501 + }, + { + "epoch": 0.5546528803545052, + "grad_norm": 0.28746384382247925, + "learning_rate": 0.00016312353738145093, + "loss": 0.3605, + "step": 1502 + }, + { + "epoch": 0.5550221565731167, + "grad_norm": 0.24880008399486542, + "learning_rate": 0.00016309890380588745, + "loss": 0.3112, + "step": 1503 + }, + { + "epoch": 0.5553914327917282, + "grad_norm": 0.24837081134319305, + "learning_rate": 0.00016307427023032393, + "loss": 0.3164, + "step": 1504 + }, + { + "epoch": 0.5557607090103397, + "grad_norm": 0.2272336632013321, + "learning_rate": 0.00016304963665476045, + "loss": 0.2693, + "step": 1505 + }, + { + "epoch": 0.5561299852289513, + "grad_norm": 0.308358371257782, + "learning_rate": 0.00016302500307919694, + "loss": 0.3081, + "step": 1506 + }, + { + "epoch": 0.5564992614475628, + "grad_norm": 0.2964613437652588, + "learning_rate": 0.00016300036950363348, + "loss": 0.3034, + "step": 1507 + }, + { + "epoch": 0.5568685376661743, + "grad_norm": 0.25197067856788635, + "learning_rate": 0.00016297573592806997, + "loss": 0.3219, + "step": 1508 + }, + { + "epoch": 0.5572378138847858, + "grad_norm": 0.22385966777801514, + "learning_rate": 0.00016295110235250648, + "loss": 0.2852, + "step": 1509 + }, + { + "epoch": 0.5576070901033974, + "grad_norm": 0.30940085649490356, + "learning_rate": 0.00016292646877694297, + "loss": 0.3938, + "step": 1510 + }, + { + "epoch": 0.5579763663220089, + "grad_norm": 0.3045515716075897, + "learning_rate": 0.00016290183520137948, + "loss": 0.3998, + "step": 1511 + }, + { + "epoch": 0.5583456425406204, + "grad_norm": 0.2571142315864563, + "learning_rate": 0.000162877201625816, + "loss": 0.3032, + "step": 1512 + }, + { + "epoch": 0.558714918759232, + "grad_norm": 0.2966245412826538, + "learning_rate": 0.0001628525680502525, + "loss": 0.3265, + "step": 1513 + }, + { + "epoch": 0.5590841949778435, + "grad_norm": 0.3071720600128174, + "learning_rate": 0.000162827934474689, + "loss": 0.3125, + "step": 1514 + }, + { + "epoch": 0.5594534711964549, + "grad_norm": 0.26687899231910706, + "learning_rate": 0.00016280330089912551, + "loss": 0.3205, + "step": 1515 + }, + { + "epoch": 0.5598227474150664, + "grad_norm": 0.3195459246635437, + "learning_rate": 0.00016277866732356203, + "loss": 0.3185, + "step": 1516 + }, + { + "epoch": 0.560192023633678, + "grad_norm": 0.25656718015670776, + "learning_rate": 0.00016275403374799854, + "loss": 0.3054, + "step": 1517 + }, + { + "epoch": 0.5605612998522895, + "grad_norm": 0.2774629592895508, + "learning_rate": 0.00016272940017243503, + "loss": 0.349, + "step": 1518 + }, + { + "epoch": 0.560930576070901, + "grad_norm": 0.31901049613952637, + "learning_rate": 0.00016270476659687155, + "loss": 0.3302, + "step": 1519 + }, + { + "epoch": 0.5612998522895125, + "grad_norm": 0.2996314764022827, + "learning_rate": 0.00016268013302130803, + "loss": 0.3756, + "step": 1520 + }, + { + "epoch": 0.5616691285081241, + "grad_norm": 0.25130292773246765, + "learning_rate": 0.00016265549944574458, + "loss": 0.2793, + "step": 1521 + }, + { + "epoch": 0.5620384047267356, + "grad_norm": 0.22529804706573486, + "learning_rate": 0.00016263086587018106, + "loss": 0.2827, + "step": 1522 + }, + { + "epoch": 0.5624076809453471, + "grad_norm": 0.2514728903770447, + "learning_rate": 0.00016260623229461758, + "loss": 0.3101, + "step": 1523 + }, + { + "epoch": 0.5627769571639586, + "grad_norm": 0.2736479938030243, + "learning_rate": 0.00016258159871905406, + "loss": 0.3247, + "step": 1524 + }, + { + "epoch": 0.5631462333825702, + "grad_norm": 0.2479754388332367, + "learning_rate": 0.00016255696514349058, + "loss": 0.3041, + "step": 1525 + }, + { + "epoch": 0.5635155096011817, + "grad_norm": 0.29206737875938416, + "learning_rate": 0.0001625323315679271, + "loss": 0.3814, + "step": 1526 + }, + { + "epoch": 0.5638847858197932, + "grad_norm": 0.2652639150619507, + "learning_rate": 0.0001625076979923636, + "loss": 0.3101, + "step": 1527 + }, + { + "epoch": 0.5642540620384048, + "grad_norm": 0.2730245888233185, + "learning_rate": 0.0001624830644168001, + "loss": 0.3134, + "step": 1528 + }, + { + "epoch": 0.5646233382570163, + "grad_norm": 0.27171245217323303, + "learning_rate": 0.0001624584308412366, + "loss": 0.3354, + "step": 1529 + }, + { + "epoch": 0.5649926144756278, + "grad_norm": 0.2858797013759613, + "learning_rate": 0.00016243379726567313, + "loss": 0.3172, + "step": 1530 + }, + { + "epoch": 0.5653618906942393, + "grad_norm": 0.31379011273384094, + "learning_rate": 0.00016240916369010964, + "loss": 0.2986, + "step": 1531 + }, + { + "epoch": 0.5657311669128509, + "grad_norm": 0.2694772779941559, + "learning_rate": 0.00016238453011454613, + "loss": 0.3502, + "step": 1532 + }, + { + "epoch": 0.5661004431314623, + "grad_norm": 0.26037195324897766, + "learning_rate": 0.00016235989653898264, + "loss": 0.3147, + "step": 1533 + }, + { + "epoch": 0.5664697193500738, + "grad_norm": 0.24806667864322662, + "learning_rate": 0.00016233526296341913, + "loss": 0.3207, + "step": 1534 + }, + { + "epoch": 0.5668389955686853, + "grad_norm": 0.2972624897956848, + "learning_rate": 0.00016231062938785567, + "loss": 0.3217, + "step": 1535 + }, + { + "epoch": 0.5672082717872969, + "grad_norm": 0.3553450405597687, + "learning_rate": 0.00016228599581229216, + "loss": 0.3584, + "step": 1536 + }, + { + "epoch": 0.5675775480059084, + "grad_norm": 0.28341180086135864, + "learning_rate": 0.00016226136223672867, + "loss": 0.2427, + "step": 1537 + }, + { + "epoch": 0.5679468242245199, + "grad_norm": 0.26211366057395935, + "learning_rate": 0.00016223672866116516, + "loss": 0.3499, + "step": 1538 + }, + { + "epoch": 0.5683161004431314, + "grad_norm": 0.31121179461479187, + "learning_rate": 0.0001622120950856017, + "loss": 0.3725, + "step": 1539 + }, + { + "epoch": 0.568685376661743, + "grad_norm": 0.4344642460346222, + "learning_rate": 0.0001621874615100382, + "loss": 0.326, + "step": 1540 + }, + { + "epoch": 0.5690546528803545, + "grad_norm": 0.29905596375465393, + "learning_rate": 0.0001621628279344747, + "loss": 0.293, + "step": 1541 + }, + { + "epoch": 0.569423929098966, + "grad_norm": 0.29579880833625793, + "learning_rate": 0.0001621381943589112, + "loss": 0.3516, + "step": 1542 + }, + { + "epoch": 0.5697932053175776, + "grad_norm": 0.2735360264778137, + "learning_rate": 0.0001621135607833477, + "loss": 0.2921, + "step": 1543 + }, + { + "epoch": 0.5701624815361891, + "grad_norm": 0.25425660610198975, + "learning_rate": 0.00016208892720778422, + "loss": 0.2833, + "step": 1544 + }, + { + "epoch": 0.5705317577548006, + "grad_norm": 0.2781763970851898, + "learning_rate": 0.00016206429363222074, + "loss": 0.3237, + "step": 1545 + }, + { + "epoch": 0.5709010339734121, + "grad_norm": 0.2914276421070099, + "learning_rate": 0.00016203966005665722, + "loss": 0.2976, + "step": 1546 + }, + { + "epoch": 0.5712703101920237, + "grad_norm": 0.24221573770046234, + "learning_rate": 0.00016201502648109374, + "loss": 0.2981, + "step": 1547 + }, + { + "epoch": 0.5716395864106352, + "grad_norm": 0.25619933009147644, + "learning_rate": 0.00016199039290553025, + "loss": 0.2642, + "step": 1548 + }, + { + "epoch": 0.5720088626292467, + "grad_norm": 0.22263039648532867, + "learning_rate": 0.00016196575932996677, + "loss": 0.2634, + "step": 1549 + }, + { + "epoch": 0.5723781388478582, + "grad_norm": 0.24372972548007965, + "learning_rate": 0.00016194112575440326, + "loss": 0.2598, + "step": 1550 + }, + { + "epoch": 0.5723781388478582, + "eval_loss": 7.577460765838623, + "eval_runtime": 6.9609, + "eval_samples_per_second": 7.183, + "eval_steps_per_second": 1.006, + "step": 1550 + }, + { + "epoch": 0.5727474150664698, + "grad_norm": 0.3471795916557312, + "learning_rate": 0.00016191649217883977, + "loss": 0.3954, + "step": 1551 + }, + { + "epoch": 0.5731166912850812, + "grad_norm": 0.30651941895484924, + "learning_rate": 0.00016189185860327626, + "loss": 0.3109, + "step": 1552 + }, + { + "epoch": 0.5734859675036927, + "grad_norm": 0.2919563949108124, + "learning_rate": 0.0001618672250277128, + "loss": 0.3407, + "step": 1553 + }, + { + "epoch": 0.5738552437223042, + "grad_norm": 0.27599087357521057, + "learning_rate": 0.0001618425914521493, + "loss": 0.3454, + "step": 1554 + }, + { + "epoch": 0.5742245199409158, + "grad_norm": 0.2686636447906494, + "learning_rate": 0.0001618179578765858, + "loss": 0.2549, + "step": 1555 + }, + { + "epoch": 0.5745937961595273, + "grad_norm": 0.28241464495658875, + "learning_rate": 0.0001617933243010223, + "loss": 0.3041, + "step": 1556 + }, + { + "epoch": 0.5749630723781388, + "grad_norm": 0.2569475769996643, + "learning_rate": 0.0001617686907254588, + "loss": 0.2424, + "step": 1557 + }, + { + "epoch": 0.5753323485967504, + "grad_norm": 0.27386388182640076, + "learning_rate": 0.00016174405714989532, + "loss": 0.2694, + "step": 1558 + }, + { + "epoch": 0.5757016248153619, + "grad_norm": 0.31360316276550293, + "learning_rate": 0.00016171942357433183, + "loss": 0.407, + "step": 1559 + }, + { + "epoch": 0.5760709010339734, + "grad_norm": 0.2870629131793976, + "learning_rate": 0.00016169478999876832, + "loss": 0.3341, + "step": 1560 + }, + { + "epoch": 0.5764401772525849, + "grad_norm": 0.3812291920185089, + "learning_rate": 0.00016167015642320484, + "loss": 0.3468, + "step": 1561 + }, + { + "epoch": 0.5768094534711965, + "grad_norm": 0.3150885999202728, + "learning_rate": 0.00016164552284764135, + "loss": 0.3751, + "step": 1562 + }, + { + "epoch": 0.577178729689808, + "grad_norm": 0.24707911908626556, + "learning_rate": 0.00016162088927207786, + "loss": 0.3401, + "step": 1563 + }, + { + "epoch": 0.5775480059084195, + "grad_norm": 0.29155275225639343, + "learning_rate": 0.00016159625569651435, + "loss": 0.3009, + "step": 1564 + }, + { + "epoch": 0.577917282127031, + "grad_norm": 0.25705769658088684, + "learning_rate": 0.00016157162212095087, + "loss": 0.3462, + "step": 1565 + }, + { + "epoch": 0.5782865583456426, + "grad_norm": 0.30340880155563354, + "learning_rate": 0.00016154698854538735, + "loss": 0.3675, + "step": 1566 + }, + { + "epoch": 0.5786558345642541, + "grad_norm": 0.253224641084671, + "learning_rate": 0.0001615223549698239, + "loss": 0.3403, + "step": 1567 + }, + { + "epoch": 0.5790251107828656, + "grad_norm": 0.28197672963142395, + "learning_rate": 0.00016149772139426038, + "loss": 0.344, + "step": 1568 + }, + { + "epoch": 0.579394387001477, + "grad_norm": 0.24765484035015106, + "learning_rate": 0.0001614730878186969, + "loss": 0.2801, + "step": 1569 + }, + { + "epoch": 0.5797636632200887, + "grad_norm": 0.2447032928466797, + "learning_rate": 0.00016144845424313339, + "loss": 0.3265, + "step": 1570 + }, + { + "epoch": 0.5801329394387001, + "grad_norm": 0.2501904368400574, + "learning_rate": 0.0001614238206675699, + "loss": 0.3256, + "step": 1571 + }, + { + "epoch": 0.5805022156573116, + "grad_norm": 0.31298375129699707, + "learning_rate": 0.00016139918709200642, + "loss": 0.3564, + "step": 1572 + }, + { + "epoch": 0.5808714918759232, + "grad_norm": 0.2798749506473541, + "learning_rate": 0.00016137455351644293, + "loss": 0.3501, + "step": 1573 + }, + { + "epoch": 0.5812407680945347, + "grad_norm": 0.4332028031349182, + "learning_rate": 0.00016134991994087942, + "loss": 0.2997, + "step": 1574 + }, + { + "epoch": 0.5816100443131462, + "grad_norm": 0.3094131350517273, + "learning_rate": 0.00016132528636531593, + "loss": 0.4367, + "step": 1575 + }, + { + "epoch": 0.5819793205317577, + "grad_norm": 0.3054462969303131, + "learning_rate": 0.00016130065278975245, + "loss": 0.3358, + "step": 1576 + }, + { + "epoch": 0.5823485967503693, + "grad_norm": 0.2707670331001282, + "learning_rate": 0.00016127601921418896, + "loss": 0.3029, + "step": 1577 + }, + { + "epoch": 0.5827178729689808, + "grad_norm": 0.28120335936546326, + "learning_rate": 0.00016125138563862545, + "loss": 0.4153, + "step": 1578 + }, + { + "epoch": 0.5830871491875923, + "grad_norm": 0.3150355815887451, + "learning_rate": 0.00016122675206306196, + "loss": 0.3302, + "step": 1579 + }, + { + "epoch": 0.5834564254062038, + "grad_norm": 0.25576528906822205, + "learning_rate": 0.00016120211848749848, + "loss": 0.3051, + "step": 1580 + }, + { + "epoch": 0.5838257016248154, + "grad_norm": 0.25515562295913696, + "learning_rate": 0.000161177484911935, + "loss": 0.2969, + "step": 1581 + }, + { + "epoch": 0.5841949778434269, + "grad_norm": 0.2506538927555084, + "learning_rate": 0.00016115285133637148, + "loss": 0.3224, + "step": 1582 + }, + { + "epoch": 0.5845642540620384, + "grad_norm": 0.26098111271858215, + "learning_rate": 0.000161128217760808, + "loss": 0.2768, + "step": 1583 + }, + { + "epoch": 0.5849335302806499, + "grad_norm": 0.36777541041374207, + "learning_rate": 0.00016110358418524448, + "loss": 0.3458, + "step": 1584 + }, + { + "epoch": 0.5853028064992615, + "grad_norm": 0.22332556545734406, + "learning_rate": 0.00016107895060968102, + "loss": 0.2818, + "step": 1585 + }, + { + "epoch": 0.585672082717873, + "grad_norm": 0.25412026047706604, + "learning_rate": 0.0001610543170341175, + "loss": 0.2891, + "step": 1586 + }, + { + "epoch": 0.5860413589364845, + "grad_norm": 0.2506200969219208, + "learning_rate": 0.00016102968345855403, + "loss": 0.3109, + "step": 1587 + }, + { + "epoch": 0.5864106351550961, + "grad_norm": 0.27151158452033997, + "learning_rate": 0.00016100504988299051, + "loss": 0.3396, + "step": 1588 + }, + { + "epoch": 0.5867799113737076, + "grad_norm": 0.249680295586586, + "learning_rate": 0.00016098041630742703, + "loss": 0.2661, + "step": 1589 + }, + { + "epoch": 0.587149187592319, + "grad_norm": 0.22947055101394653, + "learning_rate": 0.00016095578273186354, + "loss": 0.2872, + "step": 1590 + }, + { + "epoch": 0.5875184638109305, + "grad_norm": 0.2988998293876648, + "learning_rate": 0.00016093114915630006, + "loss": 0.2945, + "step": 1591 + }, + { + "epoch": 0.5878877400295421, + "grad_norm": 0.21638168394565582, + "learning_rate": 0.00016090651558073655, + "loss": 0.2489, + "step": 1592 + }, + { + "epoch": 0.5882570162481536, + "grad_norm": 0.25234416127204895, + "learning_rate": 0.00016088188200517306, + "loss": 0.3016, + "step": 1593 + }, + { + "epoch": 0.5886262924667651, + "grad_norm": 0.30782338976860046, + "learning_rate": 0.00016085724842960957, + "loss": 0.369, + "step": 1594 + }, + { + "epoch": 0.5889955686853766, + "grad_norm": 0.3398365080356598, + "learning_rate": 0.0001608326148540461, + "loss": 0.3804, + "step": 1595 + }, + { + "epoch": 0.5893648449039882, + "grad_norm": 0.2947826385498047, + "learning_rate": 0.00016080798127848258, + "loss": 0.2909, + "step": 1596 + }, + { + "epoch": 0.5897341211225997, + "grad_norm": 0.2904197573661804, + "learning_rate": 0.0001607833477029191, + "loss": 0.2965, + "step": 1597 + }, + { + "epoch": 0.5901033973412112, + "grad_norm": 0.25692451000213623, + "learning_rate": 0.00016075871412735558, + "loss": 0.2643, + "step": 1598 + }, + { + "epoch": 0.5904726735598228, + "grad_norm": 0.2462122142314911, + "learning_rate": 0.00016073408055179212, + "loss": 0.2687, + "step": 1599 + }, + { + "epoch": 0.5908419497784343, + "grad_norm": 0.2902405858039856, + "learning_rate": 0.0001607094469762286, + "loss": 0.3225, + "step": 1600 + }, + { + "epoch": 0.5908419497784343, + "eval_loss": 7.725745677947998, + "eval_runtime": 6.9093, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 1.013, + "step": 1600 + }, + { + "epoch": 0.5912112259970458, + "grad_norm": 0.28028663992881775, + "learning_rate": 0.00016068481340066512, + "loss": 0.3112, + "step": 1601 + }, + { + "epoch": 0.5915805022156573, + "grad_norm": 0.28242892026901245, + "learning_rate": 0.0001606601798251016, + "loss": 0.3653, + "step": 1602 + }, + { + "epoch": 0.5919497784342689, + "grad_norm": 0.3082188367843628, + "learning_rate": 0.00016063554624953813, + "loss": 0.3598, + "step": 1603 + }, + { + "epoch": 0.5923190546528804, + "grad_norm": 0.2695845365524292, + "learning_rate": 0.00016061091267397464, + "loss": 0.3038, + "step": 1604 + }, + { + "epoch": 0.5926883308714919, + "grad_norm": 0.22577038407325745, + "learning_rate": 0.00016058627909841115, + "loss": 0.2611, + "step": 1605 + }, + { + "epoch": 0.5930576070901034, + "grad_norm": 0.266353577375412, + "learning_rate": 0.00016056164552284764, + "loss": 0.2783, + "step": 1606 + }, + { + "epoch": 0.593426883308715, + "grad_norm": 0.22079575061798096, + "learning_rate": 0.00016053701194728416, + "loss": 0.2678, + "step": 1607 + }, + { + "epoch": 0.5937961595273265, + "grad_norm": 0.3183462619781494, + "learning_rate": 0.00016051237837172067, + "loss": 0.3863, + "step": 1608 + }, + { + "epoch": 0.5941654357459379, + "grad_norm": 0.3241240680217743, + "learning_rate": 0.00016048774479615719, + "loss": 0.3921, + "step": 1609 + }, + { + "epoch": 0.5945347119645494, + "grad_norm": 0.23680950701236725, + "learning_rate": 0.00016046311122059367, + "loss": 0.2652, + "step": 1610 + }, + { + "epoch": 0.594903988183161, + "grad_norm": 0.3423929512500763, + "learning_rate": 0.0001604384776450302, + "loss": 0.2955, + "step": 1611 + }, + { + "epoch": 0.5952732644017725, + "grad_norm": 0.27786555886268616, + "learning_rate": 0.0001604138440694667, + "loss": 0.3342, + "step": 1612 + }, + { + "epoch": 0.595642540620384, + "grad_norm": 0.24552151560783386, + "learning_rate": 0.00016038921049390322, + "loss": 0.2197, + "step": 1613 + }, + { + "epoch": 0.5960118168389956, + "grad_norm": 0.28928321599960327, + "learning_rate": 0.0001603645769183397, + "loss": 0.3443, + "step": 1614 + }, + { + "epoch": 0.5963810930576071, + "grad_norm": 0.2790432274341583, + "learning_rate": 0.00016033994334277622, + "loss": 0.3423, + "step": 1615 + }, + { + "epoch": 0.5967503692762186, + "grad_norm": 0.24902157485485077, + "learning_rate": 0.0001603153097672127, + "loss": 0.2578, + "step": 1616 + }, + { + "epoch": 0.5971196454948301, + "grad_norm": 0.30040276050567627, + "learning_rate": 0.00016029067619164925, + "loss": 0.3296, + "step": 1617 + }, + { + "epoch": 0.5974889217134417, + "grad_norm": 0.39967480301856995, + "learning_rate": 0.00016026604261608574, + "loss": 0.3878, + "step": 1618 + }, + { + "epoch": 0.5978581979320532, + "grad_norm": 0.2774800658226013, + "learning_rate": 0.00016024140904052225, + "loss": 0.3351, + "step": 1619 + }, + { + "epoch": 0.5982274741506647, + "grad_norm": 0.29975268244743347, + "learning_rate": 0.00016021677546495874, + "loss": 0.2985, + "step": 1620 + }, + { + "epoch": 0.5985967503692762, + "grad_norm": 0.2890987992286682, + "learning_rate": 0.00016019214188939525, + "loss": 0.3485, + "step": 1621 + }, + { + "epoch": 0.5989660265878878, + "grad_norm": 0.2500419020652771, + "learning_rate": 0.00016016750831383177, + "loss": 0.2974, + "step": 1622 + }, + { + "epoch": 0.5993353028064993, + "grad_norm": 0.29374146461486816, + "learning_rate": 0.00016014287473826828, + "loss": 0.3147, + "step": 1623 + }, + { + "epoch": 0.5997045790251108, + "grad_norm": 0.27955377101898193, + "learning_rate": 0.00016011824116270477, + "loss": 0.3526, + "step": 1624 + }, + { + "epoch": 0.6000738552437223, + "grad_norm": 0.24037732183933258, + "learning_rate": 0.00016009360758714128, + "loss": 0.2908, + "step": 1625 + }, + { + "epoch": 0.6004431314623339, + "grad_norm": 0.3245445787906647, + "learning_rate": 0.0001600689740115778, + "loss": 0.3156, + "step": 1626 + }, + { + "epoch": 0.6008124076809453, + "grad_norm": 0.2321922332048416, + "learning_rate": 0.00016004434043601431, + "loss": 0.3069, + "step": 1627 + }, + { + "epoch": 0.6011816838995568, + "grad_norm": 0.24052996933460236, + "learning_rate": 0.0001600197068604508, + "loss": 0.276, + "step": 1628 + }, + { + "epoch": 0.6015509601181684, + "grad_norm": 0.3128090798854828, + "learning_rate": 0.0001599950732848873, + "loss": 0.305, + "step": 1629 + }, + { + "epoch": 0.6019202363367799, + "grad_norm": 0.2942683696746826, + "learning_rate": 0.0001599704397093238, + "loss": 0.3878, + "step": 1630 + }, + { + "epoch": 0.6022895125553914, + "grad_norm": 0.26211774349212646, + "learning_rate": 0.00015994580613376032, + "loss": 0.3189, + "step": 1631 + }, + { + "epoch": 0.6026587887740029, + "grad_norm": 0.2302979677915573, + "learning_rate": 0.00015992117255819683, + "loss": 0.308, + "step": 1632 + }, + { + "epoch": 0.6030280649926145, + "grad_norm": 0.3037758767604828, + "learning_rate": 0.00015989653898263332, + "loss": 0.3291, + "step": 1633 + }, + { + "epoch": 0.603397341211226, + "grad_norm": 0.27956220507621765, + "learning_rate": 0.00015987190540706984, + "loss": 0.3041, + "step": 1634 + }, + { + "epoch": 0.6037666174298375, + "grad_norm": 0.32024672627449036, + "learning_rate": 0.00015984727183150635, + "loss": 0.3534, + "step": 1635 + }, + { + "epoch": 0.604135893648449, + "grad_norm": 0.28189992904663086, + "learning_rate": 0.00015982263825594286, + "loss": 0.3031, + "step": 1636 + }, + { + "epoch": 0.6045051698670606, + "grad_norm": 0.28293630480766296, + "learning_rate": 0.00015979800468037935, + "loss": 0.3481, + "step": 1637 + }, + { + "epoch": 0.6048744460856721, + "grad_norm": 0.2572139501571655, + "learning_rate": 0.00015977337110481587, + "loss": 0.3264, + "step": 1638 + }, + { + "epoch": 0.6052437223042836, + "grad_norm": 0.31883084774017334, + "learning_rate": 0.00015974873752925238, + "loss": 0.3567, + "step": 1639 + }, + { + "epoch": 0.6056129985228951, + "grad_norm": 0.2630270719528198, + "learning_rate": 0.0001597241039536889, + "loss": 0.3067, + "step": 1640 + }, + { + "epoch": 0.6059822747415067, + "grad_norm": 0.26433902978897095, + "learning_rate": 0.00015969947037812538, + "loss": 0.2705, + "step": 1641 + }, + { + "epoch": 0.6063515509601182, + "grad_norm": 0.24868634343147278, + "learning_rate": 0.0001596748368025619, + "loss": 0.2459, + "step": 1642 + }, + { + "epoch": 0.6067208271787297, + "grad_norm": 0.23801346123218536, + "learning_rate": 0.00015965020322699839, + "loss": 0.2754, + "step": 1643 + }, + { + "epoch": 0.6070901033973413, + "grad_norm": 0.3024817109107971, + "learning_rate": 0.00015962556965143493, + "loss": 0.3128, + "step": 1644 + }, + { + "epoch": 0.6074593796159528, + "grad_norm": 0.36035439372062683, + "learning_rate": 0.00015960093607587141, + "loss": 0.394, + "step": 1645 + }, + { + "epoch": 0.6078286558345642, + "grad_norm": 0.2865716218948364, + "learning_rate": 0.00015957630250030793, + "loss": 0.2864, + "step": 1646 + }, + { + "epoch": 0.6081979320531757, + "grad_norm": 0.26630154252052307, + "learning_rate": 0.00015955166892474442, + "loss": 0.2804, + "step": 1647 + }, + { + "epoch": 0.6085672082717873, + "grad_norm": 0.2386620044708252, + "learning_rate": 0.00015952703534918093, + "loss": 0.2886, + "step": 1648 + }, + { + "epoch": 0.6089364844903988, + "grad_norm": 0.2616189122200012, + "learning_rate": 0.00015950240177361745, + "loss": 0.3118, + "step": 1649 + }, + { + "epoch": 0.6093057607090103, + "grad_norm": 0.39563339948654175, + "learning_rate": 0.00015947776819805396, + "loss": 0.3295, + "step": 1650 + }, + { + "epoch": 0.6093057607090103, + "eval_loss": 7.679358959197998, + "eval_runtime": 6.8992, + "eval_samples_per_second": 7.247, + "eval_steps_per_second": 1.015, + "step": 1650 + }, + { + "epoch": 0.6096750369276218, + "grad_norm": 0.2697533667087555, + "learning_rate": 0.00015945313462249045, + "loss": 0.3141, + "step": 1651 + }, + { + "epoch": 0.6100443131462334, + "grad_norm": 0.26666852831840515, + "learning_rate": 0.00015942850104692696, + "loss": 0.2939, + "step": 1652 + }, + { + "epoch": 0.6104135893648449, + "grad_norm": 0.2730031907558441, + "learning_rate": 0.00015940386747136348, + "loss": 0.3265, + "step": 1653 + }, + { + "epoch": 0.6107828655834564, + "grad_norm": 0.32101598381996155, + "learning_rate": 0.0001593792338958, + "loss": 0.3363, + "step": 1654 + }, + { + "epoch": 0.6111521418020679, + "grad_norm": 0.2991955280303955, + "learning_rate": 0.00015935460032023648, + "loss": 0.3828, + "step": 1655 + }, + { + "epoch": 0.6115214180206795, + "grad_norm": 0.2822822630405426, + "learning_rate": 0.000159329966744673, + "loss": 0.3249, + "step": 1656 + }, + { + "epoch": 0.611890694239291, + "grad_norm": 0.37978971004486084, + "learning_rate": 0.00015930533316910948, + "loss": 0.4379, + "step": 1657 + }, + { + "epoch": 0.6122599704579025, + "grad_norm": 0.34900692105293274, + "learning_rate": 0.00015928069959354602, + "loss": 0.306, + "step": 1658 + }, + { + "epoch": 0.6126292466765141, + "grad_norm": 0.27820655703544617, + "learning_rate": 0.0001592560660179825, + "loss": 0.3656, + "step": 1659 + }, + { + "epoch": 0.6129985228951256, + "grad_norm": 0.24872632324695587, + "learning_rate": 0.00015923143244241903, + "loss": 0.3069, + "step": 1660 + }, + { + "epoch": 0.6133677991137371, + "grad_norm": 0.2411656379699707, + "learning_rate": 0.0001592067988668555, + "loss": 0.2904, + "step": 1661 + }, + { + "epoch": 0.6137370753323486, + "grad_norm": 0.21913909912109375, + "learning_rate": 0.00015918216529129203, + "loss": 0.2627, + "step": 1662 + }, + { + "epoch": 0.6141063515509602, + "grad_norm": 0.26816707849502563, + "learning_rate": 0.00015915753171572854, + "loss": 0.3545, + "step": 1663 + }, + { + "epoch": 0.6144756277695717, + "grad_norm": 0.24315379559993744, + "learning_rate": 0.00015913289814016506, + "loss": 0.2705, + "step": 1664 + }, + { + "epoch": 0.6148449039881831, + "grad_norm": 0.23667661845684052, + "learning_rate": 0.00015910826456460154, + "loss": 0.2961, + "step": 1665 + }, + { + "epoch": 0.6152141802067946, + "grad_norm": 0.23767057061195374, + "learning_rate": 0.00015908363098903806, + "loss": 0.2797, + "step": 1666 + }, + { + "epoch": 0.6155834564254062, + "grad_norm": 0.25796979665756226, + "learning_rate": 0.00015905899741347457, + "loss": 0.3263, + "step": 1667 + }, + { + "epoch": 0.6159527326440177, + "grad_norm": 0.26543229818344116, + "learning_rate": 0.0001590343638379111, + "loss": 0.3181, + "step": 1668 + }, + { + "epoch": 0.6163220088626292, + "grad_norm": 0.2296472191810608, + "learning_rate": 0.00015900973026234758, + "loss": 0.2685, + "step": 1669 + }, + { + "epoch": 0.6166912850812407, + "grad_norm": 0.23241066932678223, + "learning_rate": 0.0001589850966867841, + "loss": 0.2973, + "step": 1670 + }, + { + "epoch": 0.6170605612998523, + "grad_norm": 0.2954165041446686, + "learning_rate": 0.0001589604631112206, + "loss": 0.3634, + "step": 1671 + }, + { + "epoch": 0.6174298375184638, + "grad_norm": 0.2365238070487976, + "learning_rate": 0.00015893582953565712, + "loss": 0.3054, + "step": 1672 + }, + { + "epoch": 0.6177991137370753, + "grad_norm": 0.24412816762924194, + "learning_rate": 0.0001589111959600936, + "loss": 0.2947, + "step": 1673 + }, + { + "epoch": 0.6181683899556869, + "grad_norm": 0.25636664032936096, + "learning_rate": 0.00015888656238453012, + "loss": 0.2834, + "step": 1674 + }, + { + "epoch": 0.6185376661742984, + "grad_norm": 0.26511502265930176, + "learning_rate": 0.0001588619288089666, + "loss": 0.3177, + "step": 1675 + }, + { + "epoch": 0.6189069423929099, + "grad_norm": 0.24035991728305817, + "learning_rate": 0.00015883729523340315, + "loss": 0.2643, + "step": 1676 + }, + { + "epoch": 0.6192762186115214, + "grad_norm": 0.29163262248039246, + "learning_rate": 0.00015881266165783964, + "loss": 0.326, + "step": 1677 + }, + { + "epoch": 0.619645494830133, + "grad_norm": 0.2584598958492279, + "learning_rate": 0.00015878802808227615, + "loss": 0.2878, + "step": 1678 + }, + { + "epoch": 0.6200147710487445, + "grad_norm": 0.2711755931377411, + "learning_rate": 0.00015876339450671264, + "loss": 0.32, + "step": 1679 + }, + { + "epoch": 0.620384047267356, + "grad_norm": 0.25643742084503174, + "learning_rate": 0.00015873876093114916, + "loss": 0.3047, + "step": 1680 + }, + { + "epoch": 0.6207533234859675, + "grad_norm": 0.26823490858078003, + "learning_rate": 0.00015871412735558567, + "loss": 0.2823, + "step": 1681 + }, + { + "epoch": 0.6211225997045791, + "grad_norm": 0.2102544605731964, + "learning_rate": 0.00015868949378002219, + "loss": 0.2125, + "step": 1682 + }, + { + "epoch": 0.6214918759231906, + "grad_norm": 0.26037541031837463, + "learning_rate": 0.00015866486020445867, + "loss": 0.2749, + "step": 1683 + }, + { + "epoch": 0.621861152141802, + "grad_norm": 0.25930219888687134, + "learning_rate": 0.0001586402266288952, + "loss": 0.2868, + "step": 1684 + }, + { + "epoch": 0.6222304283604135, + "grad_norm": 0.19488012790679932, + "learning_rate": 0.0001586155930533317, + "loss": 0.2056, + "step": 1685 + }, + { + "epoch": 0.6225997045790251, + "grad_norm": 0.3012772798538208, + "learning_rate": 0.00015859095947776822, + "loss": 0.3411, + "step": 1686 + }, + { + "epoch": 0.6229689807976366, + "grad_norm": 0.2384556084871292, + "learning_rate": 0.0001585663259022047, + "loss": 0.3161, + "step": 1687 + }, + { + "epoch": 0.6233382570162481, + "grad_norm": 0.2903664708137512, + "learning_rate": 0.00015854169232664122, + "loss": 0.3017, + "step": 1688 + }, + { + "epoch": 0.6237075332348597, + "grad_norm": 0.274304062128067, + "learning_rate": 0.0001585170587510777, + "loss": 0.3225, + "step": 1689 + }, + { + "epoch": 0.6240768094534712, + "grad_norm": 0.2844051420688629, + "learning_rate": 0.00015849242517551425, + "loss": 0.338, + "step": 1690 + }, + { + "epoch": 0.6244460856720827, + "grad_norm": 0.3104688227176666, + "learning_rate": 0.00015846779159995074, + "loss": 0.3159, + "step": 1691 + }, + { + "epoch": 0.6248153618906942, + "grad_norm": 0.2754247784614563, + "learning_rate": 0.00015844315802438725, + "loss": 0.3268, + "step": 1692 + }, + { + "epoch": 0.6251846381093058, + "grad_norm": 0.26543259620666504, + "learning_rate": 0.00015841852444882374, + "loss": 0.3151, + "step": 1693 + }, + { + "epoch": 0.6255539143279173, + "grad_norm": 0.255329966545105, + "learning_rate": 0.00015839389087326025, + "loss": 0.3317, + "step": 1694 + }, + { + "epoch": 0.6259231905465288, + "grad_norm": 0.2582108676433563, + "learning_rate": 0.00015836925729769677, + "loss": 0.276, + "step": 1695 + }, + { + "epoch": 0.6262924667651403, + "grad_norm": 0.290130078792572, + "learning_rate": 0.00015834462372213328, + "loss": 0.3395, + "step": 1696 + }, + { + "epoch": 0.6266617429837519, + "grad_norm": 0.306317001581192, + "learning_rate": 0.00015831999014656977, + "loss": 0.4085, + "step": 1697 + }, + { + "epoch": 0.6270310192023634, + "grad_norm": 0.23540732264518738, + "learning_rate": 0.00015829535657100628, + "loss": 0.2819, + "step": 1698 + }, + { + "epoch": 0.6274002954209749, + "grad_norm": 0.24510380625724792, + "learning_rate": 0.0001582707229954428, + "loss": 0.2969, + "step": 1699 + }, + { + "epoch": 0.6277695716395865, + "grad_norm": 0.27169322967529297, + "learning_rate": 0.0001582460894198793, + "loss": 0.2972, + "step": 1700 + }, + { + "epoch": 0.6277695716395865, + "eval_loss": 7.810720920562744, + "eval_runtime": 6.9777, + "eval_samples_per_second": 7.166, + "eval_steps_per_second": 1.003, + "step": 1700 + }, + { + "epoch": 0.628138847858198, + "grad_norm": 0.2641982138156891, + "learning_rate": 0.0001582214558443158, + "loss": 0.3621, + "step": 1701 + }, + { + "epoch": 0.6285081240768094, + "grad_norm": 0.3042442500591278, + "learning_rate": 0.00015819682226875232, + "loss": 0.3457, + "step": 1702 + }, + { + "epoch": 0.6288774002954209, + "grad_norm": 0.2665266990661621, + "learning_rate": 0.0001581721886931888, + "loss": 0.3098, + "step": 1703 + }, + { + "epoch": 0.6292466765140325, + "grad_norm": 0.280387282371521, + "learning_rate": 0.00015814755511762534, + "loss": 0.3669, + "step": 1704 + }, + { + "epoch": 0.629615952732644, + "grad_norm": 0.2697155177593231, + "learning_rate": 0.00015812292154206183, + "loss": 0.3213, + "step": 1705 + }, + { + "epoch": 0.6299852289512555, + "grad_norm": 0.26276448369026184, + "learning_rate": 0.00015809828796649835, + "loss": 0.3133, + "step": 1706 + }, + { + "epoch": 0.630354505169867, + "grad_norm": 0.27231404185295105, + "learning_rate": 0.00015807365439093483, + "loss": 0.3616, + "step": 1707 + }, + { + "epoch": 0.6307237813884786, + "grad_norm": 0.24662147462368011, + "learning_rate": 0.00015804902081537135, + "loss": 0.3452, + "step": 1708 + }, + { + "epoch": 0.6310930576070901, + "grad_norm": 0.33165276050567627, + "learning_rate": 0.00015802438723980786, + "loss": 0.3416, + "step": 1709 + }, + { + "epoch": 0.6314623338257016, + "grad_norm": 0.2354043424129486, + "learning_rate": 0.00015799975366424438, + "loss": 0.2267, + "step": 1710 + }, + { + "epoch": 0.6318316100443131, + "grad_norm": 0.2776791453361511, + "learning_rate": 0.00015797512008868087, + "loss": 0.3393, + "step": 1711 + }, + { + "epoch": 0.6322008862629247, + "grad_norm": 0.3800466060638428, + "learning_rate": 0.00015795048651311738, + "loss": 0.3819, + "step": 1712 + }, + { + "epoch": 0.6325701624815362, + "grad_norm": 0.2617228031158447, + "learning_rate": 0.0001579258529375539, + "loss": 0.3103, + "step": 1713 + }, + { + "epoch": 0.6329394387001477, + "grad_norm": 0.2480565905570984, + "learning_rate": 0.0001579012193619904, + "loss": 0.2875, + "step": 1714 + }, + { + "epoch": 0.6333087149187593, + "grad_norm": 0.28185197710990906, + "learning_rate": 0.0001578765857864269, + "loss": 0.317, + "step": 1715 + }, + { + "epoch": 0.6336779911373708, + "grad_norm": 0.26127704977989197, + "learning_rate": 0.0001578519522108634, + "loss": 0.2977, + "step": 1716 + }, + { + "epoch": 0.6340472673559823, + "grad_norm": 0.2643066644668579, + "learning_rate": 0.00015782731863529993, + "loss": 0.3278, + "step": 1717 + }, + { + "epoch": 0.6344165435745938, + "grad_norm": 0.22714141011238098, + "learning_rate": 0.00015780268505973644, + "loss": 0.2515, + "step": 1718 + }, + { + "epoch": 0.6347858197932054, + "grad_norm": 0.28244608640670776, + "learning_rate": 0.00015777805148417293, + "loss": 0.4071, + "step": 1719 + }, + { + "epoch": 0.6351550960118169, + "grad_norm": 0.288053423166275, + "learning_rate": 0.00015775341790860944, + "loss": 0.2916, + "step": 1720 + }, + { + "epoch": 0.6355243722304283, + "grad_norm": 0.2766154706478119, + "learning_rate": 0.00015772878433304593, + "loss": 0.275, + "step": 1721 + }, + { + "epoch": 0.6358936484490398, + "grad_norm": 0.22157230973243713, + "learning_rate": 0.00015770415075748247, + "loss": 0.2854, + "step": 1722 + }, + { + "epoch": 0.6362629246676514, + "grad_norm": 0.26390278339385986, + "learning_rate": 0.00015767951718191896, + "loss": 0.2573, + "step": 1723 + }, + { + "epoch": 0.6366322008862629, + "grad_norm": 0.27419012784957886, + "learning_rate": 0.00015765488360635547, + "loss": 0.2505, + "step": 1724 + }, + { + "epoch": 0.6370014771048744, + "grad_norm": 0.26276707649230957, + "learning_rate": 0.00015763025003079196, + "loss": 0.2617, + "step": 1725 + }, + { + "epoch": 0.6373707533234859, + "grad_norm": 0.27408209443092346, + "learning_rate": 0.00015760561645522848, + "loss": 0.3057, + "step": 1726 + }, + { + "epoch": 0.6377400295420975, + "grad_norm": 0.27617862820625305, + "learning_rate": 0.000157580982879665, + "loss": 0.2774, + "step": 1727 + }, + { + "epoch": 0.638109305760709, + "grad_norm": 0.24700725078582764, + "learning_rate": 0.0001575563493041015, + "loss": 0.2843, + "step": 1728 + }, + { + "epoch": 0.6384785819793205, + "grad_norm": 0.2565922439098358, + "learning_rate": 0.000157531715728538, + "loss": 0.2945, + "step": 1729 + }, + { + "epoch": 0.6388478581979321, + "grad_norm": 0.2334291636943817, + "learning_rate": 0.0001575070821529745, + "loss": 0.2492, + "step": 1730 + }, + { + "epoch": 0.6392171344165436, + "grad_norm": 0.2669878900051117, + "learning_rate": 0.00015748244857741102, + "loss": 0.291, + "step": 1731 + }, + { + "epoch": 0.6395864106351551, + "grad_norm": 0.22900605201721191, + "learning_rate": 0.00015745781500184754, + "loss": 0.3107, + "step": 1732 + }, + { + "epoch": 0.6399556868537666, + "grad_norm": 0.22954991459846497, + "learning_rate": 0.00015743318142628403, + "loss": 0.2906, + "step": 1733 + }, + { + "epoch": 0.6403249630723782, + "grad_norm": 0.26370540261268616, + "learning_rate": 0.00015740854785072054, + "loss": 0.3347, + "step": 1734 + }, + { + "epoch": 0.6406942392909897, + "grad_norm": 0.2963898181915283, + "learning_rate": 0.00015738391427515703, + "loss": 0.3506, + "step": 1735 + }, + { + "epoch": 0.6410635155096012, + "grad_norm": 0.2627878785133362, + "learning_rate": 0.00015735928069959357, + "loss": 0.2814, + "step": 1736 + }, + { + "epoch": 0.6414327917282127, + "grad_norm": 0.3097379505634308, + "learning_rate": 0.00015733464712403006, + "loss": 0.3201, + "step": 1737 + }, + { + "epoch": 0.6418020679468243, + "grad_norm": 0.2512352764606476, + "learning_rate": 0.00015731001354846657, + "loss": 0.3169, + "step": 1738 + }, + { + "epoch": 0.6421713441654358, + "grad_norm": 0.2890385687351227, + "learning_rate": 0.00015728537997290306, + "loss": 0.3851, + "step": 1739 + }, + { + "epoch": 0.6425406203840472, + "grad_norm": 0.27635079622268677, + "learning_rate": 0.00015726074639733957, + "loss": 0.2884, + "step": 1740 + }, + { + "epoch": 0.6429098966026587, + "grad_norm": 0.3102495074272156, + "learning_rate": 0.0001572361128217761, + "loss": 0.2637, + "step": 1741 + }, + { + "epoch": 0.6432791728212703, + "grad_norm": 0.25071051716804504, + "learning_rate": 0.0001572114792462126, + "loss": 0.2525, + "step": 1742 + }, + { + "epoch": 0.6436484490398818, + "grad_norm": 0.24957087635993958, + "learning_rate": 0.0001571868456706491, + "loss": 0.2798, + "step": 1743 + }, + { + "epoch": 0.6440177252584933, + "grad_norm": 0.21964524686336517, + "learning_rate": 0.0001571622120950856, + "loss": 0.2733, + "step": 1744 + }, + { + "epoch": 0.6443870014771049, + "grad_norm": 0.2593156397342682, + "learning_rate": 0.00015713757851952212, + "loss": 0.317, + "step": 1745 + }, + { + "epoch": 0.6447562776957164, + "grad_norm": 0.28958824276924133, + "learning_rate": 0.00015711294494395863, + "loss": 0.3376, + "step": 1746 + }, + { + "epoch": 0.6451255539143279, + "grad_norm": 0.27648741006851196, + "learning_rate": 0.00015708831136839512, + "loss": 0.3388, + "step": 1747 + }, + { + "epoch": 0.6454948301329394, + "grad_norm": 0.3194526731967926, + "learning_rate": 0.00015706367779283164, + "loss": 0.2858, + "step": 1748 + }, + { + "epoch": 0.645864106351551, + "grad_norm": 0.286007285118103, + "learning_rate": 0.00015703904421726815, + "loss": 0.2798, + "step": 1749 + }, + { + "epoch": 0.6462333825701625, + "grad_norm": 0.2699161767959595, + "learning_rate": 0.00015701441064170467, + "loss": 0.3103, + "step": 1750 + }, + { + "epoch": 0.6462333825701625, + "eval_loss": 7.696447372436523, + "eval_runtime": 6.9116, + "eval_samples_per_second": 7.234, + "eval_steps_per_second": 1.013, + "step": 1750 + }, + { + "epoch": 0.646602658788774, + "grad_norm": 0.25919443368911743, + "learning_rate": 0.00015698977706614115, + "loss": 0.2738, + "step": 1751 + }, + { + "epoch": 0.6469719350073855, + "grad_norm": 0.2967652976512909, + "learning_rate": 0.00015696514349057767, + "loss": 0.3301, + "step": 1752 + }, + { + "epoch": 0.6473412112259971, + "grad_norm": 0.2523612976074219, + "learning_rate": 0.00015694050991501416, + "loss": 0.2841, + "step": 1753 + }, + { + "epoch": 0.6477104874446086, + "grad_norm": 0.2625032365322113, + "learning_rate": 0.0001569158763394507, + "loss": 0.2361, + "step": 1754 + }, + { + "epoch": 0.6480797636632201, + "grad_norm": 0.2238159477710724, + "learning_rate": 0.00015689124276388718, + "loss": 0.2929, + "step": 1755 + }, + { + "epoch": 0.6484490398818316, + "grad_norm": 0.27077212929725647, + "learning_rate": 0.0001568666091883237, + "loss": 0.3009, + "step": 1756 + }, + { + "epoch": 0.6488183161004432, + "grad_norm": 0.2594720125198364, + "learning_rate": 0.0001568419756127602, + "loss": 0.2809, + "step": 1757 + }, + { + "epoch": 0.6491875923190547, + "grad_norm": 0.27531465888023376, + "learning_rate": 0.0001568173420371967, + "loss": 0.346, + "step": 1758 + }, + { + "epoch": 0.6495568685376661, + "grad_norm": 0.2527509927749634, + "learning_rate": 0.00015679270846163322, + "loss": 0.2993, + "step": 1759 + }, + { + "epoch": 0.6499261447562777, + "grad_norm": 0.26535680890083313, + "learning_rate": 0.00015676807488606973, + "loss": 0.2919, + "step": 1760 + }, + { + "epoch": 0.6502954209748892, + "grad_norm": 0.24140724539756775, + "learning_rate": 0.00015674344131050622, + "loss": 0.2407, + "step": 1761 + }, + { + "epoch": 0.6506646971935007, + "grad_norm": 0.2445399910211563, + "learning_rate": 0.00015671880773494273, + "loss": 0.2705, + "step": 1762 + }, + { + "epoch": 0.6510339734121122, + "grad_norm": 0.2483196258544922, + "learning_rate": 0.00015669417415937925, + "loss": 0.288, + "step": 1763 + }, + { + "epoch": 0.6514032496307238, + "grad_norm": 0.2521629333496094, + "learning_rate": 0.00015666954058381576, + "loss": 0.2672, + "step": 1764 + }, + { + "epoch": 0.6517725258493353, + "grad_norm": 0.2859017252922058, + "learning_rate": 0.00015664490700825225, + "loss": 0.309, + "step": 1765 + }, + { + "epoch": 0.6521418020679468, + "grad_norm": 0.438293993473053, + "learning_rate": 0.00015662027343268876, + "loss": 0.3031, + "step": 1766 + }, + { + "epoch": 0.6525110782865583, + "grad_norm": 0.3088909089565277, + "learning_rate": 0.00015659563985712525, + "loss": 0.3222, + "step": 1767 + }, + { + "epoch": 0.6528803545051699, + "grad_norm": 0.2559799253940582, + "learning_rate": 0.0001565710062815618, + "loss": 0.3217, + "step": 1768 + }, + { + "epoch": 0.6532496307237814, + "grad_norm": 0.2933116555213928, + "learning_rate": 0.00015654637270599828, + "loss": 0.3746, + "step": 1769 + }, + { + "epoch": 0.6536189069423929, + "grad_norm": 0.23837897181510925, + "learning_rate": 0.0001565217391304348, + "loss": 0.2463, + "step": 1770 + }, + { + "epoch": 0.6539881831610044, + "grad_norm": 0.27254509925842285, + "learning_rate": 0.00015649710555487128, + "loss": 0.3111, + "step": 1771 + }, + { + "epoch": 0.654357459379616, + "grad_norm": 0.27168703079223633, + "learning_rate": 0.0001564724719793078, + "loss": 0.3005, + "step": 1772 + }, + { + "epoch": 0.6547267355982275, + "grad_norm": 0.26153063774108887, + "learning_rate": 0.0001564478384037443, + "loss": 0.3392, + "step": 1773 + }, + { + "epoch": 0.655096011816839, + "grad_norm": 0.2863750159740448, + "learning_rate": 0.00015642320482818083, + "loss": 0.3396, + "step": 1774 + }, + { + "epoch": 0.6554652880354506, + "grad_norm": 0.27763259410858154, + "learning_rate": 0.00015639857125261731, + "loss": 0.3179, + "step": 1775 + }, + { + "epoch": 0.6558345642540621, + "grad_norm": 0.3287683129310608, + "learning_rate": 0.00015637393767705383, + "loss": 0.3262, + "step": 1776 + }, + { + "epoch": 0.6562038404726735, + "grad_norm": 0.26164475083351135, + "learning_rate": 0.00015634930410149034, + "loss": 0.2889, + "step": 1777 + }, + { + "epoch": 0.656573116691285, + "grad_norm": 0.27468714118003845, + "learning_rate": 0.00015632467052592686, + "loss": 0.2375, + "step": 1778 + }, + { + "epoch": 0.6569423929098966, + "grad_norm": 0.2969311773777008, + "learning_rate": 0.00015630003695036335, + "loss": 0.3131, + "step": 1779 + }, + { + "epoch": 0.6573116691285081, + "grad_norm": 0.27483540773391724, + "learning_rate": 0.00015627540337479986, + "loss": 0.3385, + "step": 1780 + }, + { + "epoch": 0.6576809453471196, + "grad_norm": 0.2782421410083771, + "learning_rate": 0.00015625076979923638, + "loss": 0.294, + "step": 1781 + }, + { + "epoch": 0.6580502215657311, + "grad_norm": 0.20360851287841797, + "learning_rate": 0.0001562261362236729, + "loss": 0.2229, + "step": 1782 + }, + { + "epoch": 0.6584194977843427, + "grad_norm": 0.30043482780456543, + "learning_rate": 0.00015620150264810938, + "loss": 0.3588, + "step": 1783 + }, + { + "epoch": 0.6587887740029542, + "grad_norm": 0.24038229882717133, + "learning_rate": 0.0001561768690725459, + "loss": 0.2515, + "step": 1784 + }, + { + "epoch": 0.6591580502215657, + "grad_norm": 0.3807814121246338, + "learning_rate": 0.00015615223549698238, + "loss": 0.3943, + "step": 1785 + }, + { + "epoch": 0.6595273264401772, + "grad_norm": 0.24281558394432068, + "learning_rate": 0.00015612760192141892, + "loss": 0.2655, + "step": 1786 + }, + { + "epoch": 0.6598966026587888, + "grad_norm": 0.2623486816883087, + "learning_rate": 0.0001561029683458554, + "loss": 0.2938, + "step": 1787 + }, + { + "epoch": 0.6602658788774003, + "grad_norm": 0.2717920243740082, + "learning_rate": 0.00015607833477029192, + "loss": 0.322, + "step": 1788 + }, + { + "epoch": 0.6606351550960118, + "grad_norm": 0.23475198447704315, + "learning_rate": 0.0001560537011947284, + "loss": 0.2536, + "step": 1789 + }, + { + "epoch": 0.6610044313146234, + "grad_norm": 0.27559682726860046, + "learning_rate": 0.00015602906761916493, + "loss": 0.2858, + "step": 1790 + }, + { + "epoch": 0.6613737075332349, + "grad_norm": 0.21105653047561646, + "learning_rate": 0.00015600443404360144, + "loss": 0.2316, + "step": 1791 + }, + { + "epoch": 0.6617429837518464, + "grad_norm": 0.27383849024772644, + "learning_rate": 0.00015597980046803796, + "loss": 0.3268, + "step": 1792 + }, + { + "epoch": 0.6621122599704579, + "grad_norm": 0.3014169931411743, + "learning_rate": 0.00015595516689247444, + "loss": 0.3186, + "step": 1793 + }, + { + "epoch": 0.6624815361890695, + "grad_norm": 0.25147905945777893, + "learning_rate": 0.00015593053331691096, + "loss": 0.2811, + "step": 1794 + }, + { + "epoch": 0.662850812407681, + "grad_norm": 0.22348275780677795, + "learning_rate": 0.00015590589974134747, + "loss": 0.2961, + "step": 1795 + }, + { + "epoch": 0.6632200886262924, + "grad_norm": 0.28043776750564575, + "learning_rate": 0.000155881266165784, + "loss": 0.3095, + "step": 1796 + }, + { + "epoch": 0.6635893648449039, + "grad_norm": 0.23538918793201447, + "learning_rate": 0.00015585663259022047, + "loss": 0.2687, + "step": 1797 + }, + { + "epoch": 0.6639586410635155, + "grad_norm": 0.2427215427160263, + "learning_rate": 0.000155831999014657, + "loss": 0.296, + "step": 1798 + }, + { + "epoch": 0.664327917282127, + "grad_norm": 0.2987094819545746, + "learning_rate": 0.00015580736543909348, + "loss": 0.3549, + "step": 1799 + }, + { + "epoch": 0.6646971935007385, + "grad_norm": 0.24537645280361176, + "learning_rate": 0.00015578273186353002, + "loss": 0.2612, + "step": 1800 + }, + { + "epoch": 0.6646971935007385, + "eval_loss": 7.6551289558410645, + "eval_runtime": 6.9285, + "eval_samples_per_second": 7.217, + "eval_steps_per_second": 1.01, + "step": 1800 + }, + { + "epoch": 0.6650664697193501, + "grad_norm": 0.2557753026485443, + "learning_rate": 0.0001557580982879665, + "loss": 0.3255, + "step": 1801 + }, + { + "epoch": 0.6654357459379616, + "grad_norm": 0.25785231590270996, + "learning_rate": 0.00015573346471240302, + "loss": 0.3149, + "step": 1802 + }, + { + "epoch": 0.6658050221565731, + "grad_norm": 0.2791072726249695, + "learning_rate": 0.0001557088311368395, + "loss": 0.3244, + "step": 1803 + }, + { + "epoch": 0.6661742983751846, + "grad_norm": 0.2409910410642624, + "learning_rate": 0.00015568419756127602, + "loss": 0.2987, + "step": 1804 + }, + { + "epoch": 0.6665435745937962, + "grad_norm": 0.2809560298919678, + "learning_rate": 0.00015565956398571254, + "loss": 0.3389, + "step": 1805 + }, + { + "epoch": 0.6669128508124077, + "grad_norm": 0.23236386477947235, + "learning_rate": 0.00015563493041014905, + "loss": 0.2797, + "step": 1806 + }, + { + "epoch": 0.6672821270310192, + "grad_norm": 0.2688330113887787, + "learning_rate": 0.00015561029683458554, + "loss": 0.3032, + "step": 1807 + }, + { + "epoch": 0.6676514032496307, + "grad_norm": 0.28646624088287354, + "learning_rate": 0.00015558566325902205, + "loss": 0.3359, + "step": 1808 + }, + { + "epoch": 0.6680206794682423, + "grad_norm": 0.22623562812805176, + "learning_rate": 0.00015556102968345857, + "loss": 0.2482, + "step": 1809 + }, + { + "epoch": 0.6683899556868538, + "grad_norm": 0.295561283826828, + "learning_rate": 0.00015553639610789508, + "loss": 0.3356, + "step": 1810 + }, + { + "epoch": 0.6687592319054653, + "grad_norm": 0.33545178174972534, + "learning_rate": 0.00015551176253233157, + "loss": 0.3779, + "step": 1811 + }, + { + "epoch": 0.6691285081240768, + "grad_norm": 0.3177444338798523, + "learning_rate": 0.00015548712895676809, + "loss": 0.3916, + "step": 1812 + }, + { + "epoch": 0.6694977843426884, + "grad_norm": 0.24064241349697113, + "learning_rate": 0.0001554624953812046, + "loss": 0.2888, + "step": 1813 + }, + { + "epoch": 0.6698670605612999, + "grad_norm": 0.2905915677547455, + "learning_rate": 0.00015543786180564111, + "loss": 0.3267, + "step": 1814 + }, + { + "epoch": 0.6702363367799113, + "grad_norm": 0.3154178559780121, + "learning_rate": 0.0001554132282300776, + "loss": 0.3112, + "step": 1815 + }, + { + "epoch": 0.670605612998523, + "grad_norm": 0.2267937809228897, + "learning_rate": 0.00015538859465451412, + "loss": 0.2232, + "step": 1816 + }, + { + "epoch": 0.6709748892171344, + "grad_norm": 0.2581126391887665, + "learning_rate": 0.0001553639610789506, + "loss": 0.2733, + "step": 1817 + }, + { + "epoch": 0.6713441654357459, + "grad_norm": 0.2659561336040497, + "learning_rate": 0.00015533932750338715, + "loss": 0.3261, + "step": 1818 + }, + { + "epoch": 0.6717134416543574, + "grad_norm": 0.2620655596256256, + "learning_rate": 0.00015531469392782363, + "loss": 0.2835, + "step": 1819 + }, + { + "epoch": 0.672082717872969, + "grad_norm": 0.2914316654205322, + "learning_rate": 0.00015529006035226015, + "loss": 0.2674, + "step": 1820 + }, + { + "epoch": 0.6724519940915805, + "grad_norm": 0.20766432583332062, + "learning_rate": 0.00015526542677669664, + "loss": 0.2444, + "step": 1821 + }, + { + "epoch": 0.672821270310192, + "grad_norm": 0.27259644865989685, + "learning_rate": 0.00015524079320113315, + "loss": 0.3258, + "step": 1822 + }, + { + "epoch": 0.6731905465288035, + "grad_norm": 0.2548483908176422, + "learning_rate": 0.00015521615962556967, + "loss": 0.2727, + "step": 1823 + }, + { + "epoch": 0.6735598227474151, + "grad_norm": 0.26370954513549805, + "learning_rate": 0.00015519152605000618, + "loss": 0.3016, + "step": 1824 + }, + { + "epoch": 0.6739290989660266, + "grad_norm": 0.31123995780944824, + "learning_rate": 0.00015516689247444267, + "loss": 0.3826, + "step": 1825 + }, + { + "epoch": 0.6742983751846381, + "grad_norm": 0.23884528875350952, + "learning_rate": 0.00015514225889887918, + "loss": 0.2662, + "step": 1826 + }, + { + "epoch": 0.6746676514032496, + "grad_norm": 0.2541447579860687, + "learning_rate": 0.0001551176253233157, + "loss": 0.332, + "step": 1827 + }, + { + "epoch": 0.6750369276218612, + "grad_norm": 0.327722430229187, + "learning_rate": 0.0001550929917477522, + "loss": 0.3716, + "step": 1828 + }, + { + "epoch": 0.6754062038404727, + "grad_norm": 0.36113235354423523, + "learning_rate": 0.0001550683581721887, + "loss": 0.3383, + "step": 1829 + }, + { + "epoch": 0.6757754800590842, + "grad_norm": 0.237142413854599, + "learning_rate": 0.00015504372459662521, + "loss": 0.2609, + "step": 1830 + }, + { + "epoch": 0.6761447562776958, + "grad_norm": 0.28000253438949585, + "learning_rate": 0.0001550190910210617, + "loss": 0.3386, + "step": 1831 + }, + { + "epoch": 0.6765140324963073, + "grad_norm": 0.29142963886260986, + "learning_rate": 0.00015499445744549824, + "loss": 0.3199, + "step": 1832 + }, + { + "epoch": 0.6768833087149188, + "grad_norm": 0.23117280006408691, + "learning_rate": 0.00015496982386993473, + "loss": 0.2953, + "step": 1833 + }, + { + "epoch": 0.6772525849335302, + "grad_norm": 0.30192527174949646, + "learning_rate": 0.00015494519029437125, + "loss": 0.35, + "step": 1834 + }, + { + "epoch": 0.6776218611521418, + "grad_norm": 0.21648409962654114, + "learning_rate": 0.00015492055671880773, + "loss": 0.2627, + "step": 1835 + }, + { + "epoch": 0.6779911373707533, + "grad_norm": 0.2667943239212036, + "learning_rate": 0.00015489592314324425, + "loss": 0.2751, + "step": 1836 + }, + { + "epoch": 0.6783604135893648, + "grad_norm": 0.24076758325099945, + "learning_rate": 0.00015487128956768076, + "loss": 0.2922, + "step": 1837 + }, + { + "epoch": 0.6787296898079763, + "grad_norm": 0.2872014045715332, + "learning_rate": 0.00015484665599211728, + "loss": 0.316, + "step": 1838 + }, + { + "epoch": 0.6790989660265879, + "grad_norm": 0.3662479519844055, + "learning_rate": 0.00015482202241655376, + "loss": 0.3519, + "step": 1839 + }, + { + "epoch": 0.6794682422451994, + "grad_norm": 0.2328277975320816, + "learning_rate": 0.00015479738884099028, + "loss": 0.2464, + "step": 1840 + }, + { + "epoch": 0.6798375184638109, + "grad_norm": 0.26773300766944885, + "learning_rate": 0.0001547727552654268, + "loss": 0.2887, + "step": 1841 + }, + { + "epoch": 0.6802067946824224, + "grad_norm": 0.25428494811058044, + "learning_rate": 0.0001547481216898633, + "loss": 0.3154, + "step": 1842 + }, + { + "epoch": 0.680576070901034, + "grad_norm": 0.26065242290496826, + "learning_rate": 0.0001547234881142998, + "loss": 0.3024, + "step": 1843 + }, + { + "epoch": 0.6809453471196455, + "grad_norm": 0.26216748356819153, + "learning_rate": 0.0001546988545387363, + "loss": 0.3627, + "step": 1844 + }, + { + "epoch": 0.681314623338257, + "grad_norm": 0.2938605844974518, + "learning_rate": 0.0001546742209631728, + "loss": 0.3385, + "step": 1845 + }, + { + "epoch": 0.6816838995568686, + "grad_norm": 0.26682284474372864, + "learning_rate": 0.00015464958738760934, + "loss": 0.3064, + "step": 1846 + }, + { + "epoch": 0.6820531757754801, + "grad_norm": 0.2826003134250641, + "learning_rate": 0.00015462495381204583, + "loss": 0.3037, + "step": 1847 + }, + { + "epoch": 0.6824224519940916, + "grad_norm": 0.30669429898262024, + "learning_rate": 0.00015460032023648234, + "loss": 0.3476, + "step": 1848 + }, + { + "epoch": 0.6827917282127031, + "grad_norm": 0.3237963616847992, + "learning_rate": 0.00015457568666091883, + "loss": 0.3374, + "step": 1849 + }, + { + "epoch": 0.6831610044313147, + "grad_norm": 0.3182128667831421, + "learning_rate": 0.00015455105308535537, + "loss": 0.314, + "step": 1850 + }, + { + "epoch": 0.6831610044313147, + "eval_loss": 7.930963039398193, + "eval_runtime": 7.3627, + "eval_samples_per_second": 6.791, + "eval_steps_per_second": 0.951, + "step": 1850 + }, + { + "epoch": 0.6835302806499262, + "grad_norm": 0.3203182816505432, + "learning_rate": 0.00015452641950979186, + "loss": 0.3334, + "step": 1851 + }, + { + "epoch": 0.6838995568685377, + "grad_norm": 0.29283472895622253, + "learning_rate": 0.00015450178593422837, + "loss": 0.3043, + "step": 1852 + }, + { + "epoch": 0.6842688330871491, + "grad_norm": 0.2428922951221466, + "learning_rate": 0.00015447715235866486, + "loss": 0.2704, + "step": 1853 + }, + { + "epoch": 0.6846381093057607, + "grad_norm": 0.21880777180194855, + "learning_rate": 0.00015445251878310138, + "loss": 0.2545, + "step": 1854 + }, + { + "epoch": 0.6850073855243722, + "grad_norm": 0.24377134442329407, + "learning_rate": 0.0001544278852075379, + "loss": 0.2423, + "step": 1855 + }, + { + "epoch": 0.6853766617429837, + "grad_norm": 0.2727199196815491, + "learning_rate": 0.0001544032516319744, + "loss": 0.2929, + "step": 1856 + }, + { + "epoch": 0.6857459379615952, + "grad_norm": 0.29561924934387207, + "learning_rate": 0.0001543786180564109, + "loss": 0.2936, + "step": 1857 + }, + { + "epoch": 0.6861152141802068, + "grad_norm": 0.3117547631263733, + "learning_rate": 0.0001543539844808474, + "loss": 0.3429, + "step": 1858 + }, + { + "epoch": 0.6864844903988183, + "grad_norm": 0.3964634835720062, + "learning_rate": 0.00015432935090528392, + "loss": 0.3367, + "step": 1859 + }, + { + "epoch": 0.6868537666174298, + "grad_norm": 0.26625800132751465, + "learning_rate": 0.00015430471732972044, + "loss": 0.2842, + "step": 1860 + }, + { + "epoch": 0.6872230428360414, + "grad_norm": 0.39090195298194885, + "learning_rate": 0.00015428008375415692, + "loss": 0.3909, + "step": 1861 + }, + { + "epoch": 0.6875923190546529, + "grad_norm": 0.3051859736442566, + "learning_rate": 0.0001542554501785934, + "loss": 0.3583, + "step": 1862 + }, + { + "epoch": 0.6879615952732644, + "grad_norm": 0.2676551342010498, + "learning_rate": 0.00015423081660302993, + "loss": 0.2992, + "step": 1863 + }, + { + "epoch": 0.6883308714918759, + "grad_norm": 0.24766811728477478, + "learning_rate": 0.00015420618302746644, + "loss": 0.2666, + "step": 1864 + }, + { + "epoch": 0.6887001477104875, + "grad_norm": 0.23327337205410004, + "learning_rate": 0.00015418154945190295, + "loss": 0.2751, + "step": 1865 + }, + { + "epoch": 0.689069423929099, + "grad_norm": 0.2999024987220764, + "learning_rate": 0.00015415691587633944, + "loss": 0.3483, + "step": 1866 + }, + { + "epoch": 0.6894387001477105, + "grad_norm": 0.2706744372844696, + "learning_rate": 0.00015413228230077596, + "loss": 0.2884, + "step": 1867 + }, + { + "epoch": 0.689807976366322, + "grad_norm": 0.29461121559143066, + "learning_rate": 0.00015410764872521247, + "loss": 0.2937, + "step": 1868 + }, + { + "epoch": 0.6901772525849336, + "grad_norm": 0.2835221290588379, + "learning_rate": 0.00015408301514964899, + "loss": 0.3166, + "step": 1869 + }, + { + "epoch": 0.6905465288035451, + "grad_norm": 0.24878744781017303, + "learning_rate": 0.00015405838157408547, + "loss": 0.2965, + "step": 1870 + }, + { + "epoch": 0.6909158050221565, + "grad_norm": 0.21990624070167542, + "learning_rate": 0.000154033747998522, + "loss": 0.2832, + "step": 1871 + }, + { + "epoch": 0.691285081240768, + "grad_norm": 0.26045215129852295, + "learning_rate": 0.00015400911442295848, + "loss": 0.268, + "step": 1872 + }, + { + "epoch": 0.6916543574593796, + "grad_norm": 0.26063981652259827, + "learning_rate": 0.00015398448084739502, + "loss": 0.2871, + "step": 1873 + }, + { + "epoch": 0.6920236336779911, + "grad_norm": 0.3178841769695282, + "learning_rate": 0.0001539598472718315, + "loss": 0.2983, + "step": 1874 + }, + { + "epoch": 0.6923929098966026, + "grad_norm": 0.29066354036331177, + "learning_rate": 0.00015393521369626802, + "loss": 0.2816, + "step": 1875 + }, + { + "epoch": 0.6927621861152142, + "grad_norm": 0.2316959798336029, + "learning_rate": 0.0001539105801207045, + "loss": 0.2898, + "step": 1876 + }, + { + "epoch": 0.6931314623338257, + "grad_norm": 0.24668775498867035, + "learning_rate": 0.00015388594654514102, + "loss": 0.2907, + "step": 1877 + }, + { + "epoch": 0.6935007385524372, + "grad_norm": 0.30770760774612427, + "learning_rate": 0.00015386131296957754, + "loss": 0.3196, + "step": 1878 + }, + { + "epoch": 0.6938700147710487, + "grad_norm": 0.2858896851539612, + "learning_rate": 0.00015383667939401405, + "loss": 0.3035, + "step": 1879 + }, + { + "epoch": 0.6942392909896603, + "grad_norm": 0.26638948917388916, + "learning_rate": 0.00015381204581845054, + "loss": 0.3285, + "step": 1880 + }, + { + "epoch": 0.6946085672082718, + "grad_norm": 0.2781909108161926, + "learning_rate": 0.00015378741224288705, + "loss": 0.3001, + "step": 1881 + }, + { + "epoch": 0.6949778434268833, + "grad_norm": 0.3784567713737488, + "learning_rate": 0.00015376277866732357, + "loss": 0.2608, + "step": 1882 + }, + { + "epoch": 0.6953471196454948, + "grad_norm": 0.27729710936546326, + "learning_rate": 0.00015373814509176008, + "loss": 0.2792, + "step": 1883 + }, + { + "epoch": 0.6957163958641064, + "grad_norm": 0.27593815326690674, + "learning_rate": 0.00015371351151619657, + "loss": 0.3021, + "step": 1884 + }, + { + "epoch": 0.6960856720827179, + "grad_norm": 0.2881922721862793, + "learning_rate": 0.00015368887794063309, + "loss": 0.3478, + "step": 1885 + }, + { + "epoch": 0.6964549483013294, + "grad_norm": 0.22911010682582855, + "learning_rate": 0.0001536642443650696, + "loss": 0.2812, + "step": 1886 + }, + { + "epoch": 0.696824224519941, + "grad_norm": 0.236148402094841, + "learning_rate": 0.00015363961078950611, + "loss": 0.2425, + "step": 1887 + }, + { + "epoch": 0.6971935007385525, + "grad_norm": 0.2774393558502197, + "learning_rate": 0.0001536149772139426, + "loss": 0.3115, + "step": 1888 + }, + { + "epoch": 0.697562776957164, + "grad_norm": 0.3710113763809204, + "learning_rate": 0.00015359034363837912, + "loss": 0.4274, + "step": 1889 + }, + { + "epoch": 0.6979320531757754, + "grad_norm": 0.29781535267829895, + "learning_rate": 0.0001535657100628156, + "loss": 0.3126, + "step": 1890 + }, + { + "epoch": 0.698301329394387, + "grad_norm": 0.26671102643013, + "learning_rate": 0.00015354107648725215, + "loss": 0.2502, + "step": 1891 + }, + { + "epoch": 0.6986706056129985, + "grad_norm": 0.23077552020549774, + "learning_rate": 0.00015351644291168863, + "loss": 0.2645, + "step": 1892 + }, + { + "epoch": 0.69903988183161, + "grad_norm": 0.3586418330669403, + "learning_rate": 0.00015349180933612515, + "loss": 0.3416, + "step": 1893 + }, + { + "epoch": 0.6994091580502215, + "grad_norm": 0.319751501083374, + "learning_rate": 0.00015346717576056164, + "loss": 0.3491, + "step": 1894 + }, + { + "epoch": 0.6997784342688331, + "grad_norm": 0.27810370922088623, + "learning_rate": 0.00015344254218499815, + "loss": 0.3334, + "step": 1895 + }, + { + "epoch": 0.7001477104874446, + "grad_norm": 0.3138943016529083, + "learning_rate": 0.00015341790860943466, + "loss": 0.2844, + "step": 1896 + }, + { + "epoch": 0.7005169867060561, + "grad_norm": 0.28959372639656067, + "learning_rate": 0.00015339327503387118, + "loss": 0.3187, + "step": 1897 + }, + { + "epoch": 0.7008862629246676, + "grad_norm": 0.2932811379432678, + "learning_rate": 0.00015336864145830767, + "loss": 0.3367, + "step": 1898 + }, + { + "epoch": 0.7012555391432792, + "grad_norm": 0.27436012029647827, + "learning_rate": 0.00015334400788274418, + "loss": 0.2903, + "step": 1899 + }, + { + "epoch": 0.7016248153618907, + "grad_norm": 0.29513734579086304, + "learning_rate": 0.0001533193743071807, + "loss": 0.3383, + "step": 1900 + }, + { + "epoch": 0.7016248153618907, + "eval_loss": 8.000016212463379, + "eval_runtime": 6.9202, + "eval_samples_per_second": 7.225, + "eval_steps_per_second": 1.012, + "step": 1900 + }, + { + "epoch": 0.7019940915805022, + "grad_norm": 0.20082442462444305, + "learning_rate": 0.0001532947407316172, + "loss": 0.2266, + "step": 1901 + }, + { + "epoch": 0.7023633677991138, + "grad_norm": 0.2526630461215973, + "learning_rate": 0.0001532701071560537, + "loss": 0.3116, + "step": 1902 + }, + { + "epoch": 0.7027326440177253, + "grad_norm": 0.2321329265832901, + "learning_rate": 0.0001532454735804902, + "loss": 0.2803, + "step": 1903 + }, + { + "epoch": 0.7031019202363368, + "grad_norm": 0.3030351400375366, + "learning_rate": 0.0001532208400049267, + "loss": 0.2949, + "step": 1904 + }, + { + "epoch": 0.7034711964549483, + "grad_norm": 0.2566744089126587, + "learning_rate": 0.00015319620642936324, + "loss": 0.3465, + "step": 1905 + }, + { + "epoch": 0.7038404726735599, + "grad_norm": 0.24831126630306244, + "learning_rate": 0.00015317157285379973, + "loss": 0.2717, + "step": 1906 + }, + { + "epoch": 0.7042097488921714, + "grad_norm": 0.32666292786598206, + "learning_rate": 0.00015314693927823624, + "loss": 0.3174, + "step": 1907 + }, + { + "epoch": 0.7045790251107829, + "grad_norm": 0.27467700839042664, + "learning_rate": 0.00015312230570267273, + "loss": 0.2974, + "step": 1908 + }, + { + "epoch": 0.7049483013293943, + "grad_norm": 0.2645398676395416, + "learning_rate": 0.00015309767212710925, + "loss": 0.3119, + "step": 1909 + }, + { + "epoch": 0.705317577548006, + "grad_norm": 0.24054807424545288, + "learning_rate": 0.00015307303855154576, + "loss": 0.268, + "step": 1910 + }, + { + "epoch": 0.7056868537666174, + "grad_norm": 0.25734832882881165, + "learning_rate": 0.00015304840497598228, + "loss": 0.3304, + "step": 1911 + }, + { + "epoch": 0.7060561299852289, + "grad_norm": 0.3223969042301178, + "learning_rate": 0.00015302377140041876, + "loss": 0.2801, + "step": 1912 + }, + { + "epoch": 0.7064254062038404, + "grad_norm": 0.24556152522563934, + "learning_rate": 0.00015299913782485528, + "loss": 0.2931, + "step": 1913 + }, + { + "epoch": 0.706794682422452, + "grad_norm": 0.2814628481864929, + "learning_rate": 0.0001529745042492918, + "loss": 0.3642, + "step": 1914 + }, + { + "epoch": 0.7071639586410635, + "grad_norm": 0.23099385201931, + "learning_rate": 0.0001529498706737283, + "loss": 0.2702, + "step": 1915 + }, + { + "epoch": 0.707533234859675, + "grad_norm": 0.2592359483242035, + "learning_rate": 0.0001529252370981648, + "loss": 0.3008, + "step": 1916 + }, + { + "epoch": 0.7079025110782866, + "grad_norm": 0.2055014818906784, + "learning_rate": 0.0001529006035226013, + "loss": 0.1914, + "step": 1917 + }, + { + "epoch": 0.7082717872968981, + "grad_norm": 0.31321951746940613, + "learning_rate": 0.00015287596994703782, + "loss": 0.3372, + "step": 1918 + }, + { + "epoch": 0.7086410635155096, + "grad_norm": 0.2668575644493103, + "learning_rate": 0.00015285133637147434, + "loss": 0.3299, + "step": 1919 + }, + { + "epoch": 0.7090103397341211, + "grad_norm": 0.2725284695625305, + "learning_rate": 0.00015282670279591083, + "loss": 0.3452, + "step": 1920 + }, + { + "epoch": 0.7093796159527327, + "grad_norm": 0.2637563645839691, + "learning_rate": 0.00015280206922034734, + "loss": 0.2695, + "step": 1921 + }, + { + "epoch": 0.7097488921713442, + "grad_norm": 0.47600701451301575, + "learning_rate": 0.00015277743564478383, + "loss": 0.3969, + "step": 1922 + }, + { + "epoch": 0.7101181683899557, + "grad_norm": 0.248932883143425, + "learning_rate": 0.00015275280206922037, + "loss": 0.2939, + "step": 1923 + }, + { + "epoch": 0.7104874446085672, + "grad_norm": 0.28896889090538025, + "learning_rate": 0.00015272816849365686, + "loss": 0.3607, + "step": 1924 + }, + { + "epoch": 0.7108567208271788, + "grad_norm": 0.2557620704174042, + "learning_rate": 0.00015270353491809337, + "loss": 0.303, + "step": 1925 + }, + { + "epoch": 0.7112259970457903, + "grad_norm": 0.22804628312587738, + "learning_rate": 0.00015267890134252986, + "loss": 0.2284, + "step": 1926 + }, + { + "epoch": 0.7115952732644018, + "grad_norm": 0.3195030987262726, + "learning_rate": 0.00015265426776696637, + "loss": 0.305, + "step": 1927 + }, + { + "epoch": 0.7119645494830132, + "grad_norm": 0.2265772819519043, + "learning_rate": 0.0001526296341914029, + "loss": 0.2997, + "step": 1928 + }, + { + "epoch": 0.7123338257016248, + "grad_norm": 0.28301918506622314, + "learning_rate": 0.0001526050006158394, + "loss": 0.2683, + "step": 1929 + }, + { + "epoch": 0.7127031019202363, + "grad_norm": 0.24847456812858582, + "learning_rate": 0.0001525803670402759, + "loss": 0.2302, + "step": 1930 + }, + { + "epoch": 0.7130723781388478, + "grad_norm": 0.22580763697624207, + "learning_rate": 0.0001525557334647124, + "loss": 0.2706, + "step": 1931 + }, + { + "epoch": 0.7134416543574594, + "grad_norm": 0.2580067813396454, + "learning_rate": 0.00015253109988914892, + "loss": 0.328, + "step": 1932 + }, + { + "epoch": 0.7138109305760709, + "grad_norm": 0.28023049235343933, + "learning_rate": 0.00015250646631358544, + "loss": 0.3128, + "step": 1933 + }, + { + "epoch": 0.7141802067946824, + "grad_norm": 0.2643038034439087, + "learning_rate": 0.00015248183273802192, + "loss": 0.2881, + "step": 1934 + }, + { + "epoch": 0.7145494830132939, + "grad_norm": 0.3427876830101013, + "learning_rate": 0.00015245719916245844, + "loss": 0.3543, + "step": 1935 + }, + { + "epoch": 0.7149187592319055, + "grad_norm": 0.24903441965579987, + "learning_rate": 0.00015243256558689493, + "loss": 0.2699, + "step": 1936 + }, + { + "epoch": 0.715288035450517, + "grad_norm": 0.2503966689109802, + "learning_rate": 0.00015240793201133147, + "loss": 0.3025, + "step": 1937 + }, + { + "epoch": 0.7156573116691285, + "grad_norm": 0.24341844022274017, + "learning_rate": 0.00015238329843576795, + "loss": 0.2936, + "step": 1938 + }, + { + "epoch": 0.71602658788774, + "grad_norm": 0.2609136700630188, + "learning_rate": 0.00015235866486020447, + "loss": 0.3149, + "step": 1939 + }, + { + "epoch": 0.7163958641063516, + "grad_norm": 0.2578326165676117, + "learning_rate": 0.00015233403128464096, + "loss": 0.2373, + "step": 1940 + }, + { + "epoch": 0.7167651403249631, + "grad_norm": 0.4430208206176758, + "learning_rate": 0.00015230939770907747, + "loss": 0.3028, + "step": 1941 + }, + { + "epoch": 0.7171344165435746, + "grad_norm": 0.46359118819236755, + "learning_rate": 0.00015228476413351399, + "loss": 0.324, + "step": 1942 + }, + { + "epoch": 0.7175036927621861, + "grad_norm": 0.22760091722011566, + "learning_rate": 0.0001522601305579505, + "loss": 0.2711, + "step": 1943 + }, + { + "epoch": 0.7178729689807977, + "grad_norm": 0.2597522735595703, + "learning_rate": 0.000152235496982387, + "loss": 0.2714, + "step": 1944 + }, + { + "epoch": 0.7182422451994092, + "grad_norm": 0.2737117409706116, + "learning_rate": 0.0001522108634068235, + "loss": 0.2807, + "step": 1945 + }, + { + "epoch": 0.7186115214180206, + "grad_norm": 0.23758068680763245, + "learning_rate": 0.00015218622983126002, + "loss": 0.2532, + "step": 1946 + }, + { + "epoch": 0.7189807976366323, + "grad_norm": 0.21985715627670288, + "learning_rate": 0.00015216159625569653, + "loss": 0.2391, + "step": 1947 + }, + { + "epoch": 0.7193500738552437, + "grad_norm": 0.24311186373233795, + "learning_rate": 0.00015213696268013302, + "loss": 0.2866, + "step": 1948 + }, + { + "epoch": 0.7197193500738552, + "grad_norm": 0.25463172793388367, + "learning_rate": 0.00015211232910456953, + "loss": 0.2412, + "step": 1949 + }, + { + "epoch": 0.7200886262924667, + "grad_norm": 0.3111552596092224, + "learning_rate": 0.00015208769552900605, + "loss": 0.3422, + "step": 1950 + }, + { + "epoch": 0.7200886262924667, + "eval_loss": 7.996081352233887, + "eval_runtime": 6.9155, + "eval_samples_per_second": 7.23, + "eval_steps_per_second": 1.012, + "step": 1950 + }, + { + "epoch": 0.7204579025110783, + "grad_norm": 0.2615399956703186, + "learning_rate": 0.00015206306195344256, + "loss": 0.2899, + "step": 1951 + }, + { + "epoch": 0.7208271787296898, + "grad_norm": 0.3296869397163391, + "learning_rate": 0.00015203842837787905, + "loss": 0.365, + "step": 1952 + }, + { + "epoch": 0.7211964549483013, + "grad_norm": 0.2782537639141083, + "learning_rate": 0.00015201379480231557, + "loss": 0.2702, + "step": 1953 + }, + { + "epoch": 0.7215657311669128, + "grad_norm": 0.24647271633148193, + "learning_rate": 0.00015198916122675205, + "loss": 0.2959, + "step": 1954 + }, + { + "epoch": 0.7219350073855244, + "grad_norm": 0.24780161678791046, + "learning_rate": 0.0001519645276511886, + "loss": 0.2877, + "step": 1955 + }, + { + "epoch": 0.7223042836041359, + "grad_norm": 0.3499109447002411, + "learning_rate": 0.00015193989407562508, + "loss": 0.3406, + "step": 1956 + }, + { + "epoch": 0.7226735598227474, + "grad_norm": 0.23967693746089935, + "learning_rate": 0.0001519152605000616, + "loss": 0.294, + "step": 1957 + }, + { + "epoch": 0.7230428360413589, + "grad_norm": 0.26585423946380615, + "learning_rate": 0.00015189062692449808, + "loss": 0.2784, + "step": 1958 + }, + { + "epoch": 0.7234121122599705, + "grad_norm": 0.23592844605445862, + "learning_rate": 0.0001518659933489346, + "loss": 0.2414, + "step": 1959 + }, + { + "epoch": 0.723781388478582, + "grad_norm": 0.2599862813949585, + "learning_rate": 0.00015184135977337111, + "loss": 0.2966, + "step": 1960 + }, + { + "epoch": 0.7241506646971935, + "grad_norm": 0.2153013050556183, + "learning_rate": 0.00015181672619780763, + "loss": 0.2381, + "step": 1961 + }, + { + "epoch": 0.7245199409158051, + "grad_norm": 0.29517412185668945, + "learning_rate": 0.00015179209262224412, + "loss": 0.2947, + "step": 1962 + }, + { + "epoch": 0.7248892171344166, + "grad_norm": 0.25670620799064636, + "learning_rate": 0.00015176745904668063, + "loss": 0.3096, + "step": 1963 + }, + { + "epoch": 0.725258493353028, + "grad_norm": 0.2682307958602905, + "learning_rate": 0.00015174282547111715, + "loss": 0.2946, + "step": 1964 + }, + { + "epoch": 0.7256277695716395, + "grad_norm": 0.23364810645580292, + "learning_rate": 0.00015171819189555366, + "loss": 0.2644, + "step": 1965 + }, + { + "epoch": 0.7259970457902511, + "grad_norm": 0.27229735255241394, + "learning_rate": 0.00015169355831999015, + "loss": 0.2476, + "step": 1966 + }, + { + "epoch": 0.7263663220088626, + "grad_norm": 0.269174724817276, + "learning_rate": 0.00015166892474442666, + "loss": 0.281, + "step": 1967 + }, + { + "epoch": 0.7267355982274741, + "grad_norm": 0.3210710287094116, + "learning_rate": 0.00015164429116886315, + "loss": 0.3539, + "step": 1968 + }, + { + "epoch": 0.7271048744460856, + "grad_norm": 0.24845993518829346, + "learning_rate": 0.0001516196575932997, + "loss": 0.2917, + "step": 1969 + }, + { + "epoch": 0.7274741506646972, + "grad_norm": 0.29046398401260376, + "learning_rate": 0.00015159502401773618, + "loss": 0.3288, + "step": 1970 + }, + { + "epoch": 0.7278434268833087, + "grad_norm": 0.25246894359588623, + "learning_rate": 0.0001515703904421727, + "loss": 0.2566, + "step": 1971 + }, + { + "epoch": 0.7282127031019202, + "grad_norm": 0.2584327757358551, + "learning_rate": 0.00015154575686660918, + "loss": 0.2749, + "step": 1972 + }, + { + "epoch": 0.7285819793205317, + "grad_norm": 0.3046245574951172, + "learning_rate": 0.0001515211232910457, + "loss": 0.3336, + "step": 1973 + }, + { + "epoch": 0.7289512555391433, + "grad_norm": 0.2278825044631958, + "learning_rate": 0.0001514964897154822, + "loss": 0.2511, + "step": 1974 + }, + { + "epoch": 0.7293205317577548, + "grad_norm": 0.261961966753006, + "learning_rate": 0.00015147185613991873, + "loss": 0.291, + "step": 1975 + }, + { + "epoch": 0.7296898079763663, + "grad_norm": 0.2599422037601471, + "learning_rate": 0.0001514472225643552, + "loss": 0.2766, + "step": 1976 + }, + { + "epoch": 0.7300590841949779, + "grad_norm": 0.2589248716831207, + "learning_rate": 0.00015142258898879173, + "loss": 0.3228, + "step": 1977 + }, + { + "epoch": 0.7304283604135894, + "grad_norm": 0.2898035943508148, + "learning_rate": 0.00015139795541322824, + "loss": 0.3309, + "step": 1978 + }, + { + "epoch": 0.7307976366322009, + "grad_norm": 0.26384422183036804, + "learning_rate": 0.00015137332183766476, + "loss": 0.3, + "step": 1979 + }, + { + "epoch": 0.7311669128508124, + "grad_norm": 0.29273176193237305, + "learning_rate": 0.00015134868826210124, + "loss": 0.2844, + "step": 1980 + }, + { + "epoch": 0.731536189069424, + "grad_norm": 0.2838720381259918, + "learning_rate": 0.00015132405468653776, + "loss": 0.2886, + "step": 1981 + }, + { + "epoch": 0.7319054652880355, + "grad_norm": 0.2766261696815491, + "learning_rate": 0.00015129942111097427, + "loss": 0.3023, + "step": 1982 + }, + { + "epoch": 0.732274741506647, + "grad_norm": 0.3501611351966858, + "learning_rate": 0.0001512747875354108, + "loss": 0.4103, + "step": 1983 + }, + { + "epoch": 0.7326440177252584, + "grad_norm": 0.24306529760360718, + "learning_rate": 0.00015125015395984728, + "loss": 0.2838, + "step": 1984 + }, + { + "epoch": 0.73301329394387, + "grad_norm": 0.2450806200504303, + "learning_rate": 0.0001512255203842838, + "loss": 0.2644, + "step": 1985 + }, + { + "epoch": 0.7333825701624815, + "grad_norm": 0.2865354120731354, + "learning_rate": 0.00015120088680872028, + "loss": 0.2988, + "step": 1986 + }, + { + "epoch": 0.733751846381093, + "grad_norm": 0.29581406712532043, + "learning_rate": 0.00015117625323315682, + "loss": 0.3422, + "step": 1987 + }, + { + "epoch": 0.7341211225997046, + "grad_norm": 0.26844993233680725, + "learning_rate": 0.0001511516196575933, + "loss": 0.3085, + "step": 1988 + }, + { + "epoch": 0.7344903988183161, + "grad_norm": 0.26844078302383423, + "learning_rate": 0.00015112698608202982, + "loss": 0.297, + "step": 1989 + }, + { + "epoch": 0.7348596750369276, + "grad_norm": 0.25834715366363525, + "learning_rate": 0.0001511023525064663, + "loss": 0.2784, + "step": 1990 + }, + { + "epoch": 0.7352289512555391, + "grad_norm": 0.2743387222290039, + "learning_rate": 0.00015107771893090282, + "loss": 0.2552, + "step": 1991 + }, + { + "epoch": 0.7355982274741507, + "grad_norm": 0.27867287397384644, + "learning_rate": 0.00015105308535533934, + "loss": 0.2899, + "step": 1992 + }, + { + "epoch": 0.7359675036927622, + "grad_norm": 0.26340252161026, + "learning_rate": 0.00015102845177977585, + "loss": 0.3289, + "step": 1993 + }, + { + "epoch": 0.7363367799113737, + "grad_norm": 0.28883883357048035, + "learning_rate": 0.00015100381820421234, + "loss": 0.3463, + "step": 1994 + }, + { + "epoch": 0.7367060561299852, + "grad_norm": 0.262492835521698, + "learning_rate": 0.00015097918462864886, + "loss": 0.2479, + "step": 1995 + }, + { + "epoch": 0.7370753323485968, + "grad_norm": 0.23756761848926544, + "learning_rate": 0.00015095455105308537, + "loss": 0.2492, + "step": 1996 + }, + { + "epoch": 0.7374446085672083, + "grad_norm": 0.26545631885528564, + "learning_rate": 0.00015092991747752188, + "loss": 0.2632, + "step": 1997 + }, + { + "epoch": 0.7378138847858198, + "grad_norm": 0.2796971797943115, + "learning_rate": 0.00015090528390195837, + "loss": 0.2927, + "step": 1998 + }, + { + "epoch": 0.7381831610044313, + "grad_norm": 0.43843719363212585, + "learning_rate": 0.0001508806503263949, + "loss": 0.3697, + "step": 1999 + }, + { + "epoch": 0.7385524372230429, + "grad_norm": 0.23755738139152527, + "learning_rate": 0.00015085601675083137, + "loss": 0.3061, + "step": 2000 + }, + { + "epoch": 0.7385524372230429, + "eval_loss": 7.900362968444824, + "eval_runtime": 6.904, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 1.014, + "step": 2000 + }, + { + "epoch": 0.7389217134416544, + "grad_norm": 0.23689517378807068, + "learning_rate": 0.00015083138317526792, + "loss": 0.247, + "step": 2001 + }, + { + "epoch": 0.7392909896602659, + "grad_norm": 0.3469773828983307, + "learning_rate": 0.0001508067495997044, + "loss": 0.309, + "step": 2002 + }, + { + "epoch": 0.7396602658788775, + "grad_norm": 0.3081546425819397, + "learning_rate": 0.00015078211602414092, + "loss": 0.3322, + "step": 2003 + }, + { + "epoch": 0.740029542097489, + "grad_norm": 0.2908842861652374, + "learning_rate": 0.0001507574824485774, + "loss": 0.296, + "step": 2004 + }, + { + "epoch": 0.7403988183161004, + "grad_norm": 0.24879853427410126, + "learning_rate": 0.00015073284887301392, + "loss": 0.2666, + "step": 2005 + }, + { + "epoch": 0.7407680945347119, + "grad_norm": 0.22988645732402802, + "learning_rate": 0.00015070821529745043, + "loss": 0.3283, + "step": 2006 + }, + { + "epoch": 0.7411373707533235, + "grad_norm": 0.25225698947906494, + "learning_rate": 0.00015068358172188695, + "loss": 0.2819, + "step": 2007 + }, + { + "epoch": 0.741506646971935, + "grad_norm": 0.24615471065044403, + "learning_rate": 0.00015065894814632344, + "loss": 0.256, + "step": 2008 + }, + { + "epoch": 0.7418759231905465, + "grad_norm": 0.32443609833717346, + "learning_rate": 0.00015063431457075995, + "loss": 0.3155, + "step": 2009 + }, + { + "epoch": 0.742245199409158, + "grad_norm": 0.2325717657804489, + "learning_rate": 0.00015060968099519647, + "loss": 0.2138, + "step": 2010 + }, + { + "epoch": 0.7426144756277696, + "grad_norm": 0.24997131526470184, + "learning_rate": 0.00015058504741963298, + "loss": 0.2569, + "step": 2011 + }, + { + "epoch": 0.7429837518463811, + "grad_norm": 0.2671299874782562, + "learning_rate": 0.00015056041384406947, + "loss": 0.2955, + "step": 2012 + }, + { + "epoch": 0.7433530280649926, + "grad_norm": 0.24666881561279297, + "learning_rate": 0.00015053578026850598, + "loss": 0.2555, + "step": 2013 + }, + { + "epoch": 0.7437223042836041, + "grad_norm": 0.2629302740097046, + "learning_rate": 0.00015051114669294247, + "loss": 0.3519, + "step": 2014 + }, + { + "epoch": 0.7440915805022157, + "grad_norm": 0.2534876763820648, + "learning_rate": 0.000150486513117379, + "loss": 0.3121, + "step": 2015 + }, + { + "epoch": 0.7444608567208272, + "grad_norm": 0.29296138882637024, + "learning_rate": 0.0001504618795418155, + "loss": 0.3549, + "step": 2016 + }, + { + "epoch": 0.7448301329394387, + "grad_norm": 0.2535209357738495, + "learning_rate": 0.00015043724596625201, + "loss": 0.2397, + "step": 2017 + }, + { + "epoch": 0.7451994091580503, + "grad_norm": 0.2866876721382141, + "learning_rate": 0.0001504126123906885, + "loss": 0.3401, + "step": 2018 + }, + { + "epoch": 0.7455686853766618, + "grad_norm": 0.24192002415657043, + "learning_rate": 0.00015038797881512502, + "loss": 0.2662, + "step": 2019 + }, + { + "epoch": 0.7459379615952733, + "grad_norm": 0.301190584897995, + "learning_rate": 0.00015036334523956153, + "loss": 0.3286, + "step": 2020 + }, + { + "epoch": 0.7463072378138847, + "grad_norm": 0.2765111029148102, + "learning_rate": 0.00015033871166399805, + "loss": 0.2948, + "step": 2021 + }, + { + "epoch": 0.7466765140324964, + "grad_norm": 0.43534228205680847, + "learning_rate": 0.00015031407808843453, + "loss": 0.3063, + "step": 2022 + }, + { + "epoch": 0.7470457902511078, + "grad_norm": 0.20948593318462372, + "learning_rate": 0.00015028944451287105, + "loss": 0.2803, + "step": 2023 + }, + { + "epoch": 0.7474150664697193, + "grad_norm": 0.322803795337677, + "learning_rate": 0.00015026481093730756, + "loss": 0.2965, + "step": 2024 + }, + { + "epoch": 0.7477843426883308, + "grad_norm": 0.2427653670310974, + "learning_rate": 0.00015024017736174408, + "loss": 0.2352, + "step": 2025 + }, + { + "epoch": 0.7481536189069424, + "grad_norm": 0.24558594822883606, + "learning_rate": 0.00015021554378618057, + "loss": 0.2696, + "step": 2026 + }, + { + "epoch": 0.7485228951255539, + "grad_norm": 0.3039640486240387, + "learning_rate": 0.00015019091021061708, + "loss": 0.3477, + "step": 2027 + }, + { + "epoch": 0.7488921713441654, + "grad_norm": 0.24833515286445618, + "learning_rate": 0.0001501662766350536, + "loss": 0.2667, + "step": 2028 + }, + { + "epoch": 0.7492614475627769, + "grad_norm": 0.27917686104774475, + "learning_rate": 0.0001501416430594901, + "loss": 0.2726, + "step": 2029 + }, + { + "epoch": 0.7496307237813885, + "grad_norm": 0.28609761595726013, + "learning_rate": 0.0001501170094839266, + "loss": 0.3221, + "step": 2030 + }, + { + "epoch": 0.75, + "grad_norm": 0.271697461605072, + "learning_rate": 0.0001500923759083631, + "loss": 0.3188, + "step": 2031 + }, + { + "epoch": 0.7503692762186115, + "grad_norm": 0.28350380063056946, + "learning_rate": 0.0001500677423327996, + "loss": 0.3403, + "step": 2032 + }, + { + "epoch": 0.7507385524372231, + "grad_norm": 0.2460940182209015, + "learning_rate": 0.00015004310875723614, + "loss": 0.2783, + "step": 2033 + }, + { + "epoch": 0.7511078286558346, + "grad_norm": 0.22866520285606384, + "learning_rate": 0.00015001847518167263, + "loss": 0.2679, + "step": 2034 + }, + { + "epoch": 0.7514771048744461, + "grad_norm": 0.2501647174358368, + "learning_rate": 0.00014999384160610914, + "loss": 0.3316, + "step": 2035 + }, + { + "epoch": 0.7518463810930576, + "grad_norm": 0.2611762285232544, + "learning_rate": 0.00014996920803054563, + "loss": 0.3115, + "step": 2036 + }, + { + "epoch": 0.7522156573116692, + "grad_norm": 0.19983458518981934, + "learning_rate": 0.00014994457445498214, + "loss": 0.1849, + "step": 2037 + }, + { + "epoch": 0.7525849335302807, + "grad_norm": 0.22143511474132538, + "learning_rate": 0.00014991994087941866, + "loss": 0.2844, + "step": 2038 + }, + { + "epoch": 0.7529542097488922, + "grad_norm": 0.3073621392250061, + "learning_rate": 0.00014989530730385517, + "loss": 0.3233, + "step": 2039 + }, + { + "epoch": 0.7533234859675036, + "grad_norm": 0.2384214997291565, + "learning_rate": 0.00014987067372829166, + "loss": 0.2756, + "step": 2040 + }, + { + "epoch": 0.7536927621861153, + "grad_norm": 0.33154869079589844, + "learning_rate": 0.00014984604015272818, + "loss": 0.3898, + "step": 2041 + }, + { + "epoch": 0.7540620384047267, + "grad_norm": 0.33897361159324646, + "learning_rate": 0.0001498214065771647, + "loss": 0.3679, + "step": 2042 + }, + { + "epoch": 0.7544313146233382, + "grad_norm": 0.28146255016326904, + "learning_rate": 0.0001497967730016012, + "loss": 0.309, + "step": 2043 + }, + { + "epoch": 0.7548005908419497, + "grad_norm": 0.37559834122657776, + "learning_rate": 0.0001497721394260377, + "loss": 0.2566, + "step": 2044 + }, + { + "epoch": 0.7551698670605613, + "grad_norm": 0.3806197941303253, + "learning_rate": 0.0001497475058504742, + "loss": 0.3526, + "step": 2045 + }, + { + "epoch": 0.7555391432791728, + "grad_norm": 0.32080894708633423, + "learning_rate": 0.0001497228722749107, + "loss": 0.4106, + "step": 2046 + }, + { + "epoch": 0.7559084194977843, + "grad_norm": 0.2688358724117279, + "learning_rate": 0.00014969823869934724, + "loss": 0.3455, + "step": 2047 + }, + { + "epoch": 0.7562776957163959, + "grad_norm": 0.2761474847793579, + "learning_rate": 0.00014967360512378372, + "loss": 0.3371, + "step": 2048 + }, + { + "epoch": 0.7566469719350074, + "grad_norm": 0.332131952047348, + "learning_rate": 0.00014964897154822024, + "loss": 0.3111, + "step": 2049 + }, + { + "epoch": 0.7570162481536189, + "grad_norm": 0.27654901146888733, + "learning_rate": 0.00014962433797265673, + "loss": 0.3222, + "step": 2050 + }, + { + "epoch": 0.7570162481536189, + "eval_loss": 7.79747200012207, + "eval_runtime": 6.9152, + "eval_samples_per_second": 7.23, + "eval_steps_per_second": 1.012, + "step": 2050 + }, + { + "epoch": 0.7573855243722304, + "grad_norm": 0.2376328557729721, + "learning_rate": 0.00014959970439709324, + "loss": 0.2786, + "step": 2051 + }, + { + "epoch": 0.757754800590842, + "grad_norm": 0.33290034532546997, + "learning_rate": 0.00014957507082152976, + "loss": 0.3812, + "step": 2052 + }, + { + "epoch": 0.7581240768094535, + "grad_norm": 0.3380466103553772, + "learning_rate": 0.00014955043724596627, + "loss": 0.3922, + "step": 2053 + }, + { + "epoch": 0.758493353028065, + "grad_norm": 0.2667270004749298, + "learning_rate": 0.00014952580367040276, + "loss": 0.2551, + "step": 2054 + }, + { + "epoch": 0.7588626292466765, + "grad_norm": 0.275036096572876, + "learning_rate": 0.00014950117009483927, + "loss": 0.3162, + "step": 2055 + }, + { + "epoch": 0.7592319054652881, + "grad_norm": 0.2665283977985382, + "learning_rate": 0.0001494765365192758, + "loss": 0.3079, + "step": 2056 + }, + { + "epoch": 0.7596011816838996, + "grad_norm": 0.3038942813873291, + "learning_rate": 0.0001494519029437123, + "loss": 0.3456, + "step": 2057 + }, + { + "epoch": 0.759970457902511, + "grad_norm": 0.2251385897397995, + "learning_rate": 0.0001494272693681488, + "loss": 0.2552, + "step": 2058 + }, + { + "epoch": 0.7603397341211225, + "grad_norm": 0.2992597818374634, + "learning_rate": 0.0001494026357925853, + "loss": 0.3288, + "step": 2059 + }, + { + "epoch": 0.7607090103397341, + "grad_norm": 0.23066000640392303, + "learning_rate": 0.00014937800221702182, + "loss": 0.2828, + "step": 2060 + }, + { + "epoch": 0.7610782865583456, + "grad_norm": 0.2907067537307739, + "learning_rate": 0.00014935336864145833, + "loss": 0.3657, + "step": 2061 + }, + { + "epoch": 0.7614475627769571, + "grad_norm": 0.23587049543857574, + "learning_rate": 0.00014932873506589482, + "loss": 0.2835, + "step": 2062 + }, + { + "epoch": 0.7618168389955687, + "grad_norm": 0.29575085639953613, + "learning_rate": 0.00014930410149033134, + "loss": 0.3106, + "step": 2063 + }, + { + "epoch": 0.7621861152141802, + "grad_norm": 0.2859674096107483, + "learning_rate": 0.00014927946791476782, + "loss": 0.2587, + "step": 2064 + }, + { + "epoch": 0.7625553914327917, + "grad_norm": 0.2703956663608551, + "learning_rate": 0.00014925483433920436, + "loss": 0.3192, + "step": 2065 + }, + { + "epoch": 0.7629246676514032, + "grad_norm": 0.20116522908210754, + "learning_rate": 0.00014923020076364085, + "loss": 0.2078, + "step": 2066 + }, + { + "epoch": 0.7632939438700148, + "grad_norm": 0.34929388761520386, + "learning_rate": 0.00014920556718807737, + "loss": 0.3238, + "step": 2067 + }, + { + "epoch": 0.7636632200886263, + "grad_norm": 0.23505470156669617, + "learning_rate": 0.00014918093361251385, + "loss": 0.2836, + "step": 2068 + }, + { + "epoch": 0.7640324963072378, + "grad_norm": 0.2591880261898041, + "learning_rate": 0.00014915630003695037, + "loss": 0.2759, + "step": 2069 + }, + { + "epoch": 0.7644017725258493, + "grad_norm": 0.29266080260276794, + "learning_rate": 0.00014913166646138688, + "loss": 0.2957, + "step": 2070 + }, + { + "epoch": 0.7647710487444609, + "grad_norm": 0.28563541173934937, + "learning_rate": 0.0001491070328858234, + "loss": 0.2908, + "step": 2071 + }, + { + "epoch": 0.7651403249630724, + "grad_norm": 0.25894370675086975, + "learning_rate": 0.00014908239931025989, + "loss": 0.3064, + "step": 2072 + }, + { + "epoch": 0.7655096011816839, + "grad_norm": 0.3111742436885834, + "learning_rate": 0.0001490577657346964, + "loss": 0.3351, + "step": 2073 + }, + { + "epoch": 0.7658788774002954, + "grad_norm": 0.2422233372926712, + "learning_rate": 0.00014903313215913292, + "loss": 0.2745, + "step": 2074 + }, + { + "epoch": 0.766248153618907, + "grad_norm": 0.2636813223361969, + "learning_rate": 0.00014900849858356943, + "loss": 0.2684, + "step": 2075 + }, + { + "epoch": 0.7666174298375185, + "grad_norm": 0.26755109429359436, + "learning_rate": 0.00014898386500800592, + "loss": 0.2942, + "step": 2076 + }, + { + "epoch": 0.76698670605613, + "grad_norm": 0.22762826085090637, + "learning_rate": 0.00014895923143244243, + "loss": 0.2567, + "step": 2077 + }, + { + "epoch": 0.7673559822747416, + "grad_norm": 0.3391478359699249, + "learning_rate": 0.00014893459785687892, + "loss": 0.3031, + "step": 2078 + }, + { + "epoch": 0.767725258493353, + "grad_norm": 0.3535606265068054, + "learning_rate": 0.00014890996428131546, + "loss": 0.3177, + "step": 2079 + }, + { + "epoch": 0.7680945347119645, + "grad_norm": 0.3040603995323181, + "learning_rate": 0.00014888533070575195, + "loss": 0.3597, + "step": 2080 + }, + { + "epoch": 0.768463810930576, + "grad_norm": 0.25987836718559265, + "learning_rate": 0.00014886069713018846, + "loss": 0.2966, + "step": 2081 + }, + { + "epoch": 0.7688330871491876, + "grad_norm": 0.2888466715812683, + "learning_rate": 0.00014883606355462495, + "loss": 0.358, + "step": 2082 + }, + { + "epoch": 0.7692023633677991, + "grad_norm": 0.28685078024864197, + "learning_rate": 0.00014881142997906147, + "loss": 0.3041, + "step": 2083 + }, + { + "epoch": 0.7695716395864106, + "grad_norm": 0.297453910112381, + "learning_rate": 0.00014878679640349798, + "loss": 0.3497, + "step": 2084 + }, + { + "epoch": 0.7699409158050221, + "grad_norm": 0.2508475184440613, + "learning_rate": 0.0001487621628279345, + "loss": 0.275, + "step": 2085 + }, + { + "epoch": 0.7703101920236337, + "grad_norm": 0.27948445081710815, + "learning_rate": 0.00014873752925237098, + "loss": 0.2854, + "step": 2086 + }, + { + "epoch": 0.7706794682422452, + "grad_norm": 0.3069690763950348, + "learning_rate": 0.0001487128956768075, + "loss": 0.3543, + "step": 2087 + }, + { + "epoch": 0.7710487444608567, + "grad_norm": 0.2778802216053009, + "learning_rate": 0.000148688262101244, + "loss": 0.3575, + "step": 2088 + }, + { + "epoch": 0.7714180206794683, + "grad_norm": 0.2758817970752716, + "learning_rate": 0.00014866362852568053, + "loss": 0.2942, + "step": 2089 + }, + { + "epoch": 0.7717872968980798, + "grad_norm": 0.2673196494579315, + "learning_rate": 0.00014863899495011701, + "loss": 0.2925, + "step": 2090 + }, + { + "epoch": 0.7721565731166913, + "grad_norm": 0.2939143180847168, + "learning_rate": 0.00014861436137455353, + "loss": 0.4404, + "step": 2091 + }, + { + "epoch": 0.7725258493353028, + "grad_norm": 0.30269473791122437, + "learning_rate": 0.00014858972779899004, + "loss": 0.2935, + "step": 2092 + }, + { + "epoch": 0.7728951255539144, + "grad_norm": 0.3150918781757355, + "learning_rate": 0.00014856509422342653, + "loss": 0.3337, + "step": 2093 + }, + { + "epoch": 0.7732644017725259, + "grad_norm": 0.2703327536582947, + "learning_rate": 0.00014854046064786305, + "loss": 0.2761, + "step": 2094 + }, + { + "epoch": 0.7736336779911374, + "grad_norm": 0.25147607922554016, + "learning_rate": 0.00014851582707229953, + "loss": 0.2901, + "step": 2095 + }, + { + "epoch": 0.7740029542097489, + "grad_norm": 0.25832656025886536, + "learning_rate": 0.00014849119349673605, + "loss": 0.3272, + "step": 2096 + }, + { + "epoch": 0.7743722304283605, + "grad_norm": 0.28416287899017334, + "learning_rate": 0.00014846655992117256, + "loss": 0.3412, + "step": 2097 + }, + { + "epoch": 0.774741506646972, + "grad_norm": 0.269577294588089, + "learning_rate": 0.00014844192634560908, + "loss": 0.3273, + "step": 2098 + }, + { + "epoch": 0.7751107828655834, + "grad_norm": 0.26677989959716797, + "learning_rate": 0.00014841729277004556, + "loss": 0.2765, + "step": 2099 + }, + { + "epoch": 0.7754800590841949, + "grad_norm": 0.3046884834766388, + "learning_rate": 0.00014839265919448208, + "loss": 0.3358, + "step": 2100 + }, + { + "epoch": 0.7754800590841949, + "eval_loss": 7.7904486656188965, + "eval_runtime": 6.9074, + "eval_samples_per_second": 7.239, + "eval_steps_per_second": 1.013, + "step": 2100 + }, + { + "epoch": 0.7758493353028065, + "grad_norm": 0.32372692227363586, + "learning_rate": 0.0001483680256189186, + "loss": 0.2806, + "step": 2101 + }, + { + "epoch": 0.776218611521418, + "grad_norm": 0.3086666166782379, + "learning_rate": 0.0001483433920433551, + "loss": 0.3144, + "step": 2102 + }, + { + "epoch": 0.7765878877400295, + "grad_norm": 0.253476083278656, + "learning_rate": 0.0001483187584677916, + "loss": 0.2316, + "step": 2103 + }, + { + "epoch": 0.7769571639586411, + "grad_norm": 0.26472607254981995, + "learning_rate": 0.0001482941248922281, + "loss": 0.3283, + "step": 2104 + }, + { + "epoch": 0.7773264401772526, + "grad_norm": 0.31606829166412354, + "learning_rate": 0.0001482694913166646, + "loss": 0.2816, + "step": 2105 + }, + { + "epoch": 0.7776957163958641, + "grad_norm": 0.3165651857852936, + "learning_rate": 0.00014824485774110114, + "loss": 0.2556, + "step": 2106 + }, + { + "epoch": 0.7780649926144756, + "grad_norm": 0.22081001102924347, + "learning_rate": 0.00014822022416553763, + "loss": 0.2786, + "step": 2107 + }, + { + "epoch": 0.7784342688330872, + "grad_norm": 0.26553067564964294, + "learning_rate": 0.00014819559058997414, + "loss": 0.3258, + "step": 2108 + }, + { + "epoch": 0.7788035450516987, + "grad_norm": 0.2481728196144104, + "learning_rate": 0.00014817095701441063, + "loss": 0.283, + "step": 2109 + }, + { + "epoch": 0.7791728212703102, + "grad_norm": 0.2762134373188019, + "learning_rate": 0.00014814632343884714, + "loss": 0.3222, + "step": 2110 + }, + { + "epoch": 0.7795420974889217, + "grad_norm": 0.29372474551200867, + "learning_rate": 0.00014812168986328366, + "loss": 0.2539, + "step": 2111 + }, + { + "epoch": 0.7799113737075333, + "grad_norm": 0.2638779878616333, + "learning_rate": 0.00014809705628772017, + "loss": 0.2647, + "step": 2112 + }, + { + "epoch": 0.7802806499261448, + "grad_norm": 0.3754814863204956, + "learning_rate": 0.00014807242271215666, + "loss": 0.365, + "step": 2113 + }, + { + "epoch": 0.7806499261447563, + "grad_norm": 0.23102910816669464, + "learning_rate": 0.00014804778913659318, + "loss": 0.2769, + "step": 2114 + }, + { + "epoch": 0.7810192023633677, + "grad_norm": 0.25621116161346436, + "learning_rate": 0.0001480231555610297, + "loss": 0.2736, + "step": 2115 + }, + { + "epoch": 0.7813884785819794, + "grad_norm": 0.291114866733551, + "learning_rate": 0.0001479985219854662, + "loss": 0.3318, + "step": 2116 + }, + { + "epoch": 0.7817577548005908, + "grad_norm": 0.28566306829452515, + "learning_rate": 0.0001479738884099027, + "loss": 0.3169, + "step": 2117 + }, + { + "epoch": 0.7821270310192023, + "grad_norm": 0.30823439359664917, + "learning_rate": 0.0001479492548343392, + "loss": 0.2947, + "step": 2118 + }, + { + "epoch": 0.7824963072378139, + "grad_norm": 0.2687258720397949, + "learning_rate": 0.00014792462125877572, + "loss": 0.2864, + "step": 2119 + }, + { + "epoch": 0.7828655834564254, + "grad_norm": 0.29986774921417236, + "learning_rate": 0.00014789998768321224, + "loss": 0.2763, + "step": 2120 + }, + { + "epoch": 0.7832348596750369, + "grad_norm": 0.30100172758102417, + "learning_rate": 0.00014787535410764872, + "loss": 0.3313, + "step": 2121 + }, + { + "epoch": 0.7836041358936484, + "grad_norm": 0.3481224477291107, + "learning_rate": 0.00014785072053208524, + "loss": 0.2784, + "step": 2122 + }, + { + "epoch": 0.78397341211226, + "grad_norm": 0.33992093801498413, + "learning_rate": 0.00014782608695652173, + "loss": 0.4055, + "step": 2123 + }, + { + "epoch": 0.7843426883308715, + "grad_norm": 0.23579098284244537, + "learning_rate": 0.00014780145338095827, + "loss": 0.2214, + "step": 2124 + }, + { + "epoch": 0.784711964549483, + "grad_norm": 0.2900677025318146, + "learning_rate": 0.00014777681980539476, + "loss": 0.326, + "step": 2125 + }, + { + "epoch": 0.7850812407680945, + "grad_norm": 0.24997259676456451, + "learning_rate": 0.00014775218622983127, + "loss": 0.2683, + "step": 2126 + }, + { + "epoch": 0.7854505169867061, + "grad_norm": 0.26356208324432373, + "learning_rate": 0.00014772755265426776, + "loss": 0.2918, + "step": 2127 + }, + { + "epoch": 0.7858197932053176, + "grad_norm": 0.22333121299743652, + "learning_rate": 0.00014770291907870427, + "loss": 0.2312, + "step": 2128 + }, + { + "epoch": 0.7861890694239291, + "grad_norm": 0.3746260702610016, + "learning_rate": 0.0001476782855031408, + "loss": 0.3064, + "step": 2129 + }, + { + "epoch": 0.7865583456425406, + "grad_norm": 0.2506990134716034, + "learning_rate": 0.0001476536519275773, + "loss": 0.2707, + "step": 2130 + }, + { + "epoch": 0.7869276218611522, + "grad_norm": 0.2704800069332123, + "learning_rate": 0.0001476290183520138, + "loss": 0.3092, + "step": 2131 + }, + { + "epoch": 0.7872968980797637, + "grad_norm": 0.23799307644367218, + "learning_rate": 0.0001476043847764503, + "loss": 0.2781, + "step": 2132 + }, + { + "epoch": 0.7876661742983752, + "grad_norm": 0.30044493079185486, + "learning_rate": 0.00014757975120088682, + "loss": 0.3363, + "step": 2133 + }, + { + "epoch": 0.7880354505169868, + "grad_norm": 0.30415135622024536, + "learning_rate": 0.00014755511762532333, + "loss": 0.3287, + "step": 2134 + }, + { + "epoch": 0.7884047267355982, + "grad_norm": 0.3181655704975128, + "learning_rate": 0.00014753048404975982, + "loss": 0.3598, + "step": 2135 + }, + { + "epoch": 0.7887740029542097, + "grad_norm": 0.24187974631786346, + "learning_rate": 0.00014750585047419634, + "loss": 0.274, + "step": 2136 + }, + { + "epoch": 0.7891432791728212, + "grad_norm": 0.31295740604400635, + "learning_rate": 0.00014748121689863282, + "loss": 0.3033, + "step": 2137 + }, + { + "epoch": 0.7895125553914328, + "grad_norm": 0.208034947514534, + "learning_rate": 0.00014745658332306936, + "loss": 0.237, + "step": 2138 + }, + { + "epoch": 0.7898818316100443, + "grad_norm": 0.22629675269126892, + "learning_rate": 0.00014743194974750585, + "loss": 0.2557, + "step": 2139 + }, + { + "epoch": 0.7902511078286558, + "grad_norm": 0.23466813564300537, + "learning_rate": 0.00014740731617194237, + "loss": 0.2408, + "step": 2140 + }, + { + "epoch": 0.7906203840472673, + "grad_norm": 0.3371754288673401, + "learning_rate": 0.00014738268259637885, + "loss": 0.3724, + "step": 2141 + }, + { + "epoch": 0.7909896602658789, + "grad_norm": 0.4127272665500641, + "learning_rate": 0.00014735804902081537, + "loss": 0.3469, + "step": 2142 + }, + { + "epoch": 0.7913589364844904, + "grad_norm": 0.31296002864837646, + "learning_rate": 0.00014733341544525188, + "loss": 0.3521, + "step": 2143 + }, + { + "epoch": 0.7917282127031019, + "grad_norm": 0.29797351360321045, + "learning_rate": 0.0001473087818696884, + "loss": 0.3174, + "step": 2144 + }, + { + "epoch": 0.7920974889217134, + "grad_norm": 0.24863722920417786, + "learning_rate": 0.00014728414829412489, + "loss": 0.2757, + "step": 2145 + }, + { + "epoch": 0.792466765140325, + "grad_norm": 0.29867783188819885, + "learning_rate": 0.0001472595147185614, + "loss": 0.2692, + "step": 2146 + }, + { + "epoch": 0.7928360413589365, + "grad_norm": 0.24853657186031342, + "learning_rate": 0.00014723488114299791, + "loss": 0.2388, + "step": 2147 + }, + { + "epoch": 0.793205317577548, + "grad_norm": 0.2765767276287079, + "learning_rate": 0.00014721024756743443, + "loss": 0.2697, + "step": 2148 + }, + { + "epoch": 0.7935745937961596, + "grad_norm": 0.29328471422195435, + "learning_rate": 0.00014718561399187092, + "loss": 0.2615, + "step": 2149 + }, + { + "epoch": 0.7939438700147711, + "grad_norm": 0.3026326298713684, + "learning_rate": 0.00014716098041630743, + "loss": 0.3113, + "step": 2150 + }, + { + "epoch": 0.7939438700147711, + "eval_loss": 7.828985691070557, + "eval_runtime": 6.9202, + "eval_samples_per_second": 7.225, + "eval_steps_per_second": 1.012, + "step": 2150 + }, + { + "epoch": 0.7943131462333826, + "grad_norm": 0.25493964552879333, + "learning_rate": 0.00014713634684074392, + "loss": 0.2717, + "step": 2151 + }, + { + "epoch": 0.794682422451994, + "grad_norm": 0.3321656286716461, + "learning_rate": 0.00014711171326518046, + "loss": 0.3246, + "step": 2152 + }, + { + "epoch": 0.7950516986706057, + "grad_norm": 0.31174641847610474, + "learning_rate": 0.00014708707968961695, + "loss": 0.3177, + "step": 2153 + }, + { + "epoch": 0.7954209748892171, + "grad_norm": 0.3485448658466339, + "learning_rate": 0.00014706244611405346, + "loss": 0.3839, + "step": 2154 + }, + { + "epoch": 0.7957902511078286, + "grad_norm": 0.36434996128082275, + "learning_rate": 0.00014703781253848995, + "loss": 0.307, + "step": 2155 + }, + { + "epoch": 0.7961595273264401, + "grad_norm": 0.2779589295387268, + "learning_rate": 0.00014701317896292647, + "loss": 0.2874, + "step": 2156 + }, + { + "epoch": 0.7965288035450517, + "grad_norm": 0.2685941457748413, + "learning_rate": 0.00014698854538736298, + "loss": 0.365, + "step": 2157 + }, + { + "epoch": 0.7968980797636632, + "grad_norm": 0.29712870717048645, + "learning_rate": 0.0001469639118117995, + "loss": 0.3361, + "step": 2158 + }, + { + "epoch": 0.7972673559822747, + "grad_norm": 0.28480052947998047, + "learning_rate": 0.00014693927823623598, + "loss": 0.2941, + "step": 2159 + }, + { + "epoch": 0.7976366322008862, + "grad_norm": 0.3198976218700409, + "learning_rate": 0.0001469146446606725, + "loss": 0.308, + "step": 2160 + }, + { + "epoch": 0.7980059084194978, + "grad_norm": 0.2628573477268219, + "learning_rate": 0.000146890011085109, + "loss": 0.3338, + "step": 2161 + }, + { + "epoch": 0.7983751846381093, + "grad_norm": 0.2805553674697876, + "learning_rate": 0.00014686537750954553, + "loss": 0.33, + "step": 2162 + }, + { + "epoch": 0.7987444608567208, + "grad_norm": 0.27368399500846863, + "learning_rate": 0.00014684074393398201, + "loss": 0.2332, + "step": 2163 + }, + { + "epoch": 0.7991137370753324, + "grad_norm": 0.26735299825668335, + "learning_rate": 0.00014681611035841853, + "loss": 0.2686, + "step": 2164 + }, + { + "epoch": 0.7994830132939439, + "grad_norm": 0.2762780487537384, + "learning_rate": 0.00014679147678285504, + "loss": 0.329, + "step": 2165 + }, + { + "epoch": 0.7998522895125554, + "grad_norm": 0.226070836186409, + "learning_rate": 0.00014676684320729156, + "loss": 0.2878, + "step": 2166 + }, + { + "epoch": 0.8002215657311669, + "grad_norm": 0.2762400805950165, + "learning_rate": 0.00014674220963172805, + "loss": 0.3049, + "step": 2167 + }, + { + "epoch": 0.8005908419497785, + "grad_norm": 0.24946331977844238, + "learning_rate": 0.00014671757605616456, + "loss": 0.2936, + "step": 2168 + }, + { + "epoch": 0.80096011816839, + "grad_norm": 0.27430617809295654, + "learning_rate": 0.00014669294248060105, + "loss": 0.2892, + "step": 2169 + }, + { + "epoch": 0.8013293943870015, + "grad_norm": 0.2512323260307312, + "learning_rate": 0.0001466683089050376, + "loss": 0.3157, + "step": 2170 + }, + { + "epoch": 0.801698670605613, + "grad_norm": 0.281456857919693, + "learning_rate": 0.00014664367532947408, + "loss": 0.2917, + "step": 2171 + }, + { + "epoch": 0.8020679468242246, + "grad_norm": 0.2193906605243683, + "learning_rate": 0.0001466190417539106, + "loss": 0.2574, + "step": 2172 + }, + { + "epoch": 0.802437223042836, + "grad_norm": 0.24633151292800903, + "learning_rate": 0.00014659440817834708, + "loss": 0.3216, + "step": 2173 + }, + { + "epoch": 0.8028064992614475, + "grad_norm": 0.29831722378730774, + "learning_rate": 0.0001465697746027836, + "loss": 0.3443, + "step": 2174 + }, + { + "epoch": 0.803175775480059, + "grad_norm": 0.34024351835250854, + "learning_rate": 0.0001465451410272201, + "loss": 0.3164, + "step": 2175 + }, + { + "epoch": 0.8035450516986706, + "grad_norm": 0.21228238940238953, + "learning_rate": 0.00014652050745165662, + "loss": 0.2824, + "step": 2176 + }, + { + "epoch": 0.8039143279172821, + "grad_norm": 0.27609455585479736, + "learning_rate": 0.0001464958738760931, + "loss": 0.2675, + "step": 2177 + }, + { + "epoch": 0.8042836041358936, + "grad_norm": 0.2908269464969635, + "learning_rate": 0.00014647124030052962, + "loss": 0.3119, + "step": 2178 + }, + { + "epoch": 0.8046528803545052, + "grad_norm": 0.27265864610671997, + "learning_rate": 0.00014644660672496614, + "loss": 0.2827, + "step": 2179 + }, + { + "epoch": 0.8050221565731167, + "grad_norm": 0.2475481629371643, + "learning_rate": 0.00014642197314940265, + "loss": 0.2944, + "step": 2180 + }, + { + "epoch": 0.8053914327917282, + "grad_norm": 0.31200331449508667, + "learning_rate": 0.00014639733957383914, + "loss": 0.3448, + "step": 2181 + }, + { + "epoch": 0.8057607090103397, + "grad_norm": 0.22231833636760712, + "learning_rate": 0.00014637270599827566, + "loss": 0.2419, + "step": 2182 + }, + { + "epoch": 0.8061299852289513, + "grad_norm": 0.29735052585601807, + "learning_rate": 0.00014634807242271214, + "loss": 0.3147, + "step": 2183 + }, + { + "epoch": 0.8064992614475628, + "grad_norm": 0.25813624262809753, + "learning_rate": 0.00014632343884714869, + "loss": 0.2767, + "step": 2184 + }, + { + "epoch": 0.8068685376661743, + "grad_norm": 0.24348226189613342, + "learning_rate": 0.00014629880527158517, + "loss": 0.2406, + "step": 2185 + }, + { + "epoch": 0.8072378138847858, + "grad_norm": 0.27798759937286377, + "learning_rate": 0.0001462741716960217, + "loss": 0.2864, + "step": 2186 + }, + { + "epoch": 0.8076070901033974, + "grad_norm": 0.22812728583812714, + "learning_rate": 0.00014624953812045818, + "loss": 0.2787, + "step": 2187 + }, + { + "epoch": 0.8079763663220089, + "grad_norm": 0.3694125711917877, + "learning_rate": 0.0001462249045448947, + "loss": 0.3523, + "step": 2188 + }, + { + "epoch": 0.8083456425406204, + "grad_norm": 0.31433844566345215, + "learning_rate": 0.0001462002709693312, + "loss": 0.3086, + "step": 2189 + }, + { + "epoch": 0.808714918759232, + "grad_norm": 0.26247870922088623, + "learning_rate": 0.00014617563739376772, + "loss": 0.2405, + "step": 2190 + }, + { + "epoch": 0.8090841949778435, + "grad_norm": 0.21467307209968567, + "learning_rate": 0.0001461510038182042, + "loss": 0.2133, + "step": 2191 + }, + { + "epoch": 0.8094534711964549, + "grad_norm": 0.24723170697689056, + "learning_rate": 0.00014612637024264072, + "loss": 0.2694, + "step": 2192 + }, + { + "epoch": 0.8098227474150664, + "grad_norm": 0.25856903195381165, + "learning_rate": 0.00014610173666707724, + "loss": 0.2413, + "step": 2193 + }, + { + "epoch": 0.810192023633678, + "grad_norm": 0.25353342294692993, + "learning_rate": 0.00014607710309151375, + "loss": 0.3116, + "step": 2194 + }, + { + "epoch": 0.8105612998522895, + "grad_norm": 0.28898367285728455, + "learning_rate": 0.00014605246951595024, + "loss": 0.2921, + "step": 2195 + }, + { + "epoch": 0.810930576070901, + "grad_norm": 0.280935674905777, + "learning_rate": 0.00014602783594038675, + "loss": 0.3404, + "step": 2196 + }, + { + "epoch": 0.8112998522895125, + "grad_norm": 0.3128315210342407, + "learning_rate": 0.00014600320236482327, + "loss": 0.337, + "step": 2197 + }, + { + "epoch": 0.8116691285081241, + "grad_norm": 0.234243243932724, + "learning_rate": 0.00014597856878925978, + "loss": 0.2407, + "step": 2198 + }, + { + "epoch": 0.8120384047267356, + "grad_norm": 0.2224903255701065, + "learning_rate": 0.00014595393521369627, + "loss": 0.2266, + "step": 2199 + }, + { + "epoch": 0.8124076809453471, + "grad_norm": 0.31893783807754517, + "learning_rate": 0.00014592930163813278, + "loss": 0.3517, + "step": 2200 + }, + { + "epoch": 0.8124076809453471, + "eval_loss": 7.929659843444824, + "eval_runtime": 6.9144, + "eval_samples_per_second": 7.231, + "eval_steps_per_second": 1.012, + "step": 2200 + }, + { + "epoch": 0.8127769571639586, + "grad_norm": 0.29524412751197815, + "learning_rate": 0.00014590466806256927, + "loss": 0.254, + "step": 2201 + }, + { + "epoch": 0.8131462333825702, + "grad_norm": 0.21941427886486053, + "learning_rate": 0.0001458800344870058, + "loss": 0.2397, + "step": 2202 + }, + { + "epoch": 0.8135155096011817, + "grad_norm": 0.20782116055488586, + "learning_rate": 0.0001458554009114423, + "loss": 0.2523, + "step": 2203 + }, + { + "epoch": 0.8138847858197932, + "grad_norm": 0.32415536046028137, + "learning_rate": 0.00014583076733587882, + "loss": 0.2459, + "step": 2204 + }, + { + "epoch": 0.8142540620384048, + "grad_norm": 0.3032081127166748, + "learning_rate": 0.0001458061337603153, + "loss": 0.35, + "step": 2205 + }, + { + "epoch": 0.8146233382570163, + "grad_norm": 0.26680999994277954, + "learning_rate": 0.00014578150018475182, + "loss": 0.3346, + "step": 2206 + }, + { + "epoch": 0.8149926144756278, + "grad_norm": 0.25674813985824585, + "learning_rate": 0.00014575686660918833, + "loss": 0.2769, + "step": 2207 + }, + { + "epoch": 0.8153618906942393, + "grad_norm": 0.2930457592010498, + "learning_rate": 0.00014573223303362485, + "loss": 0.3358, + "step": 2208 + }, + { + "epoch": 0.8157311669128509, + "grad_norm": 0.30175071954727173, + "learning_rate": 0.00014570759945806133, + "loss": 0.3, + "step": 2209 + }, + { + "epoch": 0.8161004431314623, + "grad_norm": 0.23800230026245117, + "learning_rate": 0.00014568296588249785, + "loss": 0.2782, + "step": 2210 + }, + { + "epoch": 0.8164697193500738, + "grad_norm": 0.22994910180568695, + "learning_rate": 0.00014565833230693436, + "loss": 0.2469, + "step": 2211 + }, + { + "epoch": 0.8168389955686853, + "grad_norm": 0.2990546226501465, + "learning_rate": 0.00014563369873137088, + "loss": 0.3121, + "step": 2212 + }, + { + "epoch": 0.8172082717872969, + "grad_norm": 0.28766703605651855, + "learning_rate": 0.00014560906515580737, + "loss": 0.3175, + "step": 2213 + }, + { + "epoch": 0.8175775480059084, + "grad_norm": 0.26653093099594116, + "learning_rate": 0.00014558443158024388, + "loss": 0.3094, + "step": 2214 + }, + { + "epoch": 0.8179468242245199, + "grad_norm": 0.3461751639842987, + "learning_rate": 0.00014555979800468037, + "loss": 0.3309, + "step": 2215 + }, + { + "epoch": 0.8183161004431314, + "grad_norm": 0.23062889277935028, + "learning_rate": 0.0001455351644291169, + "loss": 0.2488, + "step": 2216 + }, + { + "epoch": 0.818685376661743, + "grad_norm": 0.3258536159992218, + "learning_rate": 0.0001455105308535534, + "loss": 0.2635, + "step": 2217 + }, + { + "epoch": 0.8190546528803545, + "grad_norm": 0.23981325328350067, + "learning_rate": 0.0001454858972779899, + "loss": 0.287, + "step": 2218 + }, + { + "epoch": 0.819423929098966, + "grad_norm": 0.2609763741493225, + "learning_rate": 0.0001454612637024264, + "loss": 0.3032, + "step": 2219 + }, + { + "epoch": 0.8197932053175776, + "grad_norm": 0.3009994924068451, + "learning_rate": 0.00014543663012686291, + "loss": 0.3265, + "step": 2220 + }, + { + "epoch": 0.8201624815361891, + "grad_norm": 0.2277035117149353, + "learning_rate": 0.00014541199655129943, + "loss": 0.2801, + "step": 2221 + }, + { + "epoch": 0.8205317577548006, + "grad_norm": 0.24814511835575104, + "learning_rate": 0.00014538736297573594, + "loss": 0.2501, + "step": 2222 + }, + { + "epoch": 0.8209010339734121, + "grad_norm": 0.2699408233165741, + "learning_rate": 0.00014536272940017243, + "loss": 0.2923, + "step": 2223 + }, + { + "epoch": 0.8212703101920237, + "grad_norm": 0.39552244544029236, + "learning_rate": 0.00014533809582460895, + "loss": 0.2887, + "step": 2224 + }, + { + "epoch": 0.8216395864106352, + "grad_norm": 0.25451284646987915, + "learning_rate": 0.00014531346224904546, + "loss": 0.2626, + "step": 2225 + }, + { + "epoch": 0.8220088626292467, + "grad_norm": 0.2528842091560364, + "learning_rate": 0.00014528882867348198, + "loss": 0.2951, + "step": 2226 + }, + { + "epoch": 0.8223781388478582, + "grad_norm": 0.2601645886898041, + "learning_rate": 0.00014526419509791846, + "loss": 0.2874, + "step": 2227 + }, + { + "epoch": 0.8227474150664698, + "grad_norm": 0.2379964292049408, + "learning_rate": 0.00014523956152235498, + "loss": 0.2562, + "step": 2228 + }, + { + "epoch": 0.8231166912850812, + "grad_norm": 0.23886893689632416, + "learning_rate": 0.0001452149279467915, + "loss": 0.2385, + "step": 2229 + }, + { + "epoch": 0.8234859675036927, + "grad_norm": 0.2497236579656601, + "learning_rate": 0.000145190294371228, + "loss": 0.2974, + "step": 2230 + }, + { + "epoch": 0.8238552437223042, + "grad_norm": 0.28172174096107483, + "learning_rate": 0.0001451656607956645, + "loss": 0.3276, + "step": 2231 + }, + { + "epoch": 0.8242245199409158, + "grad_norm": 0.28043049573898315, + "learning_rate": 0.000145141027220101, + "loss": 0.2736, + "step": 2232 + }, + { + "epoch": 0.8245937961595273, + "grad_norm": 0.20221546292304993, + "learning_rate": 0.0001451163936445375, + "loss": 0.2095, + "step": 2233 + }, + { + "epoch": 0.8249630723781388, + "grad_norm": 0.27188798785209656, + "learning_rate": 0.00014509176006897404, + "loss": 0.2577, + "step": 2234 + }, + { + "epoch": 0.8253323485967504, + "grad_norm": 0.2584059238433838, + "learning_rate": 0.00014506712649341053, + "loss": 0.3137, + "step": 2235 + }, + { + "epoch": 0.8257016248153619, + "grad_norm": 0.3068910539150238, + "learning_rate": 0.00014504249291784704, + "loss": 0.3392, + "step": 2236 + }, + { + "epoch": 0.8260709010339734, + "grad_norm": 0.3119933009147644, + "learning_rate": 0.00014501785934228353, + "loss": 0.3674, + "step": 2237 + }, + { + "epoch": 0.8264401772525849, + "grad_norm": 0.3034285306930542, + "learning_rate": 0.00014499322576672004, + "loss": 0.2988, + "step": 2238 + }, + { + "epoch": 0.8268094534711965, + "grad_norm": 0.35521337389945984, + "learning_rate": 0.00014496859219115656, + "loss": 0.3353, + "step": 2239 + }, + { + "epoch": 0.827178729689808, + "grad_norm": 0.23530933260917664, + "learning_rate": 0.00014494395861559307, + "loss": 0.2914, + "step": 2240 + }, + { + "epoch": 0.8275480059084195, + "grad_norm": 0.25276270508766174, + "learning_rate": 0.00014491932504002956, + "loss": 0.2663, + "step": 2241 + }, + { + "epoch": 0.827917282127031, + "grad_norm": 0.2694617807865143, + "learning_rate": 0.00014489469146446607, + "loss": 0.3339, + "step": 2242 + }, + { + "epoch": 0.8282865583456426, + "grad_norm": 0.2752767503261566, + "learning_rate": 0.0001448700578889026, + "loss": 0.308, + "step": 2243 + }, + { + "epoch": 0.8286558345642541, + "grad_norm": 0.24133189022541046, + "learning_rate": 0.0001448454243133391, + "loss": 0.3134, + "step": 2244 + }, + { + "epoch": 0.8290251107828656, + "grad_norm": 0.28474217653274536, + "learning_rate": 0.0001448207907377756, + "loss": 0.3143, + "step": 2245 + }, + { + "epoch": 0.829394387001477, + "grad_norm": 0.22678597271442413, + "learning_rate": 0.0001447961571622121, + "loss": 0.2509, + "step": 2246 + }, + { + "epoch": 0.8297636632200887, + "grad_norm": 0.2652204632759094, + "learning_rate": 0.0001447715235866486, + "loss": 0.2922, + "step": 2247 + }, + { + "epoch": 0.8301329394387001, + "grad_norm": 0.22950397431850433, + "learning_rate": 0.00014474689001108513, + "loss": 0.2744, + "step": 2248 + }, + { + "epoch": 0.8305022156573116, + "grad_norm": 0.25417378544807434, + "learning_rate": 0.00014472225643552162, + "loss": 0.3053, + "step": 2249 + }, + { + "epoch": 0.8308714918759232, + "grad_norm": 0.23306229710578918, + "learning_rate": 0.00014469762285995814, + "loss": 0.2824, + "step": 2250 + }, + { + "epoch": 0.8308714918759232, + "eval_loss": 8.08752727508545, + "eval_runtime": 6.9183, + "eval_samples_per_second": 7.227, + "eval_steps_per_second": 1.012, + "step": 2250 + }, + { + "epoch": 0.8312407680945347, + "grad_norm": 0.31129640340805054, + "learning_rate": 0.00014467298928439462, + "loss": 0.3866, + "step": 2251 + }, + { + "epoch": 0.8316100443131462, + "grad_norm": 0.27730610966682434, + "learning_rate": 0.00014464835570883114, + "loss": 0.3523, + "step": 2252 + }, + { + "epoch": 0.8319793205317577, + "grad_norm": 0.24633419513702393, + "learning_rate": 0.00014462372213326765, + "loss": 0.3021, + "step": 2253 + }, + { + "epoch": 0.8323485967503693, + "grad_norm": 0.22300489246845245, + "learning_rate": 0.00014459908855770417, + "loss": 0.2443, + "step": 2254 + }, + { + "epoch": 0.8327178729689808, + "grad_norm": 0.26045602560043335, + "learning_rate": 0.00014457445498214066, + "loss": 0.2526, + "step": 2255 + }, + { + "epoch": 0.8330871491875923, + "grad_norm": 0.24008513987064362, + "learning_rate": 0.00014454982140657717, + "loss": 0.252, + "step": 2256 + }, + { + "epoch": 0.8334564254062038, + "grad_norm": 0.23904789984226227, + "learning_rate": 0.00014452518783101369, + "loss": 0.2855, + "step": 2257 + }, + { + "epoch": 0.8338257016248154, + "grad_norm": 0.309533953666687, + "learning_rate": 0.0001445005542554502, + "loss": 0.3245, + "step": 2258 + }, + { + "epoch": 0.8341949778434269, + "grad_norm": 0.3368018567562103, + "learning_rate": 0.0001444759206798867, + "loss": 0.3244, + "step": 2259 + }, + { + "epoch": 0.8345642540620384, + "grad_norm": 0.2544728219509125, + "learning_rate": 0.0001444512871043232, + "loss": 0.2675, + "step": 2260 + }, + { + "epoch": 0.8349335302806499, + "grad_norm": 0.31026560068130493, + "learning_rate": 0.00014442665352875972, + "loss": 0.3542, + "step": 2261 + }, + { + "epoch": 0.8353028064992615, + "grad_norm": 0.2643549144268036, + "learning_rate": 0.00014440201995319623, + "loss": 0.2471, + "step": 2262 + }, + { + "epoch": 0.835672082717873, + "grad_norm": 0.2330327033996582, + "learning_rate": 0.00014437738637763272, + "loss": 0.2672, + "step": 2263 + }, + { + "epoch": 0.8360413589364845, + "grad_norm": 0.2380456179380417, + "learning_rate": 0.00014435275280206923, + "loss": 0.2217, + "step": 2264 + }, + { + "epoch": 0.8364106351550961, + "grad_norm": 0.2590167224407196, + "learning_rate": 0.00014432811922650572, + "loss": 0.2839, + "step": 2265 + }, + { + "epoch": 0.8367799113737076, + "grad_norm": 0.24498246610164642, + "learning_rate": 0.00014430348565094226, + "loss": 0.2569, + "step": 2266 + }, + { + "epoch": 0.837149187592319, + "grad_norm": 0.2888021469116211, + "learning_rate": 0.00014427885207537875, + "loss": 0.3086, + "step": 2267 + }, + { + "epoch": 0.8375184638109305, + "grad_norm": 0.2462167590856552, + "learning_rate": 0.00014425421849981526, + "loss": 0.2815, + "step": 2268 + }, + { + "epoch": 0.8378877400295421, + "grad_norm": 0.22932159900665283, + "learning_rate": 0.00014422958492425175, + "loss": 0.2684, + "step": 2269 + }, + { + "epoch": 0.8382570162481536, + "grad_norm": 0.2545166015625, + "learning_rate": 0.00014420495134868827, + "loss": 0.3159, + "step": 2270 + }, + { + "epoch": 0.8386262924667651, + "grad_norm": 0.25789421796798706, + "learning_rate": 0.00014418031777312478, + "loss": 0.2687, + "step": 2271 + }, + { + "epoch": 0.8389955686853766, + "grad_norm": 0.26509973406791687, + "learning_rate": 0.0001441556841975613, + "loss": 0.2555, + "step": 2272 + }, + { + "epoch": 0.8393648449039882, + "grad_norm": 0.28650057315826416, + "learning_rate": 0.00014413105062199778, + "loss": 0.2938, + "step": 2273 + }, + { + "epoch": 0.8397341211225997, + "grad_norm": 0.24216364324092865, + "learning_rate": 0.0001441064170464343, + "loss": 0.2711, + "step": 2274 + }, + { + "epoch": 0.8401033973412112, + "grad_norm": 0.257546067237854, + "learning_rate": 0.0001440817834708708, + "loss": 0.2777, + "step": 2275 + }, + { + "epoch": 0.8404726735598228, + "grad_norm": 0.2521352469921112, + "learning_rate": 0.00014405714989530733, + "loss": 0.2697, + "step": 2276 + }, + { + "epoch": 0.8408419497784343, + "grad_norm": 0.24463944137096405, + "learning_rate": 0.00014403251631974382, + "loss": 0.2816, + "step": 2277 + }, + { + "epoch": 0.8412112259970458, + "grad_norm": 0.241760715842247, + "learning_rate": 0.00014400788274418033, + "loss": 0.2574, + "step": 2278 + }, + { + "epoch": 0.8415805022156573, + "grad_norm": 0.31199437379837036, + "learning_rate": 0.00014398324916861682, + "loss": 0.3265, + "step": 2279 + }, + { + "epoch": 0.8419497784342689, + "grad_norm": 0.41356080770492554, + "learning_rate": 0.00014395861559305336, + "loss": 0.3583, + "step": 2280 + }, + { + "epoch": 0.8423190546528804, + "grad_norm": 0.28970956802368164, + "learning_rate": 0.00014393398201748985, + "loss": 0.3452, + "step": 2281 + }, + { + "epoch": 0.8426883308714919, + "grad_norm": 0.2849578559398651, + "learning_rate": 0.00014390934844192636, + "loss": 0.2644, + "step": 2282 + }, + { + "epoch": 0.8430576070901034, + "grad_norm": 0.263704389333725, + "learning_rate": 0.00014388471486636285, + "loss": 0.3247, + "step": 2283 + }, + { + "epoch": 0.843426883308715, + "grad_norm": 0.2463897466659546, + "learning_rate": 0.00014386008129079936, + "loss": 0.2921, + "step": 2284 + }, + { + "epoch": 0.8437961595273265, + "grad_norm": 0.22011560201644897, + "learning_rate": 0.00014383544771523588, + "loss": 0.2413, + "step": 2285 + }, + { + "epoch": 0.8441654357459379, + "grad_norm": 0.24087125062942505, + "learning_rate": 0.0001438108141396724, + "loss": 0.2607, + "step": 2286 + }, + { + "epoch": 0.8445347119645494, + "grad_norm": 0.29487496614456177, + "learning_rate": 0.00014378618056410888, + "loss": 0.2941, + "step": 2287 + }, + { + "epoch": 0.844903988183161, + "grad_norm": 0.22979193925857544, + "learning_rate": 0.0001437615469885454, + "loss": 0.2815, + "step": 2288 + }, + { + "epoch": 0.8452732644017725, + "grad_norm": 0.26312023401260376, + "learning_rate": 0.0001437369134129819, + "loss": 0.3177, + "step": 2289 + }, + { + "epoch": 0.845642540620384, + "grad_norm": 0.2808831036090851, + "learning_rate": 0.00014371227983741842, + "loss": 0.2733, + "step": 2290 + }, + { + "epoch": 0.8460118168389956, + "grad_norm": 0.2607739269733429, + "learning_rate": 0.0001436876462618549, + "loss": 0.2843, + "step": 2291 + }, + { + "epoch": 0.8463810930576071, + "grad_norm": 0.25114959478378296, + "learning_rate": 0.00014366301268629143, + "loss": 0.2732, + "step": 2292 + }, + { + "epoch": 0.8467503692762186, + "grad_norm": 0.26519036293029785, + "learning_rate": 0.00014363837911072794, + "loss": 0.2922, + "step": 2293 + }, + { + "epoch": 0.8471196454948301, + "grad_norm": 0.3005210757255554, + "learning_rate": 0.00014361374553516446, + "loss": 0.3429, + "step": 2294 + }, + { + "epoch": 0.8474889217134417, + "grad_norm": 0.31273654103279114, + "learning_rate": 0.00014358911195960094, + "loss": 0.3218, + "step": 2295 + }, + { + "epoch": 0.8478581979320532, + "grad_norm": 0.28099071979522705, + "learning_rate": 0.00014356447838403746, + "loss": 0.3233, + "step": 2296 + }, + { + "epoch": 0.8482274741506647, + "grad_norm": 0.2438063770532608, + "learning_rate": 0.00014353984480847395, + "loss": 0.2597, + "step": 2297 + }, + { + "epoch": 0.8485967503692762, + "grad_norm": 0.31336653232574463, + "learning_rate": 0.0001435152112329105, + "loss": 0.3208, + "step": 2298 + }, + { + "epoch": 0.8489660265878878, + "grad_norm": 0.2622755765914917, + "learning_rate": 0.00014349057765734697, + "loss": 0.3206, + "step": 2299 + }, + { + "epoch": 0.8493353028064993, + "grad_norm": 0.2713054120540619, + "learning_rate": 0.0001434659440817835, + "loss": 0.3138, + "step": 2300 + }, + { + "epoch": 0.8493353028064993, + "eval_loss": 8.21534538269043, + "eval_runtime": 6.9167, + "eval_samples_per_second": 7.229, + "eval_steps_per_second": 1.012, + "step": 2300 + }, + { + "epoch": 0.8497045790251108, + "grad_norm": 0.34781596064567566, + "learning_rate": 0.00014344131050621998, + "loss": 0.3756, + "step": 2301 + }, + { + "epoch": 0.8500738552437223, + "grad_norm": 0.2935175895690918, + "learning_rate": 0.0001434166769306565, + "loss": 0.3744, + "step": 2302 + }, + { + "epoch": 0.8504431314623339, + "grad_norm": 0.31657060980796814, + "learning_rate": 0.000143392043355093, + "loss": 0.3078, + "step": 2303 + }, + { + "epoch": 0.8508124076809453, + "grad_norm": 0.29735085368156433, + "learning_rate": 0.00014336740977952952, + "loss": 0.2331, + "step": 2304 + }, + { + "epoch": 0.8511816838995568, + "grad_norm": 0.2540007531642914, + "learning_rate": 0.000143342776203966, + "loss": 0.2605, + "step": 2305 + }, + { + "epoch": 0.8515509601181684, + "grad_norm": 0.3064590096473694, + "learning_rate": 0.00014331814262840252, + "loss": 0.3189, + "step": 2306 + }, + { + "epoch": 0.8519202363367799, + "grad_norm": 0.2287900745868683, + "learning_rate": 0.00014329350905283904, + "loss": 0.2469, + "step": 2307 + }, + { + "epoch": 0.8522895125553914, + "grad_norm": 0.2744688093662262, + "learning_rate": 0.00014326887547727555, + "loss": 0.3216, + "step": 2308 + }, + { + "epoch": 0.8526587887740029, + "grad_norm": 0.23625966906547546, + "learning_rate": 0.00014324424190171204, + "loss": 0.2695, + "step": 2309 + }, + { + "epoch": 0.8530280649926145, + "grad_norm": 0.31987616419792175, + "learning_rate": 0.00014321960832614855, + "loss": 0.2625, + "step": 2310 + }, + { + "epoch": 0.853397341211226, + "grad_norm": 0.2772734463214874, + "learning_rate": 0.00014319497475058504, + "loss": 0.3243, + "step": 2311 + }, + { + "epoch": 0.8537666174298375, + "grad_norm": 0.26590102910995483, + "learning_rate": 0.00014317034117502158, + "loss": 0.2881, + "step": 2312 + }, + { + "epoch": 0.854135893648449, + "grad_norm": 0.2178795337677002, + "learning_rate": 0.00014314570759945807, + "loss": 0.2568, + "step": 2313 + }, + { + "epoch": 0.8545051698670606, + "grad_norm": 0.30304235219955444, + "learning_rate": 0.00014312107402389459, + "loss": 0.3313, + "step": 2314 + }, + { + "epoch": 0.8548744460856721, + "grad_norm": 0.26502808928489685, + "learning_rate": 0.00014309644044833107, + "loss": 0.3302, + "step": 2315 + }, + { + "epoch": 0.8552437223042836, + "grad_norm": 0.28301286697387695, + "learning_rate": 0.0001430718068727676, + "loss": 0.2883, + "step": 2316 + }, + { + "epoch": 0.8556129985228951, + "grad_norm": 0.2148820459842682, + "learning_rate": 0.0001430471732972041, + "loss": 0.2613, + "step": 2317 + }, + { + "epoch": 0.8559822747415067, + "grad_norm": 0.32251638174057007, + "learning_rate": 0.00014302253972164062, + "loss": 0.2765, + "step": 2318 + }, + { + "epoch": 0.8563515509601182, + "grad_norm": 0.24901001155376434, + "learning_rate": 0.0001429979061460771, + "loss": 0.2988, + "step": 2319 + }, + { + "epoch": 0.8567208271787297, + "grad_norm": 0.2393132746219635, + "learning_rate": 0.00014297327257051362, + "loss": 0.2341, + "step": 2320 + }, + { + "epoch": 0.8570901033973413, + "grad_norm": 0.2816013991832733, + "learning_rate": 0.00014294863899495013, + "loss": 0.3013, + "step": 2321 + }, + { + "epoch": 0.8574593796159528, + "grad_norm": 0.22826966643333435, + "learning_rate": 0.00014292400541938665, + "loss": 0.2397, + "step": 2322 + }, + { + "epoch": 0.8578286558345642, + "grad_norm": 0.24233514070510864, + "learning_rate": 0.00014289937184382314, + "loss": 0.2762, + "step": 2323 + }, + { + "epoch": 0.8581979320531757, + "grad_norm": 0.2514684498310089, + "learning_rate": 0.00014287473826825965, + "loss": 0.2672, + "step": 2324 + }, + { + "epoch": 0.8585672082717873, + "grad_norm": 0.33941999077796936, + "learning_rate": 0.00014285010469269614, + "loss": 0.2427, + "step": 2325 + }, + { + "epoch": 0.8589364844903988, + "grad_norm": 0.26198750734329224, + "learning_rate": 0.00014282547111713265, + "loss": 0.3104, + "step": 2326 + }, + { + "epoch": 0.8593057607090103, + "grad_norm": 0.2418547421693802, + "learning_rate": 0.00014280083754156917, + "loss": 0.2611, + "step": 2327 + }, + { + "epoch": 0.8596750369276218, + "grad_norm": 0.3034150004386902, + "learning_rate": 0.00014277620396600566, + "loss": 0.3401, + "step": 2328 + }, + { + "epoch": 0.8600443131462334, + "grad_norm": 0.24259145557880402, + "learning_rate": 0.00014275157039044217, + "loss": 0.2604, + "step": 2329 + }, + { + "epoch": 0.8604135893648449, + "grad_norm": 0.25469255447387695, + "learning_rate": 0.00014272693681487868, + "loss": 0.286, + "step": 2330 + }, + { + "epoch": 0.8607828655834564, + "grad_norm": 0.30289265513420105, + "learning_rate": 0.0001427023032393152, + "loss": 0.3408, + "step": 2331 + }, + { + "epoch": 0.8611521418020679, + "grad_norm": 0.2945447862148285, + "learning_rate": 0.0001426776696637517, + "loss": 0.3414, + "step": 2332 + }, + { + "epoch": 0.8615214180206795, + "grad_norm": 0.21897754073143005, + "learning_rate": 0.0001426530360881882, + "loss": 0.2083, + "step": 2333 + }, + { + "epoch": 0.861890694239291, + "grad_norm": 0.3115282952785492, + "learning_rate": 0.00014262840251262472, + "loss": 0.3034, + "step": 2334 + }, + { + "epoch": 0.8622599704579025, + "grad_norm": 0.22728799283504486, + "learning_rate": 0.00014260376893706123, + "loss": 0.2601, + "step": 2335 + }, + { + "epoch": 0.8626292466765141, + "grad_norm": 0.2540660500526428, + "learning_rate": 0.00014257913536149772, + "loss": 0.2887, + "step": 2336 + }, + { + "epoch": 0.8629985228951256, + "grad_norm": 0.32656875252723694, + "learning_rate": 0.00014255450178593423, + "loss": 0.367, + "step": 2337 + }, + { + "epoch": 0.8633677991137371, + "grad_norm": 0.2668778598308563, + "learning_rate": 0.00014252986821037072, + "loss": 0.2458, + "step": 2338 + }, + { + "epoch": 0.8637370753323486, + "grad_norm": 0.23212186992168427, + "learning_rate": 0.00014250523463480726, + "loss": 0.2629, + "step": 2339 + }, + { + "epoch": 0.8641063515509602, + "grad_norm": 0.2713010609149933, + "learning_rate": 0.00014248060105924375, + "loss": 0.284, + "step": 2340 + }, + { + "epoch": 0.8644756277695717, + "grad_norm": 0.24616140127182007, + "learning_rate": 0.00014245596748368026, + "loss": 0.3017, + "step": 2341 + }, + { + "epoch": 0.8648449039881831, + "grad_norm": 0.26833686232566833, + "learning_rate": 0.00014243133390811675, + "loss": 0.3257, + "step": 2342 + }, + { + "epoch": 0.8652141802067946, + "grad_norm": 0.3410303294658661, + "learning_rate": 0.00014240670033255327, + "loss": 0.2896, + "step": 2343 + }, + { + "epoch": 0.8655834564254062, + "grad_norm": 0.24566622078418732, + "learning_rate": 0.00014238206675698978, + "loss": 0.248, + "step": 2344 + }, + { + "epoch": 0.8659527326440177, + "grad_norm": 0.21404078602790833, + "learning_rate": 0.0001423574331814263, + "loss": 0.2429, + "step": 2345 + }, + { + "epoch": 0.8663220088626292, + "grad_norm": 0.29214203357696533, + "learning_rate": 0.00014233279960586278, + "loss": 0.3101, + "step": 2346 + }, + { + "epoch": 0.8666912850812407, + "grad_norm": 0.33117231726646423, + "learning_rate": 0.0001423081660302993, + "loss": 0.2753, + "step": 2347 + }, + { + "epoch": 0.8670605612998523, + "grad_norm": 0.24536119401454926, + "learning_rate": 0.0001422835324547358, + "loss": 0.2933, + "step": 2348 + }, + { + "epoch": 0.8674298375184638, + "grad_norm": 0.2663596570491791, + "learning_rate": 0.00014225889887917233, + "loss": 0.2914, + "step": 2349 + }, + { + "epoch": 0.8677991137370753, + "grad_norm": 0.2503300607204437, + "learning_rate": 0.00014223426530360881, + "loss": 0.2491, + "step": 2350 + }, + { + "epoch": 0.8677991137370753, + "eval_loss": 8.249739646911621, + "eval_runtime": 6.9172, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 1.012, + "step": 2350 + }, + { + "epoch": 0.8681683899556869, + "grad_norm": 0.2165781557559967, + "learning_rate": 0.00014220963172804533, + "loss": 0.264, + "step": 2351 + }, + { + "epoch": 0.8685376661742984, + "grad_norm": 0.24416819214820862, + "learning_rate": 0.00014218499815248182, + "loss": 0.2588, + "step": 2352 + }, + { + "epoch": 0.8689069423929099, + "grad_norm": 0.2668074071407318, + "learning_rate": 0.00014216036457691836, + "loss": 0.312, + "step": 2353 + }, + { + "epoch": 0.8692762186115214, + "grad_norm": 0.35705941915512085, + "learning_rate": 0.00014213573100135485, + "loss": 0.3288, + "step": 2354 + }, + { + "epoch": 0.869645494830133, + "grad_norm": 0.24707238376140594, + "learning_rate": 0.00014211109742579136, + "loss": 0.2508, + "step": 2355 + }, + { + "epoch": 0.8700147710487445, + "grad_norm": 0.224946990609169, + "learning_rate": 0.00014208646385022785, + "loss": 0.2401, + "step": 2356 + }, + { + "epoch": 0.870384047267356, + "grad_norm": 0.2506362199783325, + "learning_rate": 0.00014206183027466436, + "loss": 0.2481, + "step": 2357 + }, + { + "epoch": 0.8707533234859675, + "grad_norm": 0.31596046686172485, + "learning_rate": 0.00014203719669910088, + "loss": 0.2812, + "step": 2358 + }, + { + "epoch": 0.8711225997045791, + "grad_norm": 0.21078452467918396, + "learning_rate": 0.0001420125631235374, + "loss": 0.2187, + "step": 2359 + }, + { + "epoch": 0.8714918759231906, + "grad_norm": 0.2589664161205292, + "learning_rate": 0.00014198792954797388, + "loss": 0.2932, + "step": 2360 + }, + { + "epoch": 0.871861152141802, + "grad_norm": 0.19735625386238098, + "learning_rate": 0.0001419632959724104, + "loss": 0.2307, + "step": 2361 + }, + { + "epoch": 0.8722304283604135, + "grad_norm": 0.2740657329559326, + "learning_rate": 0.0001419386623968469, + "loss": 0.2729, + "step": 2362 + }, + { + "epoch": 0.8725997045790251, + "grad_norm": 0.2743399143218994, + "learning_rate": 0.00014191402882128342, + "loss": 0.2399, + "step": 2363 + }, + { + "epoch": 0.8729689807976366, + "grad_norm": 0.25327354669570923, + "learning_rate": 0.0001418893952457199, + "loss": 0.2613, + "step": 2364 + }, + { + "epoch": 0.8733382570162481, + "grad_norm": 0.269949734210968, + "learning_rate": 0.00014186476167015643, + "loss": 0.2703, + "step": 2365 + }, + { + "epoch": 0.8737075332348597, + "grad_norm": 0.26192906498908997, + "learning_rate": 0.00014184012809459294, + "loss": 0.2633, + "step": 2366 + }, + { + "epoch": 0.8740768094534712, + "grad_norm": 0.25830745697021484, + "learning_rate": 0.00014181549451902946, + "loss": 0.2422, + "step": 2367 + }, + { + "epoch": 0.8744460856720827, + "grad_norm": 0.25160229206085205, + "learning_rate": 0.00014179086094346594, + "loss": 0.2789, + "step": 2368 + }, + { + "epoch": 0.8748153618906942, + "grad_norm": 0.27158504724502563, + "learning_rate": 0.00014176622736790246, + "loss": 0.3141, + "step": 2369 + }, + { + "epoch": 0.8751846381093058, + "grad_norm": 0.24612627923488617, + "learning_rate": 0.00014174159379233894, + "loss": 0.2326, + "step": 2370 + }, + { + "epoch": 0.8755539143279173, + "grad_norm": 0.35965976119041443, + "learning_rate": 0.0001417169602167755, + "loss": 0.2916, + "step": 2371 + }, + { + "epoch": 0.8759231905465288, + "grad_norm": 0.2359815090894699, + "learning_rate": 0.00014169232664121197, + "loss": 0.29, + "step": 2372 + }, + { + "epoch": 0.8762924667651403, + "grad_norm": 0.22954469919204712, + "learning_rate": 0.0001416676930656485, + "loss": 0.2327, + "step": 2373 + }, + { + "epoch": 0.8766617429837519, + "grad_norm": 0.29345446825027466, + "learning_rate": 0.00014164305949008498, + "loss": 0.296, + "step": 2374 + }, + { + "epoch": 0.8770310192023634, + "grad_norm": 0.3087807297706604, + "learning_rate": 0.0001416184259145215, + "loss": 0.3297, + "step": 2375 + }, + { + "epoch": 0.8774002954209749, + "grad_norm": 0.27971455454826355, + "learning_rate": 0.000141593792338958, + "loss": 0.3198, + "step": 2376 + }, + { + "epoch": 0.8777695716395865, + "grad_norm": 0.3473670184612274, + "learning_rate": 0.00014156915876339452, + "loss": 0.2965, + "step": 2377 + }, + { + "epoch": 0.878138847858198, + "grad_norm": 0.2651292383670807, + "learning_rate": 0.000141544525187831, + "loss": 0.2456, + "step": 2378 + }, + { + "epoch": 0.8785081240768094, + "grad_norm": 0.28210899233818054, + "learning_rate": 0.00014151989161226752, + "loss": 0.3266, + "step": 2379 + }, + { + "epoch": 0.8788774002954209, + "grad_norm": 0.25558584928512573, + "learning_rate": 0.00014149525803670404, + "loss": 0.2594, + "step": 2380 + }, + { + "epoch": 0.8792466765140325, + "grad_norm": 0.21020004153251648, + "learning_rate": 0.00014147062446114055, + "loss": 0.256, + "step": 2381 + }, + { + "epoch": 0.879615952732644, + "grad_norm": 0.34587594866752625, + "learning_rate": 0.00014144599088557704, + "loss": 0.3241, + "step": 2382 + }, + { + "epoch": 0.8799852289512555, + "grad_norm": 0.303607702255249, + "learning_rate": 0.00014142135731001355, + "loss": 0.2834, + "step": 2383 + }, + { + "epoch": 0.880354505169867, + "grad_norm": 0.2405773252248764, + "learning_rate": 0.00014139672373445004, + "loss": 0.2921, + "step": 2384 + }, + { + "epoch": 0.8807237813884786, + "grad_norm": 0.2870854437351227, + "learning_rate": 0.00014137209015888658, + "loss": 0.2654, + "step": 2385 + }, + { + "epoch": 0.8810930576070901, + "grad_norm": 0.26790541410446167, + "learning_rate": 0.00014134745658332307, + "loss": 0.259, + "step": 2386 + }, + { + "epoch": 0.8814623338257016, + "grad_norm": 0.2772853672504425, + "learning_rate": 0.00014132282300775959, + "loss": 0.3069, + "step": 2387 + }, + { + "epoch": 0.8818316100443131, + "grad_norm": 0.28792527318000793, + "learning_rate": 0.00014129818943219607, + "loss": 0.2541, + "step": 2388 + }, + { + "epoch": 0.8822008862629247, + "grad_norm": 0.2280530333518982, + "learning_rate": 0.0001412735558566326, + "loss": 0.2447, + "step": 2389 + }, + { + "epoch": 0.8825701624815362, + "grad_norm": 0.33198174834251404, + "learning_rate": 0.0001412489222810691, + "loss": 0.2978, + "step": 2390 + }, + { + "epoch": 0.8829394387001477, + "grad_norm": 0.30945661664009094, + "learning_rate": 0.00014122428870550562, + "loss": 0.3398, + "step": 2391 + }, + { + "epoch": 0.8833087149187593, + "grad_norm": 0.27216172218322754, + "learning_rate": 0.0001411996551299421, + "loss": 0.2522, + "step": 2392 + }, + { + "epoch": 0.8836779911373708, + "grad_norm": 0.2583320140838623, + "learning_rate": 0.00014117502155437862, + "loss": 0.2537, + "step": 2393 + }, + { + "epoch": 0.8840472673559823, + "grad_norm": 0.26584291458129883, + "learning_rate": 0.00014115038797881513, + "loss": 0.2749, + "step": 2394 + }, + { + "epoch": 0.8844165435745938, + "grad_norm": 0.2255135029554367, + "learning_rate": 0.00014112575440325165, + "loss": 0.2749, + "step": 2395 + }, + { + "epoch": 0.8847858197932054, + "grad_norm": 0.2680929899215698, + "learning_rate": 0.00014110112082768814, + "loss": 0.3037, + "step": 2396 + }, + { + "epoch": 0.8851550960118169, + "grad_norm": 0.26319649815559387, + "learning_rate": 0.00014107648725212465, + "loss": 0.285, + "step": 2397 + }, + { + "epoch": 0.8855243722304283, + "grad_norm": 0.31821325421333313, + "learning_rate": 0.00014105185367656116, + "loss": 0.3745, + "step": 2398 + }, + { + "epoch": 0.8858936484490398, + "grad_norm": 0.289172500371933, + "learning_rate": 0.00014102722010099768, + "loss": 0.3399, + "step": 2399 + }, + { + "epoch": 0.8862629246676514, + "grad_norm": 0.22171613574028015, + "learning_rate": 0.00014100258652543417, + "loss": 0.2426, + "step": 2400 + }, + { + "epoch": 0.8862629246676514, + "eval_loss": 8.158534049987793, + "eval_runtime": 6.9213, + "eval_samples_per_second": 7.224, + "eval_steps_per_second": 1.011, + "step": 2400 + }, + { + "epoch": 0.8866322008862629, + "grad_norm": 0.28381603956222534, + "learning_rate": 0.00014097795294987068, + "loss": 0.3243, + "step": 2401 + }, + { + "epoch": 0.8870014771048744, + "grad_norm": 0.2581366300582886, + "learning_rate": 0.00014095331937430717, + "loss": 0.253, + "step": 2402 + }, + { + "epoch": 0.8873707533234859, + "grad_norm": 0.24550338089466095, + "learning_rate": 0.0001409286857987437, + "loss": 0.2489, + "step": 2403 + }, + { + "epoch": 0.8877400295420975, + "grad_norm": 0.2996337413787842, + "learning_rate": 0.0001409040522231802, + "loss": 0.3234, + "step": 2404 + }, + { + "epoch": 0.888109305760709, + "grad_norm": 0.29904499650001526, + "learning_rate": 0.0001408794186476167, + "loss": 0.3064, + "step": 2405 + }, + { + "epoch": 0.8884785819793205, + "grad_norm": 0.2337605506181717, + "learning_rate": 0.0001408547850720532, + "loss": 0.2252, + "step": 2406 + }, + { + "epoch": 0.8888478581979321, + "grad_norm": 0.2514759302139282, + "learning_rate": 0.00014083015149648972, + "loss": 0.2693, + "step": 2407 + }, + { + "epoch": 0.8892171344165436, + "grad_norm": 0.21925853192806244, + "learning_rate": 0.00014080551792092623, + "loss": 0.2755, + "step": 2408 + }, + { + "epoch": 0.8895864106351551, + "grad_norm": 0.27189895510673523, + "learning_rate": 0.00014078088434536274, + "loss": 0.2721, + "step": 2409 + }, + { + "epoch": 0.8899556868537666, + "grad_norm": 0.2939663529396057, + "learning_rate": 0.00014075625076979923, + "loss": 0.2852, + "step": 2410 + }, + { + "epoch": 0.8903249630723782, + "grad_norm": 0.26466068625450134, + "learning_rate": 0.00014073161719423575, + "loss": 0.2798, + "step": 2411 + }, + { + "epoch": 0.8906942392909897, + "grad_norm": 0.24543273448944092, + "learning_rate": 0.00014070698361867226, + "loss": 0.256, + "step": 2412 + }, + { + "epoch": 0.8910635155096012, + "grad_norm": 0.24231398105621338, + "learning_rate": 0.00014068235004310878, + "loss": 0.2416, + "step": 2413 + }, + { + "epoch": 0.8914327917282127, + "grad_norm": 0.332803875207901, + "learning_rate": 0.00014065771646754526, + "loss": 0.2865, + "step": 2414 + }, + { + "epoch": 0.8918020679468243, + "grad_norm": 0.2219744771718979, + "learning_rate": 0.00014063308289198178, + "loss": 0.2357, + "step": 2415 + }, + { + "epoch": 0.8921713441654358, + "grad_norm": 0.2557753920555115, + "learning_rate": 0.00014060844931641827, + "loss": 0.273, + "step": 2416 + }, + { + "epoch": 0.8925406203840472, + "grad_norm": 0.302416056394577, + "learning_rate": 0.0001405838157408548, + "loss": 0.2947, + "step": 2417 + }, + { + "epoch": 0.8929098966026587, + "grad_norm": 0.23148083686828613, + "learning_rate": 0.0001405591821652913, + "loss": 0.2527, + "step": 2418 + }, + { + "epoch": 0.8932791728212703, + "grad_norm": 0.23304253816604614, + "learning_rate": 0.0001405345485897278, + "loss": 0.2793, + "step": 2419 + }, + { + "epoch": 0.8936484490398818, + "grad_norm": 0.2863101363182068, + "learning_rate": 0.0001405099150141643, + "loss": 0.3062, + "step": 2420 + }, + { + "epoch": 0.8940177252584933, + "grad_norm": 0.26595038175582886, + "learning_rate": 0.0001404852814386008, + "loss": 0.2826, + "step": 2421 + }, + { + "epoch": 0.8943870014771049, + "grad_norm": 0.24915006756782532, + "learning_rate": 0.00014046064786303733, + "loss": 0.241, + "step": 2422 + }, + { + "epoch": 0.8947562776957164, + "grad_norm": 0.2529863715171814, + "learning_rate": 0.00014043601428747384, + "loss": 0.2786, + "step": 2423 + }, + { + "epoch": 0.8951255539143279, + "grad_norm": 0.2361615151166916, + "learning_rate": 0.00014041138071191033, + "loss": 0.2602, + "step": 2424 + }, + { + "epoch": 0.8954948301329394, + "grad_norm": 0.30985426902770996, + "learning_rate": 0.00014038674713634684, + "loss": 0.2843, + "step": 2425 + }, + { + "epoch": 0.895864106351551, + "grad_norm": 0.20042794942855835, + "learning_rate": 0.00014036211356078336, + "loss": 0.2196, + "step": 2426 + }, + { + "epoch": 0.8962333825701625, + "grad_norm": 0.21849043667316437, + "learning_rate": 0.00014033747998521987, + "loss": 0.2326, + "step": 2427 + }, + { + "epoch": 0.896602658788774, + "grad_norm": 0.27464914321899414, + "learning_rate": 0.00014031284640965636, + "loss": 0.3062, + "step": 2428 + }, + { + "epoch": 0.8969719350073855, + "grad_norm": 0.35907840728759766, + "learning_rate": 0.00014028821283409287, + "loss": 0.3423, + "step": 2429 + }, + { + "epoch": 0.8973412112259971, + "grad_norm": 0.25261354446411133, + "learning_rate": 0.0001402635792585294, + "loss": 0.2649, + "step": 2430 + }, + { + "epoch": 0.8977104874446086, + "grad_norm": 0.2667701244354248, + "learning_rate": 0.0001402389456829659, + "loss": 0.2803, + "step": 2431 + }, + { + "epoch": 0.8980797636632201, + "grad_norm": 0.25323063135147095, + "learning_rate": 0.0001402143121074024, + "loss": 0.255, + "step": 2432 + }, + { + "epoch": 0.8984490398818316, + "grad_norm": 0.2831653654575348, + "learning_rate": 0.0001401896785318389, + "loss": 0.2913, + "step": 2433 + }, + { + "epoch": 0.8988183161004432, + "grad_norm": 0.23538748919963837, + "learning_rate": 0.0001401650449562754, + "loss": 0.2557, + "step": 2434 + }, + { + "epoch": 0.8991875923190547, + "grad_norm": 0.26506251096725464, + "learning_rate": 0.00014014041138071194, + "loss": 0.2313, + "step": 2435 + }, + { + "epoch": 0.8995568685376661, + "grad_norm": 0.24521896243095398, + "learning_rate": 0.00014011577780514842, + "loss": 0.2766, + "step": 2436 + }, + { + "epoch": 0.8999261447562777, + "grad_norm": 0.25493261218070984, + "learning_rate": 0.00014009114422958494, + "loss": 0.2539, + "step": 2437 + }, + { + "epoch": 0.9002954209748892, + "grad_norm": 0.2945232093334198, + "learning_rate": 0.00014006651065402143, + "loss": 0.3322, + "step": 2438 + }, + { + "epoch": 0.9006646971935007, + "grad_norm": 0.21371427178382874, + "learning_rate": 0.00014004187707845794, + "loss": 0.2165, + "step": 2439 + }, + { + "epoch": 0.9010339734121122, + "grad_norm": 0.27180060744285583, + "learning_rate": 0.00014001724350289445, + "loss": 0.2572, + "step": 2440 + }, + { + "epoch": 0.9014032496307238, + "grad_norm": 0.3545316457748413, + "learning_rate": 0.00013999260992733097, + "loss": 0.3267, + "step": 2441 + }, + { + "epoch": 0.9017725258493353, + "grad_norm": 0.23242874443531036, + "learning_rate": 0.00013996797635176746, + "loss": 0.2364, + "step": 2442 + }, + { + "epoch": 0.9021418020679468, + "grad_norm": 0.28296172618865967, + "learning_rate": 0.00013994334277620397, + "loss": 0.3141, + "step": 2443 + }, + { + "epoch": 0.9025110782865583, + "grad_norm": 0.29769816994667053, + "learning_rate": 0.00013991870920064049, + "loss": 0.3688, + "step": 2444 + }, + { + "epoch": 0.9028803545051699, + "grad_norm": 0.2693830132484436, + "learning_rate": 0.000139894075625077, + "loss": 0.2902, + "step": 2445 + }, + { + "epoch": 0.9032496307237814, + "grad_norm": 0.27015113830566406, + "learning_rate": 0.0001398694420495135, + "loss": 0.26, + "step": 2446 + }, + { + "epoch": 0.9036189069423929, + "grad_norm": 0.3325751721858978, + "learning_rate": 0.00013984480847395, + "loss": 0.3425, + "step": 2447 + }, + { + "epoch": 0.9039881831610044, + "grad_norm": 0.24994274973869324, + "learning_rate": 0.0001398201748983865, + "loss": 0.3008, + "step": 2448 + }, + { + "epoch": 0.904357459379616, + "grad_norm": 0.23159727454185486, + "learning_rate": 0.00013979554132282303, + "loss": 0.2487, + "step": 2449 + }, + { + "epoch": 0.9047267355982275, + "grad_norm": 0.24654342234134674, + "learning_rate": 0.00013977090774725952, + "loss": 0.3395, + "step": 2450 + }, + { + "epoch": 0.9047267355982275, + "eval_loss": 8.296114921569824, + "eval_runtime": 6.9244, + "eval_samples_per_second": 7.221, + "eval_steps_per_second": 1.011, + "step": 2450 + }, + { + "epoch": 0.905096011816839, + "grad_norm": 0.2912880480289459, + "learning_rate": 0.00013974627417169603, + "loss": 0.2958, + "step": 2451 + }, + { + "epoch": 0.9054652880354506, + "grad_norm": 0.24700932204723358, + "learning_rate": 0.00013972164059613252, + "loss": 0.3166, + "step": 2452 + }, + { + "epoch": 0.9058345642540621, + "grad_norm": 0.23382841050624847, + "learning_rate": 0.00013969700702056904, + "loss": 0.2644, + "step": 2453 + }, + { + "epoch": 0.9062038404726735, + "grad_norm": 0.3532010316848755, + "learning_rate": 0.00013967237344500555, + "loss": 0.3257, + "step": 2454 + }, + { + "epoch": 0.906573116691285, + "grad_norm": 0.22511905431747437, + "learning_rate": 0.00013964773986944207, + "loss": 0.2228, + "step": 2455 + }, + { + "epoch": 0.9069423929098966, + "grad_norm": 0.25986629724502563, + "learning_rate": 0.00013962310629387855, + "loss": 0.2898, + "step": 2456 + }, + { + "epoch": 0.9073116691285081, + "grad_norm": 0.21695490181446075, + "learning_rate": 0.00013959847271831507, + "loss": 0.2555, + "step": 2457 + }, + { + "epoch": 0.9076809453471196, + "grad_norm": 0.32843661308288574, + "learning_rate": 0.00013957383914275158, + "loss": 0.3191, + "step": 2458 + }, + { + "epoch": 0.9080502215657311, + "grad_norm": 0.27170658111572266, + "learning_rate": 0.0001395492055671881, + "loss": 0.2873, + "step": 2459 + }, + { + "epoch": 0.9084194977843427, + "grad_norm": 0.24585942924022675, + "learning_rate": 0.00013952457199162458, + "loss": 0.2608, + "step": 2460 + }, + { + "epoch": 0.9087887740029542, + "grad_norm": 0.2840253710746765, + "learning_rate": 0.0001394999384160611, + "loss": 0.3466, + "step": 2461 + }, + { + "epoch": 0.9091580502215657, + "grad_norm": 0.24581150710582733, + "learning_rate": 0.0001394753048404976, + "loss": 0.2663, + "step": 2462 + }, + { + "epoch": 0.9095273264401772, + "grad_norm": 0.25381600856781006, + "learning_rate": 0.00013945067126493413, + "loss": 0.3024, + "step": 2463 + }, + { + "epoch": 0.9098966026587888, + "grad_norm": 0.2941172420978546, + "learning_rate": 0.00013942603768937062, + "loss": 0.2724, + "step": 2464 + }, + { + "epoch": 0.9102658788774003, + "grad_norm": 0.2804733216762543, + "learning_rate": 0.00013940140411380713, + "loss": 0.254, + "step": 2465 + }, + { + "epoch": 0.9106351550960118, + "grad_norm": 0.303742915391922, + "learning_rate": 0.00013937677053824362, + "loss": 0.3894, + "step": 2466 + }, + { + "epoch": 0.9110044313146234, + "grad_norm": 0.3048993647098541, + "learning_rate": 0.00013935213696268013, + "loss": 0.3628, + "step": 2467 + }, + { + "epoch": 0.9113737075332349, + "grad_norm": 0.23515570163726807, + "learning_rate": 0.00013932750338711665, + "loss": 0.2361, + "step": 2468 + }, + { + "epoch": 0.9117429837518464, + "grad_norm": 0.25261393189430237, + "learning_rate": 0.00013930286981155316, + "loss": 0.2796, + "step": 2469 + }, + { + "epoch": 0.9121122599704579, + "grad_norm": 0.27003705501556396, + "learning_rate": 0.00013927823623598965, + "loss": 0.2978, + "step": 2470 + }, + { + "epoch": 0.9124815361890695, + "grad_norm": 0.308946430683136, + "learning_rate": 0.00013925360266042616, + "loss": 0.2673, + "step": 2471 + }, + { + "epoch": 0.912850812407681, + "grad_norm": 0.31486308574676514, + "learning_rate": 0.00013922896908486268, + "loss": 0.3437, + "step": 2472 + }, + { + "epoch": 0.9132200886262924, + "grad_norm": 0.2895006835460663, + "learning_rate": 0.0001392043355092992, + "loss": 0.2956, + "step": 2473 + }, + { + "epoch": 0.9135893648449039, + "grad_norm": 0.2763524651527405, + "learning_rate": 0.00013917970193373568, + "loss": 0.3122, + "step": 2474 + }, + { + "epoch": 0.9139586410635155, + "grad_norm": 0.2455618977546692, + "learning_rate": 0.0001391550683581722, + "loss": 0.2933, + "step": 2475 + }, + { + "epoch": 0.914327917282127, + "grad_norm": 0.2520411014556885, + "learning_rate": 0.0001391304347826087, + "loss": 0.2194, + "step": 2476 + }, + { + "epoch": 0.9146971935007385, + "grad_norm": 0.2278340458869934, + "learning_rate": 0.00013910580120704523, + "loss": 0.2667, + "step": 2477 + }, + { + "epoch": 0.9150664697193501, + "grad_norm": 0.31450703740119934, + "learning_rate": 0.0001390811676314817, + "loss": 0.4058, + "step": 2478 + }, + { + "epoch": 0.9154357459379616, + "grad_norm": 0.26386645436286926, + "learning_rate": 0.00013905653405591823, + "loss": 0.3052, + "step": 2479 + }, + { + "epoch": 0.9158050221565731, + "grad_norm": 0.21750874817371368, + "learning_rate": 0.00013903190048035471, + "loss": 0.2374, + "step": 2480 + }, + { + "epoch": 0.9161742983751846, + "grad_norm": 0.22775046527385712, + "learning_rate": 0.00013900726690479126, + "loss": 0.2431, + "step": 2481 + }, + { + "epoch": 0.9165435745937962, + "grad_norm": 0.2875349223613739, + "learning_rate": 0.00013898263332922774, + "loss": 0.2921, + "step": 2482 + }, + { + "epoch": 0.9169128508124077, + "grad_norm": 0.24682042002677917, + "learning_rate": 0.00013895799975366426, + "loss": 0.2657, + "step": 2483 + }, + { + "epoch": 0.9172821270310192, + "grad_norm": 0.29444950819015503, + "learning_rate": 0.00013893336617810075, + "loss": 0.2957, + "step": 2484 + }, + { + "epoch": 0.9176514032496307, + "grad_norm": 0.3001859486103058, + "learning_rate": 0.00013890873260253726, + "loss": 0.3206, + "step": 2485 + }, + { + "epoch": 0.9180206794682423, + "grad_norm": 0.31281018257141113, + "learning_rate": 0.00013888409902697378, + "loss": 0.3484, + "step": 2486 + }, + { + "epoch": 0.9183899556868538, + "grad_norm": 0.2582842707633972, + "learning_rate": 0.0001388594654514103, + "loss": 0.2899, + "step": 2487 + }, + { + "epoch": 0.9187592319054653, + "grad_norm": 0.2088260054588318, + "learning_rate": 0.00013883483187584678, + "loss": 0.2136, + "step": 2488 + }, + { + "epoch": 0.9191285081240768, + "grad_norm": 0.25716960430145264, + "learning_rate": 0.0001388101983002833, + "loss": 0.2934, + "step": 2489 + }, + { + "epoch": 0.9194977843426884, + "grad_norm": 0.27651864290237427, + "learning_rate": 0.0001387855647247198, + "loss": 0.3078, + "step": 2490 + }, + { + "epoch": 0.9198670605612999, + "grad_norm": 0.2961086630821228, + "learning_rate": 0.00013876093114915632, + "loss": 0.2836, + "step": 2491 + }, + { + "epoch": 0.9202363367799113, + "grad_norm": 0.23901313543319702, + "learning_rate": 0.0001387362975735928, + "loss": 0.2503, + "step": 2492 + }, + { + "epoch": 0.920605612998523, + "grad_norm": 0.31725743412971497, + "learning_rate": 0.00013871166399802932, + "loss": 0.323, + "step": 2493 + }, + { + "epoch": 0.9209748892171344, + "grad_norm": 0.29879850149154663, + "learning_rate": 0.0001386870304224658, + "loss": 0.3329, + "step": 2494 + }, + { + "epoch": 0.9213441654357459, + "grad_norm": 0.2684386074542999, + "learning_rate": 0.00013866239684690235, + "loss": 0.2954, + "step": 2495 + }, + { + "epoch": 0.9217134416543574, + "grad_norm": 0.2850632071495056, + "learning_rate": 0.00013863776327133884, + "loss": 0.3001, + "step": 2496 + }, + { + "epoch": 0.922082717872969, + "grad_norm": 0.24028350412845612, + "learning_rate": 0.00013861312969577536, + "loss": 0.2548, + "step": 2497 + }, + { + "epoch": 0.9224519940915805, + "grad_norm": 0.31391140818595886, + "learning_rate": 0.00013858849612021184, + "loss": 0.3123, + "step": 2498 + }, + { + "epoch": 0.922821270310192, + "grad_norm": 0.27869704365730286, + "learning_rate": 0.00013856386254464836, + "loss": 0.296, + "step": 2499 + }, + { + "epoch": 0.9231905465288035, + "grad_norm": 0.22903719544410706, + "learning_rate": 0.00013853922896908487, + "loss": 0.2567, + "step": 2500 + }, + { + "epoch": 0.9231905465288035, + "eval_loss": 8.136713981628418, + "eval_runtime": 6.9138, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 1.012, + "step": 2500 + }, + { + "epoch": 0.9235598227474151, + "grad_norm": 0.3191649317741394, + "learning_rate": 0.0001385145953935214, + "loss": 0.3628, + "step": 2501 + }, + { + "epoch": 0.9239290989660266, + "grad_norm": 0.2711998522281647, + "learning_rate": 0.00013848996181795787, + "loss": 0.3074, + "step": 2502 + }, + { + "epoch": 0.9242983751846381, + "grad_norm": 0.3006347417831421, + "learning_rate": 0.0001384653282423944, + "loss": 0.316, + "step": 2503 + }, + { + "epoch": 0.9246676514032496, + "grad_norm": 0.2184845358133316, + "learning_rate": 0.0001384406946668309, + "loss": 0.2497, + "step": 2504 + }, + { + "epoch": 0.9250369276218612, + "grad_norm": 0.27696430683135986, + "learning_rate": 0.00013841606109126742, + "loss": 0.3123, + "step": 2505 + }, + { + "epoch": 0.9254062038404727, + "grad_norm": 0.2045396864414215, + "learning_rate": 0.0001383914275157039, + "loss": 0.2584, + "step": 2506 + }, + { + "epoch": 0.9257754800590842, + "grad_norm": 0.2868804931640625, + "learning_rate": 0.00013836679394014042, + "loss": 0.2875, + "step": 2507 + }, + { + "epoch": 0.9261447562776958, + "grad_norm": 0.2394094616174698, + "learning_rate": 0.00013834216036457694, + "loss": 0.2687, + "step": 2508 + }, + { + "epoch": 0.9265140324963073, + "grad_norm": 0.2483847290277481, + "learning_rate": 0.00013831752678901345, + "loss": 0.2155, + "step": 2509 + }, + { + "epoch": 0.9268833087149188, + "grad_norm": 0.3035317361354828, + "learning_rate": 0.00013829289321344994, + "loss": 0.3255, + "step": 2510 + }, + { + "epoch": 0.9272525849335302, + "grad_norm": 0.2944867014884949, + "learning_rate": 0.00013826825963788645, + "loss": 0.2651, + "step": 2511 + }, + { + "epoch": 0.9276218611521418, + "grad_norm": 0.35688772797584534, + "learning_rate": 0.00013824362606232294, + "loss": 0.2968, + "step": 2512 + }, + { + "epoch": 0.9279911373707533, + "grad_norm": 0.35703906416893005, + "learning_rate": 0.00013821899248675948, + "loss": 0.3139, + "step": 2513 + }, + { + "epoch": 0.9283604135893648, + "grad_norm": 0.2110690176486969, + "learning_rate": 0.00013819435891119597, + "loss": 0.2522, + "step": 2514 + }, + { + "epoch": 0.9287296898079763, + "grad_norm": 0.28055527806282043, + "learning_rate": 0.00013816972533563248, + "loss": 0.3106, + "step": 2515 + }, + { + "epoch": 0.9290989660265879, + "grad_norm": 0.2880934774875641, + "learning_rate": 0.00013814509176006897, + "loss": 0.2762, + "step": 2516 + }, + { + "epoch": 0.9294682422451994, + "grad_norm": 0.24046504497528076, + "learning_rate": 0.00013812045818450549, + "loss": 0.2528, + "step": 2517 + }, + { + "epoch": 0.9298375184638109, + "grad_norm": 0.28930288553237915, + "learning_rate": 0.000138095824608942, + "loss": 0.2856, + "step": 2518 + }, + { + "epoch": 0.9302067946824224, + "grad_norm": 0.26080793142318726, + "learning_rate": 0.00013807119103337851, + "loss": 0.2386, + "step": 2519 + }, + { + "epoch": 0.930576070901034, + "grad_norm": 0.2434677928686142, + "learning_rate": 0.000138046557457815, + "loss": 0.2476, + "step": 2520 + }, + { + "epoch": 0.9309453471196455, + "grad_norm": 0.2883527874946594, + "learning_rate": 0.00013802192388225152, + "loss": 0.2973, + "step": 2521 + }, + { + "epoch": 0.931314623338257, + "grad_norm": 0.2541818618774414, + "learning_rate": 0.00013799729030668803, + "loss": 0.2492, + "step": 2522 + }, + { + "epoch": 0.9316838995568686, + "grad_norm": 0.3094305992126465, + "learning_rate": 0.00013797265673112455, + "loss": 0.3449, + "step": 2523 + }, + { + "epoch": 0.9320531757754801, + "grad_norm": 0.2514682114124298, + "learning_rate": 0.00013794802315556103, + "loss": 0.2705, + "step": 2524 + }, + { + "epoch": 0.9324224519940916, + "grad_norm": 0.3077158033847809, + "learning_rate": 0.00013792338957999755, + "loss": 0.2679, + "step": 2525 + }, + { + "epoch": 0.9327917282127031, + "grad_norm": 0.24246780574321747, + "learning_rate": 0.00013789875600443404, + "loss": 0.2212, + "step": 2526 + }, + { + "epoch": 0.9331610044313147, + "grad_norm": 0.22519810497760773, + "learning_rate": 0.00013787412242887058, + "loss": 0.2192, + "step": 2527 + }, + { + "epoch": 0.9335302806499262, + "grad_norm": 0.2912737727165222, + "learning_rate": 0.00013784948885330707, + "loss": 0.3607, + "step": 2528 + }, + { + "epoch": 0.9338995568685377, + "grad_norm": 0.3519744575023651, + "learning_rate": 0.00013782485527774358, + "loss": 0.3043, + "step": 2529 + }, + { + "epoch": 0.9342688330871491, + "grad_norm": 0.2391892522573471, + "learning_rate": 0.00013780022170218007, + "loss": 0.268, + "step": 2530 + }, + { + "epoch": 0.9346381093057607, + "grad_norm": 0.23374566435813904, + "learning_rate": 0.00013777558812661658, + "loss": 0.2537, + "step": 2531 + }, + { + "epoch": 0.9350073855243722, + "grad_norm": 0.2581256628036499, + "learning_rate": 0.0001377509545510531, + "loss": 0.2849, + "step": 2532 + }, + { + "epoch": 0.9353766617429837, + "grad_norm": 0.2182261347770691, + "learning_rate": 0.0001377263209754896, + "loss": 0.2147, + "step": 2533 + }, + { + "epoch": 0.9357459379615952, + "grad_norm": 0.28320491313934326, + "learning_rate": 0.0001377016873999261, + "loss": 0.2858, + "step": 2534 + }, + { + "epoch": 0.9361152141802068, + "grad_norm": 0.2754431664943695, + "learning_rate": 0.0001376770538243626, + "loss": 0.291, + "step": 2535 + }, + { + "epoch": 0.9364844903988183, + "grad_norm": 0.20113760232925415, + "learning_rate": 0.00013765242024879913, + "loss": 0.2366, + "step": 2536 + }, + { + "epoch": 0.9368537666174298, + "grad_norm": 0.2871677279472351, + "learning_rate": 0.00013762778667323564, + "loss": 0.3197, + "step": 2537 + }, + { + "epoch": 0.9372230428360414, + "grad_norm": 0.21958333253860474, + "learning_rate": 0.00013760315309767213, + "loss": 0.2588, + "step": 2538 + }, + { + "epoch": 0.9375923190546529, + "grad_norm": 0.2792249321937561, + "learning_rate": 0.00013757851952210864, + "loss": 0.2547, + "step": 2539 + }, + { + "epoch": 0.9379615952732644, + "grad_norm": 0.289400577545166, + "learning_rate": 0.00013755388594654516, + "loss": 0.3681, + "step": 2540 + }, + { + "epoch": 0.9383308714918759, + "grad_norm": 0.26338911056518555, + "learning_rate": 0.00013752925237098167, + "loss": 0.3124, + "step": 2541 + }, + { + "epoch": 0.9387001477104875, + "grad_norm": 0.26968058943748474, + "learning_rate": 0.00013750461879541816, + "loss": 0.2832, + "step": 2542 + }, + { + "epoch": 0.939069423929099, + "grad_norm": 0.2359062284231186, + "learning_rate": 0.00013747998521985468, + "loss": 0.2711, + "step": 2543 + }, + { + "epoch": 0.9394387001477105, + "grad_norm": 0.2957818806171417, + "learning_rate": 0.00013745535164429116, + "loss": 0.3163, + "step": 2544 + }, + { + "epoch": 0.939807976366322, + "grad_norm": 0.28414493799209595, + "learning_rate": 0.0001374307180687277, + "loss": 0.3197, + "step": 2545 + }, + { + "epoch": 0.9401772525849336, + "grad_norm": 0.23775461316108704, + "learning_rate": 0.0001374060844931642, + "loss": 0.3031, + "step": 2546 + }, + { + "epoch": 0.9405465288035451, + "grad_norm": 0.282515287399292, + "learning_rate": 0.0001373814509176007, + "loss": 0.3199, + "step": 2547 + }, + { + "epoch": 0.9409158050221565, + "grad_norm": 0.3067757487297058, + "learning_rate": 0.0001373568173420372, + "loss": 0.303, + "step": 2548 + }, + { + "epoch": 0.941285081240768, + "grad_norm": 0.2592032849788666, + "learning_rate": 0.0001373321837664737, + "loss": 0.3091, + "step": 2549 + }, + { + "epoch": 0.9416543574593796, + "grad_norm": 0.2740531861782074, + "learning_rate": 0.00013730755019091022, + "loss": 0.3491, + "step": 2550 + }, + { + "epoch": 0.9416543574593796, + "eval_loss": 8.338347434997559, + "eval_runtime": 6.917, + "eval_samples_per_second": 7.229, + "eval_steps_per_second": 1.012, + "step": 2550 + }, + { + "epoch": 0.9420236336779911, + "grad_norm": 0.23710212111473083, + "learning_rate": 0.00013728291661534674, + "loss": 0.2385, + "step": 2551 + }, + { + "epoch": 0.9423929098966026, + "grad_norm": 0.2704252004623413, + "learning_rate": 0.00013725828303978323, + "loss": 0.2872, + "step": 2552 + }, + { + "epoch": 0.9427621861152142, + "grad_norm": 0.38992467522621155, + "learning_rate": 0.00013723364946421974, + "loss": 0.2908, + "step": 2553 + }, + { + "epoch": 0.9431314623338257, + "grad_norm": 0.20440521836280823, + "learning_rate": 0.00013720901588865626, + "loss": 0.2312, + "step": 2554 + }, + { + "epoch": 0.9435007385524372, + "grad_norm": 0.21918311715126038, + "learning_rate": 0.00013718438231309277, + "loss": 0.2922, + "step": 2555 + }, + { + "epoch": 0.9438700147710487, + "grad_norm": 0.19745555520057678, + "learning_rate": 0.00013715974873752926, + "loss": 0.2204, + "step": 2556 + }, + { + "epoch": 0.9442392909896603, + "grad_norm": 0.30187833309173584, + "learning_rate": 0.00013713511516196575, + "loss": 0.3148, + "step": 2557 + }, + { + "epoch": 0.9446085672082718, + "grad_norm": 0.3005145192146301, + "learning_rate": 0.00013711048158640226, + "loss": 0.2945, + "step": 2558 + }, + { + "epoch": 0.9449778434268833, + "grad_norm": 0.42977726459503174, + "learning_rate": 0.00013708584801083878, + "loss": 0.398, + "step": 2559 + }, + { + "epoch": 0.9453471196454948, + "grad_norm": 0.19661004841327667, + "learning_rate": 0.0001370612144352753, + "loss": 0.1928, + "step": 2560 + }, + { + "epoch": 0.9457163958641064, + "grad_norm": 0.2427736222743988, + "learning_rate": 0.00013703658085971178, + "loss": 0.2328, + "step": 2561 + }, + { + "epoch": 0.9460856720827179, + "grad_norm": 0.21797865629196167, + "learning_rate": 0.0001370119472841483, + "loss": 0.2399, + "step": 2562 + }, + { + "epoch": 0.9464549483013294, + "grad_norm": 0.27254238724708557, + "learning_rate": 0.0001369873137085848, + "loss": 0.3376, + "step": 2563 + }, + { + "epoch": 0.946824224519941, + "grad_norm": 0.3058348000049591, + "learning_rate": 0.00013696268013302132, + "loss": 0.328, + "step": 2564 + }, + { + "epoch": 0.9471935007385525, + "grad_norm": 0.24107080698013306, + "learning_rate": 0.0001369380465574578, + "loss": 0.2568, + "step": 2565 + }, + { + "epoch": 0.947562776957164, + "grad_norm": 0.2434549182653427, + "learning_rate": 0.00013691341298189432, + "loss": 0.2457, + "step": 2566 + }, + { + "epoch": 0.9479320531757754, + "grad_norm": 0.2889346778392792, + "learning_rate": 0.00013688877940633084, + "loss": 0.3098, + "step": 2567 + }, + { + "epoch": 0.948301329394387, + "grad_norm": 0.19707129895687103, + "learning_rate": 0.00013686414583076735, + "loss": 0.2321, + "step": 2568 + }, + { + "epoch": 0.9486706056129985, + "grad_norm": 0.23561494052410126, + "learning_rate": 0.00013683951225520384, + "loss": 0.2987, + "step": 2569 + }, + { + "epoch": 0.94903988183161, + "grad_norm": 0.3076520264148712, + "learning_rate": 0.00013681487867964035, + "loss": 0.2606, + "step": 2570 + }, + { + "epoch": 0.9494091580502215, + "grad_norm": 0.3640258312225342, + "learning_rate": 0.00013679024510407684, + "loss": 0.3542, + "step": 2571 + }, + { + "epoch": 0.9497784342688331, + "grad_norm": 0.23354092240333557, + "learning_rate": 0.00013676561152851338, + "loss": 0.2384, + "step": 2572 + }, + { + "epoch": 0.9501477104874446, + "grad_norm": 0.3421535789966583, + "learning_rate": 0.00013674097795294987, + "loss": 0.3742, + "step": 2573 + }, + { + "epoch": 0.9505169867060561, + "grad_norm": 0.2915743291378021, + "learning_rate": 0.00013671634437738639, + "loss": 0.2659, + "step": 2574 + }, + { + "epoch": 0.9508862629246676, + "grad_norm": 0.24567344784736633, + "learning_rate": 0.00013669171080182287, + "loss": 0.2573, + "step": 2575 + }, + { + "epoch": 0.9512555391432792, + "grad_norm": 0.2579054832458496, + "learning_rate": 0.0001366670772262594, + "loss": 0.2993, + "step": 2576 + }, + { + "epoch": 0.9516248153618907, + "grad_norm": 0.20793886482715607, + "learning_rate": 0.0001366424436506959, + "loss": 0.2192, + "step": 2577 + }, + { + "epoch": 0.9519940915805022, + "grad_norm": 0.30044519901275635, + "learning_rate": 0.00013661781007513242, + "loss": 0.353, + "step": 2578 + }, + { + "epoch": 0.9523633677991138, + "grad_norm": 0.28049206733703613, + "learning_rate": 0.0001365931764995689, + "loss": 0.3352, + "step": 2579 + }, + { + "epoch": 0.9527326440177253, + "grad_norm": 0.2630428969860077, + "learning_rate": 0.00013656854292400542, + "loss": 0.2663, + "step": 2580 + }, + { + "epoch": 0.9531019202363368, + "grad_norm": 0.256254106760025, + "learning_rate": 0.00013654390934844193, + "loss": 0.276, + "step": 2581 + }, + { + "epoch": 0.9534711964549483, + "grad_norm": 0.24607379734516144, + "learning_rate": 0.00013651927577287845, + "loss": 0.2354, + "step": 2582 + }, + { + "epoch": 0.9538404726735599, + "grad_norm": 0.2805701494216919, + "learning_rate": 0.00013649464219731494, + "loss": 0.2811, + "step": 2583 + }, + { + "epoch": 0.9542097488921714, + "grad_norm": 0.27809494733810425, + "learning_rate": 0.00013647000862175145, + "loss": 0.2683, + "step": 2584 + }, + { + "epoch": 0.9545790251107829, + "grad_norm": 0.2555201053619385, + "learning_rate": 0.00013644537504618794, + "loss": 0.2717, + "step": 2585 + }, + { + "epoch": 0.9549483013293943, + "grad_norm": 0.3575626015663147, + "learning_rate": 0.00013642074147062448, + "loss": 0.2788, + "step": 2586 + }, + { + "epoch": 0.955317577548006, + "grad_norm": 0.23265214264392853, + "learning_rate": 0.00013639610789506097, + "loss": 0.2655, + "step": 2587 + }, + { + "epoch": 0.9556868537666174, + "grad_norm": 0.2332039624452591, + "learning_rate": 0.00013637147431949748, + "loss": 0.2471, + "step": 2588 + }, + { + "epoch": 0.9560561299852289, + "grad_norm": 0.2927175760269165, + "learning_rate": 0.00013634684074393397, + "loss": 0.3053, + "step": 2589 + }, + { + "epoch": 0.9564254062038404, + "grad_norm": 0.2403823435306549, + "learning_rate": 0.00013632220716837049, + "loss": 0.2718, + "step": 2590 + }, + { + "epoch": 0.956794682422452, + "grad_norm": 0.20419754087924957, + "learning_rate": 0.000136297573592807, + "loss": 0.2004, + "step": 2591 + }, + { + "epoch": 0.9571639586410635, + "grad_norm": 0.3135280907154083, + "learning_rate": 0.00013627294001724351, + "loss": 0.2851, + "step": 2592 + }, + { + "epoch": 0.957533234859675, + "grad_norm": 0.2785232961177826, + "learning_rate": 0.00013624830644168, + "loss": 0.3191, + "step": 2593 + }, + { + "epoch": 0.9579025110782866, + "grad_norm": 0.22876116633415222, + "learning_rate": 0.00013622367286611652, + "loss": 0.2315, + "step": 2594 + }, + { + "epoch": 0.9582717872968981, + "grad_norm": 0.24172428250312805, + "learning_rate": 0.00013619903929055303, + "loss": 0.2228, + "step": 2595 + }, + { + "epoch": 0.9586410635155096, + "grad_norm": 0.29304224252700806, + "learning_rate": 0.00013617440571498955, + "loss": 0.332, + "step": 2596 + }, + { + "epoch": 0.9590103397341211, + "grad_norm": 0.2573486566543579, + "learning_rate": 0.00013614977213942603, + "loss": 0.2674, + "step": 2597 + }, + { + "epoch": 0.9593796159527327, + "grad_norm": 0.2802794277667999, + "learning_rate": 0.00013612513856386255, + "loss": 0.2914, + "step": 2598 + }, + { + "epoch": 0.9597488921713442, + "grad_norm": 0.2769867777824402, + "learning_rate": 0.00013610050498829906, + "loss": 0.3208, + "step": 2599 + }, + { + "epoch": 0.9601181683899557, + "grad_norm": 0.33963873982429504, + "learning_rate": 0.00013607587141273558, + "loss": 0.3719, + "step": 2600 + }, + { + "epoch": 0.9601181683899557, + "eval_loss": 8.255655288696289, + "eval_runtime": 6.9159, + "eval_samples_per_second": 7.23, + "eval_steps_per_second": 1.012, + "step": 2600 + }, + { + "epoch": 0.9604874446085672, + "grad_norm": 0.30288827419281006, + "learning_rate": 0.00013605123783717206, + "loss": 0.354, + "step": 2601 + }, + { + "epoch": 0.9608567208271788, + "grad_norm": 0.32560470700263977, + "learning_rate": 0.00013602660426160858, + "loss": 0.3282, + "step": 2602 + }, + { + "epoch": 0.9612259970457903, + "grad_norm": 0.26978743076324463, + "learning_rate": 0.00013600197068604507, + "loss": 0.2828, + "step": 2603 + }, + { + "epoch": 0.9615952732644018, + "grad_norm": 0.30333012342453003, + "learning_rate": 0.0001359773371104816, + "loss": 0.261, + "step": 2604 + }, + { + "epoch": 0.9619645494830132, + "grad_norm": 0.2831343710422516, + "learning_rate": 0.0001359527035349181, + "loss": 0.2706, + "step": 2605 + }, + { + "epoch": 0.9623338257016248, + "grad_norm": 0.2685483992099762, + "learning_rate": 0.0001359280699593546, + "loss": 0.329, + "step": 2606 + }, + { + "epoch": 0.9627031019202363, + "grad_norm": 0.23163548111915588, + "learning_rate": 0.0001359034363837911, + "loss": 0.2656, + "step": 2607 + }, + { + "epoch": 0.9630723781388478, + "grad_norm": 0.2846479117870331, + "learning_rate": 0.0001358788028082276, + "loss": 0.3334, + "step": 2608 + }, + { + "epoch": 0.9634416543574594, + "grad_norm": 0.2757445275783539, + "learning_rate": 0.00013585416923266413, + "loss": 0.274, + "step": 2609 + }, + { + "epoch": 0.9638109305760709, + "grad_norm": 0.2637721002101898, + "learning_rate": 0.00013582953565710064, + "loss": 0.2839, + "step": 2610 + }, + { + "epoch": 0.9641802067946824, + "grad_norm": 0.2603969871997833, + "learning_rate": 0.00013580490208153713, + "loss": 0.2926, + "step": 2611 + }, + { + "epoch": 0.9645494830132939, + "grad_norm": 0.31915032863616943, + "learning_rate": 0.00013578026850597364, + "loss": 0.257, + "step": 2612 + }, + { + "epoch": 0.9649187592319055, + "grad_norm": 0.22478729486465454, + "learning_rate": 0.00013575563493041016, + "loss": 0.2666, + "step": 2613 + }, + { + "epoch": 0.965288035450517, + "grad_norm": 0.3239811658859253, + "learning_rate": 0.00013573100135484667, + "loss": 0.3062, + "step": 2614 + }, + { + "epoch": 0.9656573116691285, + "grad_norm": 0.264182448387146, + "learning_rate": 0.00013570636777928316, + "loss": 0.2863, + "step": 2615 + }, + { + "epoch": 0.96602658788774, + "grad_norm": 0.27280545234680176, + "learning_rate": 0.00013568173420371968, + "loss": 0.2984, + "step": 2616 + }, + { + "epoch": 0.9663958641063516, + "grad_norm": 0.22328899800777435, + "learning_rate": 0.00013565710062815616, + "loss": 0.2908, + "step": 2617 + }, + { + "epoch": 0.9667651403249631, + "grad_norm": 0.3466127812862396, + "learning_rate": 0.0001356324670525927, + "loss": 0.3602, + "step": 2618 + }, + { + "epoch": 0.9671344165435746, + "grad_norm": 0.3335649073123932, + "learning_rate": 0.0001356078334770292, + "loss": 0.3377, + "step": 2619 + }, + { + "epoch": 0.9675036927621861, + "grad_norm": 0.26888495683670044, + "learning_rate": 0.0001355831999014657, + "loss": 0.2695, + "step": 2620 + }, + { + "epoch": 0.9678729689807977, + "grad_norm": 0.2637009918689728, + "learning_rate": 0.0001355585663259022, + "loss": 0.2691, + "step": 2621 + }, + { + "epoch": 0.9682422451994092, + "grad_norm": 0.2704222798347473, + "learning_rate": 0.0001355339327503387, + "loss": 0.2754, + "step": 2622 + }, + { + "epoch": 0.9686115214180206, + "grad_norm": 0.24718932807445526, + "learning_rate": 0.00013550929917477522, + "loss": 0.2873, + "step": 2623 + }, + { + "epoch": 0.9689807976366323, + "grad_norm": 0.24533362686634064, + "learning_rate": 0.00013548466559921174, + "loss": 0.2914, + "step": 2624 + }, + { + "epoch": 0.9693500738552437, + "grad_norm": 0.2601556181907654, + "learning_rate": 0.00013546003202364823, + "loss": 0.2849, + "step": 2625 + }, + { + "epoch": 0.9697193500738552, + "grad_norm": 0.30134016275405884, + "learning_rate": 0.00013543539844808474, + "loss": 0.3174, + "step": 2626 + }, + { + "epoch": 0.9700886262924667, + "grad_norm": 0.22383323311805725, + "learning_rate": 0.00013541076487252126, + "loss": 0.2492, + "step": 2627 + }, + { + "epoch": 0.9704579025110783, + "grad_norm": 0.23088370263576508, + "learning_rate": 0.00013538613129695777, + "loss": 0.271, + "step": 2628 + }, + { + "epoch": 0.9708271787296898, + "grad_norm": 0.2410733997821808, + "learning_rate": 0.00013536149772139426, + "loss": 0.2462, + "step": 2629 + }, + { + "epoch": 0.9711964549483013, + "grad_norm": 0.27193793654441833, + "learning_rate": 0.00013533686414583077, + "loss": 0.28, + "step": 2630 + }, + { + "epoch": 0.9715657311669128, + "grad_norm": 0.28846636414527893, + "learning_rate": 0.00013531223057026726, + "loss": 0.3126, + "step": 2631 + }, + { + "epoch": 0.9719350073855244, + "grad_norm": 0.2804848253726959, + "learning_rate": 0.0001352875969947038, + "loss": 0.2986, + "step": 2632 + }, + { + "epoch": 0.9723042836041359, + "grad_norm": 0.3260886073112488, + "learning_rate": 0.0001352629634191403, + "loss": 0.3752, + "step": 2633 + }, + { + "epoch": 0.9726735598227474, + "grad_norm": 0.30501049757003784, + "learning_rate": 0.0001352383298435768, + "loss": 0.378, + "step": 2634 + }, + { + "epoch": 0.9730428360413589, + "grad_norm": 0.35624799132347107, + "learning_rate": 0.0001352136962680133, + "loss": 0.335, + "step": 2635 + }, + { + "epoch": 0.9734121122599705, + "grad_norm": 0.37001433968544006, + "learning_rate": 0.0001351890626924498, + "loss": 0.3088, + "step": 2636 + }, + { + "epoch": 0.973781388478582, + "grad_norm": 0.2995047867298126, + "learning_rate": 0.00013516442911688632, + "loss": 0.2721, + "step": 2637 + }, + { + "epoch": 0.9741506646971935, + "grad_norm": 0.2463473677635193, + "learning_rate": 0.00013513979554132284, + "loss": 0.2593, + "step": 2638 + }, + { + "epoch": 0.9745199409158051, + "grad_norm": 0.3171980381011963, + "learning_rate": 0.00013511516196575932, + "loss": 0.2885, + "step": 2639 + }, + { + "epoch": 0.9748892171344166, + "grad_norm": 0.262368768453598, + "learning_rate": 0.00013509052839019584, + "loss": 0.3022, + "step": 2640 + }, + { + "epoch": 0.975258493353028, + "grad_norm": 0.2606953978538513, + "learning_rate": 0.00013506589481463235, + "loss": 0.3043, + "step": 2641 + }, + { + "epoch": 0.9756277695716395, + "grad_norm": 0.27412235736846924, + "learning_rate": 0.00013504126123906887, + "loss": 0.3215, + "step": 2642 + }, + { + "epoch": 0.9759970457902511, + "grad_norm": 0.28935301303863525, + "learning_rate": 0.00013501662766350535, + "loss": 0.3425, + "step": 2643 + }, + { + "epoch": 0.9763663220088626, + "grad_norm": 0.31100738048553467, + "learning_rate": 0.00013499199408794187, + "loss": 0.2997, + "step": 2644 + }, + { + "epoch": 0.9767355982274741, + "grad_norm": 0.2823958694934845, + "learning_rate": 0.00013496736051237838, + "loss": 0.2709, + "step": 2645 + }, + { + "epoch": 0.9771048744460856, + "grad_norm": 0.28484511375427246, + "learning_rate": 0.0001349427269368149, + "loss": 0.314, + "step": 2646 + }, + { + "epoch": 0.9774741506646972, + "grad_norm": 0.22270408272743225, + "learning_rate": 0.00013491809336125139, + "loss": 0.2476, + "step": 2647 + }, + { + "epoch": 0.9778434268833087, + "grad_norm": 0.2287965565919876, + "learning_rate": 0.0001348934597856879, + "loss": 0.238, + "step": 2648 + }, + { + "epoch": 0.9782127031019202, + "grad_norm": 0.2475476861000061, + "learning_rate": 0.0001348688262101244, + "loss": 0.2874, + "step": 2649 + }, + { + "epoch": 0.9785819793205317, + "grad_norm": 0.3460521697998047, + "learning_rate": 0.00013484419263456093, + "loss": 0.3598, + "step": 2650 + }, + { + "epoch": 0.9785819793205317, + "eval_loss": 8.293087005615234, + "eval_runtime": 6.9152, + "eval_samples_per_second": 7.23, + "eval_steps_per_second": 1.012, + "step": 2650 + }, + { + "epoch": 0.9789512555391433, + "grad_norm": 0.2710550129413605, + "learning_rate": 0.00013481955905899742, + "loss": 0.2758, + "step": 2651 + }, + { + "epoch": 0.9793205317577548, + "grad_norm": 0.3149683177471161, + "learning_rate": 0.00013479492548343393, + "loss": 0.3762, + "step": 2652 + }, + { + "epoch": 0.9796898079763663, + "grad_norm": 0.2516147494316101, + "learning_rate": 0.00013477029190787042, + "loss": 0.2828, + "step": 2653 + }, + { + "epoch": 0.9800590841949779, + "grad_norm": 0.23255646228790283, + "learning_rate": 0.00013474565833230693, + "loss": 0.2488, + "step": 2654 + }, + { + "epoch": 0.9804283604135894, + "grad_norm": 0.2697564959526062, + "learning_rate": 0.00013472102475674345, + "loss": 0.2721, + "step": 2655 + }, + { + "epoch": 0.9807976366322009, + "grad_norm": 0.2285911589860916, + "learning_rate": 0.00013469639118117996, + "loss": 0.2624, + "step": 2656 + }, + { + "epoch": 0.9811669128508124, + "grad_norm": 0.24470436573028564, + "learning_rate": 0.00013467175760561645, + "loss": 0.2581, + "step": 2657 + }, + { + "epoch": 0.981536189069424, + "grad_norm": 0.28452184796333313, + "learning_rate": 0.00013464712403005297, + "loss": 0.2989, + "step": 2658 + }, + { + "epoch": 0.9819054652880355, + "grad_norm": 0.3286585807800293, + "learning_rate": 0.00013462249045448948, + "loss": 0.3385, + "step": 2659 + }, + { + "epoch": 0.982274741506647, + "grad_norm": 0.31793978810310364, + "learning_rate": 0.000134597856878926, + "loss": 0.2874, + "step": 2660 + }, + { + "epoch": 0.9826440177252584, + "grad_norm": 0.2451810985803604, + "learning_rate": 0.00013457322330336248, + "loss": 0.2745, + "step": 2661 + }, + { + "epoch": 0.98301329394387, + "grad_norm": 0.26427337527275085, + "learning_rate": 0.000134548589727799, + "loss": 0.2988, + "step": 2662 + }, + { + "epoch": 0.9833825701624815, + "grad_norm": 0.23884670436382294, + "learning_rate": 0.00013452395615223548, + "loss": 0.2698, + "step": 2663 + }, + { + "epoch": 0.983751846381093, + "grad_norm": 0.24663832783699036, + "learning_rate": 0.00013449932257667203, + "loss": 0.2618, + "step": 2664 + }, + { + "epoch": 0.9841211225997046, + "grad_norm": 0.2725020945072174, + "learning_rate": 0.00013447468900110851, + "loss": 0.2707, + "step": 2665 + }, + { + "epoch": 0.9844903988183161, + "grad_norm": 0.26264533400535583, + "learning_rate": 0.00013445005542554503, + "loss": 0.252, + "step": 2666 + }, + { + "epoch": 0.9848596750369276, + "grad_norm": 0.2789607346057892, + "learning_rate": 0.00013442542184998152, + "loss": 0.3439, + "step": 2667 + }, + { + "epoch": 0.9852289512555391, + "grad_norm": 0.2495069056749344, + "learning_rate": 0.00013440078827441803, + "loss": 0.2562, + "step": 2668 + }, + { + "epoch": 0.9855982274741507, + "grad_norm": 0.30534255504608154, + "learning_rate": 0.00013437615469885455, + "loss": 0.2818, + "step": 2669 + }, + { + "epoch": 0.9859675036927622, + "grad_norm": 0.2565981447696686, + "learning_rate": 0.00013435152112329106, + "loss": 0.3062, + "step": 2670 + }, + { + "epoch": 0.9863367799113737, + "grad_norm": 0.20889125764369965, + "learning_rate": 0.00013432688754772755, + "loss": 0.2585, + "step": 2671 + }, + { + "epoch": 0.9867060561299852, + "grad_norm": 0.3168131113052368, + "learning_rate": 0.00013430225397216406, + "loss": 0.2889, + "step": 2672 + }, + { + "epoch": 0.9870753323485968, + "grad_norm": 0.2701852023601532, + "learning_rate": 0.00013427762039660058, + "loss": 0.3193, + "step": 2673 + }, + { + "epoch": 0.9874446085672083, + "grad_norm": 0.2778410017490387, + "learning_rate": 0.0001342529868210371, + "loss": 0.2474, + "step": 2674 + }, + { + "epoch": 0.9878138847858198, + "grad_norm": 0.3304849863052368, + "learning_rate": 0.00013422835324547358, + "loss": 0.271, + "step": 2675 + }, + { + "epoch": 0.9881831610044313, + "grad_norm": 0.29668906331062317, + "learning_rate": 0.0001342037196699101, + "loss": 0.267, + "step": 2676 + }, + { + "epoch": 0.9885524372230429, + "grad_norm": 0.2762480676174164, + "learning_rate": 0.0001341790860943466, + "loss": 0.268, + "step": 2677 + }, + { + "epoch": 0.9889217134416544, + "grad_norm": 0.28095296025276184, + "learning_rate": 0.00013415445251878312, + "loss": 0.3132, + "step": 2678 + }, + { + "epoch": 0.9892909896602659, + "grad_norm": 0.28721529245376587, + "learning_rate": 0.0001341298189432196, + "loss": 0.2887, + "step": 2679 + }, + { + "epoch": 0.9896602658788775, + "grad_norm": 0.3234040141105652, + "learning_rate": 0.00013410518536765612, + "loss": 0.3346, + "step": 2680 + }, + { + "epoch": 0.990029542097489, + "grad_norm": 0.25142377614974976, + "learning_rate": 0.0001340805517920926, + "loss": 0.2283, + "step": 2681 + }, + { + "epoch": 0.9903988183161004, + "grad_norm": 0.23547294735908508, + "learning_rate": 0.00013405591821652915, + "loss": 0.2525, + "step": 2682 + }, + { + "epoch": 0.9907680945347119, + "grad_norm": 0.25455066561698914, + "learning_rate": 0.00013403128464096564, + "loss": 0.2965, + "step": 2683 + }, + { + "epoch": 0.9911373707533235, + "grad_norm": 0.3175659477710724, + "learning_rate": 0.00013400665106540216, + "loss": 0.3053, + "step": 2684 + }, + { + "epoch": 0.991506646971935, + "grad_norm": 0.31836122274398804, + "learning_rate": 0.00013398201748983864, + "loss": 0.3658, + "step": 2685 + }, + { + "epoch": 0.9918759231905465, + "grad_norm": 0.24422796070575714, + "learning_rate": 0.00013395738391427516, + "loss": 0.3234, + "step": 2686 + }, + { + "epoch": 0.992245199409158, + "grad_norm": 0.27074941992759705, + "learning_rate": 0.00013393275033871167, + "loss": 0.282, + "step": 2687 + }, + { + "epoch": 0.9926144756277696, + "grad_norm": 0.3228800594806671, + "learning_rate": 0.0001339081167631482, + "loss": 0.2893, + "step": 2688 + }, + { + "epoch": 0.9929837518463811, + "grad_norm": 0.24783533811569214, + "learning_rate": 0.00013388348318758468, + "loss": 0.3086, + "step": 2689 + }, + { + "epoch": 0.9933530280649926, + "grad_norm": 0.23847107589244843, + "learning_rate": 0.0001338588496120212, + "loss": 0.2799, + "step": 2690 + }, + { + "epoch": 0.9937223042836041, + "grad_norm": 0.23520159721374512, + "learning_rate": 0.0001338342160364577, + "loss": 0.2961, + "step": 2691 + }, + { + "epoch": 0.9940915805022157, + "grad_norm": 0.2620205581188202, + "learning_rate": 0.00013380958246089422, + "loss": 0.3022, + "step": 2692 + }, + { + "epoch": 0.9944608567208272, + "grad_norm": 0.24109342694282532, + "learning_rate": 0.0001337849488853307, + "loss": 0.2316, + "step": 2693 + }, + { + "epoch": 0.9948301329394387, + "grad_norm": 0.2254646122455597, + "learning_rate": 0.00013376031530976722, + "loss": 0.2405, + "step": 2694 + }, + { + "epoch": 0.9951994091580503, + "grad_norm": 0.30142900347709656, + "learning_rate": 0.0001337356817342037, + "loss": 0.3613, + "step": 2695 + }, + { + "epoch": 0.9955686853766618, + "grad_norm": 0.24732990562915802, + "learning_rate": 0.00013371104815864025, + "loss": 0.2463, + "step": 2696 + }, + { + "epoch": 0.9959379615952733, + "grad_norm": 0.24802136421203613, + "learning_rate": 0.00013368641458307674, + "loss": 0.2561, + "step": 2697 + }, + { + "epoch": 0.9963072378138847, + "grad_norm": 0.26919543743133545, + "learning_rate": 0.00013366178100751325, + "loss": 0.2762, + "step": 2698 + }, + { + "epoch": 0.9966765140324964, + "grad_norm": 0.2618715763092041, + "learning_rate": 0.00013363714743194974, + "loss": 0.2691, + "step": 2699 + }, + { + "epoch": 0.9970457902511078, + "grad_norm": 0.23027564585208893, + "learning_rate": 0.00013361251385638626, + "loss": 0.2599, + "step": 2700 + }, + { + "epoch": 0.9970457902511078, + "eval_loss": 8.474834442138672, + "eval_runtime": 6.9239, + "eval_samples_per_second": 7.221, + "eval_steps_per_second": 1.011, + "step": 2700 + }, + { + "epoch": 0.9974150664697193, + "grad_norm": 0.26839572191238403, + "learning_rate": 0.00013358788028082277, + "loss": 0.2727, + "step": 2701 + }, + { + "epoch": 0.9977843426883308, + "grad_norm": 0.2604323625564575, + "learning_rate": 0.00013356324670525928, + "loss": 0.2937, + "step": 2702 + }, + { + "epoch": 0.9981536189069424, + "grad_norm": 0.33425942063331604, + "learning_rate": 0.00013353861312969577, + "loss": 0.3517, + "step": 2703 + }, + { + "epoch": 0.9985228951255539, + "grad_norm": 0.2822682559490204, + "learning_rate": 0.0001335139795541323, + "loss": 0.3148, + "step": 2704 + }, + { + "epoch": 0.9988921713441654, + "grad_norm": 0.26111552119255066, + "learning_rate": 0.0001334893459785688, + "loss": 0.2507, + "step": 2705 + }, + { + "epoch": 0.9992614475627769, + "grad_norm": 0.26819875836372375, + "learning_rate": 0.00013346471240300532, + "loss": 0.2705, + "step": 2706 + }, + { + "epoch": 0.9996307237813885, + "grad_norm": 0.23910900950431824, + "learning_rate": 0.0001334400788274418, + "loss": 0.2277, + "step": 2707 + }, + { + "epoch": 1.0, + "grad_norm": 0.381069153547287, + "learning_rate": 0.00013341544525187832, + "loss": 0.3182, + "step": 2708 + }, + { + "epoch": 1.0003692762186116, + "grad_norm": 0.2615739107131958, + "learning_rate": 0.00013339081167631483, + "loss": 0.2785, + "step": 2709 + }, + { + "epoch": 1.000738552437223, + "grad_norm": 0.2380443811416626, + "learning_rate": 0.00013336617810075135, + "loss": 0.222, + "step": 2710 + }, + { + "epoch": 1.0011078286558346, + "grad_norm": 0.2520563006401062, + "learning_rate": 0.00013334154452518783, + "loss": 0.2315, + "step": 2711 + }, + { + "epoch": 1.0014771048744462, + "grad_norm": 0.22483482956886292, + "learning_rate": 0.00013331691094962435, + "loss": 0.2226, + "step": 2712 + }, + { + "epoch": 1.0018463810930576, + "grad_norm": 0.2389538735151291, + "learning_rate": 0.00013329227737406084, + "loss": 0.2134, + "step": 2713 + }, + { + "epoch": 1.0022156573116692, + "grad_norm": 0.23874297738075256, + "learning_rate": 0.00013326764379849738, + "loss": 0.213, + "step": 2714 + }, + { + "epoch": 1.0025849335302806, + "grad_norm": 0.26090678572654724, + "learning_rate": 0.00013324301022293387, + "loss": 0.2533, + "step": 2715 + }, + { + "epoch": 1.0029542097488922, + "grad_norm": 0.18605168163776398, + "learning_rate": 0.00013321837664737038, + "loss": 0.1938, + "step": 2716 + }, + { + "epoch": 1.0033234859675038, + "grad_norm": 0.26474621891975403, + "learning_rate": 0.00013319374307180687, + "loss": 0.2316, + "step": 2717 + }, + { + "epoch": 1.0036927621861151, + "grad_norm": 0.2105218470096588, + "learning_rate": 0.00013316910949624338, + "loss": 0.2179, + "step": 2718 + }, + { + "epoch": 1.0040620384047267, + "grad_norm": 0.2903737425804138, + "learning_rate": 0.0001331444759206799, + "loss": 0.2355, + "step": 2719 + }, + { + "epoch": 1.0044313146233383, + "grad_norm": 0.20291602611541748, + "learning_rate": 0.0001331198423451164, + "loss": 0.2066, + "step": 2720 + }, + { + "epoch": 1.0048005908419497, + "grad_norm": 0.2591138482093811, + "learning_rate": 0.0001330952087695529, + "loss": 0.2467, + "step": 2721 + }, + { + "epoch": 1.0051698670605613, + "grad_norm": 0.22016775608062744, + "learning_rate": 0.00013307057519398941, + "loss": 0.2126, + "step": 2722 + }, + { + "epoch": 1.005539143279173, + "grad_norm": 0.22761479020118713, + "learning_rate": 0.00013304594161842593, + "loss": 0.1896, + "step": 2723 + }, + { + "epoch": 1.0059084194977843, + "grad_norm": 0.2857967019081116, + "learning_rate": 0.00013302130804286244, + "loss": 0.2489, + "step": 2724 + }, + { + "epoch": 1.006277695716396, + "grad_norm": 0.2444583624601364, + "learning_rate": 0.00013299667446729893, + "loss": 0.2556, + "step": 2725 + }, + { + "epoch": 1.0066469719350073, + "grad_norm": 0.282543808221817, + "learning_rate": 0.00013297204089173545, + "loss": 0.1818, + "step": 2726 + }, + { + "epoch": 1.007016248153619, + "grad_norm": 0.20856013894081116, + "learning_rate": 0.00013294740731617193, + "loss": 0.1782, + "step": 2727 + }, + { + "epoch": 1.0073855243722305, + "grad_norm": 0.24826043844223022, + "learning_rate": 0.00013292277374060848, + "loss": 0.2387, + "step": 2728 + }, + { + "epoch": 1.0077548005908419, + "grad_norm": 0.23914384841918945, + "learning_rate": 0.00013289814016504496, + "loss": 0.2652, + "step": 2729 + }, + { + "epoch": 1.0081240768094535, + "grad_norm": 0.2623727321624756, + "learning_rate": 0.00013287350658948148, + "loss": 0.2501, + "step": 2730 + }, + { + "epoch": 1.008493353028065, + "grad_norm": 0.31962916254997253, + "learning_rate": 0.00013284887301391797, + "loss": 0.305, + "step": 2731 + }, + { + "epoch": 1.0088626292466765, + "grad_norm": 0.2621099650859833, + "learning_rate": 0.00013282423943835448, + "loss": 0.2683, + "step": 2732 + }, + { + "epoch": 1.009231905465288, + "grad_norm": 0.21160075068473816, + "learning_rate": 0.000132799605862791, + "loss": 0.2312, + "step": 2733 + }, + { + "epoch": 1.0096011816838995, + "grad_norm": 0.27014338970184326, + "learning_rate": 0.0001327749722872275, + "loss": 0.2555, + "step": 2734 + }, + { + "epoch": 1.009970457902511, + "grad_norm": 0.28267085552215576, + "learning_rate": 0.000132750338711664, + "loss": 0.2613, + "step": 2735 + }, + { + "epoch": 1.0103397341211227, + "grad_norm": 0.27195724844932556, + "learning_rate": 0.0001327257051361005, + "loss": 0.2207, + "step": 2736 + }, + { + "epoch": 1.010709010339734, + "grad_norm": 0.23639927804470062, + "learning_rate": 0.00013270107156053703, + "loss": 0.2419, + "step": 2737 + }, + { + "epoch": 1.0110782865583456, + "grad_norm": 0.25145962834358215, + "learning_rate": 0.00013267643798497354, + "loss": 0.2525, + "step": 2738 + }, + { + "epoch": 1.0114475627769572, + "grad_norm": 0.29074764251708984, + "learning_rate": 0.00013265180440941003, + "loss": 0.2396, + "step": 2739 + }, + { + "epoch": 1.0118168389955686, + "grad_norm": 0.2732088267803192, + "learning_rate": 0.00013262717083384654, + "loss": 0.2775, + "step": 2740 + }, + { + "epoch": 1.0121861152141802, + "grad_norm": 0.21709252893924713, + "learning_rate": 0.00013260253725828306, + "loss": 0.2067, + "step": 2741 + }, + { + "epoch": 1.0125553914327918, + "grad_norm": 0.23734478652477264, + "learning_rate": 0.00013257790368271957, + "loss": 0.269, + "step": 2742 + }, + { + "epoch": 1.0129246676514032, + "grad_norm": 0.2774570882320404, + "learning_rate": 0.00013255327010715606, + "loss": 0.2572, + "step": 2743 + }, + { + "epoch": 1.0132939438700148, + "grad_norm": 0.272756427526474, + "learning_rate": 0.00013252863653159257, + "loss": 0.2696, + "step": 2744 + }, + { + "epoch": 1.0136632200886262, + "grad_norm": 0.27521100640296936, + "learning_rate": 0.00013250400295602906, + "loss": 0.2628, + "step": 2745 + }, + { + "epoch": 1.0140324963072378, + "grad_norm": 0.4214564263820648, + "learning_rate": 0.0001324793693804656, + "loss": 0.2411, + "step": 2746 + }, + { + "epoch": 1.0144017725258494, + "grad_norm": 0.21545644104480743, + "learning_rate": 0.0001324547358049021, + "loss": 0.206, + "step": 2747 + }, + { + "epoch": 1.0147710487444608, + "grad_norm": 0.23862315714359283, + "learning_rate": 0.0001324301022293386, + "loss": 0.2168, + "step": 2748 + }, + { + "epoch": 1.0151403249630724, + "grad_norm": 0.2961968779563904, + "learning_rate": 0.0001324054686537751, + "loss": 0.2463, + "step": 2749 + }, + { + "epoch": 1.015509601181684, + "grad_norm": 0.23514299094676971, + "learning_rate": 0.0001323808350782116, + "loss": 0.2218, + "step": 2750 + }, + { + "epoch": 1.015509601181684, + "eval_loss": 8.441010475158691, + "eval_runtime": 6.9096, + "eval_samples_per_second": 7.236, + "eval_steps_per_second": 1.013, + "step": 2750 + }, + { + "epoch": 1.0158788774002954, + "grad_norm": 0.21323415637016296, + "learning_rate": 0.00013235620150264812, + "loss": 0.1738, + "step": 2751 + }, + { + "epoch": 1.016248153618907, + "grad_norm": 0.24373584985733032, + "learning_rate": 0.00013233156792708464, + "loss": 0.2322, + "step": 2752 + }, + { + "epoch": 1.0166174298375186, + "grad_norm": 0.3021080493927002, + "learning_rate": 0.00013230693435152112, + "loss": 0.2852, + "step": 2753 + }, + { + "epoch": 1.01698670605613, + "grad_norm": 0.26079070568084717, + "learning_rate": 0.00013228230077595764, + "loss": 0.2045, + "step": 2754 + }, + { + "epoch": 1.0173559822747416, + "grad_norm": 0.28082722425460815, + "learning_rate": 0.00013225766720039415, + "loss": 0.2395, + "step": 2755 + }, + { + "epoch": 1.017725258493353, + "grad_norm": 0.2571943998336792, + "learning_rate": 0.00013223303362483067, + "loss": 0.2217, + "step": 2756 + }, + { + "epoch": 1.0180945347119645, + "grad_norm": 0.22800298035144806, + "learning_rate": 0.00013220840004926716, + "loss": 0.2079, + "step": 2757 + }, + { + "epoch": 1.0184638109305761, + "grad_norm": 0.2597028315067291, + "learning_rate": 0.00013218376647370367, + "loss": 0.2187, + "step": 2758 + }, + { + "epoch": 1.0188330871491875, + "grad_norm": 0.26292890310287476, + "learning_rate": 0.00013215913289814016, + "loss": 0.2343, + "step": 2759 + }, + { + "epoch": 1.0192023633677991, + "grad_norm": 0.31723257899284363, + "learning_rate": 0.0001321344993225767, + "loss": 0.2896, + "step": 2760 + }, + { + "epoch": 1.0195716395864107, + "grad_norm": 0.2545534670352936, + "learning_rate": 0.0001321098657470132, + "loss": 0.2231, + "step": 2761 + }, + { + "epoch": 1.019940915805022, + "grad_norm": 0.22904063761234283, + "learning_rate": 0.0001320852321714497, + "loss": 0.2354, + "step": 2762 + }, + { + "epoch": 1.0203101920236337, + "grad_norm": 0.22065946459770203, + "learning_rate": 0.0001320605985958862, + "loss": 0.2302, + "step": 2763 + }, + { + "epoch": 1.020679468242245, + "grad_norm": 0.3344857096672058, + "learning_rate": 0.0001320359650203227, + "loss": 0.295, + "step": 2764 + }, + { + "epoch": 1.0210487444608567, + "grad_norm": 0.27599337697029114, + "learning_rate": 0.00013201133144475922, + "loss": 0.2496, + "step": 2765 + }, + { + "epoch": 1.0214180206794683, + "grad_norm": 0.23153887689113617, + "learning_rate": 0.00013198669786919573, + "loss": 0.2181, + "step": 2766 + }, + { + "epoch": 1.0217872968980797, + "grad_norm": 0.2671188712120056, + "learning_rate": 0.00013196206429363222, + "loss": 0.2531, + "step": 2767 + }, + { + "epoch": 1.0221565731166913, + "grad_norm": 0.28018835186958313, + "learning_rate": 0.00013193743071806874, + "loss": 0.2321, + "step": 2768 + }, + { + "epoch": 1.0225258493353029, + "grad_norm": 0.2690368890762329, + "learning_rate": 0.00013191279714250525, + "loss": 0.2595, + "step": 2769 + }, + { + "epoch": 1.0228951255539143, + "grad_norm": 0.31606605648994446, + "learning_rate": 0.00013188816356694176, + "loss": 0.2972, + "step": 2770 + }, + { + "epoch": 1.0232644017725259, + "grad_norm": 0.2672824561595917, + "learning_rate": 0.00013186352999137825, + "loss": 0.2299, + "step": 2771 + }, + { + "epoch": 1.0236336779911375, + "grad_norm": 0.23899973928928375, + "learning_rate": 0.00013183889641581477, + "loss": 0.1918, + "step": 2772 + }, + { + "epoch": 1.0240029542097489, + "grad_norm": 0.23761935532093048, + "learning_rate": 0.00013181426284025125, + "loss": 0.2458, + "step": 2773 + }, + { + "epoch": 1.0243722304283605, + "grad_norm": 0.24064522981643677, + "learning_rate": 0.0001317896292646878, + "loss": 0.258, + "step": 2774 + }, + { + "epoch": 1.0247415066469718, + "grad_norm": 0.2548474371433258, + "learning_rate": 0.00013176499568912428, + "loss": 0.2065, + "step": 2775 + }, + { + "epoch": 1.0251107828655834, + "grad_norm": 0.23399488627910614, + "learning_rate": 0.0001317403621135608, + "loss": 0.2425, + "step": 2776 + }, + { + "epoch": 1.025480059084195, + "grad_norm": 0.2868897616863251, + "learning_rate": 0.00013171572853799729, + "loss": 0.2272, + "step": 2777 + }, + { + "epoch": 1.0258493353028064, + "grad_norm": 0.2258249819278717, + "learning_rate": 0.0001316910949624338, + "loss": 0.2301, + "step": 2778 + }, + { + "epoch": 1.026218611521418, + "grad_norm": 0.20794442296028137, + "learning_rate": 0.00013166646138687032, + "loss": 0.1876, + "step": 2779 + }, + { + "epoch": 1.0265878877400296, + "grad_norm": 0.23677808046340942, + "learning_rate": 0.00013164182781130683, + "loss": 0.2363, + "step": 2780 + }, + { + "epoch": 1.026957163958641, + "grad_norm": 0.4296845495700836, + "learning_rate": 0.00013161719423574332, + "loss": 0.3163, + "step": 2781 + }, + { + "epoch": 1.0273264401772526, + "grad_norm": 0.2699335515499115, + "learning_rate": 0.00013159256066017983, + "loss": 0.2461, + "step": 2782 + }, + { + "epoch": 1.0276957163958642, + "grad_norm": 0.277008056640625, + "learning_rate": 0.00013156792708461635, + "loss": 0.2377, + "step": 2783 + }, + { + "epoch": 1.0280649926144756, + "grad_norm": 0.2013576775789261, + "learning_rate": 0.00013154329350905286, + "loss": 0.2038, + "step": 2784 + }, + { + "epoch": 1.0284342688330872, + "grad_norm": 0.2546544373035431, + "learning_rate": 0.00013151865993348935, + "loss": 0.2375, + "step": 2785 + }, + { + "epoch": 1.0288035450516986, + "grad_norm": 0.22223815321922302, + "learning_rate": 0.00013149402635792586, + "loss": 0.2083, + "step": 2786 + }, + { + "epoch": 1.0291728212703102, + "grad_norm": 0.23303988575935364, + "learning_rate": 0.00013146939278236238, + "loss": 0.2595, + "step": 2787 + }, + { + "epoch": 1.0295420974889218, + "grad_norm": 0.26260295510292053, + "learning_rate": 0.0001314447592067989, + "loss": 0.2396, + "step": 2788 + }, + { + "epoch": 1.0299113737075332, + "grad_norm": 0.25902485847473145, + "learning_rate": 0.00013142012563123538, + "loss": 0.2191, + "step": 2789 + }, + { + "epoch": 1.0302806499261448, + "grad_norm": 0.22949974238872528, + "learning_rate": 0.00013139549205567187, + "loss": 0.243, + "step": 2790 + }, + { + "epoch": 1.0306499261447564, + "grad_norm": 0.2843352258205414, + "learning_rate": 0.00013137085848010838, + "loss": 0.2184, + "step": 2791 + }, + { + "epoch": 1.0310192023633677, + "grad_norm": 0.2798159718513489, + "learning_rate": 0.0001313462249045449, + "loss": 0.2841, + "step": 2792 + }, + { + "epoch": 1.0313884785819794, + "grad_norm": 0.24332858622074127, + "learning_rate": 0.0001313215913289814, + "loss": 0.2047, + "step": 2793 + }, + { + "epoch": 1.0317577548005907, + "grad_norm": 0.23941193521022797, + "learning_rate": 0.0001312969577534179, + "loss": 0.1873, + "step": 2794 + }, + { + "epoch": 1.0321270310192023, + "grad_norm": 0.26138177514076233, + "learning_rate": 0.00013127232417785441, + "loss": 0.2695, + "step": 2795 + }, + { + "epoch": 1.032496307237814, + "grad_norm": 0.1991458386182785, + "learning_rate": 0.00013124769060229093, + "loss": 0.1941, + "step": 2796 + }, + { + "epoch": 1.0328655834564253, + "grad_norm": 0.22780875861644745, + "learning_rate": 0.00013122305702672744, + "loss": 0.262, + "step": 2797 + }, + { + "epoch": 1.033234859675037, + "grad_norm": 0.23241841793060303, + "learning_rate": 0.00013119842345116393, + "loss": 0.24, + "step": 2798 + }, + { + "epoch": 1.0336041358936485, + "grad_norm": 0.21401453018188477, + "learning_rate": 0.00013117378987560045, + "loss": 0.1737, + "step": 2799 + }, + { + "epoch": 1.03397341211226, + "grad_norm": 0.3072996735572815, + "learning_rate": 0.00013114915630003693, + "loss": 0.2315, + "step": 2800 + }, + { + "epoch": 1.03397341211226, + "eval_loss": 8.450515747070312, + "eval_runtime": 6.9077, + "eval_samples_per_second": 7.238, + "eval_steps_per_second": 1.013, + "step": 2800 + }, + { + "epoch": 1.0343426883308715, + "grad_norm": 0.27251601219177246, + "learning_rate": 0.00013112452272447347, + "loss": 0.2541, + "step": 2801 + }, + { + "epoch": 1.034711964549483, + "grad_norm": 0.23030078411102295, + "learning_rate": 0.00013109988914890996, + "loss": 0.2324, + "step": 2802 + }, + { + "epoch": 1.0350812407680945, + "grad_norm": 0.27154791355133057, + "learning_rate": 0.00013107525557334648, + "loss": 0.2504, + "step": 2803 + }, + { + "epoch": 1.035450516986706, + "grad_norm": 0.32601410150527954, + "learning_rate": 0.00013105062199778296, + "loss": 0.2745, + "step": 2804 + }, + { + "epoch": 1.0358197932053175, + "grad_norm": 0.24586603045463562, + "learning_rate": 0.00013102598842221948, + "loss": 0.2414, + "step": 2805 + }, + { + "epoch": 1.036189069423929, + "grad_norm": 0.21404922008514404, + "learning_rate": 0.000131001354846656, + "loss": 0.2147, + "step": 2806 + }, + { + "epoch": 1.0365583456425407, + "grad_norm": 0.22154846787452698, + "learning_rate": 0.0001309767212710925, + "loss": 0.2252, + "step": 2807 + }, + { + "epoch": 1.036927621861152, + "grad_norm": 0.21748924255371094, + "learning_rate": 0.000130952087695529, + "loss": 0.2128, + "step": 2808 + }, + { + "epoch": 1.0372968980797637, + "grad_norm": 0.24705617129802704, + "learning_rate": 0.0001309274541199655, + "loss": 0.2554, + "step": 2809 + }, + { + "epoch": 1.0376661742983753, + "grad_norm": 0.2688770592212677, + "learning_rate": 0.00013090282054440203, + "loss": 0.2731, + "step": 2810 + }, + { + "epoch": 1.0380354505169866, + "grad_norm": 0.2509062886238098, + "learning_rate": 0.00013087818696883854, + "loss": 0.2437, + "step": 2811 + }, + { + "epoch": 1.0384047267355982, + "grad_norm": 0.28050926327705383, + "learning_rate": 0.00013085355339327503, + "loss": 0.254, + "step": 2812 + }, + { + "epoch": 1.0387740029542099, + "grad_norm": 0.2680518925189972, + "learning_rate": 0.00013082891981771154, + "loss": 0.2957, + "step": 2813 + }, + { + "epoch": 1.0391432791728212, + "grad_norm": 0.2615833282470703, + "learning_rate": 0.00013080428624214806, + "loss": 0.2611, + "step": 2814 + }, + { + "epoch": 1.0395125553914328, + "grad_norm": 0.2402847856283188, + "learning_rate": 0.00013077965266658457, + "loss": 0.2271, + "step": 2815 + }, + { + "epoch": 1.0398818316100442, + "grad_norm": 0.25918516516685486, + "learning_rate": 0.00013075501909102106, + "loss": 0.2597, + "step": 2816 + }, + { + "epoch": 1.0402511078286558, + "grad_norm": 0.26455992460250854, + "learning_rate": 0.00013073038551545757, + "loss": 0.2366, + "step": 2817 + }, + { + "epoch": 1.0406203840472674, + "grad_norm": 0.26553401350975037, + "learning_rate": 0.00013070575193989406, + "loss": 0.2518, + "step": 2818 + }, + { + "epoch": 1.0409896602658788, + "grad_norm": 0.25620895624160767, + "learning_rate": 0.0001306811183643306, + "loss": 0.2334, + "step": 2819 + }, + { + "epoch": 1.0413589364844904, + "grad_norm": 0.33764657378196716, + "learning_rate": 0.0001306564847887671, + "loss": 0.2619, + "step": 2820 + }, + { + "epoch": 1.041728212703102, + "grad_norm": 0.2284594476222992, + "learning_rate": 0.0001306318512132036, + "loss": 0.2283, + "step": 2821 + }, + { + "epoch": 1.0420974889217134, + "grad_norm": 0.3451555669307709, + "learning_rate": 0.0001306072176376401, + "loss": 0.2835, + "step": 2822 + }, + { + "epoch": 1.042466765140325, + "grad_norm": 0.3266119062900543, + "learning_rate": 0.0001305825840620766, + "loss": 0.2346, + "step": 2823 + }, + { + "epoch": 1.0428360413589366, + "grad_norm": 0.270417183637619, + "learning_rate": 0.00013055795048651312, + "loss": 0.2647, + "step": 2824 + }, + { + "epoch": 1.043205317577548, + "grad_norm": 0.2889769673347473, + "learning_rate": 0.00013053331691094964, + "loss": 0.2359, + "step": 2825 + }, + { + "epoch": 1.0435745937961596, + "grad_norm": 0.26300638914108276, + "learning_rate": 0.00013050868333538612, + "loss": 0.2275, + "step": 2826 + }, + { + "epoch": 1.043943870014771, + "grad_norm": 0.2595803737640381, + "learning_rate": 0.00013048404975982264, + "loss": 0.2289, + "step": 2827 + }, + { + "epoch": 1.0443131462333826, + "grad_norm": 0.26959100365638733, + "learning_rate": 0.00013045941618425915, + "loss": 0.236, + "step": 2828 + }, + { + "epoch": 1.0446824224519942, + "grad_norm": 0.305092453956604, + "learning_rate": 0.00013043478260869567, + "loss": 0.2829, + "step": 2829 + }, + { + "epoch": 1.0450516986706055, + "grad_norm": 0.29510173201560974, + "learning_rate": 0.00013041014903313216, + "loss": 0.2493, + "step": 2830 + }, + { + "epoch": 1.0454209748892171, + "grad_norm": 0.24791450798511505, + "learning_rate": 0.00013038551545756867, + "loss": 0.2167, + "step": 2831 + }, + { + "epoch": 1.0457902511078287, + "grad_norm": 0.24968664348125458, + "learning_rate": 0.00013036088188200516, + "loss": 0.2237, + "step": 2832 + }, + { + "epoch": 1.0461595273264401, + "grad_norm": 0.2564988434314728, + "learning_rate": 0.0001303362483064417, + "loss": 0.2545, + "step": 2833 + }, + { + "epoch": 1.0465288035450517, + "grad_norm": 0.28506484627723694, + "learning_rate": 0.0001303116147308782, + "loss": 0.2627, + "step": 2834 + }, + { + "epoch": 1.0468980797636631, + "grad_norm": 0.36924222111701965, + "learning_rate": 0.0001302869811553147, + "loss": 0.252, + "step": 2835 + }, + { + "epoch": 1.0472673559822747, + "grad_norm": 0.25784701108932495, + "learning_rate": 0.0001302623475797512, + "loss": 0.2136, + "step": 2836 + }, + { + "epoch": 1.0476366322008863, + "grad_norm": 0.2246789187192917, + "learning_rate": 0.0001302377140041877, + "loss": 0.1641, + "step": 2837 + }, + { + "epoch": 1.0480059084194977, + "grad_norm": 0.23290549218654633, + "learning_rate": 0.00013021308042862422, + "loss": 0.1921, + "step": 2838 + }, + { + "epoch": 1.0483751846381093, + "grad_norm": 0.3374260663986206, + "learning_rate": 0.00013018844685306073, + "loss": 0.2527, + "step": 2839 + }, + { + "epoch": 1.048744460856721, + "grad_norm": 0.2262749969959259, + "learning_rate": 0.00013016381327749722, + "loss": 0.2271, + "step": 2840 + }, + { + "epoch": 1.0491137370753323, + "grad_norm": 0.22664733231067657, + "learning_rate": 0.00013013917970193374, + "loss": 0.2273, + "step": 2841 + }, + { + "epoch": 1.049483013293944, + "grad_norm": 0.32149672508239746, + "learning_rate": 0.00013011454612637025, + "loss": 0.2271, + "step": 2842 + }, + { + "epoch": 1.0498522895125555, + "grad_norm": 0.2273453325033188, + "learning_rate": 0.00013008991255080676, + "loss": 0.2454, + "step": 2843 + }, + { + "epoch": 1.0502215657311669, + "grad_norm": 0.27102231979370117, + "learning_rate": 0.00013006527897524325, + "loss": 0.2527, + "step": 2844 + }, + { + "epoch": 1.0505908419497785, + "grad_norm": 0.28361067175865173, + "learning_rate": 0.00013004064539967977, + "loss": 0.2754, + "step": 2845 + }, + { + "epoch": 1.0509601181683899, + "grad_norm": 0.2805883288383484, + "learning_rate": 0.00013001601182411628, + "loss": 0.2158, + "step": 2846 + }, + { + "epoch": 1.0513293943870015, + "grad_norm": 0.2733857333660126, + "learning_rate": 0.0001299913782485528, + "loss": 0.1971, + "step": 2847 + }, + { + "epoch": 1.051698670605613, + "grad_norm": 0.29386094212532043, + "learning_rate": 0.00012996674467298928, + "loss": 0.1798, + "step": 2848 + }, + { + "epoch": 1.0520679468242244, + "grad_norm": 0.22668612003326416, + "learning_rate": 0.0001299421110974258, + "loss": 0.1993, + "step": 2849 + }, + { + "epoch": 1.052437223042836, + "grad_norm": 0.28496038913726807, + "learning_rate": 0.00012991747752186229, + "loss": 0.283, + "step": 2850 + }, + { + "epoch": 1.052437223042836, + "eval_loss": 8.485991477966309, + "eval_runtime": 6.911, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 1.013, + "step": 2850 + }, + { + "epoch": 1.0528064992614476, + "grad_norm": 0.4445702135562897, + "learning_rate": 0.00012989284394629883, + "loss": 0.2598, + "step": 2851 + }, + { + "epoch": 1.053175775480059, + "grad_norm": 0.24115540087223053, + "learning_rate": 0.00012986821037073531, + "loss": 0.2482, + "step": 2852 + }, + { + "epoch": 1.0535450516986706, + "grad_norm": 0.24585190415382385, + "learning_rate": 0.00012984357679517183, + "loss": 0.1954, + "step": 2853 + }, + { + "epoch": 1.0539143279172822, + "grad_norm": 0.2481495589017868, + "learning_rate": 0.00012981894321960832, + "loss": 0.2114, + "step": 2854 + }, + { + "epoch": 1.0542836041358936, + "grad_norm": 0.25713133811950684, + "learning_rate": 0.00012979430964404483, + "loss": 0.2453, + "step": 2855 + }, + { + "epoch": 1.0546528803545052, + "grad_norm": 0.23062777519226074, + "learning_rate": 0.00012976967606848135, + "loss": 0.1944, + "step": 2856 + }, + { + "epoch": 1.0550221565731166, + "grad_norm": 0.25019457936286926, + "learning_rate": 0.00012974504249291786, + "loss": 0.2281, + "step": 2857 + }, + { + "epoch": 1.0553914327917282, + "grad_norm": 0.32851049304008484, + "learning_rate": 0.00012972040891735435, + "loss": 0.2215, + "step": 2858 + }, + { + "epoch": 1.0557607090103398, + "grad_norm": 0.2568628489971161, + "learning_rate": 0.00012969577534179086, + "loss": 0.2556, + "step": 2859 + }, + { + "epoch": 1.0561299852289512, + "grad_norm": 0.25231048464775085, + "learning_rate": 0.00012967114176622738, + "loss": 0.2075, + "step": 2860 + }, + { + "epoch": 1.0564992614475628, + "grad_norm": 0.28217604756355286, + "learning_rate": 0.0001296465081906639, + "loss": 0.2176, + "step": 2861 + }, + { + "epoch": 1.0568685376661744, + "grad_norm": 0.2996918857097626, + "learning_rate": 0.00012962187461510038, + "loss": 0.2595, + "step": 2862 + }, + { + "epoch": 1.0572378138847858, + "grad_norm": 0.2471679151058197, + "learning_rate": 0.0001295972410395369, + "loss": 0.208, + "step": 2863 + }, + { + "epoch": 1.0576070901033974, + "grad_norm": 0.3010309338569641, + "learning_rate": 0.00012957260746397338, + "loss": 0.2619, + "step": 2864 + }, + { + "epoch": 1.0579763663220088, + "grad_norm": 0.2689743638038635, + "learning_rate": 0.00012954797388840992, + "loss": 0.2686, + "step": 2865 + }, + { + "epoch": 1.0583456425406204, + "grad_norm": 0.2509540021419525, + "learning_rate": 0.0001295233403128464, + "loss": 0.2304, + "step": 2866 + }, + { + "epoch": 1.058714918759232, + "grad_norm": 0.23404230177402496, + "learning_rate": 0.00012949870673728293, + "loss": 0.2014, + "step": 2867 + }, + { + "epoch": 1.0590841949778433, + "grad_norm": 0.2765131890773773, + "learning_rate": 0.0001294740731617194, + "loss": 0.2585, + "step": 2868 + }, + { + "epoch": 1.059453471196455, + "grad_norm": 0.2308437079191208, + "learning_rate": 0.00012944943958615593, + "loss": 0.1909, + "step": 2869 + }, + { + "epoch": 1.0598227474150665, + "grad_norm": 0.28192681074142456, + "learning_rate": 0.00012942480601059244, + "loss": 0.2502, + "step": 2870 + }, + { + "epoch": 1.060192023633678, + "grad_norm": 0.31268104910850525, + "learning_rate": 0.00012940017243502896, + "loss": 0.276, + "step": 2871 + }, + { + "epoch": 1.0605612998522895, + "grad_norm": 0.21047550439834595, + "learning_rate": 0.00012937553885946544, + "loss": 0.185, + "step": 2872 + }, + { + "epoch": 1.0609305760709011, + "grad_norm": 0.3675978481769562, + "learning_rate": 0.00012935090528390196, + "loss": 0.2292, + "step": 2873 + }, + { + "epoch": 1.0612998522895125, + "grad_norm": 0.23332859575748444, + "learning_rate": 0.00012932627170833847, + "loss": 0.2299, + "step": 2874 + }, + { + "epoch": 1.0616691285081241, + "grad_norm": 0.24053311347961426, + "learning_rate": 0.000129301638132775, + "loss": 0.2154, + "step": 2875 + }, + { + "epoch": 1.0620384047267355, + "grad_norm": 0.2479230761528015, + "learning_rate": 0.00012927700455721148, + "loss": 0.2143, + "step": 2876 + }, + { + "epoch": 1.062407680945347, + "grad_norm": 0.24574218690395355, + "learning_rate": 0.000129252370981648, + "loss": 0.2133, + "step": 2877 + }, + { + "epoch": 1.0627769571639587, + "grad_norm": 0.262930691242218, + "learning_rate": 0.0001292277374060845, + "loss": 0.2316, + "step": 2878 + }, + { + "epoch": 1.06314623338257, + "grad_norm": 0.30103567242622375, + "learning_rate": 0.00012920310383052102, + "loss": 0.2468, + "step": 2879 + }, + { + "epoch": 1.0635155096011817, + "grad_norm": 0.2641828954219818, + "learning_rate": 0.0001291784702549575, + "loss": 0.2485, + "step": 2880 + }, + { + "epoch": 1.0638847858197933, + "grad_norm": 0.265886127948761, + "learning_rate": 0.00012915383667939402, + "loss": 0.2054, + "step": 2881 + }, + { + "epoch": 1.0642540620384047, + "grad_norm": 0.25658199191093445, + "learning_rate": 0.0001291292031038305, + "loss": 0.2458, + "step": 2882 + }, + { + "epoch": 1.0646233382570163, + "grad_norm": 0.2215060293674469, + "learning_rate": 0.00012910456952826705, + "loss": 0.2039, + "step": 2883 + }, + { + "epoch": 1.0649926144756279, + "grad_norm": 0.2803214490413666, + "learning_rate": 0.00012907993595270354, + "loss": 0.2148, + "step": 2884 + }, + { + "epoch": 1.0653618906942393, + "grad_norm": 0.2311931699514389, + "learning_rate": 0.00012905530237714005, + "loss": 0.2142, + "step": 2885 + }, + { + "epoch": 1.0657311669128509, + "grad_norm": 0.3004252314567566, + "learning_rate": 0.00012903066880157654, + "loss": 0.3018, + "step": 2886 + }, + { + "epoch": 1.0661004431314622, + "grad_norm": 0.24119549989700317, + "learning_rate": 0.00012900603522601306, + "loss": 0.2191, + "step": 2887 + }, + { + "epoch": 1.0664697193500738, + "grad_norm": 0.24185258150100708, + "learning_rate": 0.00012898140165044957, + "loss": 0.2293, + "step": 2888 + }, + { + "epoch": 1.0668389955686854, + "grad_norm": 0.3597058653831482, + "learning_rate": 0.00012895676807488609, + "loss": 0.3047, + "step": 2889 + }, + { + "epoch": 1.0672082717872968, + "grad_norm": 0.25713691115379333, + "learning_rate": 0.00012893213449932257, + "loss": 0.2349, + "step": 2890 + }, + { + "epoch": 1.0675775480059084, + "grad_norm": 0.2724282443523407, + "learning_rate": 0.0001289075009237591, + "loss": 0.2672, + "step": 2891 + }, + { + "epoch": 1.06794682422452, + "grad_norm": 0.2510925829410553, + "learning_rate": 0.0001288828673481956, + "loss": 0.2551, + "step": 2892 + }, + { + "epoch": 1.0683161004431314, + "grad_norm": 0.24488314986228943, + "learning_rate": 0.00012885823377263212, + "loss": 0.2156, + "step": 2893 + }, + { + "epoch": 1.068685376661743, + "grad_norm": 0.29250568151474, + "learning_rate": 0.0001288336001970686, + "loss": 0.2232, + "step": 2894 + }, + { + "epoch": 1.0690546528803546, + "grad_norm": 0.22155694663524628, + "learning_rate": 0.00012880896662150512, + "loss": 0.2371, + "step": 2895 + }, + { + "epoch": 1.069423929098966, + "grad_norm": 0.2740866243839264, + "learning_rate": 0.0001287843330459416, + "loss": 0.2544, + "step": 2896 + }, + { + "epoch": 1.0697932053175776, + "grad_norm": 0.294709175825119, + "learning_rate": 0.00012875969947037815, + "loss": 0.2737, + "step": 2897 + }, + { + "epoch": 1.070162481536189, + "grad_norm": 0.2706703841686249, + "learning_rate": 0.00012873506589481464, + "loss": 0.2188, + "step": 2898 + }, + { + "epoch": 1.0705317577548006, + "grad_norm": 0.2892598807811737, + "learning_rate": 0.00012871043231925115, + "loss": 0.2327, + "step": 2899 + }, + { + "epoch": 1.0709010339734122, + "grad_norm": 0.3209196925163269, + "learning_rate": 0.00012868579874368764, + "loss": 0.2298, + "step": 2900 + }, + { + "epoch": 1.0709010339734122, + "eval_loss": 8.449867248535156, + "eval_runtime": 6.9196, + "eval_samples_per_second": 7.226, + "eval_steps_per_second": 1.012, + "step": 2900 + }, + { + "epoch": 1.0712703101920236, + "grad_norm": 0.3323870301246643, + "learning_rate": 0.00012866116516812415, + "loss": 0.2279, + "step": 2901 + }, + { + "epoch": 1.0716395864106352, + "grad_norm": 0.25287455320358276, + "learning_rate": 0.00012863653159256067, + "loss": 0.2181, + "step": 2902 + }, + { + "epoch": 1.0720088626292468, + "grad_norm": 0.32346925139427185, + "learning_rate": 0.00012861189801699718, + "loss": 0.2169, + "step": 2903 + }, + { + "epoch": 1.0723781388478582, + "grad_norm": 0.299011766910553, + "learning_rate": 0.00012858726444143367, + "loss": 0.2431, + "step": 2904 + }, + { + "epoch": 1.0727474150664698, + "grad_norm": 0.2364770621061325, + "learning_rate": 0.00012856263086587018, + "loss": 0.2504, + "step": 2905 + }, + { + "epoch": 1.0731166912850811, + "grad_norm": 0.19738894701004028, + "learning_rate": 0.0001285379972903067, + "loss": 0.199, + "step": 2906 + }, + { + "epoch": 1.0734859675036927, + "grad_norm": 0.3166182041168213, + "learning_rate": 0.0001285133637147432, + "loss": 0.2804, + "step": 2907 + }, + { + "epoch": 1.0738552437223043, + "grad_norm": 0.2700210213661194, + "learning_rate": 0.0001284887301391797, + "loss": 0.2365, + "step": 2908 + }, + { + "epoch": 1.0742245199409157, + "grad_norm": 0.223306804895401, + "learning_rate": 0.00012846409656361622, + "loss": 0.2357, + "step": 2909 + }, + { + "epoch": 1.0745937961595273, + "grad_norm": 0.19979429244995117, + "learning_rate": 0.00012843946298805273, + "loss": 0.1915, + "step": 2910 + }, + { + "epoch": 1.074963072378139, + "grad_norm": 0.2926329970359802, + "learning_rate": 0.00012841482941248924, + "loss": 0.2597, + "step": 2911 + }, + { + "epoch": 1.0753323485967503, + "grad_norm": 0.2393726259469986, + "learning_rate": 0.00012839019583692573, + "loss": 0.2144, + "step": 2912 + }, + { + "epoch": 1.075701624815362, + "grad_norm": 0.24303549528121948, + "learning_rate": 0.00012836556226136225, + "loss": 0.2371, + "step": 2913 + }, + { + "epoch": 1.0760709010339735, + "grad_norm": 0.29544028639793396, + "learning_rate": 0.00012834092868579873, + "loss": 0.2682, + "step": 2914 + }, + { + "epoch": 1.076440177252585, + "grad_norm": 0.24073255062103271, + "learning_rate": 0.00012831629511023528, + "loss": 0.2031, + "step": 2915 + }, + { + "epoch": 1.0768094534711965, + "grad_norm": 0.270773321390152, + "learning_rate": 0.00012829166153467176, + "loss": 0.2121, + "step": 2916 + }, + { + "epoch": 1.0771787296898079, + "grad_norm": 0.3140622675418854, + "learning_rate": 0.00012826702795910828, + "loss": 0.2594, + "step": 2917 + }, + { + "epoch": 1.0775480059084195, + "grad_norm": 0.2280280441045761, + "learning_rate": 0.00012824239438354477, + "loss": 0.2079, + "step": 2918 + }, + { + "epoch": 1.077917282127031, + "grad_norm": 0.3007722795009613, + "learning_rate": 0.00012821776080798128, + "loss": 0.2212, + "step": 2919 + }, + { + "epoch": 1.0782865583456425, + "grad_norm": 0.2359761893749237, + "learning_rate": 0.0001281931272324178, + "loss": 0.2323, + "step": 2920 + }, + { + "epoch": 1.078655834564254, + "grad_norm": 0.20551949739456177, + "learning_rate": 0.0001281684936568543, + "loss": 0.1685, + "step": 2921 + }, + { + "epoch": 1.0790251107828657, + "grad_norm": 0.27626994252204895, + "learning_rate": 0.0001281438600812908, + "loss": 0.2541, + "step": 2922 + }, + { + "epoch": 1.079394387001477, + "grad_norm": 0.24405014514923096, + "learning_rate": 0.0001281192265057273, + "loss": 0.2323, + "step": 2923 + }, + { + "epoch": 1.0797636632200887, + "grad_norm": 0.28040629625320435, + "learning_rate": 0.00012809459293016383, + "loss": 0.2573, + "step": 2924 + }, + { + "epoch": 1.0801329394387, + "grad_norm": 0.3102937638759613, + "learning_rate": 0.00012806995935460034, + "loss": 0.2335, + "step": 2925 + }, + { + "epoch": 1.0805022156573116, + "grad_norm": 0.2559118866920471, + "learning_rate": 0.00012804532577903683, + "loss": 0.2668, + "step": 2926 + }, + { + "epoch": 1.0808714918759232, + "grad_norm": 0.22850985825061798, + "learning_rate": 0.00012802069220347334, + "loss": 0.1968, + "step": 2927 + }, + { + "epoch": 1.0812407680945346, + "grad_norm": 0.24528613686561584, + "learning_rate": 0.00012799605862790983, + "loss": 0.2349, + "step": 2928 + }, + { + "epoch": 1.0816100443131462, + "grad_norm": 0.23558145761489868, + "learning_rate": 0.00012797142505234637, + "loss": 0.1909, + "step": 2929 + }, + { + "epoch": 1.0819793205317578, + "grad_norm": 0.27739837765693665, + "learning_rate": 0.00012794679147678286, + "loss": 0.2368, + "step": 2930 + }, + { + "epoch": 1.0823485967503692, + "grad_norm": 0.23819200694561005, + "learning_rate": 0.00012792215790121938, + "loss": 0.2123, + "step": 2931 + }, + { + "epoch": 1.0827178729689808, + "grad_norm": 0.26758721470832825, + "learning_rate": 0.00012789752432565586, + "loss": 0.2514, + "step": 2932 + }, + { + "epoch": 1.0830871491875924, + "grad_norm": 0.2250298708677292, + "learning_rate": 0.00012787289075009238, + "loss": 0.2114, + "step": 2933 + }, + { + "epoch": 1.0834564254062038, + "grad_norm": 0.2457333356142044, + "learning_rate": 0.0001278482571745289, + "loss": 0.2165, + "step": 2934 + }, + { + "epoch": 1.0838257016248154, + "grad_norm": 0.3151920735836029, + "learning_rate": 0.0001278236235989654, + "loss": 0.2739, + "step": 2935 + }, + { + "epoch": 1.0841949778434268, + "grad_norm": 0.2202668935060501, + "learning_rate": 0.0001277989900234019, + "loss": 0.2016, + "step": 2936 + }, + { + "epoch": 1.0845642540620384, + "grad_norm": 0.21768949925899506, + "learning_rate": 0.0001277743564478384, + "loss": 0.2216, + "step": 2937 + }, + { + "epoch": 1.08493353028065, + "grad_norm": 0.3253072500228882, + "learning_rate": 0.00012774972287227492, + "loss": 0.2749, + "step": 2938 + }, + { + "epoch": 1.0853028064992614, + "grad_norm": 0.308607280254364, + "learning_rate": 0.00012772508929671144, + "loss": 0.2634, + "step": 2939 + }, + { + "epoch": 1.085672082717873, + "grad_norm": 0.23289059102535248, + "learning_rate": 0.00012770045572114793, + "loss": 0.215, + "step": 2940 + }, + { + "epoch": 1.0860413589364846, + "grad_norm": 0.2398148626089096, + "learning_rate": 0.00012767582214558444, + "loss": 0.2272, + "step": 2941 + }, + { + "epoch": 1.086410635155096, + "grad_norm": 0.2810850441455841, + "learning_rate": 0.00012765118857002093, + "loss": 0.2751, + "step": 2942 + }, + { + "epoch": 1.0867799113737076, + "grad_norm": 0.2803725004196167, + "learning_rate": 0.00012762655499445747, + "loss": 0.26, + "step": 2943 + }, + { + "epoch": 1.0871491875923192, + "grad_norm": 0.2764188349246979, + "learning_rate": 0.00012760192141889396, + "loss": 0.2535, + "step": 2944 + }, + { + "epoch": 1.0875184638109305, + "grad_norm": 0.22685247659683228, + "learning_rate": 0.00012757728784333047, + "loss": 0.1998, + "step": 2945 + }, + { + "epoch": 1.0878877400295421, + "grad_norm": 0.257934033870697, + "learning_rate": 0.00012755265426776696, + "loss": 0.2091, + "step": 2946 + }, + { + "epoch": 1.0882570162481535, + "grad_norm": 0.20494507253170013, + "learning_rate": 0.00012752802069220347, + "loss": 0.2097, + "step": 2947 + }, + { + "epoch": 1.0886262924667651, + "grad_norm": 0.2264721691608429, + "learning_rate": 0.00012750338711664, + "loss": 0.1818, + "step": 2948 + }, + { + "epoch": 1.0889955686853767, + "grad_norm": 0.2269432097673416, + "learning_rate": 0.0001274787535410765, + "loss": 0.2267, + "step": 2949 + }, + { + "epoch": 1.089364844903988, + "grad_norm": 0.25488942861557007, + "learning_rate": 0.000127454119965513, + "loss": 0.2135, + "step": 2950 + }, + { + "epoch": 1.089364844903988, + "eval_loss": 8.238913536071777, + "eval_runtime": 6.9152, + "eval_samples_per_second": 7.23, + "eval_steps_per_second": 1.012, + "step": 2950 + }, + { + "epoch": 1.0897341211225997, + "grad_norm": 0.22945912182331085, + "learning_rate": 0.0001274294863899495, + "loss": 0.1788, + "step": 2951 + }, + { + "epoch": 1.0901033973412113, + "grad_norm": 0.2284742146730423, + "learning_rate": 0.00012740485281438602, + "loss": 0.2261, + "step": 2952 + }, + { + "epoch": 1.0904726735598227, + "grad_norm": 0.2353018969297409, + "learning_rate": 0.00012738021923882253, + "loss": 0.2225, + "step": 2953 + }, + { + "epoch": 1.0908419497784343, + "grad_norm": 0.23720666766166687, + "learning_rate": 0.00012735558566325902, + "loss": 0.2322, + "step": 2954 + }, + { + "epoch": 1.091211225997046, + "grad_norm": 0.3234037458896637, + "learning_rate": 0.00012733095208769554, + "loss": 0.2583, + "step": 2955 + }, + { + "epoch": 1.0915805022156573, + "grad_norm": 0.2765007019042969, + "learning_rate": 0.00012730631851213205, + "loss": 0.2408, + "step": 2956 + }, + { + "epoch": 1.0919497784342689, + "grad_norm": 0.27248650789260864, + "learning_rate": 0.00012728168493656857, + "loss": 0.2286, + "step": 2957 + }, + { + "epoch": 1.0923190546528803, + "grad_norm": 0.25097253918647766, + "learning_rate": 0.00012725705136100505, + "loss": 0.2369, + "step": 2958 + }, + { + "epoch": 1.0926883308714919, + "grad_norm": 0.31896355748176575, + "learning_rate": 0.00012723241778544157, + "loss": 0.2813, + "step": 2959 + }, + { + "epoch": 1.0930576070901035, + "grad_norm": 0.27294644713401794, + "learning_rate": 0.00012720778420987806, + "loss": 0.2329, + "step": 2960 + }, + { + "epoch": 1.0934268833087148, + "grad_norm": 0.2927689552307129, + "learning_rate": 0.0001271831506343146, + "loss": 0.2493, + "step": 2961 + }, + { + "epoch": 1.0937961595273265, + "grad_norm": 0.2979494035243988, + "learning_rate": 0.00012715851705875108, + "loss": 0.2271, + "step": 2962 + }, + { + "epoch": 1.094165435745938, + "grad_norm": 0.24813397228717804, + "learning_rate": 0.0001271338834831876, + "loss": 0.2285, + "step": 2963 + }, + { + "epoch": 1.0945347119645494, + "grad_norm": 0.26949915289878845, + "learning_rate": 0.0001271092499076241, + "loss": 0.2426, + "step": 2964 + }, + { + "epoch": 1.094903988183161, + "grad_norm": 0.22388923168182373, + "learning_rate": 0.0001270846163320606, + "loss": 0.1951, + "step": 2965 + }, + { + "epoch": 1.0952732644017726, + "grad_norm": 0.27401968836784363, + "learning_rate": 0.00012705998275649712, + "loss": 0.2389, + "step": 2966 + }, + { + "epoch": 1.095642540620384, + "grad_norm": 0.2172209769487381, + "learning_rate": 0.00012703534918093363, + "loss": 0.1899, + "step": 2967 + }, + { + "epoch": 1.0960118168389956, + "grad_norm": 0.37609565258026123, + "learning_rate": 0.00012701071560537012, + "loss": 0.2729, + "step": 2968 + }, + { + "epoch": 1.096381093057607, + "grad_norm": 0.28436267375946045, + "learning_rate": 0.00012698608202980663, + "loss": 0.2682, + "step": 2969 + }, + { + "epoch": 1.0967503692762186, + "grad_norm": 0.2859165668487549, + "learning_rate": 0.00012696144845424315, + "loss": 0.2292, + "step": 2970 + }, + { + "epoch": 1.0971196454948302, + "grad_norm": 0.22722071409225464, + "learning_rate": 0.00012693681487867966, + "loss": 0.2015, + "step": 2971 + }, + { + "epoch": 1.0974889217134416, + "grad_norm": 0.3454102873802185, + "learning_rate": 0.00012691218130311615, + "loss": 0.2831, + "step": 2972 + }, + { + "epoch": 1.0978581979320532, + "grad_norm": 0.24200601875782013, + "learning_rate": 0.00012688754772755266, + "loss": 0.2212, + "step": 2973 + }, + { + "epoch": 1.0982274741506648, + "grad_norm": 0.22822780907154083, + "learning_rate": 0.00012686291415198915, + "loss": 0.2085, + "step": 2974 + }, + { + "epoch": 1.0985967503692762, + "grad_norm": 0.27999043464660645, + "learning_rate": 0.0001268382805764257, + "loss": 0.2645, + "step": 2975 + }, + { + "epoch": 1.0989660265878878, + "grad_norm": 0.27863240242004395, + "learning_rate": 0.00012681364700086218, + "loss": 0.2497, + "step": 2976 + }, + { + "epoch": 1.0993353028064992, + "grad_norm": 0.26704660058021545, + "learning_rate": 0.0001267890134252987, + "loss": 0.2752, + "step": 2977 + }, + { + "epoch": 1.0997045790251108, + "grad_norm": 0.2555709779262543, + "learning_rate": 0.00012676437984973518, + "loss": 0.3211, + "step": 2978 + }, + { + "epoch": 1.1000738552437224, + "grad_norm": 0.24935823678970337, + "learning_rate": 0.0001267397462741717, + "loss": 0.2223, + "step": 2979 + }, + { + "epoch": 1.1004431314623337, + "grad_norm": 0.23784668743610382, + "learning_rate": 0.0001267151126986082, + "loss": 0.2163, + "step": 2980 + }, + { + "epoch": 1.1008124076809453, + "grad_norm": 0.2791133522987366, + "learning_rate": 0.00012669047912304473, + "loss": 0.2396, + "step": 2981 + }, + { + "epoch": 1.101181683899557, + "grad_norm": 0.3145526349544525, + "learning_rate": 0.00012666584554748122, + "loss": 0.2359, + "step": 2982 + }, + { + "epoch": 1.1015509601181683, + "grad_norm": 0.2756912410259247, + "learning_rate": 0.00012664121197191773, + "loss": 0.2384, + "step": 2983 + }, + { + "epoch": 1.10192023633678, + "grad_norm": 0.33269283175468445, + "learning_rate": 0.00012661657839635424, + "loss": 0.2534, + "step": 2984 + }, + { + "epoch": 1.1022895125553913, + "grad_norm": 0.24811066687107086, + "learning_rate": 0.00012659194482079076, + "loss": 0.2326, + "step": 2985 + }, + { + "epoch": 1.102658788774003, + "grad_norm": 0.23978480696678162, + "learning_rate": 0.00012656731124522725, + "loss": 0.2432, + "step": 2986 + }, + { + "epoch": 1.1030280649926145, + "grad_norm": 0.2566649913787842, + "learning_rate": 0.00012654267766966376, + "loss": 0.2234, + "step": 2987 + }, + { + "epoch": 1.103397341211226, + "grad_norm": 0.2305290848016739, + "learning_rate": 0.00012651804409410028, + "loss": 0.2126, + "step": 2988 + }, + { + "epoch": 1.1037666174298375, + "grad_norm": 0.2616018056869507, + "learning_rate": 0.0001264934105185368, + "loss": 0.2396, + "step": 2989 + }, + { + "epoch": 1.104135893648449, + "grad_norm": 0.30988749861717224, + "learning_rate": 0.00012646877694297328, + "loss": 0.2591, + "step": 2990 + }, + { + "epoch": 1.1045051698670605, + "grad_norm": 0.2921716272830963, + "learning_rate": 0.0001264441433674098, + "loss": 0.2412, + "step": 2991 + }, + { + "epoch": 1.104874446085672, + "grad_norm": 0.32811033725738525, + "learning_rate": 0.00012641950979184628, + "loss": 0.2298, + "step": 2992 + }, + { + "epoch": 1.1052437223042837, + "grad_norm": 0.3138335943222046, + "learning_rate": 0.00012639487621628282, + "loss": 0.2541, + "step": 2993 + }, + { + "epoch": 1.105612998522895, + "grad_norm": 0.24116988480091095, + "learning_rate": 0.0001263702426407193, + "loss": 0.2122, + "step": 2994 + }, + { + "epoch": 1.1059822747415067, + "grad_norm": 0.2491578310728073, + "learning_rate": 0.00012634560906515582, + "loss": 0.2031, + "step": 2995 + }, + { + "epoch": 1.106351550960118, + "grad_norm": 0.22991780936717987, + "learning_rate": 0.0001263209754895923, + "loss": 0.2192, + "step": 2996 + }, + { + "epoch": 1.1067208271787297, + "grad_norm": 0.2700033485889435, + "learning_rate": 0.00012629634191402883, + "loss": 0.2228, + "step": 2997 + }, + { + "epoch": 1.1070901033973413, + "grad_norm": 0.34385159611701965, + "learning_rate": 0.00012627170833846534, + "loss": 0.2703, + "step": 2998 + }, + { + "epoch": 1.1074593796159526, + "grad_norm": 0.24987593293190002, + "learning_rate": 0.00012624707476290186, + "loss": 0.1976, + "step": 2999 + }, + { + "epoch": 1.1078286558345642, + "grad_norm": 0.27670469880104065, + "learning_rate": 0.00012622244118733834, + "loss": 0.229, + "step": 3000 + }, + { + "epoch": 1.1078286558345642, + "eval_loss": 8.378609657287598, + "eval_runtime": 6.9242, + "eval_samples_per_second": 7.221, + "eval_steps_per_second": 1.011, + "step": 3000 + }, + { + "epoch": 1.1081979320531758, + "grad_norm": 0.2337518185377121, + "learning_rate": 0.00012619780761177486, + "loss": 0.2047, + "step": 3001 + }, + { + "epoch": 1.1085672082717872, + "grad_norm": 0.2378792017698288, + "learning_rate": 0.00012617317403621137, + "loss": 0.2242, + "step": 3002 + }, + { + "epoch": 1.1089364844903988, + "grad_norm": 0.2966460883617401, + "learning_rate": 0.0001261485404606479, + "loss": 0.2498, + "step": 3003 + }, + { + "epoch": 1.1093057607090104, + "grad_norm": 0.22803139686584473, + "learning_rate": 0.00012612390688508437, + "loss": 0.1894, + "step": 3004 + }, + { + "epoch": 1.1096750369276218, + "grad_norm": 0.2894534468650818, + "learning_rate": 0.0001260992733095209, + "loss": 0.2343, + "step": 3005 + }, + { + "epoch": 1.1100443131462334, + "grad_norm": 0.24783295392990112, + "learning_rate": 0.00012607463973395738, + "loss": 0.2359, + "step": 3006 + }, + { + "epoch": 1.1104135893648448, + "grad_norm": 0.23934906721115112, + "learning_rate": 0.00012605000615839392, + "loss": 0.196, + "step": 3007 + }, + { + "epoch": 1.1107828655834564, + "grad_norm": 0.29602378606796265, + "learning_rate": 0.0001260253725828304, + "loss": 0.2254, + "step": 3008 + }, + { + "epoch": 1.111152141802068, + "grad_norm": 0.2830478250980377, + "learning_rate": 0.00012600073900726692, + "loss": 0.2309, + "step": 3009 + }, + { + "epoch": 1.1115214180206794, + "grad_norm": 0.29829126596450806, + "learning_rate": 0.0001259761054317034, + "loss": 0.234, + "step": 3010 + }, + { + "epoch": 1.111890694239291, + "grad_norm": 0.25990650057792664, + "learning_rate": 0.00012595147185613992, + "loss": 0.2133, + "step": 3011 + }, + { + "epoch": 1.1122599704579026, + "grad_norm": 0.22677458822727203, + "learning_rate": 0.00012592683828057644, + "loss": 0.2365, + "step": 3012 + }, + { + "epoch": 1.112629246676514, + "grad_norm": 0.2427213490009308, + "learning_rate": 0.00012590220470501295, + "loss": 0.1889, + "step": 3013 + }, + { + "epoch": 1.1129985228951256, + "grad_norm": 0.2340569943189621, + "learning_rate": 0.00012587757112944944, + "loss": 0.1966, + "step": 3014 + }, + { + "epoch": 1.1133677991137372, + "grad_norm": 0.33453550934791565, + "learning_rate": 0.00012585293755388595, + "loss": 0.2963, + "step": 3015 + }, + { + "epoch": 1.1137370753323486, + "grad_norm": 0.27313587069511414, + "learning_rate": 0.00012582830397832247, + "loss": 0.2069, + "step": 3016 + }, + { + "epoch": 1.1141063515509602, + "grad_norm": 0.2653758227825165, + "learning_rate": 0.00012580367040275898, + "loss": 0.2707, + "step": 3017 + }, + { + "epoch": 1.1144756277695715, + "grad_norm": 0.26495906710624695, + "learning_rate": 0.00012577903682719547, + "loss": 0.2491, + "step": 3018 + }, + { + "epoch": 1.1148449039881831, + "grad_norm": 0.2761906087398529, + "learning_rate": 0.00012575440325163199, + "loss": 0.2072, + "step": 3019 + }, + { + "epoch": 1.1152141802067947, + "grad_norm": 0.2400861233472824, + "learning_rate": 0.0001257297696760685, + "loss": 0.1899, + "step": 3020 + }, + { + "epoch": 1.1155834564254061, + "grad_norm": 0.2813641130924225, + "learning_rate": 0.000125705136100505, + "loss": 0.218, + "step": 3021 + }, + { + "epoch": 1.1159527326440177, + "grad_norm": 0.2475597858428955, + "learning_rate": 0.0001256805025249415, + "loss": 0.198, + "step": 3022 + }, + { + "epoch": 1.1163220088626293, + "grad_norm": 0.2877148985862732, + "learning_rate": 0.000125655868949378, + "loss": 0.2599, + "step": 3023 + }, + { + "epoch": 1.1166912850812407, + "grad_norm": 0.2387334257364273, + "learning_rate": 0.0001256312353738145, + "loss": 0.2205, + "step": 3024 + }, + { + "epoch": 1.1170605612998523, + "grad_norm": 0.21745145320892334, + "learning_rate": 0.00012560660179825102, + "loss": 0.1983, + "step": 3025 + }, + { + "epoch": 1.117429837518464, + "grad_norm": 0.22784286737442017, + "learning_rate": 0.00012558196822268753, + "loss": 0.2139, + "step": 3026 + }, + { + "epoch": 1.1177991137370753, + "grad_norm": 0.26237693428993225, + "learning_rate": 0.00012555733464712402, + "loss": 0.2487, + "step": 3027 + }, + { + "epoch": 1.118168389955687, + "grad_norm": 0.2777375876903534, + "learning_rate": 0.00012553270107156054, + "loss": 0.2358, + "step": 3028 + }, + { + "epoch": 1.1185376661742983, + "grad_norm": 0.2025119960308075, + "learning_rate": 0.00012550806749599705, + "loss": 0.188, + "step": 3029 + }, + { + "epoch": 1.1189069423929099, + "grad_norm": 0.2220081239938736, + "learning_rate": 0.00012548343392043357, + "loss": 0.2169, + "step": 3030 + }, + { + "epoch": 1.1192762186115215, + "grad_norm": 0.23842179775238037, + "learning_rate": 0.00012545880034487005, + "loss": 0.2256, + "step": 3031 + }, + { + "epoch": 1.1196454948301329, + "grad_norm": 0.23575958609580994, + "learning_rate": 0.00012543416676930657, + "loss": 0.1972, + "step": 3032 + }, + { + "epoch": 1.1200147710487445, + "grad_norm": 0.2646510601043701, + "learning_rate": 0.00012540953319374306, + "loss": 0.2872, + "step": 3033 + }, + { + "epoch": 1.120384047267356, + "grad_norm": 0.3003496825695038, + "learning_rate": 0.0001253848996181796, + "loss": 0.2201, + "step": 3034 + }, + { + "epoch": 1.1207533234859675, + "grad_norm": 0.2522369623184204, + "learning_rate": 0.00012536026604261608, + "loss": 0.2261, + "step": 3035 + }, + { + "epoch": 1.121122599704579, + "grad_norm": 0.24043703079223633, + "learning_rate": 0.0001253356324670526, + "loss": 0.2076, + "step": 3036 + }, + { + "epoch": 1.1214918759231907, + "grad_norm": 0.2606269121170044, + "learning_rate": 0.0001253109988914891, + "loss": 0.2108, + "step": 3037 + }, + { + "epoch": 1.121861152141802, + "grad_norm": 0.3020753264427185, + "learning_rate": 0.0001252863653159256, + "loss": 0.2785, + "step": 3038 + }, + { + "epoch": 1.1222304283604136, + "grad_norm": 0.3187282979488373, + "learning_rate": 0.00012526173174036212, + "loss": 0.2598, + "step": 3039 + }, + { + "epoch": 1.122599704579025, + "grad_norm": 0.29384467005729675, + "learning_rate": 0.00012523709816479863, + "loss": 0.2582, + "step": 3040 + }, + { + "epoch": 1.1229689807976366, + "grad_norm": 0.23518045246601105, + "learning_rate": 0.00012521246458923512, + "loss": 0.2195, + "step": 3041 + }, + { + "epoch": 1.1233382570162482, + "grad_norm": 0.23523001372814178, + "learning_rate": 0.00012518783101367163, + "loss": 0.2244, + "step": 3042 + }, + { + "epoch": 1.1237075332348596, + "grad_norm": 0.27626824378967285, + "learning_rate": 0.00012516319743810815, + "loss": 0.2256, + "step": 3043 + }, + { + "epoch": 1.1240768094534712, + "grad_norm": 0.22142411768436432, + "learning_rate": 0.00012513856386254466, + "loss": 0.2067, + "step": 3044 + }, + { + "epoch": 1.1244460856720828, + "grad_norm": 0.2683732807636261, + "learning_rate": 0.00012511393028698115, + "loss": 0.2683, + "step": 3045 + }, + { + "epoch": 1.1248153618906942, + "grad_norm": 0.23579442501068115, + "learning_rate": 0.00012508929671141766, + "loss": 0.2072, + "step": 3046 + }, + { + "epoch": 1.1251846381093058, + "grad_norm": 0.29119208455085754, + "learning_rate": 0.00012506466313585418, + "loss": 0.2518, + "step": 3047 + }, + { + "epoch": 1.1255539143279174, + "grad_norm": 0.21760676801204681, + "learning_rate": 0.0001250400295602907, + "loss": 0.1986, + "step": 3048 + }, + { + "epoch": 1.1259231905465288, + "grad_norm": 0.23465099930763245, + "learning_rate": 0.00012501539598472718, + "loss": 0.2109, + "step": 3049 + }, + { + "epoch": 1.1262924667651404, + "grad_norm": 0.2772041857242584, + "learning_rate": 0.0001249907624091637, + "loss": 0.2235, + "step": 3050 + }, + { + "epoch": 1.1262924667651404, + "eval_loss": 8.450030326843262, + "eval_runtime": 6.9101, + "eval_samples_per_second": 7.236, + "eval_steps_per_second": 1.013, + "step": 3050 + }, + { + "epoch": 1.1266617429837518, + "grad_norm": 0.2420531064271927, + "learning_rate": 0.00012496612883360018, + "loss": 0.2297, + "step": 3051 + }, + { + "epoch": 1.1270310192023634, + "grad_norm": 0.2992003858089447, + "learning_rate": 0.00012494149525803672, + "loss": 0.2802, + "step": 3052 + }, + { + "epoch": 1.127400295420975, + "grad_norm": 0.2346888780593872, + "learning_rate": 0.0001249168616824732, + "loss": 0.199, + "step": 3053 + }, + { + "epoch": 1.1277695716395864, + "grad_norm": 0.3090643882751465, + "learning_rate": 0.00012489222810690973, + "loss": 0.2611, + "step": 3054 + }, + { + "epoch": 1.128138847858198, + "grad_norm": 0.25954845547676086, + "learning_rate": 0.00012486759453134621, + "loss": 0.2199, + "step": 3055 + }, + { + "epoch": 1.1285081240768093, + "grad_norm": 0.24739718437194824, + "learning_rate": 0.00012484296095578273, + "loss": 0.2275, + "step": 3056 + }, + { + "epoch": 1.128877400295421, + "grad_norm": 0.2381451576948166, + "learning_rate": 0.00012481832738021924, + "loss": 0.2427, + "step": 3057 + }, + { + "epoch": 1.1292466765140325, + "grad_norm": 0.2932181656360626, + "learning_rate": 0.00012479369380465576, + "loss": 0.2306, + "step": 3058 + }, + { + "epoch": 1.129615952732644, + "grad_norm": 0.30831512808799744, + "learning_rate": 0.00012476906022909225, + "loss": 0.2551, + "step": 3059 + }, + { + "epoch": 1.1299852289512555, + "grad_norm": 0.25685325264930725, + "learning_rate": 0.00012474442665352876, + "loss": 0.2314, + "step": 3060 + }, + { + "epoch": 1.1303545051698671, + "grad_norm": 0.2983797490596771, + "learning_rate": 0.00012471979307796528, + "loss": 0.2793, + "step": 3061 + }, + { + "epoch": 1.1307237813884785, + "grad_norm": 0.2818678915500641, + "learning_rate": 0.0001246951595024018, + "loss": 0.2701, + "step": 3062 + }, + { + "epoch": 1.1310930576070901, + "grad_norm": 0.23537462949752808, + "learning_rate": 0.00012467052592683828, + "loss": 0.2541, + "step": 3063 + }, + { + "epoch": 1.1314623338257017, + "grad_norm": 0.25887569785118103, + "learning_rate": 0.0001246458923512748, + "loss": 0.1958, + "step": 3064 + }, + { + "epoch": 1.131831610044313, + "grad_norm": 0.22699254751205444, + "learning_rate": 0.00012462125877571128, + "loss": 0.2061, + "step": 3065 + }, + { + "epoch": 1.1322008862629247, + "grad_norm": 0.2543795704841614, + "learning_rate": 0.00012459662520014782, + "loss": 0.2033, + "step": 3066 + }, + { + "epoch": 1.132570162481536, + "grad_norm": 0.25490936636924744, + "learning_rate": 0.0001245719916245843, + "loss": 0.2585, + "step": 3067 + }, + { + "epoch": 1.1329394387001477, + "grad_norm": 0.3246995210647583, + "learning_rate": 0.00012454735804902082, + "loss": 0.2819, + "step": 3068 + }, + { + "epoch": 1.1333087149187593, + "grad_norm": 0.2574843764305115, + "learning_rate": 0.0001245227244734573, + "loss": 0.2566, + "step": 3069 + }, + { + "epoch": 1.1336779911373707, + "grad_norm": 0.23957401514053345, + "learning_rate": 0.00012449809089789383, + "loss": 0.2236, + "step": 3070 + }, + { + "epoch": 1.1340472673559823, + "grad_norm": 0.2911028563976288, + "learning_rate": 0.00012447345732233034, + "loss": 0.2904, + "step": 3071 + }, + { + "epoch": 1.1344165435745939, + "grad_norm": 0.2553776800632477, + "learning_rate": 0.00012444882374676686, + "loss": 0.2233, + "step": 3072 + }, + { + "epoch": 1.1347858197932053, + "grad_norm": 0.27068206667900085, + "learning_rate": 0.00012442419017120334, + "loss": 0.2384, + "step": 3073 + }, + { + "epoch": 1.1351550960118169, + "grad_norm": 0.2106127291917801, + "learning_rate": 0.00012439955659563986, + "loss": 0.2016, + "step": 3074 + }, + { + "epoch": 1.1355243722304285, + "grad_norm": 0.24056026339530945, + "learning_rate": 0.00012437492302007637, + "loss": 0.2056, + "step": 3075 + }, + { + "epoch": 1.1358936484490398, + "grad_norm": 0.27358096837997437, + "learning_rate": 0.0001243502894445129, + "loss": 0.2096, + "step": 3076 + }, + { + "epoch": 1.1362629246676514, + "grad_norm": 0.30243122577667236, + "learning_rate": 0.00012432565586894937, + "loss": 0.2587, + "step": 3077 + }, + { + "epoch": 1.1366322008862628, + "grad_norm": 0.2903537154197693, + "learning_rate": 0.0001243010222933859, + "loss": 0.2595, + "step": 3078 + }, + { + "epoch": 1.1370014771048744, + "grad_norm": 0.25704383850097656, + "learning_rate": 0.00012427638871782238, + "loss": 0.2283, + "step": 3079 + }, + { + "epoch": 1.137370753323486, + "grad_norm": 0.24480633437633514, + "learning_rate": 0.00012425175514225892, + "loss": 0.2219, + "step": 3080 + }, + { + "epoch": 1.1377400295420974, + "grad_norm": 0.31090837717056274, + "learning_rate": 0.0001242271215666954, + "loss": 0.2457, + "step": 3081 + }, + { + "epoch": 1.138109305760709, + "grad_norm": 0.31273677945137024, + "learning_rate": 0.00012420248799113192, + "loss": 0.2672, + "step": 3082 + }, + { + "epoch": 1.1384785819793206, + "grad_norm": 0.22598646581172943, + "learning_rate": 0.0001241778544155684, + "loss": 0.2085, + "step": 3083 + }, + { + "epoch": 1.138847858197932, + "grad_norm": 0.27792271971702576, + "learning_rate": 0.00012415322084000492, + "loss": 0.2124, + "step": 3084 + }, + { + "epoch": 1.1392171344165436, + "grad_norm": 0.24575480818748474, + "learning_rate": 0.00012412858726444144, + "loss": 0.2173, + "step": 3085 + }, + { + "epoch": 1.1395864106351552, + "grad_norm": 0.23398679494857788, + "learning_rate": 0.00012410395368887795, + "loss": 0.2029, + "step": 3086 + }, + { + "epoch": 1.1399556868537666, + "grad_norm": 0.20122838020324707, + "learning_rate": 0.00012407932011331444, + "loss": 0.1916, + "step": 3087 + }, + { + "epoch": 1.1403249630723782, + "grad_norm": 0.29287928342819214, + "learning_rate": 0.00012405468653775095, + "loss": 0.2308, + "step": 3088 + }, + { + "epoch": 1.1406942392909896, + "grad_norm": 0.21308663487434387, + "learning_rate": 0.00012403005296218747, + "loss": 0.1981, + "step": 3089 + }, + { + "epoch": 1.1410635155096012, + "grad_norm": 0.2087683230638504, + "learning_rate": 0.00012400541938662398, + "loss": 0.1912, + "step": 3090 + }, + { + "epoch": 1.1414327917282128, + "grad_norm": 0.2861213684082031, + "learning_rate": 0.00012398078581106047, + "loss": 0.2414, + "step": 3091 + }, + { + "epoch": 1.1418020679468242, + "grad_norm": 0.2477501779794693, + "learning_rate": 0.00012395615223549699, + "loss": 0.22, + "step": 3092 + }, + { + "epoch": 1.1421713441654358, + "grad_norm": 0.24152931571006775, + "learning_rate": 0.0001239315186599335, + "loss": 0.2215, + "step": 3093 + }, + { + "epoch": 1.1425406203840474, + "grad_norm": 0.26816463470458984, + "learning_rate": 0.00012390688508437001, + "loss": 0.2352, + "step": 3094 + }, + { + "epoch": 1.1429098966026587, + "grad_norm": 0.2798493206501007, + "learning_rate": 0.0001238822515088065, + "loss": 0.2508, + "step": 3095 + }, + { + "epoch": 1.1432791728212703, + "grad_norm": 0.2466522604227066, + "learning_rate": 0.00012385761793324302, + "loss": 0.2221, + "step": 3096 + }, + { + "epoch": 1.143648449039882, + "grad_norm": 0.2926315665245056, + "learning_rate": 0.0001238329843576795, + "loss": 0.2704, + "step": 3097 + }, + { + "epoch": 1.1440177252584933, + "grad_norm": 0.251381516456604, + "learning_rate": 0.00012380835078211605, + "loss": 0.2154, + "step": 3098 + }, + { + "epoch": 1.144387001477105, + "grad_norm": 0.2858138680458069, + "learning_rate": 0.00012378371720655253, + "loss": 0.2281, + "step": 3099 + }, + { + "epoch": 1.1447562776957163, + "grad_norm": 0.25521156191825867, + "learning_rate": 0.00012375908363098905, + "loss": 0.2661, + "step": 3100 + }, + { + "epoch": 1.1447562776957163, + "eval_loss": 8.390676498413086, + "eval_runtime": 6.9117, + "eval_samples_per_second": 7.234, + "eval_steps_per_second": 1.013, + "step": 3100 + }, + { + "epoch": 1.145125553914328, + "grad_norm": 0.2875896990299225, + "learning_rate": 0.00012373445005542554, + "loss": 0.2245, + "step": 3101 + }, + { + "epoch": 1.1454948301329395, + "grad_norm": 0.2725217342376709, + "learning_rate": 0.00012370981647986205, + "loss": 0.2507, + "step": 3102 + }, + { + "epoch": 1.145864106351551, + "grad_norm": 0.27444911003112793, + "learning_rate": 0.00012368518290429856, + "loss": 0.2249, + "step": 3103 + }, + { + "epoch": 1.1462333825701625, + "grad_norm": 0.2828966975212097, + "learning_rate": 0.00012366054932873508, + "loss": 0.2207, + "step": 3104 + }, + { + "epoch": 1.146602658788774, + "grad_norm": 0.25408756732940674, + "learning_rate": 0.00012363591575317157, + "loss": 0.2192, + "step": 3105 + }, + { + "epoch": 1.1469719350073855, + "grad_norm": 0.2717505693435669, + "learning_rate": 0.00012361128217760808, + "loss": 0.2302, + "step": 3106 + }, + { + "epoch": 1.147341211225997, + "grad_norm": 0.22665484249591827, + "learning_rate": 0.0001235866486020446, + "loss": 0.2117, + "step": 3107 + }, + { + "epoch": 1.1477104874446087, + "grad_norm": 0.24677586555480957, + "learning_rate": 0.0001235620150264811, + "loss": 0.2376, + "step": 3108 + }, + { + "epoch": 1.14807976366322, + "grad_norm": 0.25087496638298035, + "learning_rate": 0.0001235373814509176, + "loss": 0.173, + "step": 3109 + }, + { + "epoch": 1.1484490398818317, + "grad_norm": 0.24136517941951752, + "learning_rate": 0.0001235127478753541, + "loss": 0.2043, + "step": 3110 + }, + { + "epoch": 1.148818316100443, + "grad_norm": 0.35589680075645447, + "learning_rate": 0.0001234881142997906, + "loss": 0.2661, + "step": 3111 + }, + { + "epoch": 1.1491875923190547, + "grad_norm": 0.29157546162605286, + "learning_rate": 0.00012346348072422714, + "loss": 0.2951, + "step": 3112 + }, + { + "epoch": 1.1495568685376663, + "grad_norm": 0.2843800485134125, + "learning_rate": 0.00012343884714866363, + "loss": 0.2154, + "step": 3113 + }, + { + "epoch": 1.1499261447562776, + "grad_norm": 0.2830895483493805, + "learning_rate": 0.00012341421357310014, + "loss": 0.2533, + "step": 3114 + }, + { + "epoch": 1.1502954209748892, + "grad_norm": 0.3031349778175354, + "learning_rate": 0.00012338957999753663, + "loss": 0.2841, + "step": 3115 + }, + { + "epoch": 1.1506646971935006, + "grad_norm": 0.2768130302429199, + "learning_rate": 0.00012336494642197315, + "loss": 0.2842, + "step": 3116 + }, + { + "epoch": 1.1510339734121122, + "grad_norm": 0.2754193842411041, + "learning_rate": 0.00012334031284640966, + "loss": 0.2481, + "step": 3117 + }, + { + "epoch": 1.1514032496307238, + "grad_norm": 0.2646322548389435, + "learning_rate": 0.00012331567927084618, + "loss": 0.2228, + "step": 3118 + }, + { + "epoch": 1.1517725258493354, + "grad_norm": 0.21576353907585144, + "learning_rate": 0.00012329104569528266, + "loss": 0.1844, + "step": 3119 + }, + { + "epoch": 1.1521418020679468, + "grad_norm": 0.2434871643781662, + "learning_rate": 0.00012326641211971918, + "loss": 0.2017, + "step": 3120 + }, + { + "epoch": 1.1525110782865584, + "grad_norm": 0.25554201006889343, + "learning_rate": 0.0001232417785441557, + "loss": 0.1954, + "step": 3121 + }, + { + "epoch": 1.1528803545051698, + "grad_norm": 0.21986477077007294, + "learning_rate": 0.0001232171449685922, + "loss": 0.2203, + "step": 3122 + }, + { + "epoch": 1.1532496307237814, + "grad_norm": 0.24760174751281738, + "learning_rate": 0.0001231925113930287, + "loss": 0.2328, + "step": 3123 + }, + { + "epoch": 1.153618906942393, + "grad_norm": 0.2981676161289215, + "learning_rate": 0.0001231678778174652, + "loss": 0.2179, + "step": 3124 + }, + { + "epoch": 1.1539881831610044, + "grad_norm": 0.24826565384864807, + "learning_rate": 0.00012314324424190172, + "loss": 0.2109, + "step": 3125 + }, + { + "epoch": 1.154357459379616, + "grad_norm": 0.2580142617225647, + "learning_rate": 0.00012311861066633824, + "loss": 0.2084, + "step": 3126 + }, + { + "epoch": 1.1547267355982274, + "grad_norm": 0.29276618361473083, + "learning_rate": 0.00012309397709077473, + "loss": 0.2775, + "step": 3127 + }, + { + "epoch": 1.155096011816839, + "grad_norm": 0.27763423323631287, + "learning_rate": 0.00012306934351521124, + "loss": 0.2624, + "step": 3128 + }, + { + "epoch": 1.1554652880354506, + "grad_norm": 0.24732311069965363, + "learning_rate": 0.00012304470993964773, + "loss": 0.2097, + "step": 3129 + }, + { + "epoch": 1.155834564254062, + "grad_norm": 0.29466676712036133, + "learning_rate": 0.00012302007636408427, + "loss": 0.2935, + "step": 3130 + }, + { + "epoch": 1.1562038404726735, + "grad_norm": 0.27581796050071716, + "learning_rate": 0.00012299544278852076, + "loss": 0.203, + "step": 3131 + }, + { + "epoch": 1.1565731166912852, + "grad_norm": 0.23546083271503448, + "learning_rate": 0.00012297080921295727, + "loss": 0.227, + "step": 3132 + }, + { + "epoch": 1.1569423929098965, + "grad_norm": 0.25049880146980286, + "learning_rate": 0.00012294617563739376, + "loss": 0.2106, + "step": 3133 + }, + { + "epoch": 1.1573116691285081, + "grad_norm": 0.32762598991394043, + "learning_rate": 0.00012292154206183027, + "loss": 0.2403, + "step": 3134 + }, + { + "epoch": 1.1576809453471197, + "grad_norm": 0.29207855463027954, + "learning_rate": 0.0001228969084862668, + "loss": 0.2532, + "step": 3135 + }, + { + "epoch": 1.1580502215657311, + "grad_norm": 0.2992591857910156, + "learning_rate": 0.0001228722749107033, + "loss": 0.2637, + "step": 3136 + }, + { + "epoch": 1.1584194977843427, + "grad_norm": 0.2585567533969879, + "learning_rate": 0.0001228476413351398, + "loss": 0.2202, + "step": 3137 + }, + { + "epoch": 1.158788774002954, + "grad_norm": 0.40473684668540955, + "learning_rate": 0.0001228230077595763, + "loss": 0.3498, + "step": 3138 + }, + { + "epoch": 1.1591580502215657, + "grad_norm": 0.285372793674469, + "learning_rate": 0.00012279837418401282, + "loss": 0.2664, + "step": 3139 + }, + { + "epoch": 1.1595273264401773, + "grad_norm": 0.27922147512435913, + "learning_rate": 0.00012277374060844934, + "loss": 0.2506, + "step": 3140 + }, + { + "epoch": 1.1598966026587887, + "grad_norm": 0.28766176104545593, + "learning_rate": 0.00012274910703288582, + "loss": 0.2177, + "step": 3141 + }, + { + "epoch": 1.1602658788774003, + "grad_norm": 0.3553175926208496, + "learning_rate": 0.00012272447345732234, + "loss": 0.2537, + "step": 3142 + }, + { + "epoch": 1.160635155096012, + "grad_norm": 0.2734138071537018, + "learning_rate": 0.00012269983988175883, + "loss": 0.2245, + "step": 3143 + }, + { + "epoch": 1.1610044313146233, + "grad_norm": 0.24008162319660187, + "learning_rate": 0.00012267520630619537, + "loss": 0.1896, + "step": 3144 + }, + { + "epoch": 1.1613737075332349, + "grad_norm": 0.2744710147380829, + "learning_rate": 0.00012265057273063185, + "loss": 0.2283, + "step": 3145 + }, + { + "epoch": 1.1617429837518465, + "grad_norm": 0.2407720386981964, + "learning_rate": 0.00012262593915506837, + "loss": 0.2402, + "step": 3146 + }, + { + "epoch": 1.1621122599704579, + "grad_norm": 0.2725698947906494, + "learning_rate": 0.00012260130557950486, + "loss": 0.2357, + "step": 3147 + }, + { + "epoch": 1.1624815361890695, + "grad_norm": 0.23787416517734528, + "learning_rate": 0.00012257667200394137, + "loss": 0.2198, + "step": 3148 + }, + { + "epoch": 1.1628508124076808, + "grad_norm": 0.245778888463974, + "learning_rate": 0.00012255203842837789, + "loss": 0.2202, + "step": 3149 + }, + { + "epoch": 1.1632200886262924, + "grad_norm": 0.27273258566856384, + "learning_rate": 0.0001225274048528144, + "loss": 0.2686, + "step": 3150 + }, + { + "epoch": 1.1632200886262924, + "eval_loss": 8.340035438537598, + "eval_runtime": 6.9124, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 1.013, + "step": 3150 + }, + { + "epoch": 1.163589364844904, + "grad_norm": 0.24810905754566193, + "learning_rate": 0.0001225027712772509, + "loss": 0.2655, + "step": 3151 + }, + { + "epoch": 1.1639586410635154, + "grad_norm": 0.2381618320941925, + "learning_rate": 0.0001224781377016874, + "loss": 0.2199, + "step": 3152 + }, + { + "epoch": 1.164327917282127, + "grad_norm": 0.31908920407295227, + "learning_rate": 0.00012245350412612392, + "loss": 0.2649, + "step": 3153 + }, + { + "epoch": 1.1646971935007386, + "grad_norm": 0.2840368449687958, + "learning_rate": 0.00012242887055056043, + "loss": 0.2577, + "step": 3154 + }, + { + "epoch": 1.16506646971935, + "grad_norm": 0.22765623033046722, + "learning_rate": 0.00012240423697499692, + "loss": 0.2196, + "step": 3155 + }, + { + "epoch": 1.1654357459379616, + "grad_norm": 0.2288927435874939, + "learning_rate": 0.00012237960339943343, + "loss": 0.202, + "step": 3156 + }, + { + "epoch": 1.1658050221565732, + "grad_norm": 0.22664541006088257, + "learning_rate": 0.00012235496982386995, + "loss": 0.2049, + "step": 3157 + }, + { + "epoch": 1.1661742983751846, + "grad_norm": 0.2944018840789795, + "learning_rate": 0.00012233033624830646, + "loss": 0.2582, + "step": 3158 + }, + { + "epoch": 1.1665435745937962, + "grad_norm": 0.2676953375339508, + "learning_rate": 0.00012230570267274295, + "loss": 0.2248, + "step": 3159 + }, + { + "epoch": 1.1669128508124076, + "grad_norm": 0.2412702888250351, + "learning_rate": 0.00012228106909717947, + "loss": 0.2126, + "step": 3160 + }, + { + "epoch": 1.1672821270310192, + "grad_norm": 0.24819672107696533, + "learning_rate": 0.00012225643552161595, + "loss": 0.2384, + "step": 3161 + }, + { + "epoch": 1.1676514032496308, + "grad_norm": 0.2421681135892868, + "learning_rate": 0.0001222318019460525, + "loss": 0.2278, + "step": 3162 + }, + { + "epoch": 1.1680206794682422, + "grad_norm": 0.23160724341869354, + "learning_rate": 0.00012220716837048898, + "loss": 0.2232, + "step": 3163 + }, + { + "epoch": 1.1683899556868538, + "grad_norm": 0.2932402789592743, + "learning_rate": 0.0001221825347949255, + "loss": 0.2422, + "step": 3164 + }, + { + "epoch": 1.1687592319054654, + "grad_norm": 0.24574051797389984, + "learning_rate": 0.00012215790121936198, + "loss": 0.2347, + "step": 3165 + }, + { + "epoch": 1.1691285081240768, + "grad_norm": 0.2509894073009491, + "learning_rate": 0.0001221332676437985, + "loss": 0.2177, + "step": 3166 + }, + { + "epoch": 1.1694977843426884, + "grad_norm": 0.24169331789016724, + "learning_rate": 0.00012210863406823501, + "loss": 0.2087, + "step": 3167 + }, + { + "epoch": 1.1698670605613, + "grad_norm": 0.25094521045684814, + "learning_rate": 0.00012208400049267153, + "loss": 0.2018, + "step": 3168 + }, + { + "epoch": 1.1702363367799113, + "grad_norm": 0.32894349098205566, + "learning_rate": 0.00012205936691710802, + "loss": 0.2038, + "step": 3169 + }, + { + "epoch": 1.170605612998523, + "grad_norm": 0.2670305073261261, + "learning_rate": 0.00012203473334154454, + "loss": 0.2138, + "step": 3170 + }, + { + "epoch": 1.1709748892171343, + "grad_norm": 0.2411733716726303, + "learning_rate": 0.00012201009976598103, + "loss": 0.2087, + "step": 3171 + }, + { + "epoch": 1.171344165435746, + "grad_norm": 0.2547158896923065, + "learning_rate": 0.00012198546619041755, + "loss": 0.2497, + "step": 3172 + }, + { + "epoch": 1.1717134416543575, + "grad_norm": 0.21533368527889252, + "learning_rate": 0.00012196083261485405, + "loss": 0.1986, + "step": 3173 + }, + { + "epoch": 1.172082717872969, + "grad_norm": 0.26111504435539246, + "learning_rate": 0.00012193619903929056, + "loss": 0.1959, + "step": 3174 + }, + { + "epoch": 1.1724519940915805, + "grad_norm": 0.4638223350048065, + "learning_rate": 0.00012191156546372706, + "loss": 0.2357, + "step": 3175 + }, + { + "epoch": 1.172821270310192, + "grad_norm": 0.24222992360591888, + "learning_rate": 0.00012188693188816358, + "loss": 0.1952, + "step": 3176 + }, + { + "epoch": 1.1731905465288035, + "grad_norm": 0.2631170153617859, + "learning_rate": 0.00012186229831260008, + "loss": 0.2056, + "step": 3177 + }, + { + "epoch": 1.173559822747415, + "grad_norm": 0.27158698439598083, + "learning_rate": 0.0001218376647370366, + "loss": 0.25, + "step": 3178 + }, + { + "epoch": 1.1739290989660267, + "grad_norm": 0.22357377409934998, + "learning_rate": 0.0001218130311614731, + "loss": 0.1857, + "step": 3179 + }, + { + "epoch": 1.174298375184638, + "grad_norm": 0.23896613717079163, + "learning_rate": 0.00012178839758590961, + "loss": 0.24, + "step": 3180 + }, + { + "epoch": 1.1746676514032497, + "grad_norm": 0.2463010549545288, + "learning_rate": 0.00012176376401034611, + "loss": 0.2058, + "step": 3181 + }, + { + "epoch": 1.175036927621861, + "grad_norm": 0.27682992815971375, + "learning_rate": 0.00012173913043478263, + "loss": 0.2537, + "step": 3182 + }, + { + "epoch": 1.1754062038404727, + "grad_norm": 0.27977579832077026, + "learning_rate": 0.00012171449685921911, + "loss": 0.2669, + "step": 3183 + }, + { + "epoch": 1.1757754800590843, + "grad_norm": 0.24095742404460907, + "learning_rate": 0.00012168986328365564, + "loss": 0.2041, + "step": 3184 + }, + { + "epoch": 1.1761447562776957, + "grad_norm": 0.2625512182712555, + "learning_rate": 0.00012166522970809213, + "loss": 0.2463, + "step": 3185 + }, + { + "epoch": 1.1765140324963073, + "grad_norm": 0.2417324334383011, + "learning_rate": 0.00012164059613252866, + "loss": 0.2115, + "step": 3186 + }, + { + "epoch": 1.1768833087149186, + "grad_norm": 0.2739996016025543, + "learning_rate": 0.00012161596255696514, + "loss": 0.2088, + "step": 3187 + }, + { + "epoch": 1.1772525849335302, + "grad_norm": 0.2593826353549957, + "learning_rate": 0.00012159132898140166, + "loss": 0.2554, + "step": 3188 + }, + { + "epoch": 1.1776218611521418, + "grad_norm": 0.23065000772476196, + "learning_rate": 0.00012156669540583816, + "loss": 0.2025, + "step": 3189 + }, + { + "epoch": 1.1779911373707532, + "grad_norm": 0.25530335307121277, + "learning_rate": 0.00012154206183027467, + "loss": 0.2452, + "step": 3190 + }, + { + "epoch": 1.1783604135893648, + "grad_norm": 0.2744583785533905, + "learning_rate": 0.00012151742825471118, + "loss": 0.2236, + "step": 3191 + }, + { + "epoch": 1.1787296898079764, + "grad_norm": 0.24074983596801758, + "learning_rate": 0.00012149279467914769, + "loss": 0.2115, + "step": 3192 + }, + { + "epoch": 1.1790989660265878, + "grad_norm": 0.2629273235797882, + "learning_rate": 0.00012146816110358419, + "loss": 0.2461, + "step": 3193 + }, + { + "epoch": 1.1794682422451994, + "grad_norm": 0.30756059288978577, + "learning_rate": 0.0001214435275280207, + "loss": 0.2896, + "step": 3194 + }, + { + "epoch": 1.179837518463811, + "grad_norm": 0.23834991455078125, + "learning_rate": 0.00012141889395245721, + "loss": 0.2027, + "step": 3195 + }, + { + "epoch": 1.1802067946824224, + "grad_norm": 0.2319520115852356, + "learning_rate": 0.00012139426037689372, + "loss": 0.2036, + "step": 3196 + }, + { + "epoch": 1.180576070901034, + "grad_norm": 0.2667308449745178, + "learning_rate": 0.00012136962680133022, + "loss": 0.2513, + "step": 3197 + }, + { + "epoch": 1.1809453471196454, + "grad_norm": 0.263604998588562, + "learning_rate": 0.00012134499322576674, + "loss": 0.227, + "step": 3198 + }, + { + "epoch": 1.181314623338257, + "grad_norm": 0.2538369596004486, + "learning_rate": 0.00012132035965020322, + "loss": 0.2155, + "step": 3199 + }, + { + "epoch": 1.1816838995568686, + "grad_norm": 0.22317415475845337, + "learning_rate": 0.00012129572607463975, + "loss": 0.2002, + "step": 3200 + }, + { + "epoch": 1.1816838995568686, + "eval_loss": 8.267110824584961, + "eval_runtime": 6.922, + "eval_samples_per_second": 7.223, + "eval_steps_per_second": 1.011, + "step": 3200 + }, + { + "epoch": 1.18205317577548, + "grad_norm": 0.2647877633571625, + "learning_rate": 0.00012127109249907624, + "loss": 0.2237, + "step": 3201 + }, + { + "epoch": 1.1824224519940916, + "grad_norm": 0.28878480195999146, + "learning_rate": 0.00012124645892351277, + "loss": 0.2418, + "step": 3202 + }, + { + "epoch": 1.1827917282127032, + "grad_norm": 0.27412959933280945, + "learning_rate": 0.00012122182534794926, + "loss": 0.2081, + "step": 3203 + }, + { + "epoch": 1.1831610044313146, + "grad_norm": 0.21949930489063263, + "learning_rate": 0.00012119719177238577, + "loss": 0.2104, + "step": 3204 + }, + { + "epoch": 1.1835302806499262, + "grad_norm": 0.2462879866361618, + "learning_rate": 0.00012117255819682227, + "loss": 0.2233, + "step": 3205 + }, + { + "epoch": 1.1838995568685378, + "grad_norm": 0.23553521931171417, + "learning_rate": 0.00012114792462125879, + "loss": 0.2298, + "step": 3206 + }, + { + "epoch": 1.1842688330871491, + "grad_norm": 0.22991472482681274, + "learning_rate": 0.00012112329104569529, + "loss": 0.1935, + "step": 3207 + }, + { + "epoch": 1.1846381093057607, + "grad_norm": 0.29040002822875977, + "learning_rate": 0.0001210986574701318, + "loss": 0.2299, + "step": 3208 + }, + { + "epoch": 1.1850073855243721, + "grad_norm": 0.28040045499801636, + "learning_rate": 0.0001210740238945683, + "loss": 0.2234, + "step": 3209 + }, + { + "epoch": 1.1853766617429837, + "grad_norm": 0.27813273668289185, + "learning_rate": 0.00012104939031900482, + "loss": 0.2437, + "step": 3210 + }, + { + "epoch": 1.1857459379615953, + "grad_norm": 0.2803947329521179, + "learning_rate": 0.00012102475674344132, + "loss": 0.2398, + "step": 3211 + }, + { + "epoch": 1.1861152141802067, + "grad_norm": 0.25766292214393616, + "learning_rate": 0.00012100012316787783, + "loss": 0.225, + "step": 3212 + }, + { + "epoch": 1.1864844903988183, + "grad_norm": 0.23333033919334412, + "learning_rate": 0.00012097548959231434, + "loss": 0.2232, + "step": 3213 + }, + { + "epoch": 1.18685376661743, + "grad_norm": 0.3438679873943329, + "learning_rate": 0.00012095085601675085, + "loss": 0.2269, + "step": 3214 + }, + { + "epoch": 1.1872230428360413, + "grad_norm": 0.24346406757831573, + "learning_rate": 0.00012092622244118734, + "loss": 0.2013, + "step": 3215 + }, + { + "epoch": 1.187592319054653, + "grad_norm": 0.23038794100284576, + "learning_rate": 0.00012090158886562387, + "loss": 0.1991, + "step": 3216 + }, + { + "epoch": 1.1879615952732645, + "grad_norm": 0.26206523180007935, + "learning_rate": 0.00012087695529006035, + "loss": 0.2177, + "step": 3217 + }, + { + "epoch": 1.1883308714918759, + "grad_norm": 0.25569677352905273, + "learning_rate": 0.00012085232171449688, + "loss": 0.2223, + "step": 3218 + }, + { + "epoch": 1.1887001477104875, + "grad_norm": 0.27414456009864807, + "learning_rate": 0.00012082768813893337, + "loss": 0.2361, + "step": 3219 + }, + { + "epoch": 1.1890694239290989, + "grad_norm": 0.2250797599554062, + "learning_rate": 0.00012080305456336988, + "loss": 0.2243, + "step": 3220 + }, + { + "epoch": 1.1894387001477105, + "grad_norm": 0.24654324352741241, + "learning_rate": 0.00012077842098780638, + "loss": 0.2229, + "step": 3221 + }, + { + "epoch": 1.189807976366322, + "grad_norm": 0.23722168803215027, + "learning_rate": 0.0001207537874122429, + "loss": 0.2304, + "step": 3222 + }, + { + "epoch": 1.1901772525849335, + "grad_norm": 0.26113882660865784, + "learning_rate": 0.0001207291538366794, + "loss": 0.2279, + "step": 3223 + }, + { + "epoch": 1.190546528803545, + "grad_norm": 0.2567733824253082, + "learning_rate": 0.00012070452026111591, + "loss": 0.2201, + "step": 3224 + }, + { + "epoch": 1.1909158050221567, + "grad_norm": 0.2425006628036499, + "learning_rate": 0.00012067988668555242, + "loss": 0.2376, + "step": 3225 + }, + { + "epoch": 1.191285081240768, + "grad_norm": 0.24800774455070496, + "learning_rate": 0.00012065525310998893, + "loss": 0.194, + "step": 3226 + }, + { + "epoch": 1.1916543574593796, + "grad_norm": 0.29246145486831665, + "learning_rate": 0.00012063061953442543, + "loss": 0.2935, + "step": 3227 + }, + { + "epoch": 1.1920236336779912, + "grad_norm": 0.19384582340717316, + "learning_rate": 0.00012060598595886195, + "loss": 0.2006, + "step": 3228 + }, + { + "epoch": 1.1923929098966026, + "grad_norm": 0.23506811261177063, + "learning_rate": 0.00012058135238329843, + "loss": 0.1995, + "step": 3229 + }, + { + "epoch": 1.1927621861152142, + "grad_norm": 0.20495739579200745, + "learning_rate": 0.00012055671880773496, + "loss": 0.1806, + "step": 3230 + }, + { + "epoch": 1.1931314623338256, + "grad_norm": 0.2660287916660309, + "learning_rate": 0.00012053208523217145, + "loss": 0.214, + "step": 3231 + }, + { + "epoch": 1.1935007385524372, + "grad_norm": 0.2588471472263336, + "learning_rate": 0.00012050745165660798, + "loss": 0.2075, + "step": 3232 + }, + { + "epoch": 1.1938700147710488, + "grad_norm": 0.3432180881500244, + "learning_rate": 0.00012048281808104447, + "loss": 0.2647, + "step": 3233 + }, + { + "epoch": 1.1942392909896602, + "grad_norm": 0.25156551599502563, + "learning_rate": 0.00012045818450548098, + "loss": 0.2489, + "step": 3234 + }, + { + "epoch": 1.1946085672082718, + "grad_norm": 0.201694056391716, + "learning_rate": 0.00012043355092991748, + "loss": 0.1932, + "step": 3235 + }, + { + "epoch": 1.1949778434268834, + "grad_norm": 0.23648051917552948, + "learning_rate": 0.000120408917354354, + "loss": 0.2348, + "step": 3236 + }, + { + "epoch": 1.1953471196454948, + "grad_norm": 0.2742224931716919, + "learning_rate": 0.0001203842837787905, + "loss": 0.256, + "step": 3237 + }, + { + "epoch": 1.1957163958641064, + "grad_norm": 0.2593318223953247, + "learning_rate": 0.00012035965020322701, + "loss": 0.212, + "step": 3238 + }, + { + "epoch": 1.196085672082718, + "grad_norm": 0.23279283940792084, + "learning_rate": 0.00012033501662766351, + "loss": 0.2201, + "step": 3239 + }, + { + "epoch": 1.1964549483013294, + "grad_norm": 0.24237380921840668, + "learning_rate": 0.00012031038305210003, + "loss": 0.2008, + "step": 3240 + }, + { + "epoch": 1.196824224519941, + "grad_norm": 0.23196551203727722, + "learning_rate": 0.00012028574947653653, + "loss": 0.2118, + "step": 3241 + }, + { + "epoch": 1.1971935007385524, + "grad_norm": 0.25500011444091797, + "learning_rate": 0.00012026111590097304, + "loss": 0.2031, + "step": 3242 + }, + { + "epoch": 1.197562776957164, + "grad_norm": 0.21559029817581177, + "learning_rate": 0.00012023648232540954, + "loss": 0.2199, + "step": 3243 + }, + { + "epoch": 1.1979320531757756, + "grad_norm": 0.24756810069084167, + "learning_rate": 0.00012021184874984606, + "loss": 0.2013, + "step": 3244 + }, + { + "epoch": 1.198301329394387, + "grad_norm": 0.2566717565059662, + "learning_rate": 0.00012018721517428255, + "loss": 0.1911, + "step": 3245 + }, + { + "epoch": 1.1986706056129985, + "grad_norm": 0.2147149294614792, + "learning_rate": 0.00012016258159871907, + "loss": 0.2167, + "step": 3246 + }, + { + "epoch": 1.19903988183161, + "grad_norm": 0.2479647994041443, + "learning_rate": 0.00012013794802315556, + "loss": 0.211, + "step": 3247 + }, + { + "epoch": 1.1994091580502215, + "grad_norm": 0.3019788861274719, + "learning_rate": 0.00012011331444759209, + "loss": 0.236, + "step": 3248 + }, + { + "epoch": 1.1997784342688331, + "grad_norm": 0.2598505914211273, + "learning_rate": 0.00012008868087202858, + "loss": 0.2309, + "step": 3249 + }, + { + "epoch": 1.2001477104874447, + "grad_norm": 0.26086798310279846, + "learning_rate": 0.00012006404729646509, + "loss": 0.1969, + "step": 3250 + }, + { + "epoch": 1.2001477104874447, + "eval_loss": 8.288263320922852, + "eval_runtime": 6.9109, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 1.013, + "step": 3250 + }, + { + "epoch": 1.200516986706056, + "grad_norm": 0.2878490388393402, + "learning_rate": 0.00012003941372090159, + "loss": 0.2227, + "step": 3251 + }, + { + "epoch": 1.2008862629246677, + "grad_norm": 0.25834518671035767, + "learning_rate": 0.00012001478014533811, + "loss": 0.2421, + "step": 3252 + }, + { + "epoch": 1.201255539143279, + "grad_norm": 0.22195979952812195, + "learning_rate": 0.00011999014656977461, + "loss": 0.2141, + "step": 3253 + }, + { + "epoch": 1.2016248153618907, + "grad_norm": 0.2155759185552597, + "learning_rate": 0.00011996551299421111, + "loss": 0.203, + "step": 3254 + }, + { + "epoch": 1.2019940915805023, + "grad_norm": 0.25331732630729675, + "learning_rate": 0.00011994087941864762, + "loss": 0.2099, + "step": 3255 + }, + { + "epoch": 1.2023633677991137, + "grad_norm": 0.34362849593162537, + "learning_rate": 0.00011991624584308411, + "loss": 0.247, + "step": 3256 + }, + { + "epoch": 1.2027326440177253, + "grad_norm": 0.2557450830936432, + "learning_rate": 0.00011989161226752064, + "loss": 0.2322, + "step": 3257 + }, + { + "epoch": 1.2031019202363367, + "grad_norm": 0.2926187217235565, + "learning_rate": 0.00011986697869195713, + "loss": 0.2326, + "step": 3258 + }, + { + "epoch": 1.2034711964549483, + "grad_norm": 0.29786282777786255, + "learning_rate": 0.00011984234511639366, + "loss": 0.2906, + "step": 3259 + }, + { + "epoch": 1.2038404726735599, + "grad_norm": 0.2573811411857605, + "learning_rate": 0.00011981771154083014, + "loss": 0.2143, + "step": 3260 + }, + { + "epoch": 1.2042097488921713, + "grad_norm": 0.2751642167568207, + "learning_rate": 0.00011979307796526666, + "loss": 0.2291, + "step": 3261 + }, + { + "epoch": 1.2045790251107829, + "grad_norm": 0.22257913649082184, + "learning_rate": 0.00011976844438970316, + "loss": 0.2321, + "step": 3262 + }, + { + "epoch": 1.2049483013293945, + "grad_norm": 0.21001161634922028, + "learning_rate": 0.00011974381081413967, + "loss": 0.1737, + "step": 3263 + }, + { + "epoch": 1.2053175775480058, + "grad_norm": 0.24512171745300293, + "learning_rate": 0.00011971917723857618, + "loss": 0.2318, + "step": 3264 + }, + { + "epoch": 1.2056868537666174, + "grad_norm": 0.257089763879776, + "learning_rate": 0.00011969454366301269, + "loss": 0.2165, + "step": 3265 + }, + { + "epoch": 1.206056129985229, + "grad_norm": 0.23673245310783386, + "learning_rate": 0.00011966991008744919, + "loss": 0.2319, + "step": 3266 + }, + { + "epoch": 1.2064254062038404, + "grad_norm": 0.2873469591140747, + "learning_rate": 0.0001196452765118857, + "loss": 0.2674, + "step": 3267 + }, + { + "epoch": 1.206794682422452, + "grad_norm": 0.3001554012298584, + "learning_rate": 0.0001196206429363222, + "loss": 0.2267, + "step": 3268 + }, + { + "epoch": 1.2071639586410634, + "grad_norm": 0.24801422655582428, + "learning_rate": 0.00011959600936075872, + "loss": 0.2366, + "step": 3269 + }, + { + "epoch": 1.207533234859675, + "grad_norm": 0.2508806586265564, + "learning_rate": 0.00011957137578519522, + "loss": 0.2422, + "step": 3270 + }, + { + "epoch": 1.2079025110782866, + "grad_norm": 0.26135170459747314, + "learning_rate": 0.00011954674220963174, + "loss": 0.2431, + "step": 3271 + }, + { + "epoch": 1.208271787296898, + "grad_norm": 0.23587903380393982, + "learning_rate": 0.00011952210863406822, + "loss": 0.2367, + "step": 3272 + }, + { + "epoch": 1.2086410635155096, + "grad_norm": 0.2692301869392395, + "learning_rate": 0.00011949747505850475, + "loss": 0.2541, + "step": 3273 + }, + { + "epoch": 1.2090103397341212, + "grad_norm": 0.2857707440853119, + "learning_rate": 0.00011947284148294124, + "loss": 0.2544, + "step": 3274 + }, + { + "epoch": 1.2093796159527326, + "grad_norm": 0.2758999466896057, + "learning_rate": 0.00011944820790737777, + "loss": 0.2883, + "step": 3275 + }, + { + "epoch": 1.2097488921713442, + "grad_norm": 0.30602675676345825, + "learning_rate": 0.00011942357433181426, + "loss": 0.2431, + "step": 3276 + }, + { + "epoch": 1.2101181683899558, + "grad_norm": 0.33125412464141846, + "learning_rate": 0.00011939894075625077, + "loss": 0.2765, + "step": 3277 + }, + { + "epoch": 1.2104874446085672, + "grad_norm": 0.27180907130241394, + "learning_rate": 0.00011937430718068727, + "loss": 0.2538, + "step": 3278 + }, + { + "epoch": 1.2108567208271788, + "grad_norm": 0.25376594066619873, + "learning_rate": 0.00011934967360512379, + "loss": 0.2297, + "step": 3279 + }, + { + "epoch": 1.2112259970457901, + "grad_norm": 0.26222750544548035, + "learning_rate": 0.00011932504002956029, + "loss": 0.2027, + "step": 3280 + }, + { + "epoch": 1.2115952732644018, + "grad_norm": 0.23355819284915924, + "learning_rate": 0.0001193004064539968, + "loss": 0.2239, + "step": 3281 + }, + { + "epoch": 1.2119645494830134, + "grad_norm": 0.3795492947101593, + "learning_rate": 0.0001192757728784333, + "loss": 0.293, + "step": 3282 + }, + { + "epoch": 1.2123338257016247, + "grad_norm": 0.24195772409439087, + "learning_rate": 0.00011925113930286982, + "loss": 0.2095, + "step": 3283 + }, + { + "epoch": 1.2127031019202363, + "grad_norm": 0.27835386991500854, + "learning_rate": 0.00011922650572730632, + "loss": 0.24, + "step": 3284 + }, + { + "epoch": 1.213072378138848, + "grad_norm": 0.20145586133003235, + "learning_rate": 0.00011920187215174283, + "loss": 0.1753, + "step": 3285 + }, + { + "epoch": 1.2134416543574593, + "grad_norm": 0.2768704295158386, + "learning_rate": 0.00011917723857617933, + "loss": 0.2525, + "step": 3286 + }, + { + "epoch": 1.213810930576071, + "grad_norm": 0.28876692056655884, + "learning_rate": 0.00011915260500061585, + "loss": 0.2374, + "step": 3287 + }, + { + "epoch": 1.2141802067946825, + "grad_norm": 0.28393030166625977, + "learning_rate": 0.00011912797142505234, + "loss": 0.2271, + "step": 3288 + }, + { + "epoch": 1.214549483013294, + "grad_norm": 0.23999620974063873, + "learning_rate": 0.00011910333784948886, + "loss": 0.2084, + "step": 3289 + }, + { + "epoch": 1.2149187592319055, + "grad_norm": 0.3205350637435913, + "learning_rate": 0.00011907870427392535, + "loss": 0.2619, + "step": 3290 + }, + { + "epoch": 1.215288035450517, + "grad_norm": 0.263473242521286, + "learning_rate": 0.00011905407069836188, + "loss": 0.2555, + "step": 3291 + }, + { + "epoch": 1.2156573116691285, + "grad_norm": 0.2530558109283447, + "learning_rate": 0.00011902943712279837, + "loss": 0.2143, + "step": 3292 + }, + { + "epoch": 1.21602658788774, + "grad_norm": 0.24290473759174347, + "learning_rate": 0.00011900480354723488, + "loss": 0.2377, + "step": 3293 + }, + { + "epoch": 1.2163958641063515, + "grad_norm": 0.2556678056716919, + "learning_rate": 0.00011898016997167138, + "loss": 0.2169, + "step": 3294 + }, + { + "epoch": 1.216765140324963, + "grad_norm": 0.27232617139816284, + "learning_rate": 0.0001189555363961079, + "loss": 0.2277, + "step": 3295 + }, + { + "epoch": 1.2171344165435747, + "grad_norm": 0.24958842992782593, + "learning_rate": 0.0001189309028205444, + "loss": 0.2688, + "step": 3296 + }, + { + "epoch": 1.217503692762186, + "grad_norm": 0.26248520612716675, + "learning_rate": 0.00011890626924498091, + "loss": 0.2574, + "step": 3297 + }, + { + "epoch": 1.2178729689807977, + "grad_norm": 0.23428164422512054, + "learning_rate": 0.00011888163566941742, + "loss": 0.2181, + "step": 3298 + }, + { + "epoch": 1.2182422451994093, + "grad_norm": 0.23519310355186462, + "learning_rate": 0.00011885700209385393, + "loss": 0.224, + "step": 3299 + }, + { + "epoch": 1.2186115214180206, + "grad_norm": 0.25486820936203003, + "learning_rate": 0.00011883236851829043, + "loss": 0.2408, + "step": 3300 + }, + { + "epoch": 1.2186115214180206, + "eval_loss": 8.363037109375, + "eval_runtime": 6.9147, + "eval_samples_per_second": 7.231, + "eval_steps_per_second": 1.012, + "step": 3300 + }, + { + "epoch": 1.2189807976366323, + "grad_norm": 0.25926437973976135, + "learning_rate": 0.00011880773494272695, + "loss": 0.2303, + "step": 3301 + }, + { + "epoch": 1.2193500738552436, + "grad_norm": 0.2779652178287506, + "learning_rate": 0.00011878310136716345, + "loss": 0.2452, + "step": 3302 + }, + { + "epoch": 1.2197193500738552, + "grad_norm": 0.28274989128112793, + "learning_rate": 0.00011875846779159996, + "loss": 0.2551, + "step": 3303 + }, + { + "epoch": 1.2200886262924668, + "grad_norm": 0.21953511238098145, + "learning_rate": 0.00011873383421603645, + "loss": 0.2286, + "step": 3304 + }, + { + "epoch": 1.2204579025110782, + "grad_norm": 0.3077790439128876, + "learning_rate": 0.00011870920064047298, + "loss": 0.239, + "step": 3305 + }, + { + "epoch": 1.2208271787296898, + "grad_norm": 0.2430875599384308, + "learning_rate": 0.00011868456706490946, + "loss": 0.2318, + "step": 3306 + }, + { + "epoch": 1.2211964549483014, + "grad_norm": 0.23860754072666168, + "learning_rate": 0.00011865993348934599, + "loss": 0.241, + "step": 3307 + }, + { + "epoch": 1.2215657311669128, + "grad_norm": 0.27392128109931946, + "learning_rate": 0.00011863529991378248, + "loss": 0.2161, + "step": 3308 + }, + { + "epoch": 1.2219350073855244, + "grad_norm": 0.26613715291023254, + "learning_rate": 0.000118610666338219, + "loss": 0.2489, + "step": 3309 + }, + { + "epoch": 1.222304283604136, + "grad_norm": 0.29265210032463074, + "learning_rate": 0.0001185860327626555, + "loss": 0.2605, + "step": 3310 + }, + { + "epoch": 1.2226735598227474, + "grad_norm": 0.2343636006116867, + "learning_rate": 0.00011856139918709201, + "loss": 0.201, + "step": 3311 + }, + { + "epoch": 1.223042836041359, + "grad_norm": 0.2594150900840759, + "learning_rate": 0.00011853676561152851, + "loss": 0.2695, + "step": 3312 + }, + { + "epoch": 1.2234121122599704, + "grad_norm": 0.22403880953788757, + "learning_rate": 0.00011851213203596503, + "loss": 0.209, + "step": 3313 + }, + { + "epoch": 1.223781388478582, + "grad_norm": 0.2343108206987381, + "learning_rate": 0.00011848749846040153, + "loss": 0.2552, + "step": 3314 + }, + { + "epoch": 1.2241506646971936, + "grad_norm": 0.24968460202217102, + "learning_rate": 0.00011846286488483804, + "loss": 0.195, + "step": 3315 + }, + { + "epoch": 1.224519940915805, + "grad_norm": 0.25107911229133606, + "learning_rate": 0.00011843823130927454, + "loss": 0.218, + "step": 3316 + }, + { + "epoch": 1.2248892171344166, + "grad_norm": 0.3153674304485321, + "learning_rate": 0.00011841359773371106, + "loss": 0.2234, + "step": 3317 + }, + { + "epoch": 1.225258493353028, + "grad_norm": 0.21191003918647766, + "learning_rate": 0.00011838896415814756, + "loss": 0.1933, + "step": 3318 + }, + { + "epoch": 1.2256277695716395, + "grad_norm": 0.25615596771240234, + "learning_rate": 0.00011836433058258407, + "loss": 0.2257, + "step": 3319 + }, + { + "epoch": 1.2259970457902511, + "grad_norm": 0.27919718623161316, + "learning_rate": 0.00011833969700702056, + "loss": 0.1987, + "step": 3320 + }, + { + "epoch": 1.2263663220088628, + "grad_norm": 0.2541070282459259, + "learning_rate": 0.00011831506343145709, + "loss": 0.2105, + "step": 3321 + }, + { + "epoch": 1.2267355982274741, + "grad_norm": 0.26886770129203796, + "learning_rate": 0.00011829042985589358, + "loss": 0.1937, + "step": 3322 + }, + { + "epoch": 1.2271048744460857, + "grad_norm": 0.30414992570877075, + "learning_rate": 0.0001182657962803301, + "loss": 0.2772, + "step": 3323 + }, + { + "epoch": 1.2274741506646971, + "grad_norm": 0.2989824712276459, + "learning_rate": 0.00011824116270476659, + "loss": 0.2755, + "step": 3324 + }, + { + "epoch": 1.2278434268833087, + "grad_norm": 0.32459351420402527, + "learning_rate": 0.00011821652912920311, + "loss": 0.2444, + "step": 3325 + }, + { + "epoch": 1.2282127031019203, + "grad_norm": 0.25471627712249756, + "learning_rate": 0.00011819189555363961, + "loss": 0.2293, + "step": 3326 + }, + { + "epoch": 1.2285819793205317, + "grad_norm": 0.306276798248291, + "learning_rate": 0.00011816726197807612, + "loss": 0.2455, + "step": 3327 + }, + { + "epoch": 1.2289512555391433, + "grad_norm": 0.22191748023033142, + "learning_rate": 0.00011814262840251262, + "loss": 0.2043, + "step": 3328 + }, + { + "epoch": 1.2293205317577547, + "grad_norm": 0.2712024450302124, + "learning_rate": 0.00011811799482694914, + "loss": 0.2272, + "step": 3329 + }, + { + "epoch": 1.2296898079763663, + "grad_norm": 0.2807033061981201, + "learning_rate": 0.00011809336125138564, + "loss": 0.2038, + "step": 3330 + }, + { + "epoch": 1.230059084194978, + "grad_norm": 0.2215922772884369, + "learning_rate": 0.00011806872767582215, + "loss": 0.1967, + "step": 3331 + }, + { + "epoch": 1.2304283604135893, + "grad_norm": 0.2950519025325775, + "learning_rate": 0.00011804409410025866, + "loss": 0.276, + "step": 3332 + }, + { + "epoch": 1.2307976366322009, + "grad_norm": 0.2861984372138977, + "learning_rate": 0.00011801946052469517, + "loss": 0.2104, + "step": 3333 + }, + { + "epoch": 1.2311669128508125, + "grad_norm": 0.2402583658695221, + "learning_rate": 0.00011799482694913167, + "loss": 0.2229, + "step": 3334 + }, + { + "epoch": 1.2315361890694239, + "grad_norm": 0.2987872064113617, + "learning_rate": 0.00011797019337356819, + "loss": 0.2264, + "step": 3335 + }, + { + "epoch": 1.2319054652880355, + "grad_norm": 0.2632824182510376, + "learning_rate": 0.00011794555979800467, + "loss": 0.2364, + "step": 3336 + }, + { + "epoch": 1.232274741506647, + "grad_norm": 0.23499926924705505, + "learning_rate": 0.0001179209262224412, + "loss": 0.2195, + "step": 3337 + }, + { + "epoch": 1.2326440177252584, + "grad_norm": 0.268235981464386, + "learning_rate": 0.00011789629264687769, + "loss": 0.2046, + "step": 3338 + }, + { + "epoch": 1.23301329394387, + "grad_norm": 0.3327055871486664, + "learning_rate": 0.00011787165907131422, + "loss": 0.2504, + "step": 3339 + }, + { + "epoch": 1.2333825701624814, + "grad_norm": 0.2737831771373749, + "learning_rate": 0.0001178470254957507, + "loss": 0.2421, + "step": 3340 + }, + { + "epoch": 1.233751846381093, + "grad_norm": 0.26479947566986084, + "learning_rate": 0.00011782239192018722, + "loss": 0.2387, + "step": 3341 + }, + { + "epoch": 1.2341211225997046, + "grad_norm": 0.2526566982269287, + "learning_rate": 0.00011779775834462372, + "loss": 0.2256, + "step": 3342 + }, + { + "epoch": 1.234490398818316, + "grad_norm": 0.2856113910675049, + "learning_rate": 0.00011777312476906024, + "loss": 0.2403, + "step": 3343 + }, + { + "epoch": 1.2348596750369276, + "grad_norm": 0.23721155524253845, + "learning_rate": 0.00011774849119349674, + "loss": 0.2409, + "step": 3344 + }, + { + "epoch": 1.2352289512555392, + "grad_norm": 0.22775903344154358, + "learning_rate": 0.00011772385761793325, + "loss": 0.2125, + "step": 3345 + }, + { + "epoch": 1.2355982274741506, + "grad_norm": 0.37607520818710327, + "learning_rate": 0.00011769922404236975, + "loss": 0.2905, + "step": 3346 + }, + { + "epoch": 1.2359675036927622, + "grad_norm": 0.2471715658903122, + "learning_rate": 0.00011767459046680627, + "loss": 0.2331, + "step": 3347 + }, + { + "epoch": 1.2363367799113738, + "grad_norm": 0.2758427560329437, + "learning_rate": 0.00011764995689124277, + "loss": 0.2616, + "step": 3348 + }, + { + "epoch": 1.2367060561299852, + "grad_norm": 0.2578900158405304, + "learning_rate": 0.00011762532331567928, + "loss": 0.2445, + "step": 3349 + }, + { + "epoch": 1.2370753323485968, + "grad_norm": 0.2560451328754425, + "learning_rate": 0.00011760068974011578, + "loss": 0.2331, + "step": 3350 + }, + { + "epoch": 1.2370753323485968, + "eval_loss": 8.416973114013672, + "eval_runtime": 6.9213, + "eval_samples_per_second": 7.224, + "eval_steps_per_second": 1.011, + "step": 3350 + }, + { + "epoch": 1.2374446085672082, + "grad_norm": 0.2963328957557678, + "learning_rate": 0.0001175760561645523, + "loss": 0.2591, + "step": 3351 + }, + { + "epoch": 1.2378138847858198, + "grad_norm": 0.26694539189338684, + "learning_rate": 0.00011755142258898879, + "loss": 0.213, + "step": 3352 + }, + { + "epoch": 1.2381831610044314, + "grad_norm": 0.2384127378463745, + "learning_rate": 0.00011752678901342531, + "loss": 0.2584, + "step": 3353 + }, + { + "epoch": 1.2385524372230428, + "grad_norm": 0.2888193726539612, + "learning_rate": 0.0001175021554378618, + "loss": 0.2766, + "step": 3354 + }, + { + "epoch": 1.2389217134416544, + "grad_norm": 0.22999149560928345, + "learning_rate": 0.00011747752186229833, + "loss": 0.2198, + "step": 3355 + }, + { + "epoch": 1.239290989660266, + "grad_norm": 0.20463885366916656, + "learning_rate": 0.00011745288828673482, + "loss": 0.1983, + "step": 3356 + }, + { + "epoch": 1.2396602658788773, + "grad_norm": 0.2699209451675415, + "learning_rate": 0.00011742825471117133, + "loss": 0.2528, + "step": 3357 + }, + { + "epoch": 1.240029542097489, + "grad_norm": 0.2296416461467743, + "learning_rate": 0.00011740362113560783, + "loss": 0.2013, + "step": 3358 + }, + { + "epoch": 1.2403988183161005, + "grad_norm": 0.22649043798446655, + "learning_rate": 0.00011737898756004435, + "loss": 0.225, + "step": 3359 + }, + { + "epoch": 1.240768094534712, + "grad_norm": 0.23239751160144806, + "learning_rate": 0.00011735435398448085, + "loss": 0.1814, + "step": 3360 + }, + { + "epoch": 1.2411373707533235, + "grad_norm": 0.22629684209823608, + "learning_rate": 0.00011732972040891736, + "loss": 0.2041, + "step": 3361 + }, + { + "epoch": 1.241506646971935, + "grad_norm": 0.3109305500984192, + "learning_rate": 0.00011730508683335386, + "loss": 0.2318, + "step": 3362 + }, + { + "epoch": 1.2418759231905465, + "grad_norm": 0.2607688307762146, + "learning_rate": 0.00011728045325779038, + "loss": 0.2533, + "step": 3363 + }, + { + "epoch": 1.2422451994091581, + "grad_norm": 0.23955689370632172, + "learning_rate": 0.00011725581968222688, + "loss": 0.2073, + "step": 3364 + }, + { + "epoch": 1.2426144756277695, + "grad_norm": 0.23761025071144104, + "learning_rate": 0.0001172311861066634, + "loss": 0.2199, + "step": 3365 + }, + { + "epoch": 1.242983751846381, + "grad_norm": 0.3183135986328125, + "learning_rate": 0.0001172065525310999, + "loss": 0.2584, + "step": 3366 + }, + { + "epoch": 1.2433530280649927, + "grad_norm": 0.2968287467956543, + "learning_rate": 0.00011718191895553641, + "loss": 0.2598, + "step": 3367 + }, + { + "epoch": 1.243722304283604, + "grad_norm": 0.29538244009017944, + "learning_rate": 0.0001171572853799729, + "loss": 0.2158, + "step": 3368 + }, + { + "epoch": 1.2440915805022157, + "grad_norm": 0.2288307398557663, + "learning_rate": 0.00011713265180440943, + "loss": 0.2243, + "step": 3369 + }, + { + "epoch": 1.2444608567208273, + "grad_norm": 0.23554591834545135, + "learning_rate": 0.00011710801822884591, + "loss": 0.2444, + "step": 3370 + }, + { + "epoch": 1.2448301329394387, + "grad_norm": 0.2647620737552643, + "learning_rate": 0.00011708338465328244, + "loss": 0.2216, + "step": 3371 + }, + { + "epoch": 1.2451994091580503, + "grad_norm": 0.366571843624115, + "learning_rate": 0.00011705875107771893, + "loss": 0.2178, + "step": 3372 + }, + { + "epoch": 1.2455686853766617, + "grad_norm": 0.26439544558525085, + "learning_rate": 0.00011703411750215544, + "loss": 0.2316, + "step": 3373 + }, + { + "epoch": 1.2459379615952733, + "grad_norm": 0.2776379883289337, + "learning_rate": 0.00011700948392659195, + "loss": 0.2356, + "step": 3374 + }, + { + "epoch": 1.2463072378138849, + "grad_norm": 0.27136820554733276, + "learning_rate": 0.00011698485035102846, + "loss": 0.2286, + "step": 3375 + }, + { + "epoch": 1.2466765140324962, + "grad_norm": 0.2680774927139282, + "learning_rate": 0.00011696021677546496, + "loss": 0.2291, + "step": 3376 + }, + { + "epoch": 1.2470457902511078, + "grad_norm": 0.25082525610923767, + "learning_rate": 0.00011693558319990148, + "loss": 0.2328, + "step": 3377 + }, + { + "epoch": 1.2474150664697194, + "grad_norm": 0.22933636605739594, + "learning_rate": 0.00011691094962433798, + "loss": 0.224, + "step": 3378 + }, + { + "epoch": 1.2477843426883308, + "grad_norm": 0.30019885301589966, + "learning_rate": 0.00011688631604877449, + "loss": 0.285, + "step": 3379 + }, + { + "epoch": 1.2481536189069424, + "grad_norm": 0.26965227723121643, + "learning_rate": 0.00011686168247321099, + "loss": 0.2933, + "step": 3380 + }, + { + "epoch": 1.248522895125554, + "grad_norm": 0.3054683208465576, + "learning_rate": 0.00011683704889764751, + "loss": 0.246, + "step": 3381 + }, + { + "epoch": 1.2488921713441654, + "grad_norm": 0.25285640358924866, + "learning_rate": 0.000116812415322084, + "loss": 0.2108, + "step": 3382 + }, + { + "epoch": 1.249261447562777, + "grad_norm": 0.27096039056777954, + "learning_rate": 0.00011678778174652052, + "loss": 0.2066, + "step": 3383 + }, + { + "epoch": 1.2496307237813884, + "grad_norm": 0.3022547960281372, + "learning_rate": 0.00011676314817095701, + "loss": 0.3064, + "step": 3384 + }, + { + "epoch": 1.25, + "grad_norm": 0.2609975337982178, + "learning_rate": 0.00011673851459539354, + "loss": 0.2359, + "step": 3385 + }, + { + "epoch": 1.2503692762186116, + "grad_norm": 0.22686047852039337, + "learning_rate": 0.00011671388101983003, + "loss": 0.2154, + "step": 3386 + }, + { + "epoch": 1.250738552437223, + "grad_norm": 0.2349785417318344, + "learning_rate": 0.00011668924744426654, + "loss": 0.1998, + "step": 3387 + }, + { + "epoch": 1.2511078286558346, + "grad_norm": 0.21435336768627167, + "learning_rate": 0.00011666461386870304, + "loss": 0.1983, + "step": 3388 + }, + { + "epoch": 1.251477104874446, + "grad_norm": 0.2593035101890564, + "learning_rate": 0.00011663998029313956, + "loss": 0.1971, + "step": 3389 + }, + { + "epoch": 1.2518463810930576, + "grad_norm": 0.24976903200149536, + "learning_rate": 0.00011661534671757606, + "loss": 0.2494, + "step": 3390 + }, + { + "epoch": 1.2522156573116692, + "grad_norm": 0.2570582330226898, + "learning_rate": 0.00011659071314201257, + "loss": 0.2234, + "step": 3391 + }, + { + "epoch": 1.2525849335302808, + "grad_norm": 0.2553063929080963, + "learning_rate": 0.00011656607956644907, + "loss": 0.2131, + "step": 3392 + }, + { + "epoch": 1.2529542097488922, + "grad_norm": 0.2569276988506317, + "learning_rate": 0.00011654144599088559, + "loss": 0.2378, + "step": 3393 + }, + { + "epoch": 1.2533234859675038, + "grad_norm": 0.25869986414909363, + "learning_rate": 0.00011651681241532209, + "loss": 0.2287, + "step": 3394 + }, + { + "epoch": 1.2536927621861151, + "grad_norm": 0.26297512650489807, + "learning_rate": 0.0001164921788397586, + "loss": 0.2192, + "step": 3395 + }, + { + "epoch": 1.2540620384047267, + "grad_norm": 0.24470414221286774, + "learning_rate": 0.0001164675452641951, + "loss": 0.2113, + "step": 3396 + }, + { + "epoch": 1.2544313146233383, + "grad_norm": 0.27882763743400574, + "learning_rate": 0.00011644291168863162, + "loss": 0.238, + "step": 3397 + }, + { + "epoch": 1.2548005908419497, + "grad_norm": 0.2359365075826645, + "learning_rate": 0.00011641827811306811, + "loss": 0.2055, + "step": 3398 + }, + { + "epoch": 1.2551698670605613, + "grad_norm": 0.2696508765220642, + "learning_rate": 0.00011639364453750463, + "loss": 0.2327, + "step": 3399 + }, + { + "epoch": 1.2555391432791727, + "grad_norm": 0.2626379430294037, + "learning_rate": 0.00011636901096194112, + "loss": 0.239, + "step": 3400 + }, + { + "epoch": 1.2555391432791727, + "eval_loss": 8.523516654968262, + "eval_runtime": 6.903, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 1.014, + "step": 3400 + }, + { + "epoch": 1.2559084194977843, + "grad_norm": 0.32457777857780457, + "learning_rate": 0.00011634437738637765, + "loss": 0.2507, + "step": 3401 + }, + { + "epoch": 1.256277695716396, + "grad_norm": 0.2613844573497772, + "learning_rate": 0.00011631974381081414, + "loss": 0.23, + "step": 3402 + }, + { + "epoch": 1.2566469719350075, + "grad_norm": 0.23312924802303314, + "learning_rate": 0.00011629511023525065, + "loss": 0.2119, + "step": 3403 + }, + { + "epoch": 1.257016248153619, + "grad_norm": 0.286294549703598, + "learning_rate": 0.00011627047665968715, + "loss": 0.2378, + "step": 3404 + }, + { + "epoch": 1.2573855243722305, + "grad_norm": 0.24122202396392822, + "learning_rate": 0.00011624584308412367, + "loss": 0.2026, + "step": 3405 + }, + { + "epoch": 1.2577548005908419, + "grad_norm": 0.34664401412010193, + "learning_rate": 0.00011622120950856017, + "loss": 0.2553, + "step": 3406 + }, + { + "epoch": 1.2581240768094535, + "grad_norm": 0.3065793514251709, + "learning_rate": 0.00011619657593299668, + "loss": 0.272, + "step": 3407 + }, + { + "epoch": 1.258493353028065, + "grad_norm": 0.3267727196216583, + "learning_rate": 0.00011617194235743319, + "loss": 0.2577, + "step": 3408 + }, + { + "epoch": 1.2588626292466765, + "grad_norm": 0.29558831453323364, + "learning_rate": 0.0001161473087818697, + "loss": 0.2415, + "step": 3409 + }, + { + "epoch": 1.259231905465288, + "grad_norm": 0.28601300716400146, + "learning_rate": 0.0001161226752063062, + "loss": 0.2491, + "step": 3410 + }, + { + "epoch": 1.2596011816838995, + "grad_norm": 0.25276196002960205, + "learning_rate": 0.00011609804163074272, + "loss": 0.2412, + "step": 3411 + }, + { + "epoch": 1.259970457902511, + "grad_norm": 0.2742173373699188, + "learning_rate": 0.00011607340805517922, + "loss": 0.2365, + "step": 3412 + }, + { + "epoch": 1.2603397341211227, + "grad_norm": 0.22434216737747192, + "learning_rate": 0.00011604877447961573, + "loss": 0.1869, + "step": 3413 + }, + { + "epoch": 1.2607090103397343, + "grad_norm": 0.21254268288612366, + "learning_rate": 0.00011602414090405222, + "loss": 0.1931, + "step": 3414 + }, + { + "epoch": 1.2610782865583456, + "grad_norm": 0.2525831460952759, + "learning_rate": 0.00011599950732848875, + "loss": 0.1967, + "step": 3415 + }, + { + "epoch": 1.2614475627769572, + "grad_norm": 0.25302761793136597, + "learning_rate": 0.00011597487375292523, + "loss": 0.224, + "step": 3416 + }, + { + "epoch": 1.2618168389955686, + "grad_norm": 0.2513731122016907, + "learning_rate": 0.00011595024017736176, + "loss": 0.2347, + "step": 3417 + }, + { + "epoch": 1.2621861152141802, + "grad_norm": 0.25753411650657654, + "learning_rate": 0.00011592560660179825, + "loss": 0.227, + "step": 3418 + }, + { + "epoch": 1.2625553914327918, + "grad_norm": 0.2561231255531311, + "learning_rate": 0.00011590097302623477, + "loss": 0.2689, + "step": 3419 + }, + { + "epoch": 1.2629246676514032, + "grad_norm": 0.25286149978637695, + "learning_rate": 0.00011587633945067127, + "loss": 0.2569, + "step": 3420 + }, + { + "epoch": 1.2632939438700148, + "grad_norm": 0.24448955059051514, + "learning_rate": 0.00011585170587510778, + "loss": 0.2249, + "step": 3421 + }, + { + "epoch": 1.2636632200886262, + "grad_norm": 0.2631427049636841, + "learning_rate": 0.00011582707229954428, + "loss": 0.2688, + "step": 3422 + }, + { + "epoch": 1.2640324963072378, + "grad_norm": 0.20674455165863037, + "learning_rate": 0.0001158024387239808, + "loss": 0.1816, + "step": 3423 + }, + { + "epoch": 1.2644017725258494, + "grad_norm": 0.22658397257328033, + "learning_rate": 0.0001157778051484173, + "loss": 0.2076, + "step": 3424 + }, + { + "epoch": 1.2647710487444608, + "grad_norm": 0.2082243263721466, + "learning_rate": 0.00011575317157285381, + "loss": 0.1896, + "step": 3425 + }, + { + "epoch": 1.2651403249630724, + "grad_norm": 0.28609052300453186, + "learning_rate": 0.00011572853799729031, + "loss": 0.2102, + "step": 3426 + }, + { + "epoch": 1.2655096011816838, + "grad_norm": 0.2633495628833771, + "learning_rate": 0.00011570390442172683, + "loss": 0.2353, + "step": 3427 + }, + { + "epoch": 1.2658788774002954, + "grad_norm": 0.20005057752132416, + "learning_rate": 0.00011567927084616333, + "loss": 0.1654, + "step": 3428 + }, + { + "epoch": 1.266248153618907, + "grad_norm": 0.2864704132080078, + "learning_rate": 0.00011565463727059984, + "loss": 0.2377, + "step": 3429 + }, + { + "epoch": 1.2666174298375186, + "grad_norm": 0.23344750702381134, + "learning_rate": 0.00011563000369503633, + "loss": 0.2357, + "step": 3430 + }, + { + "epoch": 1.26698670605613, + "grad_norm": 0.24360081553459167, + "learning_rate": 0.00011560537011947286, + "loss": 0.2246, + "step": 3431 + }, + { + "epoch": 1.2673559822747416, + "grad_norm": 0.3829514980316162, + "learning_rate": 0.00011558073654390935, + "loss": 0.2279, + "step": 3432 + }, + { + "epoch": 1.267725258493353, + "grad_norm": 0.29327312111854553, + "learning_rate": 0.00011555610296834588, + "loss": 0.2274, + "step": 3433 + }, + { + "epoch": 1.2680945347119645, + "grad_norm": 0.23207516968250275, + "learning_rate": 0.00011553146939278236, + "loss": 0.1646, + "step": 3434 + }, + { + "epoch": 1.2684638109305761, + "grad_norm": 0.231863871216774, + "learning_rate": 0.00011550683581721888, + "loss": 0.2271, + "step": 3435 + }, + { + "epoch": 1.2688330871491875, + "grad_norm": 0.29306188225746155, + "learning_rate": 0.00011548220224165538, + "loss": 0.2844, + "step": 3436 + }, + { + "epoch": 1.2692023633677991, + "grad_norm": 0.3142595887184143, + "learning_rate": 0.00011545756866609189, + "loss": 0.262, + "step": 3437 + }, + { + "epoch": 1.2695716395864105, + "grad_norm": 0.24813604354858398, + "learning_rate": 0.0001154329350905284, + "loss": 0.2413, + "step": 3438 + }, + { + "epoch": 1.269940915805022, + "grad_norm": 0.2181418389081955, + "learning_rate": 0.00011540830151496491, + "loss": 0.21, + "step": 3439 + }, + { + "epoch": 1.2703101920236337, + "grad_norm": 0.22441615164279938, + "learning_rate": 0.00011538366793940141, + "loss": 0.2056, + "step": 3440 + }, + { + "epoch": 1.2706794682422453, + "grad_norm": 0.24306073784828186, + "learning_rate": 0.00011535903436383792, + "loss": 0.2332, + "step": 3441 + }, + { + "epoch": 1.2710487444608567, + "grad_norm": 0.24106436967849731, + "learning_rate": 0.00011533440078827443, + "loss": 0.2168, + "step": 3442 + }, + { + "epoch": 1.2714180206794683, + "grad_norm": 0.26992201805114746, + "learning_rate": 0.00011530976721271094, + "loss": 0.2022, + "step": 3443 + }, + { + "epoch": 1.2717872968980797, + "grad_norm": 0.2588542401790619, + "learning_rate": 0.00011528513363714744, + "loss": 0.2264, + "step": 3444 + }, + { + "epoch": 1.2721565731166913, + "grad_norm": 0.25823917984962463, + "learning_rate": 0.00011526050006158396, + "loss": 0.2169, + "step": 3445 + }, + { + "epoch": 1.2725258493353029, + "grad_norm": 0.2877916991710663, + "learning_rate": 0.00011523586648602044, + "loss": 0.2279, + "step": 3446 + }, + { + "epoch": 1.2728951255539143, + "grad_norm": 0.25977274775505066, + "learning_rate": 0.00011521123291045697, + "loss": 0.2276, + "step": 3447 + }, + { + "epoch": 1.2732644017725259, + "grad_norm": 0.27222439646720886, + "learning_rate": 0.00011518659933489346, + "loss": 0.2621, + "step": 3448 + }, + { + "epoch": 1.2736336779911372, + "grad_norm": 0.2803106904029846, + "learning_rate": 0.00011516196575932999, + "loss": 0.2257, + "step": 3449 + }, + { + "epoch": 1.2740029542097489, + "grad_norm": 0.24286209046840668, + "learning_rate": 0.00011513733218376648, + "loss": 0.2106, + "step": 3450 + }, + { + "epoch": 1.2740029542097489, + "eval_loss": 8.425955772399902, + "eval_runtime": 6.9174, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 1.012, + "step": 3450 + }, + { + "epoch": 1.2743722304283605, + "grad_norm": 0.2712884247303009, + "learning_rate": 0.00011511269860820299, + "loss": 0.2119, + "step": 3451 + }, + { + "epoch": 1.274741506646972, + "grad_norm": 0.25911641120910645, + "learning_rate": 0.00011508806503263949, + "loss": 0.2224, + "step": 3452 + }, + { + "epoch": 1.2751107828655834, + "grad_norm": 0.29832640290260315, + "learning_rate": 0.000115063431457076, + "loss": 0.2862, + "step": 3453 + }, + { + "epoch": 1.275480059084195, + "grad_norm": 0.3105472922325134, + "learning_rate": 0.0001150387978815125, + "loss": 0.2398, + "step": 3454 + }, + { + "epoch": 1.2758493353028064, + "grad_norm": 0.3013194501399994, + "learning_rate": 0.00011501416430594902, + "loss": 0.2191, + "step": 3455 + }, + { + "epoch": 1.276218611521418, + "grad_norm": 0.6256669759750366, + "learning_rate": 0.00011498953073038552, + "loss": 0.2396, + "step": 3456 + }, + { + "epoch": 1.2765878877400296, + "grad_norm": 0.2438044548034668, + "learning_rate": 0.00011496489715482204, + "loss": 0.2131, + "step": 3457 + }, + { + "epoch": 1.276957163958641, + "grad_norm": 0.3112978935241699, + "learning_rate": 0.00011494026357925854, + "loss": 0.244, + "step": 3458 + }, + { + "epoch": 1.2773264401772526, + "grad_norm": 0.327286034822464, + "learning_rate": 0.00011491563000369505, + "loss": 0.2875, + "step": 3459 + }, + { + "epoch": 1.277695716395864, + "grad_norm": 0.31151580810546875, + "learning_rate": 0.00011489099642813155, + "loss": 0.21, + "step": 3460 + }, + { + "epoch": 1.2780649926144756, + "grad_norm": 0.2683573365211487, + "learning_rate": 0.00011486636285256807, + "loss": 0.2157, + "step": 3461 + }, + { + "epoch": 1.2784342688330872, + "grad_norm": 0.28166618943214417, + "learning_rate": 0.00011484172927700456, + "loss": 0.2213, + "step": 3462 + }, + { + "epoch": 1.2788035450516988, + "grad_norm": 0.28906145691871643, + "learning_rate": 0.00011481709570144108, + "loss": 0.2486, + "step": 3463 + }, + { + "epoch": 1.2791728212703102, + "grad_norm": 0.29438501596450806, + "learning_rate": 0.00011479246212587757, + "loss": 0.2505, + "step": 3464 + }, + { + "epoch": 1.2795420974889218, + "grad_norm": 0.2749755382537842, + "learning_rate": 0.0001147678285503141, + "loss": 0.2261, + "step": 3465 + }, + { + "epoch": 1.2799113737075332, + "grad_norm": 0.2997390329837799, + "learning_rate": 0.00011474319497475059, + "loss": 0.2483, + "step": 3466 + }, + { + "epoch": 1.2802806499261448, + "grad_norm": 0.3680136203765869, + "learning_rate": 0.0001147185613991871, + "loss": 0.2698, + "step": 3467 + }, + { + "epoch": 1.2806499261447564, + "grad_norm": 0.328579843044281, + "learning_rate": 0.0001146939278236236, + "loss": 0.3016, + "step": 3468 + }, + { + "epoch": 1.2810192023633677, + "grad_norm": 0.26369476318359375, + "learning_rate": 0.00011466929424806012, + "loss": 0.2201, + "step": 3469 + }, + { + "epoch": 1.2813884785819794, + "grad_norm": 0.27216023206710815, + "learning_rate": 0.00011464466067249662, + "loss": 0.2085, + "step": 3470 + }, + { + "epoch": 1.2817577548005907, + "grad_norm": 0.241181418299675, + "learning_rate": 0.00011462002709693313, + "loss": 0.1941, + "step": 3471 + }, + { + "epoch": 1.2821270310192023, + "grad_norm": 0.29305508732795715, + "learning_rate": 0.00011459539352136963, + "loss": 0.2817, + "step": 3472 + }, + { + "epoch": 1.282496307237814, + "grad_norm": 0.27724137902259827, + "learning_rate": 0.00011457075994580615, + "loss": 0.2285, + "step": 3473 + }, + { + "epoch": 1.2828655834564255, + "grad_norm": 0.28394389152526855, + "learning_rate": 0.00011454612637024265, + "loss": 0.2713, + "step": 3474 + }, + { + "epoch": 1.283234859675037, + "grad_norm": 0.27055153250694275, + "learning_rate": 0.00011452149279467916, + "loss": 0.2237, + "step": 3475 + }, + { + "epoch": 1.2836041358936485, + "grad_norm": 0.24068449437618256, + "learning_rate": 0.00011449685921911567, + "loss": 0.2398, + "step": 3476 + }, + { + "epoch": 1.28397341211226, + "grad_norm": 0.23853908479213715, + "learning_rate": 0.00011447222564355218, + "loss": 0.2044, + "step": 3477 + }, + { + "epoch": 1.2843426883308715, + "grad_norm": 0.27512553334236145, + "learning_rate": 0.00011444759206798867, + "loss": 0.2019, + "step": 3478 + }, + { + "epoch": 1.284711964549483, + "grad_norm": 0.22928085923194885, + "learning_rate": 0.0001144229584924252, + "loss": 0.2249, + "step": 3479 + }, + { + "epoch": 1.2850812407680945, + "grad_norm": 0.23123067617416382, + "learning_rate": 0.00011439832491686168, + "loss": 0.219, + "step": 3480 + }, + { + "epoch": 1.285450516986706, + "grad_norm": 0.2043607085943222, + "learning_rate": 0.00011437369134129821, + "loss": 0.2033, + "step": 3481 + }, + { + "epoch": 1.2858197932053175, + "grad_norm": 0.2710270583629608, + "learning_rate": 0.0001143490577657347, + "loss": 0.2649, + "step": 3482 + }, + { + "epoch": 1.286189069423929, + "grad_norm": 0.21949158608913422, + "learning_rate": 0.00011432442419017121, + "loss": 0.2099, + "step": 3483 + }, + { + "epoch": 1.2865583456425407, + "grad_norm": 0.25183525681495667, + "learning_rate": 0.00011429979061460772, + "loss": 0.2572, + "step": 3484 + }, + { + "epoch": 1.286927621861152, + "grad_norm": 0.23705171048641205, + "learning_rate": 0.00011427515703904422, + "loss": 0.2578, + "step": 3485 + }, + { + "epoch": 1.2872968980797637, + "grad_norm": 0.326200932264328, + "learning_rate": 0.00011425052346348073, + "loss": 0.2475, + "step": 3486 + }, + { + "epoch": 1.2876661742983753, + "grad_norm": 0.2294246405363083, + "learning_rate": 0.00011422588988791723, + "loss": 0.1903, + "step": 3487 + }, + { + "epoch": 1.2880354505169866, + "grad_norm": 0.2455328106880188, + "learning_rate": 0.00011420125631235375, + "loss": 0.2288, + "step": 3488 + }, + { + "epoch": 1.2884047267355982, + "grad_norm": 0.2682133913040161, + "learning_rate": 0.00011417662273679023, + "loss": 0.2395, + "step": 3489 + }, + { + "epoch": 1.2887740029542099, + "grad_norm": 0.22708268463611603, + "learning_rate": 0.00011415198916122676, + "loss": 0.2167, + "step": 3490 + }, + { + "epoch": 1.2891432791728212, + "grad_norm": 0.2839600145816803, + "learning_rate": 0.00011412735558566325, + "loss": 0.2726, + "step": 3491 + }, + { + "epoch": 1.2895125553914328, + "grad_norm": 0.2724458873271942, + "learning_rate": 0.00011410272201009978, + "loss": 0.1905, + "step": 3492 + }, + { + "epoch": 1.2898818316100442, + "grad_norm": 0.22909487783908844, + "learning_rate": 0.00011407808843453627, + "loss": 0.223, + "step": 3493 + }, + { + "epoch": 1.2902511078286558, + "grad_norm": 0.31607377529144287, + "learning_rate": 0.00011405345485897278, + "loss": 0.2788, + "step": 3494 + }, + { + "epoch": 1.2906203840472674, + "grad_norm": 0.26251718401908875, + "learning_rate": 0.00011402882128340928, + "loss": 0.211, + "step": 3495 + }, + { + "epoch": 1.2909896602658788, + "grad_norm": 0.22165197134017944, + "learning_rate": 0.0001140041877078458, + "loss": 0.2178, + "step": 3496 + }, + { + "epoch": 1.2913589364844904, + "grad_norm": 0.2483806163072586, + "learning_rate": 0.0001139795541322823, + "loss": 0.2003, + "step": 3497 + }, + { + "epoch": 1.2917282127031018, + "grad_norm": 0.2736043930053711, + "learning_rate": 0.00011395492055671881, + "loss": 0.2487, + "step": 3498 + }, + { + "epoch": 1.2920974889217134, + "grad_norm": 0.22725573182106018, + "learning_rate": 0.00011393028698115531, + "loss": 0.1838, + "step": 3499 + }, + { + "epoch": 1.292466765140325, + "grad_norm": 0.2202163189649582, + "learning_rate": 0.00011390565340559183, + "loss": 0.1961, + "step": 3500 + }, + { + "epoch": 1.292466765140325, + "eval_loss": 8.431181907653809, + "eval_runtime": 6.9186, + "eval_samples_per_second": 7.227, + "eval_steps_per_second": 1.012, + "step": 3500 + }, + { + "epoch": 1.2928360413589366, + "grad_norm": 0.2373214066028595, + "learning_rate": 0.00011388101983002833, + "loss": 0.2271, + "step": 3501 + }, + { + "epoch": 1.293205317577548, + "grad_norm": 0.2979702353477478, + "learning_rate": 0.00011385638625446484, + "loss": 0.251, + "step": 3502 + }, + { + "epoch": 1.2935745937961596, + "grad_norm": 0.2323768436908722, + "learning_rate": 0.00011383175267890134, + "loss": 0.1932, + "step": 3503 + }, + { + "epoch": 1.293943870014771, + "grad_norm": 0.4226436913013458, + "learning_rate": 0.00011380711910333786, + "loss": 0.2564, + "step": 3504 + }, + { + "epoch": 1.2943131462333826, + "grad_norm": 0.2765912115573883, + "learning_rate": 0.00011378248552777435, + "loss": 0.2345, + "step": 3505 + }, + { + "epoch": 1.2946824224519942, + "grad_norm": 0.29060566425323486, + "learning_rate": 0.00011375785195221087, + "loss": 0.2087, + "step": 3506 + }, + { + "epoch": 1.2950516986706055, + "grad_norm": 0.2734554409980774, + "learning_rate": 0.00011373321837664736, + "loss": 0.2653, + "step": 3507 + }, + { + "epoch": 1.2954209748892171, + "grad_norm": 0.31372523307800293, + "learning_rate": 0.00011370858480108389, + "loss": 0.2722, + "step": 3508 + }, + { + "epoch": 1.2957902511078285, + "grad_norm": 0.2653447091579437, + "learning_rate": 0.00011368395122552038, + "loss": 0.26, + "step": 3509 + }, + { + "epoch": 1.2961595273264401, + "grad_norm": 0.24433931708335876, + "learning_rate": 0.00011365931764995689, + "loss": 0.2336, + "step": 3510 + }, + { + "epoch": 1.2965288035450517, + "grad_norm": 0.30695024132728577, + "learning_rate": 0.0001136346840743934, + "loss": 0.2529, + "step": 3511 + }, + { + "epoch": 1.2968980797636633, + "grad_norm": 0.27168330550193787, + "learning_rate": 0.00011361005049882991, + "loss": 0.2229, + "step": 3512 + }, + { + "epoch": 1.2972673559822747, + "grad_norm": 0.2662791311740875, + "learning_rate": 0.00011358541692326641, + "loss": 0.2441, + "step": 3513 + }, + { + "epoch": 1.2976366322008863, + "grad_norm": 0.26673001050949097, + "learning_rate": 0.00011356078334770292, + "loss": 0.2382, + "step": 3514 + }, + { + "epoch": 1.2980059084194977, + "grad_norm": 0.24687881767749786, + "learning_rate": 0.00011353614977213943, + "loss": 0.2291, + "step": 3515 + }, + { + "epoch": 1.2983751846381093, + "grad_norm": 0.23660744726657867, + "learning_rate": 0.00011351151619657594, + "loss": 0.2262, + "step": 3516 + }, + { + "epoch": 1.298744460856721, + "grad_norm": 0.28196680545806885, + "learning_rate": 0.00011348688262101244, + "loss": 0.2394, + "step": 3517 + }, + { + "epoch": 1.2991137370753323, + "grad_norm": 0.26121199131011963, + "learning_rate": 0.00011346224904544896, + "loss": 0.2211, + "step": 3518 + }, + { + "epoch": 1.299483013293944, + "grad_norm": 0.2198493778705597, + "learning_rate": 0.00011343761546988546, + "loss": 0.195, + "step": 3519 + }, + { + "epoch": 1.2998522895125553, + "grad_norm": 0.31639131903648376, + "learning_rate": 0.00011341298189432197, + "loss": 0.2576, + "step": 3520 + }, + { + "epoch": 1.3002215657311669, + "grad_norm": 0.31901511549949646, + "learning_rate": 0.00011338834831875846, + "loss": 0.2666, + "step": 3521 + }, + { + "epoch": 1.3005908419497785, + "grad_norm": 0.3152129054069519, + "learning_rate": 0.00011336371474319499, + "loss": 0.3, + "step": 3522 + }, + { + "epoch": 1.30096011816839, + "grad_norm": 0.23618915677070618, + "learning_rate": 0.00011333908116763147, + "loss": 0.202, + "step": 3523 + }, + { + "epoch": 1.3013293943870015, + "grad_norm": 0.2665996551513672, + "learning_rate": 0.000113314447592068, + "loss": 0.2564, + "step": 3524 + }, + { + "epoch": 1.301698670605613, + "grad_norm": 0.22175969183444977, + "learning_rate": 0.00011328981401650449, + "loss": 0.2127, + "step": 3525 + }, + { + "epoch": 1.3020679468242244, + "grad_norm": 0.2564530670642853, + "learning_rate": 0.000113265180440941, + "loss": 0.2381, + "step": 3526 + }, + { + "epoch": 1.302437223042836, + "grad_norm": 0.25428542494773865, + "learning_rate": 0.0001132405468653775, + "loss": 0.2295, + "step": 3527 + }, + { + "epoch": 1.3028064992614476, + "grad_norm": 0.2503000795841217, + "learning_rate": 0.00011321591328981402, + "loss": 0.2071, + "step": 3528 + }, + { + "epoch": 1.303175775480059, + "grad_norm": 0.24263231456279755, + "learning_rate": 0.00011319127971425052, + "loss": 0.2385, + "step": 3529 + }, + { + "epoch": 1.3035450516986706, + "grad_norm": 0.3493305444717407, + "learning_rate": 0.00011316664613868704, + "loss": 0.2707, + "step": 3530 + }, + { + "epoch": 1.303914327917282, + "grad_norm": 0.2467355579137802, + "learning_rate": 0.00011314201256312354, + "loss": 0.2313, + "step": 3531 + }, + { + "epoch": 1.3042836041358936, + "grad_norm": 0.23463772237300873, + "learning_rate": 0.00011311737898756005, + "loss": 0.2166, + "step": 3532 + }, + { + "epoch": 1.3046528803545052, + "grad_norm": 0.21148861944675446, + "learning_rate": 0.00011309274541199655, + "loss": 0.1849, + "step": 3533 + }, + { + "epoch": 1.3050221565731168, + "grad_norm": 0.24731232225894928, + "learning_rate": 0.00011306811183643307, + "loss": 0.1832, + "step": 3534 + }, + { + "epoch": 1.3053914327917282, + "grad_norm": 0.31160566210746765, + "learning_rate": 0.00011304347826086956, + "loss": 0.2774, + "step": 3535 + }, + { + "epoch": 1.3057607090103398, + "grad_norm": 0.29273363947868347, + "learning_rate": 0.00011301884468530608, + "loss": 0.2228, + "step": 3536 + }, + { + "epoch": 1.3061299852289512, + "grad_norm": 0.29804956912994385, + "learning_rate": 0.00011299421110974257, + "loss": 0.2946, + "step": 3537 + }, + { + "epoch": 1.3064992614475628, + "grad_norm": 0.2274644821882248, + "learning_rate": 0.0001129695775341791, + "loss": 0.2211, + "step": 3538 + }, + { + "epoch": 1.3068685376661744, + "grad_norm": 0.28826260566711426, + "learning_rate": 0.00011294494395861559, + "loss": 0.2216, + "step": 3539 + }, + { + "epoch": 1.3072378138847858, + "grad_norm": 0.25884541869163513, + "learning_rate": 0.0001129203103830521, + "loss": 0.2494, + "step": 3540 + }, + { + "epoch": 1.3076070901033974, + "grad_norm": 0.2520054578781128, + "learning_rate": 0.0001128956768074886, + "loss": 0.2121, + "step": 3541 + }, + { + "epoch": 1.3079763663220088, + "grad_norm": 0.2243940234184265, + "learning_rate": 0.00011287104323192512, + "loss": 0.2242, + "step": 3542 + }, + { + "epoch": 1.3083456425406204, + "grad_norm": 0.27346163988113403, + "learning_rate": 0.00011284640965636162, + "loss": 0.2451, + "step": 3543 + }, + { + "epoch": 1.308714918759232, + "grad_norm": 0.22108355164527893, + "learning_rate": 0.00011282177608079813, + "loss": 0.1919, + "step": 3544 + }, + { + "epoch": 1.3090841949778436, + "grad_norm": 0.2784762978553772, + "learning_rate": 0.00011279714250523463, + "loss": 0.2347, + "step": 3545 + }, + { + "epoch": 1.309453471196455, + "grad_norm": 0.21379348635673523, + "learning_rate": 0.00011277250892967115, + "loss": 0.1853, + "step": 3546 + }, + { + "epoch": 1.3098227474150665, + "grad_norm": 0.2326963245868683, + "learning_rate": 0.00011274787535410765, + "loss": 0.212, + "step": 3547 + }, + { + "epoch": 1.310192023633678, + "grad_norm": 0.26175549626350403, + "learning_rate": 0.00011272324177854416, + "loss": 0.2166, + "step": 3548 + }, + { + "epoch": 1.3105612998522895, + "grad_norm": 0.2624920904636383, + "learning_rate": 0.00011269860820298067, + "loss": 0.2258, + "step": 3549 + }, + { + "epoch": 1.3109305760709011, + "grad_norm": 0.23074592649936676, + "learning_rate": 0.00011267397462741718, + "loss": 0.2259, + "step": 3550 + }, + { + "epoch": 1.3109305760709011, + "eval_loss": 8.341036796569824, + "eval_runtime": 6.925, + "eval_samples_per_second": 7.22, + "eval_steps_per_second": 1.011, + "step": 3550 + }, + { + "epoch": 1.3112998522895125, + "grad_norm": 0.23249943554401398, + "learning_rate": 0.00011264934105185367, + "loss": 0.2306, + "step": 3551 + }, + { + "epoch": 1.3116691285081241, + "grad_norm": 0.23920269310474396, + "learning_rate": 0.0001126247074762902, + "loss": 0.2073, + "step": 3552 + }, + { + "epoch": 1.3120384047267355, + "grad_norm": 0.2603892385959625, + "learning_rate": 0.00011260007390072668, + "loss": 0.2405, + "step": 3553 + }, + { + "epoch": 1.312407680945347, + "grad_norm": 0.2553781569004059, + "learning_rate": 0.00011257544032516321, + "loss": 0.2219, + "step": 3554 + }, + { + "epoch": 1.3127769571639587, + "grad_norm": 0.3106338083744049, + "learning_rate": 0.0001125508067495997, + "loss": 0.2874, + "step": 3555 + }, + { + "epoch": 1.31314623338257, + "grad_norm": 0.2726644277572632, + "learning_rate": 0.00011252617317403621, + "loss": 0.2696, + "step": 3556 + }, + { + "epoch": 1.3135155096011817, + "grad_norm": 0.27001407742500305, + "learning_rate": 0.00011250153959847271, + "loss": 0.275, + "step": 3557 + }, + { + "epoch": 1.3138847858197933, + "grad_norm": 0.3006684184074402, + "learning_rate": 0.00011247690602290923, + "loss": 0.2538, + "step": 3558 + }, + { + "epoch": 1.3142540620384047, + "grad_norm": 0.25130030512809753, + "learning_rate": 0.00011245227244734573, + "loss": 0.2134, + "step": 3559 + }, + { + "epoch": 1.3146233382570163, + "grad_norm": 0.30597347021102905, + "learning_rate": 0.00011242763887178225, + "loss": 0.2888, + "step": 3560 + }, + { + "epoch": 1.3149926144756279, + "grad_norm": 0.25837674736976624, + "learning_rate": 0.00011240300529621875, + "loss": 0.2476, + "step": 3561 + }, + { + "epoch": 1.3153618906942393, + "grad_norm": 0.24690522253513336, + "learning_rate": 0.00011237837172065526, + "loss": 0.2445, + "step": 3562 + }, + { + "epoch": 1.3157311669128509, + "grad_norm": 0.3022606074810028, + "learning_rate": 0.00011235373814509176, + "loss": 0.2818, + "step": 3563 + }, + { + "epoch": 1.3161004431314622, + "grad_norm": 0.25030407309532166, + "learning_rate": 0.00011232910456952828, + "loss": 0.2383, + "step": 3564 + }, + { + "epoch": 1.3164697193500738, + "grad_norm": 0.2692084312438965, + "learning_rate": 0.00011230447099396478, + "loss": 0.2471, + "step": 3565 + }, + { + "epoch": 1.3168389955686854, + "grad_norm": 0.24791176617145538, + "learning_rate": 0.00011227983741840129, + "loss": 0.1957, + "step": 3566 + }, + { + "epoch": 1.3172082717872968, + "grad_norm": 0.26559215784072876, + "learning_rate": 0.00011225520384283778, + "loss": 0.2425, + "step": 3567 + }, + { + "epoch": 1.3175775480059084, + "grad_norm": 0.2630021274089813, + "learning_rate": 0.00011223057026727431, + "loss": 0.2247, + "step": 3568 + }, + { + "epoch": 1.3179468242245198, + "grad_norm": 0.34925708174705505, + "learning_rate": 0.0001122059366917108, + "loss": 0.2687, + "step": 3569 + }, + { + "epoch": 1.3183161004431314, + "grad_norm": 0.2590206563472748, + "learning_rate": 0.00011218130311614732, + "loss": 0.2419, + "step": 3570 + }, + { + "epoch": 1.318685376661743, + "grad_norm": 0.25965169072151184, + "learning_rate": 0.00011215666954058381, + "loss": 0.2089, + "step": 3571 + }, + { + "epoch": 1.3190546528803546, + "grad_norm": 0.2514553368091583, + "learning_rate": 0.00011213203596502033, + "loss": 0.2305, + "step": 3572 + }, + { + "epoch": 1.319423929098966, + "grad_norm": 0.27882346510887146, + "learning_rate": 0.00011210740238945683, + "loss": 0.2096, + "step": 3573 + }, + { + "epoch": 1.3197932053175776, + "grad_norm": 0.28281453251838684, + "learning_rate": 0.00011208276881389334, + "loss": 0.2395, + "step": 3574 + }, + { + "epoch": 1.320162481536189, + "grad_norm": 0.2583619952201843, + "learning_rate": 0.00011205813523832984, + "loss": 0.2433, + "step": 3575 + }, + { + "epoch": 1.3205317577548006, + "grad_norm": 0.26199811697006226, + "learning_rate": 0.00011203350166276636, + "loss": 0.2489, + "step": 3576 + }, + { + "epoch": 1.3209010339734122, + "grad_norm": 0.2452738732099533, + "learning_rate": 0.00011200886808720286, + "loss": 0.2173, + "step": 3577 + }, + { + "epoch": 1.3212703101920236, + "grad_norm": 0.21720845997333527, + "learning_rate": 0.00011198423451163937, + "loss": 0.1927, + "step": 3578 + }, + { + "epoch": 1.3216395864106352, + "grad_norm": 0.2322448045015335, + "learning_rate": 0.00011195960093607587, + "loss": 0.2123, + "step": 3579 + }, + { + "epoch": 1.3220088626292466, + "grad_norm": 0.28059622645378113, + "learning_rate": 0.00011193496736051239, + "loss": 0.2264, + "step": 3580 + }, + { + "epoch": 1.3223781388478582, + "grad_norm": 0.2937006950378418, + "learning_rate": 0.00011191033378494889, + "loss": 0.219, + "step": 3581 + }, + { + "epoch": 1.3227474150664698, + "grad_norm": 0.22826345264911652, + "learning_rate": 0.0001118857002093854, + "loss": 0.2127, + "step": 3582 + }, + { + "epoch": 1.3231166912850814, + "grad_norm": 0.2461910843849182, + "learning_rate": 0.00011186106663382189, + "loss": 0.1944, + "step": 3583 + }, + { + "epoch": 1.3234859675036927, + "grad_norm": 0.3390319049358368, + "learning_rate": 0.00011183643305825842, + "loss": 0.2262, + "step": 3584 + }, + { + "epoch": 1.3238552437223043, + "grad_norm": 0.32451796531677246, + "learning_rate": 0.00011181179948269491, + "loss": 0.2746, + "step": 3585 + }, + { + "epoch": 1.3242245199409157, + "grad_norm": 0.29742759466171265, + "learning_rate": 0.00011178716590713144, + "loss": 0.2495, + "step": 3586 + }, + { + "epoch": 1.3245937961595273, + "grad_norm": 0.22607167065143585, + "learning_rate": 0.00011176253233156792, + "loss": 0.1896, + "step": 3587 + }, + { + "epoch": 1.324963072378139, + "grad_norm": 0.22262106835842133, + "learning_rate": 0.00011173789875600444, + "loss": 0.1929, + "step": 3588 + }, + { + "epoch": 1.3253323485967503, + "grad_norm": 0.3493277430534363, + "learning_rate": 0.00011171326518044094, + "loss": 0.363, + "step": 3589 + }, + { + "epoch": 1.325701624815362, + "grad_norm": 0.27974843978881836, + "learning_rate": 0.00011168863160487745, + "loss": 0.2483, + "step": 3590 + }, + { + "epoch": 1.3260709010339733, + "grad_norm": 0.29236069321632385, + "learning_rate": 0.00011166399802931396, + "loss": 0.2239, + "step": 3591 + }, + { + "epoch": 1.326440177252585, + "grad_norm": 0.2721322476863861, + "learning_rate": 0.00011163936445375047, + "loss": 0.2229, + "step": 3592 + }, + { + "epoch": 1.3268094534711965, + "grad_norm": 0.2501920163631439, + "learning_rate": 0.00011161473087818697, + "loss": 0.2054, + "step": 3593 + }, + { + "epoch": 1.327178729689808, + "grad_norm": 0.26513510942459106, + "learning_rate": 0.00011159009730262349, + "loss": 0.2543, + "step": 3594 + }, + { + "epoch": 1.3275480059084195, + "grad_norm": 0.25501394271850586, + "learning_rate": 0.00011156546372705999, + "loss": 0.1906, + "step": 3595 + }, + { + "epoch": 1.327917282127031, + "grad_norm": 0.28721457719802856, + "learning_rate": 0.0001115408301514965, + "loss": 0.2423, + "step": 3596 + }, + { + "epoch": 1.3282865583456425, + "grad_norm": 0.2379792332649231, + "learning_rate": 0.000111516196575933, + "loss": 0.2108, + "step": 3597 + }, + { + "epoch": 1.328655834564254, + "grad_norm": 0.23381534218788147, + "learning_rate": 0.00011149156300036952, + "loss": 0.2163, + "step": 3598 + }, + { + "epoch": 1.3290251107828657, + "grad_norm": 0.30069708824157715, + "learning_rate": 0.000111466929424806, + "loss": 0.2491, + "step": 3599 + }, + { + "epoch": 1.329394387001477, + "grad_norm": 0.24118848145008087, + "learning_rate": 0.00011144229584924253, + "loss": 0.2267, + "step": 3600 + }, + { + "epoch": 1.329394387001477, + "eval_loss": 8.504515647888184, + "eval_runtime": 6.9068, + "eval_samples_per_second": 7.239, + "eval_steps_per_second": 1.013, + "step": 3600 + }, + { + "epoch": 1.3297636632200887, + "grad_norm": 0.2513163685798645, + "learning_rate": 0.00011141766227367902, + "loss": 0.2323, + "step": 3601 + }, + { + "epoch": 1.3301329394387, + "grad_norm": 0.231904536485672, + "learning_rate": 0.00011139302869811555, + "loss": 0.2049, + "step": 3602 + }, + { + "epoch": 1.3305022156573116, + "grad_norm": 0.23805534839630127, + "learning_rate": 0.00011136839512255204, + "loss": 0.2226, + "step": 3603 + }, + { + "epoch": 1.3308714918759232, + "grad_norm": 0.29955872893333435, + "learning_rate": 0.00011134376154698855, + "loss": 0.2565, + "step": 3604 + }, + { + "epoch": 1.3312407680945348, + "grad_norm": 0.2277589738368988, + "learning_rate": 0.00011131912797142505, + "loss": 0.2024, + "step": 3605 + }, + { + "epoch": 1.3316100443131462, + "grad_norm": 0.22374404966831207, + "learning_rate": 0.00011129449439586157, + "loss": 0.2015, + "step": 3606 + }, + { + "epoch": 1.3319793205317578, + "grad_norm": 0.3369079828262329, + "learning_rate": 0.00011126986082029807, + "loss": 0.3126, + "step": 3607 + }, + { + "epoch": 1.3323485967503692, + "grad_norm": 0.3014650344848633, + "learning_rate": 0.00011124522724473458, + "loss": 0.1889, + "step": 3608 + }, + { + "epoch": 1.3327178729689808, + "grad_norm": 0.24866700172424316, + "learning_rate": 0.00011122059366917108, + "loss": 0.2239, + "step": 3609 + }, + { + "epoch": 1.3330871491875924, + "grad_norm": 0.3198398947715759, + "learning_rate": 0.0001111959600936076, + "loss": 0.2319, + "step": 3610 + }, + { + "epoch": 1.3334564254062038, + "grad_norm": 0.24034808576107025, + "learning_rate": 0.0001111713265180441, + "loss": 0.2143, + "step": 3611 + }, + { + "epoch": 1.3338257016248154, + "grad_norm": 0.24557794630527496, + "learning_rate": 0.00011114669294248061, + "loss": 0.2177, + "step": 3612 + }, + { + "epoch": 1.3341949778434268, + "grad_norm": 0.2556517720222473, + "learning_rate": 0.00011112205936691711, + "loss": 0.2007, + "step": 3613 + }, + { + "epoch": 1.3345642540620384, + "grad_norm": 0.27443718910217285, + "learning_rate": 0.00011109742579135363, + "loss": 0.2327, + "step": 3614 + }, + { + "epoch": 1.33493353028065, + "grad_norm": 0.27578532695770264, + "learning_rate": 0.00011107279221579012, + "loss": 0.2165, + "step": 3615 + }, + { + "epoch": 1.3353028064992616, + "grad_norm": 0.23919197916984558, + "learning_rate": 0.00011104815864022664, + "loss": 0.1766, + "step": 3616 + }, + { + "epoch": 1.335672082717873, + "grad_norm": 0.21860438585281372, + "learning_rate": 0.00011102352506466313, + "loss": 0.2276, + "step": 3617 + }, + { + "epoch": 1.3360413589364846, + "grad_norm": 0.313192754983902, + "learning_rate": 0.00011099889148909966, + "loss": 0.2655, + "step": 3618 + }, + { + "epoch": 1.336410635155096, + "grad_norm": 0.31504395604133606, + "learning_rate": 0.00011097425791353615, + "loss": 0.2843, + "step": 3619 + }, + { + "epoch": 1.3367799113737076, + "grad_norm": 0.30465227365493774, + "learning_rate": 0.00011094962433797266, + "loss": 0.2525, + "step": 3620 + }, + { + "epoch": 1.3371491875923192, + "grad_norm": 0.2728731632232666, + "learning_rate": 0.00011092499076240916, + "loss": 0.2429, + "step": 3621 + }, + { + "epoch": 1.3375184638109305, + "grad_norm": 0.2500064969062805, + "learning_rate": 0.00011090035718684568, + "loss": 0.2015, + "step": 3622 + }, + { + "epoch": 1.3378877400295421, + "grad_norm": 0.2735447287559509, + "learning_rate": 0.00011087572361128218, + "loss": 0.2465, + "step": 3623 + }, + { + "epoch": 1.3382570162481535, + "grad_norm": 0.2987755239009857, + "learning_rate": 0.0001108510900357187, + "loss": 0.212, + "step": 3624 + }, + { + "epoch": 1.3386262924667651, + "grad_norm": 0.28150293231010437, + "learning_rate": 0.0001108264564601552, + "loss": 0.2344, + "step": 3625 + }, + { + "epoch": 1.3389955686853767, + "grad_norm": 0.34891730546951294, + "learning_rate": 0.00011080182288459171, + "loss": 0.2374, + "step": 3626 + }, + { + "epoch": 1.339364844903988, + "grad_norm": 0.31624752283096313, + "learning_rate": 0.00011077718930902821, + "loss": 0.2084, + "step": 3627 + }, + { + "epoch": 1.3397341211225997, + "grad_norm": 0.3266696631908417, + "learning_rate": 0.00011075255573346473, + "loss": 0.2634, + "step": 3628 + }, + { + "epoch": 1.340103397341211, + "grad_norm": 0.25475212931632996, + "learning_rate": 0.00011072792215790123, + "loss": 0.2417, + "step": 3629 + }, + { + "epoch": 1.3404726735598227, + "grad_norm": 0.23606355488300323, + "learning_rate": 0.00011070328858233774, + "loss": 0.2291, + "step": 3630 + }, + { + "epoch": 1.3408419497784343, + "grad_norm": 0.3048766851425171, + "learning_rate": 0.00011067865500677423, + "loss": 0.2755, + "step": 3631 + }, + { + "epoch": 1.341211225997046, + "grad_norm": 0.25779712200164795, + "learning_rate": 0.00011065402143121076, + "loss": 0.2435, + "step": 3632 + }, + { + "epoch": 1.3415805022156573, + "grad_norm": 0.24137096107006073, + "learning_rate": 0.00011062938785564724, + "loss": 0.1817, + "step": 3633 + }, + { + "epoch": 1.3419497784342689, + "grad_norm": 0.2800375819206238, + "learning_rate": 0.00011060475428008377, + "loss": 0.2459, + "step": 3634 + }, + { + "epoch": 1.3423190546528803, + "grad_norm": 0.22358538210391998, + "learning_rate": 0.00011058012070452026, + "loss": 0.2026, + "step": 3635 + }, + { + "epoch": 1.3426883308714919, + "grad_norm": 0.2660253643989563, + "learning_rate": 0.00011055548712895677, + "loss": 0.2482, + "step": 3636 + }, + { + "epoch": 1.3430576070901035, + "grad_norm": 0.25324636697769165, + "learning_rate": 0.00011053085355339328, + "loss": 0.2184, + "step": 3637 + }, + { + "epoch": 1.3434268833087148, + "grad_norm": 0.23177511990070343, + "learning_rate": 0.00011050621997782979, + "loss": 0.2054, + "step": 3638 + }, + { + "epoch": 1.3437961595273265, + "grad_norm": 0.25638583302497864, + "learning_rate": 0.00011048158640226629, + "loss": 0.2324, + "step": 3639 + }, + { + "epoch": 1.3441654357459378, + "grad_norm": 0.28212302923202515, + "learning_rate": 0.0001104569528267028, + "loss": 0.2129, + "step": 3640 + }, + { + "epoch": 1.3445347119645494, + "grad_norm": 0.195074662566185, + "learning_rate": 0.00011043231925113931, + "loss": 0.1846, + "step": 3641 + }, + { + "epoch": 1.344903988183161, + "grad_norm": 0.2357679009437561, + "learning_rate": 0.00011040768567557582, + "loss": 0.1935, + "step": 3642 + }, + { + "epoch": 1.3452732644017726, + "grad_norm": 0.2375728338956833, + "learning_rate": 0.00011038305210001232, + "loss": 0.1754, + "step": 3643 + }, + { + "epoch": 1.345642540620384, + "grad_norm": 0.22941121459007263, + "learning_rate": 0.00011035841852444884, + "loss": 0.2044, + "step": 3644 + }, + { + "epoch": 1.3460118168389956, + "grad_norm": 0.3009500801563263, + "learning_rate": 0.00011033378494888534, + "loss": 0.2085, + "step": 3645 + }, + { + "epoch": 1.346381093057607, + "grad_norm": 0.3025873303413391, + "learning_rate": 0.00011030915137332185, + "loss": 0.253, + "step": 3646 + }, + { + "epoch": 1.3467503692762186, + "grad_norm": 0.20187218487262726, + "learning_rate": 0.00011028451779775834, + "loss": 0.1849, + "step": 3647 + }, + { + "epoch": 1.3471196454948302, + "grad_norm": 0.2797483503818512, + "learning_rate": 0.00011025988422219487, + "loss": 0.2245, + "step": 3648 + }, + { + "epoch": 1.3474889217134416, + "grad_norm": 0.2563313841819763, + "learning_rate": 0.00011023525064663136, + "loss": 0.2197, + "step": 3649 + }, + { + "epoch": 1.3478581979320532, + "grad_norm": 0.26572248339653015, + "learning_rate": 0.00011021061707106789, + "loss": 0.2286, + "step": 3650 + }, + { + "epoch": 1.3478581979320532, + "eval_loss": 8.558237075805664, + "eval_runtime": 6.9119, + "eval_samples_per_second": 7.234, + "eval_steps_per_second": 1.013, + "step": 3650 + }, + { + "epoch": 1.3482274741506646, + "grad_norm": 0.2977292835712433, + "learning_rate": 0.00011018598349550437, + "loss": 0.2024, + "step": 3651 + }, + { + "epoch": 1.3485967503692762, + "grad_norm": 0.2794358432292938, + "learning_rate": 0.00011016134991994089, + "loss": 0.2347, + "step": 3652 + }, + { + "epoch": 1.3489660265878878, + "grad_norm": 0.23722723126411438, + "learning_rate": 0.00011013671634437739, + "loss": 0.2044, + "step": 3653 + }, + { + "epoch": 1.3493353028064994, + "grad_norm": 0.2510066628456116, + "learning_rate": 0.0001101120827688139, + "loss": 0.2121, + "step": 3654 + }, + { + "epoch": 1.3497045790251108, + "grad_norm": 0.24998068809509277, + "learning_rate": 0.0001100874491932504, + "loss": 0.2483, + "step": 3655 + }, + { + "epoch": 1.3500738552437224, + "grad_norm": 0.2867424488067627, + "learning_rate": 0.00011006281561768692, + "loss": 0.2521, + "step": 3656 + }, + { + "epoch": 1.3504431314623337, + "grad_norm": 0.30556896328926086, + "learning_rate": 0.00011003818204212342, + "loss": 0.2422, + "step": 3657 + }, + { + "epoch": 1.3508124076809453, + "grad_norm": 0.24585922062397003, + "learning_rate": 0.00011001354846655993, + "loss": 0.2064, + "step": 3658 + }, + { + "epoch": 1.351181683899557, + "grad_norm": 0.2715083956718445, + "learning_rate": 0.00010998891489099644, + "loss": 0.2522, + "step": 3659 + }, + { + "epoch": 1.3515509601181683, + "grad_norm": 0.3475584387779236, + "learning_rate": 0.00010996428131543295, + "loss": 0.2961, + "step": 3660 + }, + { + "epoch": 1.35192023633678, + "grad_norm": 0.2693030536174774, + "learning_rate": 0.00010993964773986945, + "loss": 0.2341, + "step": 3661 + }, + { + "epoch": 1.3522895125553913, + "grad_norm": 0.28587526082992554, + "learning_rate": 0.00010991501416430597, + "loss": 0.2077, + "step": 3662 + }, + { + "epoch": 1.352658788774003, + "grad_norm": 0.253993421792984, + "learning_rate": 0.00010989038058874245, + "loss": 0.2, + "step": 3663 + }, + { + "epoch": 1.3530280649926145, + "grad_norm": 0.2769867181777954, + "learning_rate": 0.00010986574701317898, + "loss": 0.2539, + "step": 3664 + }, + { + "epoch": 1.3533973412112261, + "grad_norm": 0.27828729152679443, + "learning_rate": 0.00010984111343761547, + "loss": 0.2236, + "step": 3665 + }, + { + "epoch": 1.3537666174298375, + "grad_norm": 0.2315187007188797, + "learning_rate": 0.000109816479862052, + "loss": 0.2077, + "step": 3666 + }, + { + "epoch": 1.354135893648449, + "grad_norm": 0.2761039137840271, + "learning_rate": 0.00010979184628648848, + "loss": 0.2387, + "step": 3667 + }, + { + "epoch": 1.3545051698670605, + "grad_norm": 0.25750407576560974, + "learning_rate": 0.000109767212710925, + "loss": 0.2289, + "step": 3668 + }, + { + "epoch": 1.354874446085672, + "grad_norm": 0.27624422311782837, + "learning_rate": 0.0001097425791353615, + "loss": 0.2393, + "step": 3669 + }, + { + "epoch": 1.3552437223042837, + "grad_norm": 0.2541770040988922, + "learning_rate": 0.00010971794555979802, + "loss": 0.2142, + "step": 3670 + }, + { + "epoch": 1.355612998522895, + "grad_norm": 0.24549750983715057, + "learning_rate": 0.00010969331198423452, + "loss": 0.2191, + "step": 3671 + }, + { + "epoch": 1.3559822747415067, + "grad_norm": 0.2809002995491028, + "learning_rate": 0.00010966867840867103, + "loss": 0.211, + "step": 3672 + }, + { + "epoch": 1.356351550960118, + "grad_norm": 0.2596236765384674, + "learning_rate": 0.00010964404483310753, + "loss": 0.2316, + "step": 3673 + }, + { + "epoch": 1.3567208271787297, + "grad_norm": 0.2927226126194, + "learning_rate": 0.00010961941125754405, + "loss": 0.2036, + "step": 3674 + }, + { + "epoch": 1.3570901033973413, + "grad_norm": 0.27640584111213684, + "learning_rate": 0.00010959477768198055, + "loss": 0.2626, + "step": 3675 + }, + { + "epoch": 1.3574593796159529, + "grad_norm": 0.31869152188301086, + "learning_rate": 0.00010957014410641706, + "loss": 0.2695, + "step": 3676 + }, + { + "epoch": 1.3578286558345642, + "grad_norm": 0.28587234020233154, + "learning_rate": 0.00010954551053085356, + "loss": 0.2175, + "step": 3677 + }, + { + "epoch": 1.3581979320531758, + "grad_norm": 0.23689669370651245, + "learning_rate": 0.00010952087695529008, + "loss": 0.2434, + "step": 3678 + }, + { + "epoch": 1.3585672082717872, + "grad_norm": 0.24472549557685852, + "learning_rate": 0.00010949624337972657, + "loss": 0.2154, + "step": 3679 + }, + { + "epoch": 1.3589364844903988, + "grad_norm": 0.2610950171947479, + "learning_rate": 0.0001094716098041631, + "loss": 0.2349, + "step": 3680 + }, + { + "epoch": 1.3593057607090104, + "grad_norm": 0.2528665065765381, + "learning_rate": 0.00010944697622859958, + "loss": 0.1917, + "step": 3681 + }, + { + "epoch": 1.3596750369276218, + "grad_norm": 0.24010154604911804, + "learning_rate": 0.00010942234265303611, + "loss": 0.1861, + "step": 3682 + }, + { + "epoch": 1.3600443131462334, + "grad_norm": 0.2947288155555725, + "learning_rate": 0.0001093977090774726, + "loss": 0.2759, + "step": 3683 + }, + { + "epoch": 1.3604135893648448, + "grad_norm": 0.265164315700531, + "learning_rate": 0.00010937307550190911, + "loss": 0.2458, + "step": 3684 + }, + { + "epoch": 1.3607828655834564, + "grad_norm": 0.26043519377708435, + "learning_rate": 0.00010934844192634561, + "loss": 0.2708, + "step": 3685 + }, + { + "epoch": 1.361152141802068, + "grad_norm": 0.2578504681587219, + "learning_rate": 0.00010932380835078213, + "loss": 0.2139, + "step": 3686 + }, + { + "epoch": 1.3615214180206794, + "grad_norm": 0.25657927989959717, + "learning_rate": 0.00010929917477521863, + "loss": 0.2468, + "step": 3687 + }, + { + "epoch": 1.361890694239291, + "grad_norm": 0.20949628949165344, + "learning_rate": 0.00010927454119965514, + "loss": 0.1814, + "step": 3688 + }, + { + "epoch": 1.3622599704579026, + "grad_norm": 0.22606565058231354, + "learning_rate": 0.00010924990762409164, + "loss": 0.206, + "step": 3689 + }, + { + "epoch": 1.362629246676514, + "grad_norm": 0.31376326084136963, + "learning_rate": 0.00010922527404852816, + "loss": 0.2578, + "step": 3690 + }, + { + "epoch": 1.3629985228951256, + "grad_norm": 0.2469448447227478, + "learning_rate": 0.00010920064047296466, + "loss": 0.1969, + "step": 3691 + }, + { + "epoch": 1.3633677991137372, + "grad_norm": 0.29679930210113525, + "learning_rate": 0.00010917600689740117, + "loss": 0.2421, + "step": 3692 + }, + { + "epoch": 1.3637370753323486, + "grad_norm": 0.24599914252758026, + "learning_rate": 0.00010915137332183766, + "loss": 0.2246, + "step": 3693 + }, + { + "epoch": 1.3641063515509602, + "grad_norm": 0.21572944521903992, + "learning_rate": 0.00010912673974627419, + "loss": 0.1627, + "step": 3694 + }, + { + "epoch": 1.3644756277695715, + "grad_norm": 0.2573620080947876, + "learning_rate": 0.00010910210617071068, + "loss": 0.2337, + "step": 3695 + }, + { + "epoch": 1.3648449039881831, + "grad_norm": 0.27278414368629456, + "learning_rate": 0.0001090774725951472, + "loss": 0.2364, + "step": 3696 + }, + { + "epoch": 1.3652141802067947, + "grad_norm": 0.22569341957569122, + "learning_rate": 0.0001090528390195837, + "loss": 0.2134, + "step": 3697 + }, + { + "epoch": 1.3655834564254061, + "grad_norm": 0.2763185501098633, + "learning_rate": 0.00010902820544402021, + "loss": 0.2655, + "step": 3698 + }, + { + "epoch": 1.3659527326440177, + "grad_norm": 0.27340152859687805, + "learning_rate": 0.00010900357186845671, + "loss": 0.2494, + "step": 3699 + }, + { + "epoch": 1.3663220088626291, + "grad_norm": 0.291229784488678, + "learning_rate": 0.00010897893829289322, + "loss": 0.2696, + "step": 3700 + }, + { + "epoch": 1.3663220088626291, + "eval_loss": 8.587318420410156, + "eval_runtime": 6.9122, + "eval_samples_per_second": 7.234, + "eval_steps_per_second": 1.013, + "step": 3700 + }, + { + "epoch": 1.3666912850812407, + "grad_norm": 0.25626906752586365, + "learning_rate": 0.00010895430471732973, + "loss": 0.2699, + "step": 3701 + }, + { + "epoch": 1.3670605612998523, + "grad_norm": 0.2999981939792633, + "learning_rate": 0.00010892967114176624, + "loss": 0.2449, + "step": 3702 + }, + { + "epoch": 1.367429837518464, + "grad_norm": 0.2927432358264923, + "learning_rate": 0.00010890503756620274, + "loss": 0.2436, + "step": 3703 + }, + { + "epoch": 1.3677991137370753, + "grad_norm": 0.24510280787944794, + "learning_rate": 0.00010888040399063926, + "loss": 0.2338, + "step": 3704 + }, + { + "epoch": 1.368168389955687, + "grad_norm": 0.29051750898361206, + "learning_rate": 0.00010885577041507576, + "loss": 0.2472, + "step": 3705 + }, + { + "epoch": 1.3685376661742983, + "grad_norm": 0.2787736654281616, + "learning_rate": 0.00010883113683951227, + "loss": 0.2416, + "step": 3706 + }, + { + "epoch": 1.3689069423929099, + "grad_norm": 0.24734459817409515, + "learning_rate": 0.00010880650326394877, + "loss": 0.2133, + "step": 3707 + }, + { + "epoch": 1.3692762186115215, + "grad_norm": 0.2410026490688324, + "learning_rate": 0.00010878186968838529, + "loss": 0.2145, + "step": 3708 + }, + { + "epoch": 1.3696454948301329, + "grad_norm": 0.2571277320384979, + "learning_rate": 0.00010875723611282177, + "loss": 0.2193, + "step": 3709 + }, + { + "epoch": 1.3700147710487445, + "grad_norm": 0.24685828387737274, + "learning_rate": 0.0001087326025372583, + "loss": 0.206, + "step": 3710 + }, + { + "epoch": 1.3703840472673559, + "grad_norm": 0.37469449639320374, + "learning_rate": 0.00010870796896169479, + "loss": 0.2297, + "step": 3711 + }, + { + "epoch": 1.3707533234859675, + "grad_norm": 0.3278387188911438, + "learning_rate": 0.00010868333538613132, + "loss": 0.244, + "step": 3712 + }, + { + "epoch": 1.371122599704579, + "grad_norm": 0.23818838596343994, + "learning_rate": 0.0001086587018105678, + "loss": 0.2255, + "step": 3713 + }, + { + "epoch": 1.3714918759231907, + "grad_norm": 0.299718976020813, + "learning_rate": 0.00010863406823500432, + "loss": 0.2637, + "step": 3714 + }, + { + "epoch": 1.371861152141802, + "grad_norm": 0.2648521661758423, + "learning_rate": 0.00010860943465944082, + "loss": 0.2257, + "step": 3715 + }, + { + "epoch": 1.3722304283604136, + "grad_norm": 0.2553204596042633, + "learning_rate": 0.00010858480108387734, + "loss": 0.23, + "step": 3716 + }, + { + "epoch": 1.372599704579025, + "grad_norm": 0.3448072373867035, + "learning_rate": 0.00010856016750831384, + "loss": 0.3174, + "step": 3717 + }, + { + "epoch": 1.3729689807976366, + "grad_norm": 0.31398898363113403, + "learning_rate": 0.00010853553393275034, + "loss": 0.2662, + "step": 3718 + }, + { + "epoch": 1.3733382570162482, + "grad_norm": 0.29658469557762146, + "learning_rate": 0.00010851090035718685, + "loss": 0.2545, + "step": 3719 + }, + { + "epoch": 1.3737075332348596, + "grad_norm": 0.2800571918487549, + "learning_rate": 0.00010848626678162334, + "loss": 0.2665, + "step": 3720 + }, + { + "epoch": 1.3740768094534712, + "grad_norm": 0.246707484126091, + "learning_rate": 0.00010846163320605987, + "loss": 0.2163, + "step": 3721 + }, + { + "epoch": 1.3744460856720826, + "grad_norm": 0.20510061085224152, + "learning_rate": 0.00010843699963049636, + "loss": 0.1704, + "step": 3722 + }, + { + "epoch": 1.3748153618906942, + "grad_norm": 0.2519279420375824, + "learning_rate": 0.00010841236605493288, + "loss": 0.2663, + "step": 3723 + }, + { + "epoch": 1.3751846381093058, + "grad_norm": 0.21594831347465515, + "learning_rate": 0.00010838773247936937, + "loss": 0.2036, + "step": 3724 + }, + { + "epoch": 1.3755539143279174, + "grad_norm": 0.24677294492721558, + "learning_rate": 0.00010836309890380589, + "loss": 0.2114, + "step": 3725 + }, + { + "epoch": 1.3759231905465288, + "grad_norm": 0.240212544798851, + "learning_rate": 0.00010833846532824239, + "loss": 0.2075, + "step": 3726 + }, + { + "epoch": 1.3762924667651404, + "grad_norm": 0.2338992804288864, + "learning_rate": 0.0001083138317526789, + "loss": 0.2115, + "step": 3727 + }, + { + "epoch": 1.3766617429837518, + "grad_norm": 0.23598712682724, + "learning_rate": 0.0001082891981771154, + "loss": 0.1944, + "step": 3728 + }, + { + "epoch": 1.3770310192023634, + "grad_norm": 0.2576069235801697, + "learning_rate": 0.00010826456460155192, + "loss": 0.2128, + "step": 3729 + }, + { + "epoch": 1.377400295420975, + "grad_norm": 0.32071012258529663, + "learning_rate": 0.00010823993102598842, + "loss": 0.2905, + "step": 3730 + }, + { + "epoch": 1.3777695716395864, + "grad_norm": 0.23212775588035583, + "learning_rate": 0.00010821529745042493, + "loss": 0.2095, + "step": 3731 + }, + { + "epoch": 1.378138847858198, + "grad_norm": 0.2688630521297455, + "learning_rate": 0.00010819066387486143, + "loss": 0.2442, + "step": 3732 + }, + { + "epoch": 1.3785081240768093, + "grad_norm": 0.2218068540096283, + "learning_rate": 0.00010816603029929795, + "loss": 0.224, + "step": 3733 + }, + { + "epoch": 1.378877400295421, + "grad_norm": 0.31034815311431885, + "learning_rate": 0.00010814139672373445, + "loss": 0.2481, + "step": 3734 + }, + { + "epoch": 1.3792466765140325, + "grad_norm": 0.2643583118915558, + "learning_rate": 0.00010811676314817097, + "loss": 0.2065, + "step": 3735 + }, + { + "epoch": 1.3796159527326441, + "grad_norm": 0.3374829888343811, + "learning_rate": 0.00010809212957260745, + "loss": 0.2613, + "step": 3736 + }, + { + "epoch": 1.3799852289512555, + "grad_norm": 0.2745453715324402, + "learning_rate": 0.00010806749599704398, + "loss": 0.246, + "step": 3737 + }, + { + "epoch": 1.3803545051698671, + "grad_norm": 0.3025694787502289, + "learning_rate": 0.00010804286242148047, + "loss": 0.2332, + "step": 3738 + }, + { + "epoch": 1.3807237813884785, + "grad_norm": 0.28260600566864014, + "learning_rate": 0.000108018228845917, + "loss": 0.2392, + "step": 3739 + }, + { + "epoch": 1.3810930576070901, + "grad_norm": 0.26889243721961975, + "learning_rate": 0.00010799359527035348, + "loss": 0.2345, + "step": 3740 + }, + { + "epoch": 1.3814623338257017, + "grad_norm": 0.24898390471935272, + "learning_rate": 0.00010796896169479, + "loss": 0.1907, + "step": 3741 + }, + { + "epoch": 1.381831610044313, + "grad_norm": 0.23814311623573303, + "learning_rate": 0.0001079443281192265, + "loss": 0.1867, + "step": 3742 + }, + { + "epoch": 1.3822008862629247, + "grad_norm": 0.2221551537513733, + "learning_rate": 0.00010791969454366301, + "loss": 0.2047, + "step": 3743 + }, + { + "epoch": 1.382570162481536, + "grad_norm": 0.2880096137523651, + "learning_rate": 0.00010789506096809952, + "loss": 0.2326, + "step": 3744 + }, + { + "epoch": 1.3829394387001477, + "grad_norm": 0.26900210976600647, + "learning_rate": 0.00010787042739253603, + "loss": 0.2622, + "step": 3745 + }, + { + "epoch": 1.3833087149187593, + "grad_norm": 0.28007206320762634, + "learning_rate": 0.00010784579381697253, + "loss": 0.2594, + "step": 3746 + }, + { + "epoch": 1.3836779911373709, + "grad_norm": 0.23602940142154694, + "learning_rate": 0.00010782116024140905, + "loss": 0.1992, + "step": 3747 + }, + { + "epoch": 1.3840472673559823, + "grad_norm": 0.2357938438653946, + "learning_rate": 0.00010779652666584555, + "loss": 0.1932, + "step": 3748 + }, + { + "epoch": 1.3844165435745939, + "grad_norm": 0.23136715590953827, + "learning_rate": 0.00010777189309028206, + "loss": 0.2392, + "step": 3749 + }, + { + "epoch": 1.3847858197932053, + "grad_norm": 0.3321192264556885, + "learning_rate": 0.00010774725951471856, + "loss": 0.2679, + "step": 3750 + }, + { + "epoch": 1.3847858197932053, + "eval_loss": 8.7389554977417, + "eval_runtime": 6.9211, + "eval_samples_per_second": 7.224, + "eval_steps_per_second": 1.011, + "step": 3750 + }, + { + "epoch": 1.3851550960118169, + "grad_norm": 0.2925759553909302, + "learning_rate": 0.00010772262593915508, + "loss": 0.2392, + "step": 3751 + }, + { + "epoch": 1.3855243722304285, + "grad_norm": 0.24933800101280212, + "learning_rate": 0.00010769799236359157, + "loss": 0.2153, + "step": 3752 + }, + { + "epoch": 1.3858936484490398, + "grad_norm": 0.23895122110843658, + "learning_rate": 0.0001076733587880281, + "loss": 0.2268, + "step": 3753 + }, + { + "epoch": 1.3862629246676514, + "grad_norm": 0.2631763517856598, + "learning_rate": 0.00010764872521246458, + "loss": 0.2105, + "step": 3754 + }, + { + "epoch": 1.3866322008862628, + "grad_norm": 0.2636767327785492, + "learning_rate": 0.00010762409163690111, + "loss": 0.2065, + "step": 3755 + }, + { + "epoch": 1.3870014771048744, + "grad_norm": 0.26057958602905273, + "learning_rate": 0.0001075994580613376, + "loss": 0.2314, + "step": 3756 + }, + { + "epoch": 1.387370753323486, + "grad_norm": 0.321280837059021, + "learning_rate": 0.00010757482448577411, + "loss": 0.2401, + "step": 3757 + }, + { + "epoch": 1.3877400295420974, + "grad_norm": 0.24892683327198029, + "learning_rate": 0.00010755019091021061, + "loss": 0.2373, + "step": 3758 + }, + { + "epoch": 1.388109305760709, + "grad_norm": 0.27814623713493347, + "learning_rate": 0.00010752555733464713, + "loss": 0.2824, + "step": 3759 + }, + { + "epoch": 1.3884785819793206, + "grad_norm": 0.23491588234901428, + "learning_rate": 0.00010750092375908363, + "loss": 0.2318, + "step": 3760 + }, + { + "epoch": 1.388847858197932, + "grad_norm": 0.2898375988006592, + "learning_rate": 0.00010747629018352014, + "loss": 0.3019, + "step": 3761 + }, + { + "epoch": 1.3892171344165436, + "grad_norm": 0.246374249458313, + "learning_rate": 0.00010745165660795664, + "loss": 0.2009, + "step": 3762 + }, + { + "epoch": 1.3895864106351552, + "grad_norm": 0.23861859738826752, + "learning_rate": 0.00010742702303239316, + "loss": 0.229, + "step": 3763 + }, + { + "epoch": 1.3899556868537666, + "grad_norm": 0.2956044673919678, + "learning_rate": 0.00010740238945682966, + "loss": 0.2499, + "step": 3764 + }, + { + "epoch": 1.3903249630723782, + "grad_norm": 0.24681687355041504, + "learning_rate": 0.00010737775588126617, + "loss": 0.2726, + "step": 3765 + }, + { + "epoch": 1.3906942392909896, + "grad_norm": 0.2876182496547699, + "learning_rate": 0.00010735312230570268, + "loss": 0.2893, + "step": 3766 + }, + { + "epoch": 1.3910635155096012, + "grad_norm": 0.23373618721961975, + "learning_rate": 0.00010732848873013919, + "loss": 0.2189, + "step": 3767 + }, + { + "epoch": 1.3914327917282128, + "grad_norm": 0.2955532371997833, + "learning_rate": 0.00010730385515457568, + "loss": 0.2562, + "step": 3768 + }, + { + "epoch": 1.3918020679468242, + "grad_norm": 0.24331963062286377, + "learning_rate": 0.0001072792215790122, + "loss": 0.2448, + "step": 3769 + }, + { + "epoch": 1.3921713441654358, + "grad_norm": 0.2632501423358917, + "learning_rate": 0.00010725458800344869, + "loss": 0.2559, + "step": 3770 + }, + { + "epoch": 1.3925406203840471, + "grad_norm": 0.223826602101326, + "learning_rate": 0.00010722995442788522, + "loss": 0.218, + "step": 3771 + }, + { + "epoch": 1.3929098966026587, + "grad_norm": 0.26094454526901245, + "learning_rate": 0.00010720532085232171, + "loss": 0.2421, + "step": 3772 + }, + { + "epoch": 1.3932791728212703, + "grad_norm": 0.2748807668685913, + "learning_rate": 0.00010718068727675822, + "loss": 0.2239, + "step": 3773 + }, + { + "epoch": 1.393648449039882, + "grad_norm": 0.22410407662391663, + "learning_rate": 0.00010715605370119472, + "loss": 0.201, + "step": 3774 + }, + { + "epoch": 1.3940177252584933, + "grad_norm": 0.34046974778175354, + "learning_rate": 0.00010713142012563124, + "loss": 0.3283, + "step": 3775 + }, + { + "epoch": 1.394387001477105, + "grad_norm": 0.241068497300148, + "learning_rate": 0.00010710678655006774, + "loss": 0.2165, + "step": 3776 + }, + { + "epoch": 1.3947562776957163, + "grad_norm": 0.23202429711818695, + "learning_rate": 0.00010708215297450425, + "loss": 0.2444, + "step": 3777 + }, + { + "epoch": 1.395125553914328, + "grad_norm": 0.2641740143299103, + "learning_rate": 0.00010705751939894076, + "loss": 0.2224, + "step": 3778 + }, + { + "epoch": 1.3954948301329395, + "grad_norm": 0.3078364133834839, + "learning_rate": 0.00010703288582337727, + "loss": 0.2757, + "step": 3779 + }, + { + "epoch": 1.395864106351551, + "grad_norm": 0.29337382316589355, + "learning_rate": 0.00010700825224781377, + "loss": 0.2015, + "step": 3780 + }, + { + "epoch": 1.3962333825701625, + "grad_norm": 0.3006405234336853, + "learning_rate": 0.00010698361867225029, + "loss": 0.2148, + "step": 3781 + }, + { + "epoch": 1.3966026587887739, + "grad_norm": 0.27394089102745056, + "learning_rate": 0.00010695898509668679, + "loss": 0.2684, + "step": 3782 + }, + { + "epoch": 1.3969719350073855, + "grad_norm": 0.24215780198574066, + "learning_rate": 0.0001069343515211233, + "loss": 0.2427, + "step": 3783 + }, + { + "epoch": 1.397341211225997, + "grad_norm": 0.26099783182144165, + "learning_rate": 0.00010690971794555979, + "loss": 0.2115, + "step": 3784 + }, + { + "epoch": 1.3977104874446087, + "grad_norm": 0.2992009222507477, + "learning_rate": 0.00010688508436999632, + "loss": 0.2405, + "step": 3785 + }, + { + "epoch": 1.39807976366322, + "grad_norm": 0.2526547908782959, + "learning_rate": 0.0001068604507944328, + "loss": 0.2396, + "step": 3786 + }, + { + "epoch": 1.3984490398818317, + "grad_norm": 0.35082533955574036, + "learning_rate": 0.00010683581721886933, + "loss": 0.2781, + "step": 3787 + }, + { + "epoch": 1.398818316100443, + "grad_norm": 0.22801473736763, + "learning_rate": 0.00010681118364330582, + "loss": 0.2046, + "step": 3788 + }, + { + "epoch": 1.3991875923190547, + "grad_norm": 0.3150949776172638, + "learning_rate": 0.00010678655006774234, + "loss": 0.2548, + "step": 3789 + }, + { + "epoch": 1.3995568685376663, + "grad_norm": 0.25640758872032166, + "learning_rate": 0.00010676191649217884, + "loss": 0.2228, + "step": 3790 + }, + { + "epoch": 1.3999261447562776, + "grad_norm": 0.2546674609184265, + "learning_rate": 0.00010673728291661535, + "loss": 0.2067, + "step": 3791 + }, + { + "epoch": 1.4002954209748892, + "grad_norm": 0.28234007954597473, + "learning_rate": 0.00010671264934105185, + "loss": 0.2574, + "step": 3792 + }, + { + "epoch": 1.4006646971935006, + "grad_norm": 0.2242916077375412, + "learning_rate": 0.00010668801576548837, + "loss": 0.2234, + "step": 3793 + }, + { + "epoch": 1.4010339734121122, + "grad_norm": 0.2561994791030884, + "learning_rate": 0.00010666338218992487, + "loss": 0.224, + "step": 3794 + }, + { + "epoch": 1.4014032496307238, + "grad_norm": 0.3414594233036041, + "learning_rate": 0.00010663874861436138, + "loss": 0.2679, + "step": 3795 + }, + { + "epoch": 1.4017725258493354, + "grad_norm": 0.28087905049324036, + "learning_rate": 0.00010661411503879788, + "loss": 0.2396, + "step": 3796 + }, + { + "epoch": 1.4021418020679468, + "grad_norm": 0.2722638249397278, + "learning_rate": 0.0001065894814632344, + "loss": 0.2149, + "step": 3797 + }, + { + "epoch": 1.4025110782865584, + "grad_norm": 0.2816917598247528, + "learning_rate": 0.0001065648478876709, + "loss": 0.2132, + "step": 3798 + }, + { + "epoch": 1.4028803545051698, + "grad_norm": 0.25728243589401245, + "learning_rate": 0.00010654021431210741, + "loss": 0.2223, + "step": 3799 + }, + { + "epoch": 1.4032496307237814, + "grad_norm": 0.28917446732521057, + "learning_rate": 0.0001065155807365439, + "loss": 0.2487, + "step": 3800 + }, + { + "epoch": 1.4032496307237814, + "eval_loss": 8.67109489440918, + "eval_runtime": 6.9174, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 1.012, + "step": 3800 + }, + { + "epoch": 1.403618906942393, + "grad_norm": 0.24054156243801117, + "learning_rate": 0.00010649094716098043, + "loss": 0.2168, + "step": 3801 + }, + { + "epoch": 1.4039881831610044, + "grad_norm": 0.23652881383895874, + "learning_rate": 0.00010646631358541692, + "loss": 0.2212, + "step": 3802 + }, + { + "epoch": 1.404357459379616, + "grad_norm": 0.3209688365459442, + "learning_rate": 0.00010644168000985345, + "loss": 0.2378, + "step": 3803 + }, + { + "epoch": 1.4047267355982274, + "grad_norm": 0.2922946512699127, + "learning_rate": 0.00010641704643428993, + "loss": 0.278, + "step": 3804 + }, + { + "epoch": 1.405096011816839, + "grad_norm": 0.25192025303840637, + "learning_rate": 0.00010639241285872645, + "loss": 0.2384, + "step": 3805 + }, + { + "epoch": 1.4054652880354506, + "grad_norm": 0.33324214816093445, + "learning_rate": 0.00010636777928316295, + "loss": 0.2404, + "step": 3806 + }, + { + "epoch": 1.4058345642540622, + "grad_norm": 0.2159721851348877, + "learning_rate": 0.00010634314570759946, + "loss": 0.2065, + "step": 3807 + }, + { + "epoch": 1.4062038404726735, + "grad_norm": 0.24972684681415558, + "learning_rate": 0.00010631851213203596, + "loss": 0.2421, + "step": 3808 + }, + { + "epoch": 1.4065731166912852, + "grad_norm": 0.2546807825565338, + "learning_rate": 0.00010629387855647248, + "loss": 0.2148, + "step": 3809 + }, + { + "epoch": 1.4069423929098965, + "grad_norm": 0.2856742739677429, + "learning_rate": 0.00010626924498090898, + "loss": 0.2598, + "step": 3810 + }, + { + "epoch": 1.4073116691285081, + "grad_norm": 0.20295575261116028, + "learning_rate": 0.0001062446114053455, + "loss": 0.1728, + "step": 3811 + }, + { + "epoch": 1.4076809453471197, + "grad_norm": 0.26216527819633484, + "learning_rate": 0.000106219977829782, + "loss": 0.2154, + "step": 3812 + }, + { + "epoch": 1.4080502215657311, + "grad_norm": 0.22021998465061188, + "learning_rate": 0.00010619534425421851, + "loss": 0.2094, + "step": 3813 + }, + { + "epoch": 1.4084194977843427, + "grad_norm": 0.2220689207315445, + "learning_rate": 0.00010617071067865501, + "loss": 0.2254, + "step": 3814 + }, + { + "epoch": 1.408788774002954, + "grad_norm": 0.2655491232872009, + "learning_rate": 0.00010614607710309153, + "loss": 0.2063, + "step": 3815 + }, + { + "epoch": 1.4091580502215657, + "grad_norm": 0.5775415897369385, + "learning_rate": 0.00010612144352752801, + "loss": 0.285, + "step": 3816 + }, + { + "epoch": 1.4095273264401773, + "grad_norm": 0.3346193730831146, + "learning_rate": 0.00010609680995196454, + "loss": 0.2774, + "step": 3817 + }, + { + "epoch": 1.409896602658789, + "grad_norm": 0.28976693749427795, + "learning_rate": 0.00010607217637640103, + "loss": 0.212, + "step": 3818 + }, + { + "epoch": 1.4102658788774003, + "grad_norm": 0.2844235301017761, + "learning_rate": 0.00010604754280083756, + "loss": 0.2504, + "step": 3819 + }, + { + "epoch": 1.410635155096012, + "grad_norm": 0.2223500907421112, + "learning_rate": 0.00010602290922527405, + "loss": 0.1835, + "step": 3820 + }, + { + "epoch": 1.4110044313146233, + "grad_norm": 0.2432069629430771, + "learning_rate": 0.00010599827564971056, + "loss": 0.2249, + "step": 3821 + }, + { + "epoch": 1.4113737075332349, + "grad_norm": 0.21774660050868988, + "learning_rate": 0.00010597364207414706, + "loss": 0.2118, + "step": 3822 + }, + { + "epoch": 1.4117429837518465, + "grad_norm": 0.24662066996097565, + "learning_rate": 0.00010594900849858358, + "loss": 0.22, + "step": 3823 + }, + { + "epoch": 1.4121122599704579, + "grad_norm": 0.22315095365047455, + "learning_rate": 0.00010592437492302008, + "loss": 0.1839, + "step": 3824 + }, + { + "epoch": 1.4124815361890695, + "grad_norm": 0.25096753239631653, + "learning_rate": 0.00010589974134745659, + "loss": 0.2113, + "step": 3825 + }, + { + "epoch": 1.4128508124076808, + "grad_norm": 0.24644961953163147, + "learning_rate": 0.00010587510777189309, + "loss": 0.2199, + "step": 3826 + }, + { + "epoch": 1.4132200886262924, + "grad_norm": 0.2502542734146118, + "learning_rate": 0.00010585047419632961, + "loss": 0.2251, + "step": 3827 + }, + { + "epoch": 1.413589364844904, + "grad_norm": 0.23292404413223267, + "learning_rate": 0.00010582584062076611, + "loss": 0.2238, + "step": 3828 + }, + { + "epoch": 1.4139586410635154, + "grad_norm": 0.2620003819465637, + "learning_rate": 0.00010580120704520262, + "loss": 0.1935, + "step": 3829 + }, + { + "epoch": 1.414327917282127, + "grad_norm": 0.24223050475120544, + "learning_rate": 0.00010577657346963912, + "loss": 0.2303, + "step": 3830 + }, + { + "epoch": 1.4146971935007384, + "grad_norm": 0.31552866101264954, + "learning_rate": 0.00010575193989407564, + "loss": 0.2303, + "step": 3831 + }, + { + "epoch": 1.41506646971935, + "grad_norm": 0.2255706638097763, + "learning_rate": 0.00010572730631851213, + "loss": 0.2239, + "step": 3832 + }, + { + "epoch": 1.4154357459379616, + "grad_norm": 0.23371100425720215, + "learning_rate": 0.00010570267274294865, + "loss": 0.212, + "step": 3833 + }, + { + "epoch": 1.4158050221565732, + "grad_norm": 0.3008963465690613, + "learning_rate": 0.00010567803916738514, + "loss": 0.2086, + "step": 3834 + }, + { + "epoch": 1.4161742983751846, + "grad_norm": 0.2378990650177002, + "learning_rate": 0.00010565340559182167, + "loss": 0.2276, + "step": 3835 + }, + { + "epoch": 1.4165435745937962, + "grad_norm": 0.24055680632591248, + "learning_rate": 0.00010562877201625816, + "loss": 0.1942, + "step": 3836 + }, + { + "epoch": 1.4169128508124076, + "grad_norm": 0.25177863240242004, + "learning_rate": 0.00010560413844069467, + "loss": 0.198, + "step": 3837 + }, + { + "epoch": 1.4172821270310192, + "grad_norm": 0.2898646593093872, + "learning_rate": 0.00010557950486513117, + "loss": 0.2538, + "step": 3838 + }, + { + "epoch": 1.4176514032496308, + "grad_norm": 0.2893555760383606, + "learning_rate": 0.00010555487128956769, + "loss": 0.2458, + "step": 3839 + }, + { + "epoch": 1.4180206794682422, + "grad_norm": 0.27754053473472595, + "learning_rate": 0.00010553023771400419, + "loss": 0.2469, + "step": 3840 + }, + { + "epoch": 1.4183899556868538, + "grad_norm": 0.20141923427581787, + "learning_rate": 0.0001055056041384407, + "loss": 0.1829, + "step": 3841 + }, + { + "epoch": 1.4187592319054652, + "grad_norm": 0.2939125895500183, + "learning_rate": 0.0001054809705628772, + "loss": 0.2312, + "step": 3842 + }, + { + "epoch": 1.4191285081240768, + "grad_norm": 0.2897227704524994, + "learning_rate": 0.00010545633698731372, + "loss": 0.2271, + "step": 3843 + }, + { + "epoch": 1.4194977843426884, + "grad_norm": 0.2758951485157013, + "learning_rate": 0.00010543170341175022, + "loss": 0.2364, + "step": 3844 + }, + { + "epoch": 1.4198670605613, + "grad_norm": 0.250351220369339, + "learning_rate": 0.00010540706983618674, + "loss": 0.236, + "step": 3845 + }, + { + "epoch": 1.4202363367799113, + "grad_norm": 0.22679243981838226, + "learning_rate": 0.00010538243626062322, + "loss": 0.2148, + "step": 3846 + }, + { + "epoch": 1.420605612998523, + "grad_norm": 0.23274806141853333, + "learning_rate": 0.00010535780268505975, + "loss": 0.2266, + "step": 3847 + }, + { + "epoch": 1.4209748892171343, + "grad_norm": 0.254705011844635, + "learning_rate": 0.00010533316910949624, + "loss": 0.2018, + "step": 3848 + }, + { + "epoch": 1.421344165435746, + "grad_norm": 0.32447507977485657, + "learning_rate": 0.00010530853553393277, + "loss": 0.2871, + "step": 3849 + }, + { + "epoch": 1.4217134416543575, + "grad_norm": 0.24856328964233398, + "learning_rate": 0.00010528390195836925, + "loss": 0.2191, + "step": 3850 + }, + { + "epoch": 1.4217134416543575, + "eval_loss": 8.493885040283203, + "eval_runtime": 6.9171, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 1.012, + "step": 3850 + }, + { + "epoch": 1.422082717872969, + "grad_norm": 0.236580953001976, + "learning_rate": 0.00010525926838280577, + "loss": 0.2083, + "step": 3851 + }, + { + "epoch": 1.4224519940915805, + "grad_norm": 0.273490846157074, + "learning_rate": 0.00010523463480724227, + "loss": 0.2203, + "step": 3852 + }, + { + "epoch": 1.422821270310192, + "grad_norm": 0.24787402153015137, + "learning_rate": 0.00010521000123167878, + "loss": 0.1921, + "step": 3853 + }, + { + "epoch": 1.4231905465288035, + "grad_norm": 0.3680501878261566, + "learning_rate": 0.00010518536765611529, + "loss": 0.2466, + "step": 3854 + }, + { + "epoch": 1.423559822747415, + "grad_norm": 0.29665929079055786, + "learning_rate": 0.0001051607340805518, + "loss": 0.252, + "step": 3855 + }, + { + "epoch": 1.4239290989660267, + "grad_norm": 0.18300104141235352, + "learning_rate": 0.0001051361005049883, + "loss": 0.1614, + "step": 3856 + }, + { + "epoch": 1.424298375184638, + "grad_norm": 0.27013885974884033, + "learning_rate": 0.00010511146692942482, + "loss": 0.1912, + "step": 3857 + }, + { + "epoch": 1.4246676514032497, + "grad_norm": 0.2220534086227417, + "learning_rate": 0.00010508683335386132, + "loss": 0.2067, + "step": 3858 + }, + { + "epoch": 1.425036927621861, + "grad_norm": 0.2669315040111542, + "learning_rate": 0.00010506219977829783, + "loss": 0.2232, + "step": 3859 + }, + { + "epoch": 1.4254062038404727, + "grad_norm": 0.2877553403377533, + "learning_rate": 0.00010503756620273433, + "loss": 0.2327, + "step": 3860 + }, + { + "epoch": 1.4257754800590843, + "grad_norm": 0.32978078722953796, + "learning_rate": 0.00010501293262717085, + "loss": 0.2624, + "step": 3861 + }, + { + "epoch": 1.4261447562776957, + "grad_norm": 0.2621108293533325, + "learning_rate": 0.00010498829905160734, + "loss": 0.2262, + "step": 3862 + }, + { + "epoch": 1.4265140324963073, + "grad_norm": 0.4598044455051422, + "learning_rate": 0.00010496366547604386, + "loss": 0.2697, + "step": 3863 + }, + { + "epoch": 1.4268833087149186, + "grad_norm": 0.2835172414779663, + "learning_rate": 0.00010493903190048035, + "loss": 0.2466, + "step": 3864 + }, + { + "epoch": 1.4272525849335302, + "grad_norm": 0.2745411992073059, + "learning_rate": 0.00010491439832491688, + "loss": 0.2141, + "step": 3865 + }, + { + "epoch": 1.4276218611521418, + "grad_norm": 0.2918604612350464, + "learning_rate": 0.00010488976474935337, + "loss": 0.295, + "step": 3866 + }, + { + "epoch": 1.4279911373707534, + "grad_norm": 0.22567543387413025, + "learning_rate": 0.00010486513117378988, + "loss": 0.1784, + "step": 3867 + }, + { + "epoch": 1.4283604135893648, + "grad_norm": 0.267691433429718, + "learning_rate": 0.00010484049759822638, + "loss": 0.2096, + "step": 3868 + }, + { + "epoch": 1.4287296898079764, + "grad_norm": 0.23182958364486694, + "learning_rate": 0.0001048158640226629, + "loss": 0.2085, + "step": 3869 + }, + { + "epoch": 1.4290989660265878, + "grad_norm": 0.3311103284358978, + "learning_rate": 0.0001047912304470994, + "loss": 0.2376, + "step": 3870 + }, + { + "epoch": 1.4294682422451994, + "grad_norm": 0.32816746830940247, + "learning_rate": 0.00010476659687153591, + "loss": 0.2496, + "step": 3871 + }, + { + "epoch": 1.429837518463811, + "grad_norm": 0.28248363733291626, + "learning_rate": 0.00010474196329597241, + "loss": 0.2253, + "step": 3872 + }, + { + "epoch": 1.4302067946824224, + "grad_norm": 0.22626779973506927, + "learning_rate": 0.00010471732972040893, + "loss": 0.2213, + "step": 3873 + }, + { + "epoch": 1.430576070901034, + "grad_norm": 0.23796947300434113, + "learning_rate": 0.00010469269614484543, + "loss": 0.194, + "step": 3874 + }, + { + "epoch": 1.4309453471196454, + "grad_norm": 0.31326824426651, + "learning_rate": 0.00010466806256928194, + "loss": 0.2773, + "step": 3875 + }, + { + "epoch": 1.431314623338257, + "grad_norm": 0.2317780703306198, + "learning_rate": 0.00010464342899371845, + "loss": 0.2082, + "step": 3876 + }, + { + "epoch": 1.4316838995568686, + "grad_norm": 0.23444223403930664, + "learning_rate": 0.00010461879541815496, + "loss": 0.1977, + "step": 3877 + }, + { + "epoch": 1.4320531757754802, + "grad_norm": 0.24713462591171265, + "learning_rate": 0.00010459416184259145, + "loss": 0.2306, + "step": 3878 + }, + { + "epoch": 1.4324224519940916, + "grad_norm": 0.27644142508506775, + "learning_rate": 0.00010456952826702798, + "loss": 0.2073, + "step": 3879 + }, + { + "epoch": 1.4327917282127032, + "grad_norm": 0.3052675127983093, + "learning_rate": 0.00010454489469146446, + "loss": 0.2266, + "step": 3880 + }, + { + "epoch": 1.4331610044313146, + "grad_norm": 0.2568247616291046, + "learning_rate": 0.00010452026111590099, + "loss": 0.2321, + "step": 3881 + }, + { + "epoch": 1.4335302806499262, + "grad_norm": 0.3004004657268524, + "learning_rate": 0.00010449562754033748, + "loss": 0.2229, + "step": 3882 + }, + { + "epoch": 1.4338995568685378, + "grad_norm": 0.22793149948120117, + "learning_rate": 0.000104470993964774, + "loss": 0.1976, + "step": 3883 + }, + { + "epoch": 1.4342688330871491, + "grad_norm": 0.32156893610954285, + "learning_rate": 0.0001044463603892105, + "loss": 0.2338, + "step": 3884 + }, + { + "epoch": 1.4346381093057607, + "grad_norm": 0.2936916947364807, + "learning_rate": 0.00010442172681364701, + "loss": 0.2495, + "step": 3885 + }, + { + "epoch": 1.4350073855243721, + "grad_norm": 0.27445074915885925, + "learning_rate": 0.00010439709323808351, + "loss": 0.2352, + "step": 3886 + }, + { + "epoch": 1.4353766617429837, + "grad_norm": 0.22757753729820251, + "learning_rate": 0.00010437245966252003, + "loss": 0.2224, + "step": 3887 + }, + { + "epoch": 1.4357459379615953, + "grad_norm": 0.2889946401119232, + "learning_rate": 0.00010434782608695653, + "loss": 0.2429, + "step": 3888 + }, + { + "epoch": 1.4361152141802067, + "grad_norm": 0.30941131711006165, + "learning_rate": 0.00010432319251139304, + "loss": 0.2567, + "step": 3889 + }, + { + "epoch": 1.4364844903988183, + "grad_norm": 0.2866005599498749, + "learning_rate": 0.00010429855893582954, + "loss": 0.2605, + "step": 3890 + }, + { + "epoch": 1.43685376661743, + "grad_norm": 0.3011987805366516, + "learning_rate": 0.00010427392536026606, + "loss": 0.2715, + "step": 3891 + }, + { + "epoch": 1.4372230428360413, + "grad_norm": 0.2503297030925751, + "learning_rate": 0.00010424929178470256, + "loss": 0.2456, + "step": 3892 + }, + { + "epoch": 1.437592319054653, + "grad_norm": 0.24963368475437164, + "learning_rate": 0.00010422465820913907, + "loss": 0.2192, + "step": 3893 + }, + { + "epoch": 1.4379615952732645, + "grad_norm": 0.26045989990234375, + "learning_rate": 0.00010420002463357556, + "loss": 0.2787, + "step": 3894 + }, + { + "epoch": 1.4383308714918759, + "grad_norm": 0.29313600063323975, + "learning_rate": 0.00010417539105801209, + "loss": 0.1916, + "step": 3895 + }, + { + "epoch": 1.4387001477104875, + "grad_norm": 0.23288677632808685, + "learning_rate": 0.00010415075748244858, + "loss": 0.2194, + "step": 3896 + }, + { + "epoch": 1.4390694239290989, + "grad_norm": 0.23005591332912445, + "learning_rate": 0.0001041261239068851, + "loss": 0.2225, + "step": 3897 + }, + { + "epoch": 1.4394387001477105, + "grad_norm": 0.22140176594257355, + "learning_rate": 0.00010410149033132159, + "loss": 0.1911, + "step": 3898 + }, + { + "epoch": 1.439807976366322, + "grad_norm": 0.28162136673927307, + "learning_rate": 0.0001040768567557581, + "loss": 0.2119, + "step": 3899 + }, + { + "epoch": 1.4401772525849335, + "grad_norm": 0.28900739550590515, + "learning_rate": 0.00010405222318019461, + "loss": 0.2304, + "step": 3900 + }, + { + "epoch": 1.4401772525849335, + "eval_loss": 8.565169334411621, + "eval_runtime": 6.9146, + "eval_samples_per_second": 7.231, + "eval_steps_per_second": 1.012, + "step": 3900 + }, + { + "epoch": 1.440546528803545, + "grad_norm": 0.260626882314682, + "learning_rate": 0.00010402758960463112, + "loss": 0.2211, + "step": 3901 + }, + { + "epoch": 1.4409158050221564, + "grad_norm": 0.24782182276248932, + "learning_rate": 0.00010400295602906762, + "loss": 0.2217, + "step": 3902 + }, + { + "epoch": 1.441285081240768, + "grad_norm": 0.296936571598053, + "learning_rate": 0.00010397832245350414, + "loss": 0.2221, + "step": 3903 + }, + { + "epoch": 1.4416543574593796, + "grad_norm": 0.21856023371219635, + "learning_rate": 0.00010395368887794064, + "loss": 0.1909, + "step": 3904 + }, + { + "epoch": 1.4420236336779912, + "grad_norm": 0.2765868604183197, + "learning_rate": 0.00010392905530237715, + "loss": 0.2121, + "step": 3905 + }, + { + "epoch": 1.4423929098966026, + "grad_norm": 0.2787695527076721, + "learning_rate": 0.00010390442172681365, + "loss": 0.2339, + "step": 3906 + }, + { + "epoch": 1.4427621861152142, + "grad_norm": 0.262008935213089, + "learning_rate": 0.00010387978815125017, + "loss": 0.2195, + "step": 3907 + }, + { + "epoch": 1.4431314623338256, + "grad_norm": 0.29457738995552063, + "learning_rate": 0.00010385515457568667, + "loss": 0.267, + "step": 3908 + }, + { + "epoch": 1.4435007385524372, + "grad_norm": 0.2304389774799347, + "learning_rate": 0.00010383052100012318, + "loss": 0.1811, + "step": 3909 + }, + { + "epoch": 1.4438700147710488, + "grad_norm": 0.2589552700519562, + "learning_rate": 0.00010380588742455967, + "loss": 0.2211, + "step": 3910 + }, + { + "epoch": 1.4442392909896602, + "grad_norm": 0.2457135170698166, + "learning_rate": 0.0001037812538489962, + "loss": 0.2188, + "step": 3911 + }, + { + "epoch": 1.4446085672082718, + "grad_norm": 0.27453845739364624, + "learning_rate": 0.00010375662027343269, + "loss": 0.2484, + "step": 3912 + }, + { + "epoch": 1.4449778434268832, + "grad_norm": 0.2856067717075348, + "learning_rate": 0.00010373198669786922, + "loss": 0.2469, + "step": 3913 + }, + { + "epoch": 1.4453471196454948, + "grad_norm": 0.2643541991710663, + "learning_rate": 0.0001037073531223057, + "loss": 0.193, + "step": 3914 + }, + { + "epoch": 1.4457163958641064, + "grad_norm": 0.2563317120075226, + "learning_rate": 0.00010368271954674222, + "loss": 0.2205, + "step": 3915 + }, + { + "epoch": 1.446085672082718, + "grad_norm": 0.30884408950805664, + "learning_rate": 0.00010365808597117872, + "loss": 0.2501, + "step": 3916 + }, + { + "epoch": 1.4464549483013294, + "grad_norm": 0.31442710757255554, + "learning_rate": 0.00010363345239561523, + "loss": 0.2275, + "step": 3917 + }, + { + "epoch": 1.446824224519941, + "grad_norm": 0.2886759042739868, + "learning_rate": 0.00010360881882005173, + "loss": 0.2512, + "step": 3918 + }, + { + "epoch": 1.4471935007385524, + "grad_norm": 0.24880017340183258, + "learning_rate": 0.00010358418524448825, + "loss": 0.2001, + "step": 3919 + }, + { + "epoch": 1.447562776957164, + "grad_norm": 0.2707426846027374, + "learning_rate": 0.00010355955166892475, + "loss": 0.2026, + "step": 3920 + }, + { + "epoch": 1.4479320531757756, + "grad_norm": 0.28365567326545715, + "learning_rate": 0.00010353491809336127, + "loss": 0.2373, + "step": 3921 + }, + { + "epoch": 1.448301329394387, + "grad_norm": 0.26209262013435364, + "learning_rate": 0.00010351028451779777, + "loss": 0.2511, + "step": 3922 + }, + { + "epoch": 1.4486706056129985, + "grad_norm": 0.24650391936302185, + "learning_rate": 0.00010348565094223428, + "loss": 0.2231, + "step": 3923 + }, + { + "epoch": 1.44903988183161, + "grad_norm": 0.2551679313182831, + "learning_rate": 0.00010346101736667078, + "loss": 0.2173, + "step": 3924 + }, + { + "epoch": 1.4494091580502215, + "grad_norm": 0.24441155791282654, + "learning_rate": 0.0001034363837911073, + "loss": 0.2081, + "step": 3925 + }, + { + "epoch": 1.4497784342688331, + "grad_norm": 0.2704668641090393, + "learning_rate": 0.00010341175021554378, + "loss": 0.2757, + "step": 3926 + }, + { + "epoch": 1.4501477104874447, + "grad_norm": 0.32258719205856323, + "learning_rate": 0.00010338711663998031, + "loss": 0.2472, + "step": 3927 + }, + { + "epoch": 1.450516986706056, + "grad_norm": 0.23374660313129425, + "learning_rate": 0.0001033624830644168, + "loss": 0.199, + "step": 3928 + }, + { + "epoch": 1.4508862629246677, + "grad_norm": 0.2942441403865814, + "learning_rate": 0.00010333784948885333, + "loss": 0.2496, + "step": 3929 + }, + { + "epoch": 1.451255539143279, + "grad_norm": 0.27912795543670654, + "learning_rate": 0.00010331321591328982, + "loss": 0.2334, + "step": 3930 + }, + { + "epoch": 1.4516248153618907, + "grad_norm": 0.28949761390686035, + "learning_rate": 0.00010328858233772633, + "loss": 0.2161, + "step": 3931 + }, + { + "epoch": 1.4519940915805023, + "grad_norm": 0.234833762049675, + "learning_rate": 0.00010326394876216283, + "loss": 0.2047, + "step": 3932 + }, + { + "epoch": 1.4523633677991137, + "grad_norm": 0.29237592220306396, + "learning_rate": 0.00010323931518659935, + "loss": 0.2201, + "step": 3933 + }, + { + "epoch": 1.4527326440177253, + "grad_norm": 0.25121167302131653, + "learning_rate": 0.00010321468161103585, + "loss": 0.2253, + "step": 3934 + }, + { + "epoch": 1.4531019202363367, + "grad_norm": 0.2713802456855774, + "learning_rate": 0.00010319004803547236, + "loss": 0.216, + "step": 3935 + }, + { + "epoch": 1.4534711964549483, + "grad_norm": 0.25745826959609985, + "learning_rate": 0.00010316541445990886, + "loss": 0.2068, + "step": 3936 + }, + { + "epoch": 1.4538404726735599, + "grad_norm": 0.30310261249542236, + "learning_rate": 0.00010314078088434538, + "loss": 0.2456, + "step": 3937 + }, + { + "epoch": 1.4542097488921715, + "grad_norm": 0.31646987795829773, + "learning_rate": 0.00010311614730878188, + "loss": 0.2401, + "step": 3938 + }, + { + "epoch": 1.4545790251107829, + "grad_norm": 0.21331490576267242, + "learning_rate": 0.0001030915137332184, + "loss": 0.1731, + "step": 3939 + }, + { + "epoch": 1.4549483013293945, + "grad_norm": 0.313210129737854, + "learning_rate": 0.0001030668801576549, + "loss": 0.2505, + "step": 3940 + }, + { + "epoch": 1.4553175775480058, + "grad_norm": 0.29993730783462524, + "learning_rate": 0.00010304224658209141, + "loss": 0.2837, + "step": 3941 + }, + { + "epoch": 1.4556868537666174, + "grad_norm": 0.28829681873321533, + "learning_rate": 0.0001030176130065279, + "loss": 0.2511, + "step": 3942 + }, + { + "epoch": 1.456056129985229, + "grad_norm": 0.3343363106250763, + "learning_rate": 0.00010299297943096442, + "loss": 0.3171, + "step": 3943 + }, + { + "epoch": 1.4564254062038404, + "grad_norm": 0.2859583795070648, + "learning_rate": 0.00010296834585540091, + "loss": 0.2066, + "step": 3944 + }, + { + "epoch": 1.456794682422452, + "grad_norm": 0.22887571156024933, + "learning_rate": 0.00010294371227983744, + "loss": 0.1812, + "step": 3945 + }, + { + "epoch": 1.4571639586410634, + "grad_norm": 0.29305964708328247, + "learning_rate": 0.00010291907870427393, + "loss": 0.2145, + "step": 3946 + }, + { + "epoch": 1.457533234859675, + "grad_norm": 0.24731211364269257, + "learning_rate": 0.00010289444512871044, + "loss": 0.2289, + "step": 3947 + }, + { + "epoch": 1.4579025110782866, + "grad_norm": 0.3187045156955719, + "learning_rate": 0.00010286981155314694, + "loss": 0.278, + "step": 3948 + }, + { + "epoch": 1.4582717872968982, + "grad_norm": 0.26375070214271545, + "learning_rate": 0.00010284517797758344, + "loss": 0.2248, + "step": 3949 + }, + { + "epoch": 1.4586410635155096, + "grad_norm": 0.23622861504554749, + "learning_rate": 0.00010282054440201996, + "loss": 0.2196, + "step": 3950 + }, + { + "epoch": 1.4586410635155096, + "eval_loss": 8.594433784484863, + "eval_runtime": 6.9141, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 1.012, + "step": 3950 + }, + { + "epoch": 1.4590103397341212, + "grad_norm": 0.25164759159088135, + "learning_rate": 0.00010279591082645646, + "loss": 0.2426, + "step": 3951 + }, + { + "epoch": 1.4593796159527326, + "grad_norm": 0.2208353579044342, + "learning_rate": 0.00010277127725089298, + "loss": 0.1871, + "step": 3952 + }, + { + "epoch": 1.4597488921713442, + "grad_norm": 0.22995372116565704, + "learning_rate": 0.00010274664367532946, + "loss": 0.2033, + "step": 3953 + }, + { + "epoch": 1.4601181683899558, + "grad_norm": 0.24344350397586823, + "learning_rate": 0.00010272201009976599, + "loss": 0.2402, + "step": 3954 + }, + { + "epoch": 1.4604874446085672, + "grad_norm": 0.2608914077281952, + "learning_rate": 0.00010269737652420248, + "loss": 0.2366, + "step": 3955 + }, + { + "epoch": 1.4608567208271788, + "grad_norm": 0.2236889749765396, + "learning_rate": 0.000102672742948639, + "loss": 0.235, + "step": 3956 + }, + { + "epoch": 1.4612259970457901, + "grad_norm": 0.20806574821472168, + "learning_rate": 0.0001026481093730755, + "loss": 0.2009, + "step": 3957 + }, + { + "epoch": 1.4615952732644018, + "grad_norm": 0.2754175662994385, + "learning_rate": 0.00010262347579751201, + "loss": 0.2563, + "step": 3958 + }, + { + "epoch": 1.4619645494830134, + "grad_norm": 0.2548564672470093, + "learning_rate": 0.00010259884222194851, + "loss": 0.226, + "step": 3959 + }, + { + "epoch": 1.4623338257016247, + "grad_norm": 0.2693333029747009, + "learning_rate": 0.00010257420864638502, + "loss": 0.2305, + "step": 3960 + }, + { + "epoch": 1.4627031019202363, + "grad_norm": 0.24773281812667847, + "learning_rate": 0.00010254957507082153, + "loss": 0.1993, + "step": 3961 + }, + { + "epoch": 1.463072378138848, + "grad_norm": 0.23438863456249237, + "learning_rate": 0.00010252494149525804, + "loss": 0.2245, + "step": 3962 + }, + { + "epoch": 1.4634416543574593, + "grad_norm": 0.3115812838077545, + "learning_rate": 0.00010250030791969454, + "loss": 0.2542, + "step": 3963 + }, + { + "epoch": 1.463810930576071, + "grad_norm": 0.3025296926498413, + "learning_rate": 0.00010247567434413106, + "loss": 0.2084, + "step": 3964 + }, + { + "epoch": 1.4641802067946825, + "grad_norm": 0.2728077471256256, + "learning_rate": 0.00010245104076856756, + "loss": 0.2291, + "step": 3965 + }, + { + "epoch": 1.464549483013294, + "grad_norm": 0.2873958349227905, + "learning_rate": 0.00010242640719300407, + "loss": 0.2629, + "step": 3966 + }, + { + "epoch": 1.4649187592319055, + "grad_norm": 0.3071512281894684, + "learning_rate": 0.00010240177361744057, + "loss": 0.2323, + "step": 3967 + }, + { + "epoch": 1.465288035450517, + "grad_norm": 0.25157779455184937, + "learning_rate": 0.00010237714004187709, + "loss": 0.2099, + "step": 3968 + }, + { + "epoch": 1.4656573116691285, + "grad_norm": 0.301322340965271, + "learning_rate": 0.00010235250646631357, + "loss": 0.2598, + "step": 3969 + }, + { + "epoch": 1.46602658788774, + "grad_norm": 0.311323881149292, + "learning_rate": 0.0001023278728907501, + "loss": 0.281, + "step": 3970 + }, + { + "epoch": 1.4663958641063515, + "grad_norm": 0.2952243685722351, + "learning_rate": 0.00010230323931518659, + "loss": 0.2302, + "step": 3971 + }, + { + "epoch": 1.466765140324963, + "grad_norm": 0.26137974858283997, + "learning_rate": 0.00010227860573962312, + "loss": 0.2324, + "step": 3972 + }, + { + "epoch": 1.4671344165435745, + "grad_norm": 0.2205859273672104, + "learning_rate": 0.0001022539721640596, + "loss": 0.1953, + "step": 3973 + }, + { + "epoch": 1.467503692762186, + "grad_norm": 0.290164053440094, + "learning_rate": 0.00010222933858849612, + "loss": 0.2436, + "step": 3974 + }, + { + "epoch": 1.4678729689807977, + "grad_norm": 0.2907581031322479, + "learning_rate": 0.00010220470501293262, + "loss": 0.2719, + "step": 3975 + }, + { + "epoch": 1.4682422451994093, + "grad_norm": 0.2194526344537735, + "learning_rate": 0.00010218007143736914, + "loss": 0.2102, + "step": 3976 + }, + { + "epoch": 1.4686115214180206, + "grad_norm": 0.2592501938343048, + "learning_rate": 0.00010215543786180564, + "loss": 0.2171, + "step": 3977 + }, + { + "epoch": 1.4689807976366323, + "grad_norm": 0.2976078689098358, + "learning_rate": 0.00010213080428624215, + "loss": 0.2478, + "step": 3978 + }, + { + "epoch": 1.4693500738552436, + "grad_norm": 0.23973433673381805, + "learning_rate": 0.00010210617071067865, + "loss": 0.2019, + "step": 3979 + }, + { + "epoch": 1.4697193500738552, + "grad_norm": 0.2462967187166214, + "learning_rate": 0.00010208153713511517, + "loss": 0.2061, + "step": 3980 + }, + { + "epoch": 1.4700886262924668, + "grad_norm": 0.3289574086666107, + "learning_rate": 0.00010205690355955167, + "loss": 0.2402, + "step": 3981 + }, + { + "epoch": 1.4704579025110782, + "grad_norm": 0.21579718589782715, + "learning_rate": 0.00010203226998398818, + "loss": 0.2258, + "step": 3982 + }, + { + "epoch": 1.4708271787296898, + "grad_norm": 0.274550199508667, + "learning_rate": 0.00010200763640842467, + "loss": 0.2207, + "step": 3983 + }, + { + "epoch": 1.4711964549483012, + "grad_norm": 0.2437625527381897, + "learning_rate": 0.0001019830028328612, + "loss": 0.2109, + "step": 3984 + }, + { + "epoch": 1.4715657311669128, + "grad_norm": 0.24673601984977722, + "learning_rate": 0.00010195836925729769, + "loss": 0.2138, + "step": 3985 + }, + { + "epoch": 1.4719350073855244, + "grad_norm": 0.2122892290353775, + "learning_rate": 0.00010193373568173422, + "loss": 0.1682, + "step": 3986 + }, + { + "epoch": 1.472304283604136, + "grad_norm": 0.3209424912929535, + "learning_rate": 0.0001019091021061707, + "loss": 0.2426, + "step": 3987 + }, + { + "epoch": 1.4726735598227474, + "grad_norm": 0.3087669312953949, + "learning_rate": 0.00010188446853060723, + "loss": 0.2256, + "step": 3988 + }, + { + "epoch": 1.473042836041359, + "grad_norm": 0.282467246055603, + "learning_rate": 0.00010185983495504372, + "loss": 0.2578, + "step": 3989 + }, + { + "epoch": 1.4734121122599704, + "grad_norm": 0.2821834087371826, + "learning_rate": 0.00010183520137948023, + "loss": 0.2713, + "step": 3990 + }, + { + "epoch": 1.473781388478582, + "grad_norm": 0.24819496273994446, + "learning_rate": 0.00010181056780391673, + "loss": 0.2228, + "step": 3991 + }, + { + "epoch": 1.4741506646971936, + "grad_norm": 0.27728283405303955, + "learning_rate": 0.00010178593422835325, + "loss": 0.2735, + "step": 3992 + }, + { + "epoch": 1.474519940915805, + "grad_norm": 0.2658727169036865, + "learning_rate": 0.00010176130065278975, + "loss": 0.2166, + "step": 3993 + }, + { + "epoch": 1.4748892171344166, + "grad_norm": 0.236416757106781, + "learning_rate": 0.00010173666707722626, + "loss": 0.2298, + "step": 3994 + }, + { + "epoch": 1.475258493353028, + "grad_norm": 0.32338541746139526, + "learning_rate": 0.00010171203350166277, + "loss": 0.2443, + "step": 3995 + }, + { + "epoch": 1.4756277695716395, + "grad_norm": 0.30736929178237915, + "learning_rate": 0.00010168739992609928, + "loss": 0.2225, + "step": 3996 + }, + { + "epoch": 1.4759970457902511, + "grad_norm": 0.26603782176971436, + "learning_rate": 0.00010166276635053578, + "loss": 0.241, + "step": 3997 + }, + { + "epoch": 1.4763663220088628, + "grad_norm": 0.24234913289546967, + "learning_rate": 0.0001016381327749723, + "loss": 0.2346, + "step": 3998 + }, + { + "epoch": 1.4767355982274741, + "grad_norm": 0.2570359706878662, + "learning_rate": 0.00010161349919940878, + "loss": 0.1785, + "step": 3999 + }, + { + "epoch": 1.4771048744460857, + "grad_norm": 0.2983315885066986, + "learning_rate": 0.00010158886562384531, + "loss": 0.2701, + "step": 4000 + }, + { + "epoch": 1.4771048744460857, + "eval_loss": 8.63985824584961, + "eval_runtime": 6.9204, + "eval_samples_per_second": 7.225, + "eval_steps_per_second": 1.012, + "step": 4000 + }, + { + "epoch": 1.4774741506646971, + "grad_norm": 0.23354892432689667, + "learning_rate": 0.0001015642320482818, + "loss": 0.2009, + "step": 4001 + }, + { + "epoch": 1.4778434268833087, + "grad_norm": 0.30223730206489563, + "learning_rate": 0.00010153959847271833, + "loss": 0.2429, + "step": 4002 + }, + { + "epoch": 1.4782127031019203, + "grad_norm": 0.2657831311225891, + "learning_rate": 0.00010151496489715482, + "loss": 0.2541, + "step": 4003 + }, + { + "epoch": 1.4785819793205317, + "grad_norm": 0.2590239942073822, + "learning_rate": 0.00010149033132159133, + "loss": 0.2252, + "step": 4004 + }, + { + "epoch": 1.4789512555391433, + "grad_norm": 0.2795875668525696, + "learning_rate": 0.00010146569774602783, + "loss": 0.2275, + "step": 4005 + }, + { + "epoch": 1.4793205317577547, + "grad_norm": 0.2059694230556488, + "learning_rate": 0.00010144106417046435, + "loss": 0.1876, + "step": 4006 + }, + { + "epoch": 1.4796898079763663, + "grad_norm": 0.2916269600391388, + "learning_rate": 0.00010141643059490085, + "loss": 0.2123, + "step": 4007 + }, + { + "epoch": 1.480059084194978, + "grad_norm": 0.22915804386138916, + "learning_rate": 0.00010139179701933736, + "loss": 0.2097, + "step": 4008 + }, + { + "epoch": 1.4804283604135895, + "grad_norm": 0.2544005811214447, + "learning_rate": 0.00010136716344377386, + "loss": 0.2199, + "step": 4009 + }, + { + "epoch": 1.4807976366322009, + "grad_norm": 0.30995282530784607, + "learning_rate": 0.00010134252986821038, + "loss": 0.2484, + "step": 4010 + }, + { + "epoch": 1.4811669128508125, + "grad_norm": 0.23280447721481323, + "learning_rate": 0.00010131789629264688, + "loss": 0.1837, + "step": 4011 + }, + { + "epoch": 1.4815361890694239, + "grad_norm": 0.30665484070777893, + "learning_rate": 0.00010129326271708339, + "loss": 0.2493, + "step": 4012 + }, + { + "epoch": 1.4819054652880355, + "grad_norm": 0.2421862781047821, + "learning_rate": 0.0001012686291415199, + "loss": 0.2193, + "step": 4013 + }, + { + "epoch": 1.482274741506647, + "grad_norm": 0.2910977900028229, + "learning_rate": 0.00010124399556595641, + "loss": 0.2155, + "step": 4014 + }, + { + "epoch": 1.4826440177252584, + "grad_norm": 0.23159882426261902, + "learning_rate": 0.0001012193619903929, + "loss": 0.2277, + "step": 4015 + }, + { + "epoch": 1.48301329394387, + "grad_norm": 0.2863536775112152, + "learning_rate": 0.00010119472841482942, + "loss": 0.2375, + "step": 4016 + }, + { + "epoch": 1.4833825701624814, + "grad_norm": 0.2295531928539276, + "learning_rate": 0.00010117009483926591, + "loss": 0.2018, + "step": 4017 + }, + { + "epoch": 1.483751846381093, + "grad_norm": 0.2687823176383972, + "learning_rate": 0.00010114546126370244, + "loss": 0.2129, + "step": 4018 + }, + { + "epoch": 1.4841211225997046, + "grad_norm": 0.3753030002117157, + "learning_rate": 0.00010112082768813893, + "loss": 0.2581, + "step": 4019 + }, + { + "epoch": 1.4844903988183162, + "grad_norm": 0.24089625477790833, + "learning_rate": 0.00010109619411257544, + "loss": 0.2412, + "step": 4020 + }, + { + "epoch": 1.4848596750369276, + "grad_norm": 0.2735785245895386, + "learning_rate": 0.00010107156053701194, + "loss": 0.2386, + "step": 4021 + }, + { + "epoch": 1.4852289512555392, + "grad_norm": 0.25373977422714233, + "learning_rate": 0.00010104692696144846, + "loss": 0.2426, + "step": 4022 + }, + { + "epoch": 1.4855982274741506, + "grad_norm": 0.22873155772686005, + "learning_rate": 0.00010102229338588496, + "loss": 0.2297, + "step": 4023 + }, + { + "epoch": 1.4859675036927622, + "grad_norm": 0.2810438871383667, + "learning_rate": 0.00010099765981032147, + "loss": 0.2595, + "step": 4024 + }, + { + "epoch": 1.4863367799113738, + "grad_norm": 0.22204075753688812, + "learning_rate": 0.00010097302623475797, + "loss": 0.1934, + "step": 4025 + }, + { + "epoch": 1.4867060561299852, + "grad_norm": 0.2631056308746338, + "learning_rate": 0.00010094839265919449, + "loss": 0.2211, + "step": 4026 + }, + { + "epoch": 1.4870753323485968, + "grad_norm": 0.24389302730560303, + "learning_rate": 0.00010092375908363099, + "loss": 0.186, + "step": 4027 + }, + { + "epoch": 1.4874446085672082, + "grad_norm": 0.27736514806747437, + "learning_rate": 0.0001008991255080675, + "loss": 0.2173, + "step": 4028 + }, + { + "epoch": 1.4878138847858198, + "grad_norm": 0.26858869194984436, + "learning_rate": 0.000100874491932504, + "loss": 0.2276, + "step": 4029 + }, + { + "epoch": 1.4881831610044314, + "grad_norm": 0.2619730234146118, + "learning_rate": 0.00010084985835694052, + "loss": 0.2315, + "step": 4030 + }, + { + "epoch": 1.4885524372230428, + "grad_norm": 0.25516992807388306, + "learning_rate": 0.00010082522478137701, + "loss": 0.2039, + "step": 4031 + }, + { + "epoch": 1.4889217134416544, + "grad_norm": 0.2682039737701416, + "learning_rate": 0.00010080059120581354, + "loss": 0.2051, + "step": 4032 + }, + { + "epoch": 1.4892909896602657, + "grad_norm": 0.4364745318889618, + "learning_rate": 0.00010077595763025002, + "loss": 0.2665, + "step": 4033 + }, + { + "epoch": 1.4896602658788773, + "grad_norm": 0.2910602390766144, + "learning_rate": 0.00010075132405468655, + "loss": 0.2756, + "step": 4034 + }, + { + "epoch": 1.490029542097489, + "grad_norm": 0.2666470408439636, + "learning_rate": 0.00010072669047912304, + "loss": 0.2429, + "step": 4035 + }, + { + "epoch": 1.4903988183161005, + "grad_norm": 0.22863835096359253, + "learning_rate": 0.00010070205690355955, + "loss": 0.1877, + "step": 4036 + }, + { + "epoch": 1.490768094534712, + "grad_norm": 0.2519991993904114, + "learning_rate": 0.00010067742332799606, + "loss": 0.2134, + "step": 4037 + }, + { + "epoch": 1.4911373707533235, + "grad_norm": 0.23635603487491608, + "learning_rate": 0.00010065278975243257, + "loss": 0.2251, + "step": 4038 + }, + { + "epoch": 1.491506646971935, + "grad_norm": 0.23089922964572906, + "learning_rate": 0.00010062815617686907, + "loss": 0.2112, + "step": 4039 + }, + { + "epoch": 1.4918759231905465, + "grad_norm": 0.35299113392829895, + "learning_rate": 0.00010060352260130559, + "loss": 0.2618, + "step": 4040 + }, + { + "epoch": 1.4922451994091581, + "grad_norm": 0.2394355982542038, + "learning_rate": 0.00010057888902574209, + "loss": 0.1815, + "step": 4041 + }, + { + "epoch": 1.4926144756277695, + "grad_norm": 0.2555207908153534, + "learning_rate": 0.0001005542554501786, + "loss": 0.2228, + "step": 4042 + }, + { + "epoch": 1.492983751846381, + "grad_norm": 0.27796247601509094, + "learning_rate": 0.0001005296218746151, + "loss": 0.2525, + "step": 4043 + }, + { + "epoch": 1.4933530280649925, + "grad_norm": 0.36150631308555603, + "learning_rate": 0.00010050498829905162, + "loss": 0.2504, + "step": 4044 + }, + { + "epoch": 1.493722304283604, + "grad_norm": 0.3423403203487396, + "learning_rate": 0.00010048035472348812, + "loss": 0.2211, + "step": 4045 + }, + { + "epoch": 1.4940915805022157, + "grad_norm": 0.2036154568195343, + "learning_rate": 0.00010045572114792463, + "loss": 0.1935, + "step": 4046 + }, + { + "epoch": 1.4944608567208273, + "grad_norm": 0.25069642066955566, + "learning_rate": 0.00010043108757236112, + "loss": 0.245, + "step": 4047 + }, + { + "epoch": 1.4948301329394387, + "grad_norm": 0.24127745628356934, + "learning_rate": 0.00010040645399679765, + "loss": 0.2149, + "step": 4048 + }, + { + "epoch": 1.4951994091580503, + "grad_norm": 0.29710525274276733, + "learning_rate": 0.00010038182042123414, + "loss": 0.2385, + "step": 4049 + }, + { + "epoch": 1.4955686853766617, + "grad_norm": 0.2539612948894501, + "learning_rate": 0.00010035718684567066, + "loss": 0.2371, + "step": 4050 + }, + { + "epoch": 1.4955686853766617, + "eval_loss": 8.449037551879883, + "eval_runtime": 6.9096, + "eval_samples_per_second": 7.236, + "eval_steps_per_second": 1.013, + "step": 4050 + }, + { + "epoch": 1.4959379615952733, + "grad_norm": 0.22461967170238495, + "learning_rate": 0.00010033255327010715, + "loss": 0.2056, + "step": 4051 + }, + { + "epoch": 1.4963072378138849, + "grad_norm": 0.23153838515281677, + "learning_rate": 0.00010030791969454367, + "loss": 0.1918, + "step": 4052 + }, + { + "epoch": 1.4966765140324962, + "grad_norm": 0.28983598947525024, + "learning_rate": 0.00010028328611898017, + "loss": 0.2272, + "step": 4053 + }, + { + "epoch": 1.4970457902511078, + "grad_norm": 0.2837114930152893, + "learning_rate": 0.00010025865254341668, + "loss": 0.2195, + "step": 4054 + }, + { + "epoch": 1.4974150664697192, + "grad_norm": 0.22369125485420227, + "learning_rate": 0.00010023401896785318, + "loss": 0.2071, + "step": 4055 + }, + { + "epoch": 1.4977843426883308, + "grad_norm": 0.24951887130737305, + "learning_rate": 0.0001002093853922897, + "loss": 0.2148, + "step": 4056 + }, + { + "epoch": 1.4981536189069424, + "grad_norm": 0.23311880230903625, + "learning_rate": 0.0001001847518167262, + "loss": 0.1876, + "step": 4057 + }, + { + "epoch": 1.498522895125554, + "grad_norm": 0.2553228735923767, + "learning_rate": 0.00010016011824116271, + "loss": 0.2183, + "step": 4058 + }, + { + "epoch": 1.4988921713441654, + "grad_norm": 0.29303085803985596, + "learning_rate": 0.00010013548466559921, + "loss": 0.2361, + "step": 4059 + }, + { + "epoch": 1.499261447562777, + "grad_norm": 0.25596314668655396, + "learning_rate": 0.00010011085109003573, + "loss": 0.2514, + "step": 4060 + }, + { + "epoch": 1.4996307237813884, + "grad_norm": 0.24215568602085114, + "learning_rate": 0.00010008621751447223, + "loss": 0.1874, + "step": 4061 + }, + { + "epoch": 1.5, + "grad_norm": 0.2759939432144165, + "learning_rate": 0.00010006158393890875, + "loss": 0.2244, + "step": 4062 + }, + { + "epoch": 1.5003692762186116, + "grad_norm": 0.284406453371048, + "learning_rate": 0.00010003695036334523, + "loss": 0.2208, + "step": 4063 + }, + { + "epoch": 1.5007385524372232, + "grad_norm": 0.22903361916542053, + "learning_rate": 0.00010001231678778176, + "loss": 0.2137, + "step": 4064 + }, + { + "epoch": 1.5011078286558346, + "grad_norm": 0.23467519879341125, + "learning_rate": 9.998768321221825e-05, + "loss": 0.1695, + "step": 4065 + }, + { + "epoch": 1.501477104874446, + "grad_norm": 0.23066309094429016, + "learning_rate": 9.996304963665476e-05, + "loss": 0.2071, + "step": 4066 + }, + { + "epoch": 1.5018463810930576, + "grad_norm": 0.3020581007003784, + "learning_rate": 9.993841606109126e-05, + "loss": 0.2127, + "step": 4067 + }, + { + "epoch": 1.5022156573116692, + "grad_norm": 0.26200565695762634, + "learning_rate": 9.991378248552778e-05, + "loss": 0.2101, + "step": 4068 + }, + { + "epoch": 1.5025849335302808, + "grad_norm": 0.2620743215084076, + "learning_rate": 9.988914890996428e-05, + "loss": 0.21, + "step": 4069 + }, + { + "epoch": 1.5029542097488922, + "grad_norm": 0.22514945268630981, + "learning_rate": 9.986451533440078e-05, + "loss": 0.1868, + "step": 4070 + }, + { + "epoch": 1.5033234859675035, + "grad_norm": 0.28968510031700134, + "learning_rate": 9.98398817588373e-05, + "loss": 0.2273, + "step": 4071 + }, + { + "epoch": 1.5036927621861151, + "grad_norm": 0.27703857421875, + "learning_rate": 9.98152481832738e-05, + "loss": 0.2187, + "step": 4072 + }, + { + "epoch": 1.5040620384047267, + "grad_norm": 0.22061565518379211, + "learning_rate": 9.979061460771031e-05, + "loss": 0.1796, + "step": 4073 + }, + { + "epoch": 1.5044313146233383, + "grad_norm": 0.2902814745903015, + "learning_rate": 9.976598103214681e-05, + "loss": 0.2282, + "step": 4074 + }, + { + "epoch": 1.5048005908419497, + "grad_norm": 0.32060572504997253, + "learning_rate": 9.974134745658333e-05, + "loss": 0.2267, + "step": 4075 + }, + { + "epoch": 1.5051698670605613, + "grad_norm": 0.23995429277420044, + "learning_rate": 9.971671388101983e-05, + "loss": 0.2287, + "step": 4076 + }, + { + "epoch": 1.5055391432791727, + "grad_norm": 0.30146509408950806, + "learning_rate": 9.969208030545634e-05, + "loss": 0.2389, + "step": 4077 + }, + { + "epoch": 1.5059084194977843, + "grad_norm": 0.2871498167514801, + "learning_rate": 9.966744672989284e-05, + "loss": 0.2063, + "step": 4078 + }, + { + "epoch": 1.506277695716396, + "grad_norm": 0.3023194372653961, + "learning_rate": 9.964281315432935e-05, + "loss": 0.2163, + "step": 4079 + }, + { + "epoch": 1.5066469719350075, + "grad_norm": 0.25719213485717773, + "learning_rate": 9.961817957876586e-05, + "loss": 0.1947, + "step": 4080 + }, + { + "epoch": 1.507016248153619, + "grad_norm": 0.25235921144485474, + "learning_rate": 9.959354600320236e-05, + "loss": 0.2022, + "step": 4081 + }, + { + "epoch": 1.5073855243722303, + "grad_norm": 0.2576795518398285, + "learning_rate": 9.956891242763888e-05, + "loss": 0.2052, + "step": 4082 + }, + { + "epoch": 1.5077548005908419, + "grad_norm": 0.2466406226158142, + "learning_rate": 9.954427885207538e-05, + "loss": 0.2116, + "step": 4083 + }, + { + "epoch": 1.5081240768094535, + "grad_norm": 0.2741953432559967, + "learning_rate": 9.951964527651189e-05, + "loss": 0.2215, + "step": 4084 + }, + { + "epoch": 1.508493353028065, + "grad_norm": 0.3012452721595764, + "learning_rate": 9.949501170094839e-05, + "loss": 0.2332, + "step": 4085 + }, + { + "epoch": 1.5088626292466765, + "grad_norm": 0.30661165714263916, + "learning_rate": 9.94703781253849e-05, + "loss": 0.2684, + "step": 4086 + }, + { + "epoch": 1.509231905465288, + "grad_norm": 0.22163408994674683, + "learning_rate": 9.944574454982141e-05, + "loss": 0.2075, + "step": 4087 + }, + { + "epoch": 1.5096011816838995, + "grad_norm": 0.28589728474617004, + "learning_rate": 9.942111097425791e-05, + "loss": 0.2445, + "step": 4088 + }, + { + "epoch": 1.509970457902511, + "grad_norm": 0.27572137117385864, + "learning_rate": 9.939647739869442e-05, + "loss": 0.2319, + "step": 4089 + }, + { + "epoch": 1.5103397341211227, + "grad_norm": 0.30095669627189636, + "learning_rate": 9.937184382313092e-05, + "loss": 0.2566, + "step": 4090 + }, + { + "epoch": 1.5107090103397343, + "grad_norm": 0.26606544852256775, + "learning_rate": 9.934721024756744e-05, + "loss": 0.2494, + "step": 4091 + }, + { + "epoch": 1.5110782865583456, + "grad_norm": 0.2739373445510864, + "learning_rate": 9.932257667200394e-05, + "loss": 0.2218, + "step": 4092 + }, + { + "epoch": 1.511447562776957, + "grad_norm": 0.24987280368804932, + "learning_rate": 9.929794309644046e-05, + "loss": 0.1903, + "step": 4093 + }, + { + "epoch": 1.5118168389955686, + "grad_norm": 0.2534765601158142, + "learning_rate": 9.927330952087696e-05, + "loss": 0.2213, + "step": 4094 + }, + { + "epoch": 1.5121861152141802, + "grad_norm": 0.27780649065971375, + "learning_rate": 9.924867594531346e-05, + "loss": 0.2097, + "step": 4095 + }, + { + "epoch": 1.5125553914327918, + "grad_norm": 0.27256882190704346, + "learning_rate": 9.922404236974997e-05, + "loss": 0.1906, + "step": 4096 + }, + { + "epoch": 1.5129246676514032, + "grad_norm": 0.25040626525878906, + "learning_rate": 9.919940879418647e-05, + "loss": 0.2264, + "step": 4097 + }, + { + "epoch": 1.5132939438700148, + "grad_norm": 0.26737144589424133, + "learning_rate": 9.917477521862299e-05, + "loss": 0.2051, + "step": 4098 + }, + { + "epoch": 1.5136632200886262, + "grad_norm": 0.25263530015945435, + "learning_rate": 9.915014164305949e-05, + "loss": 0.221, + "step": 4099 + }, + { + "epoch": 1.5140324963072378, + "grad_norm": 0.22754964232444763, + "learning_rate": 9.9125508067496e-05, + "loss": 0.2231, + "step": 4100 + }, + { + "epoch": 1.5140324963072378, + "eval_loss": 8.37662410736084, + "eval_runtime": 6.9081, + "eval_samples_per_second": 7.238, + "eval_steps_per_second": 1.013, + "step": 4100 + }, + { + "epoch": 1.5144017725258494, + "grad_norm": 0.31572312116622925, + "learning_rate": 9.91008744919325e-05, + "loss": 0.2303, + "step": 4101 + }, + { + "epoch": 1.514771048744461, + "grad_norm": 0.26505789160728455, + "learning_rate": 9.9076240916369e-05, + "loss": 0.2089, + "step": 4102 + }, + { + "epoch": 1.5151403249630724, + "grad_norm": 0.2949261963367462, + "learning_rate": 9.905160734080552e-05, + "loss": 0.2356, + "step": 4103 + }, + { + "epoch": 1.5155096011816838, + "grad_norm": 0.24580992758274078, + "learning_rate": 9.902697376524202e-05, + "loss": 0.2137, + "step": 4104 + }, + { + "epoch": 1.5158788774002954, + "grad_norm": 0.29294055700302124, + "learning_rate": 9.900234018967854e-05, + "loss": 0.2355, + "step": 4105 + }, + { + "epoch": 1.516248153618907, + "grad_norm": 0.28830960392951965, + "learning_rate": 9.897770661411504e-05, + "loss": 0.2513, + "step": 4106 + }, + { + "epoch": 1.5166174298375186, + "grad_norm": 0.2431422770023346, + "learning_rate": 9.895307303855155e-05, + "loss": 0.1998, + "step": 4107 + }, + { + "epoch": 1.51698670605613, + "grad_norm": 0.2487974613904953, + "learning_rate": 9.892843946298805e-05, + "loss": 0.2422, + "step": 4108 + }, + { + "epoch": 1.5173559822747416, + "grad_norm": 0.21284465491771698, + "learning_rate": 9.890380588742457e-05, + "loss": 0.2016, + "step": 4109 + }, + { + "epoch": 1.517725258493353, + "grad_norm": 0.31350409984588623, + "learning_rate": 9.887917231186107e-05, + "loss": 0.2712, + "step": 4110 + }, + { + "epoch": 1.5180945347119645, + "grad_norm": 0.28220972418785095, + "learning_rate": 9.885453873629757e-05, + "loss": 0.2598, + "step": 4111 + }, + { + "epoch": 1.5184638109305761, + "grad_norm": 0.24219532310962677, + "learning_rate": 9.882990516073408e-05, + "loss": 0.2185, + "step": 4112 + }, + { + "epoch": 1.5188330871491877, + "grad_norm": 0.2549615502357483, + "learning_rate": 9.880527158517059e-05, + "loss": 0.2376, + "step": 4113 + }, + { + "epoch": 1.5192023633677991, + "grad_norm": 0.3109208941459656, + "learning_rate": 9.87806380096071e-05, + "loss": 0.2527, + "step": 4114 + }, + { + "epoch": 1.5195716395864105, + "grad_norm": 0.26037460565567017, + "learning_rate": 9.87560044340436e-05, + "loss": 0.2111, + "step": 4115 + }, + { + "epoch": 1.519940915805022, + "grad_norm": 0.2412434220314026, + "learning_rate": 9.873137085848012e-05, + "loss": 0.1972, + "step": 4116 + }, + { + "epoch": 1.5203101920236337, + "grad_norm": 0.24877823889255524, + "learning_rate": 9.870673728291662e-05, + "loss": 0.1945, + "step": 4117 + }, + { + "epoch": 1.5206794682422453, + "grad_norm": 0.22844979166984558, + "learning_rate": 9.868210370735312e-05, + "loss": 0.2156, + "step": 4118 + }, + { + "epoch": 1.5210487444608567, + "grad_norm": 0.2412596344947815, + "learning_rate": 9.865747013178963e-05, + "loss": 0.2036, + "step": 4119 + }, + { + "epoch": 1.5214180206794683, + "grad_norm": 0.27456697821617126, + "learning_rate": 9.863283655622613e-05, + "loss": 0.2264, + "step": 4120 + }, + { + "epoch": 1.5217872968980797, + "grad_norm": 0.2727751135826111, + "learning_rate": 9.860820298066265e-05, + "loss": 0.2451, + "step": 4121 + }, + { + "epoch": 1.5221565731166913, + "grad_norm": 0.22774657607078552, + "learning_rate": 9.858356940509915e-05, + "loss": 0.199, + "step": 4122 + }, + { + "epoch": 1.5225258493353029, + "grad_norm": 0.2858784794807434, + "learning_rate": 9.855893582953566e-05, + "loss": 0.2269, + "step": 4123 + }, + { + "epoch": 1.5228951255539145, + "grad_norm": 0.2411343902349472, + "learning_rate": 9.853430225397217e-05, + "loss": 0.2145, + "step": 4124 + }, + { + "epoch": 1.5232644017725259, + "grad_norm": 0.2849372625350952, + "learning_rate": 9.850966867840868e-05, + "loss": 0.2363, + "step": 4125 + }, + { + "epoch": 1.5236336779911372, + "grad_norm": 0.22996686398983002, + "learning_rate": 9.848503510284518e-05, + "loss": 0.1847, + "step": 4126 + }, + { + "epoch": 1.5240029542097489, + "grad_norm": 0.2791785001754761, + "learning_rate": 9.846040152728168e-05, + "loss": 0.2405, + "step": 4127 + }, + { + "epoch": 1.5243722304283605, + "grad_norm": 0.3429658114910126, + "learning_rate": 9.84357679517182e-05, + "loss": 0.2295, + "step": 4128 + }, + { + "epoch": 1.524741506646972, + "grad_norm": 0.2761457562446594, + "learning_rate": 9.84111343761547e-05, + "loss": 0.2223, + "step": 4129 + }, + { + "epoch": 1.5251107828655834, + "grad_norm": 0.2568742036819458, + "learning_rate": 9.838650080059121e-05, + "loss": 0.2373, + "step": 4130 + }, + { + "epoch": 1.5254800590841948, + "grad_norm": 0.3383117616176605, + "learning_rate": 9.836186722502771e-05, + "loss": 0.239, + "step": 4131 + }, + { + "epoch": 1.5258493353028064, + "grad_norm": 0.25479748845100403, + "learning_rate": 9.833723364946423e-05, + "loss": 0.2593, + "step": 4132 + }, + { + "epoch": 1.526218611521418, + "grad_norm": 0.33271050453186035, + "learning_rate": 9.831260007390073e-05, + "loss": 0.2289, + "step": 4133 + }, + { + "epoch": 1.5265878877400296, + "grad_norm": 0.21592937409877777, + "learning_rate": 9.828796649833723e-05, + "loss": 0.2019, + "step": 4134 + }, + { + "epoch": 1.5269571639586412, + "grad_norm": 0.26369190216064453, + "learning_rate": 9.826333292277374e-05, + "loss": 0.2177, + "step": 4135 + }, + { + "epoch": 1.5273264401772526, + "grad_norm": 0.2416723519563675, + "learning_rate": 9.823869934721025e-05, + "loss": 0.1921, + "step": 4136 + }, + { + "epoch": 1.527695716395864, + "grad_norm": 0.20632563531398773, + "learning_rate": 9.821406577164676e-05, + "loss": 0.1996, + "step": 4137 + }, + { + "epoch": 1.5280649926144756, + "grad_norm": 0.29609620571136475, + "learning_rate": 9.818943219608326e-05, + "loss": 0.2513, + "step": 4138 + }, + { + "epoch": 1.5284342688330872, + "grad_norm": 0.3195131719112396, + "learning_rate": 9.816479862051978e-05, + "loss": 0.271, + "step": 4139 + }, + { + "epoch": 1.5288035450516988, + "grad_norm": 0.24665427207946777, + "learning_rate": 9.814016504495628e-05, + "loss": 0.1922, + "step": 4140 + }, + { + "epoch": 1.5291728212703102, + "grad_norm": 0.26155832409858704, + "learning_rate": 9.811553146939278e-05, + "loss": 0.2085, + "step": 4141 + }, + { + "epoch": 1.5295420974889216, + "grad_norm": 0.3678491711616516, + "learning_rate": 9.809089789382929e-05, + "loss": 0.2657, + "step": 4142 + }, + { + "epoch": 1.5299113737075332, + "grad_norm": 0.21838617324829102, + "learning_rate": 9.80662643182658e-05, + "loss": 0.2322, + "step": 4143 + }, + { + "epoch": 1.5302806499261448, + "grad_norm": 0.25652527809143066, + "learning_rate": 9.804163074270231e-05, + "loss": 0.2045, + "step": 4144 + }, + { + "epoch": 1.5306499261447564, + "grad_norm": 0.2718833088874817, + "learning_rate": 9.801699716713881e-05, + "loss": 0.2126, + "step": 4145 + }, + { + "epoch": 1.5310192023633677, + "grad_norm": 0.28380289673805237, + "learning_rate": 9.799236359157532e-05, + "loss": 0.2445, + "step": 4146 + }, + { + "epoch": 1.5313884785819794, + "grad_norm": 0.2486022263765335, + "learning_rate": 9.796773001601183e-05, + "loss": 0.2134, + "step": 4147 + }, + { + "epoch": 1.5317577548005907, + "grad_norm": 0.24982161819934845, + "learning_rate": 9.794309644044834e-05, + "loss": 0.2293, + "step": 4148 + }, + { + "epoch": 1.5321270310192023, + "grad_norm": 0.2932395040988922, + "learning_rate": 9.791846286488484e-05, + "loss": 0.2282, + "step": 4149 + }, + { + "epoch": 1.532496307237814, + "grad_norm": 0.30521366000175476, + "learning_rate": 9.789382928932134e-05, + "loss": 0.2849, + "step": 4150 + }, + { + "epoch": 1.532496307237814, + "eval_loss": 8.489041328430176, + "eval_runtime": 6.9374, + "eval_samples_per_second": 7.207, + "eval_steps_per_second": 1.009, + "step": 4150 + }, + { + "epoch": 1.5328655834564255, + "grad_norm": 0.26532071828842163, + "learning_rate": 9.786919571375786e-05, + "loss": 0.2104, + "step": 4151 + }, + { + "epoch": 1.533234859675037, + "grad_norm": 0.28178825974464417, + "learning_rate": 9.784456213819436e-05, + "loss": 0.2248, + "step": 4152 + }, + { + "epoch": 1.5336041358936483, + "grad_norm": 0.24331597983837128, + "learning_rate": 9.781992856263087e-05, + "loss": 0.1935, + "step": 4153 + }, + { + "epoch": 1.53397341211226, + "grad_norm": 0.299343466758728, + "learning_rate": 9.779529498706737e-05, + "loss": 0.2134, + "step": 4154 + }, + { + "epoch": 1.5343426883308715, + "grad_norm": 0.23296886682510376, + "learning_rate": 9.777066141150389e-05, + "loss": 0.2068, + "step": 4155 + }, + { + "epoch": 1.534711964549483, + "grad_norm": 0.23826006054878235, + "learning_rate": 9.774602783594039e-05, + "loss": 0.1971, + "step": 4156 + }, + { + "epoch": 1.5350812407680945, + "grad_norm": 0.3189060091972351, + "learning_rate": 9.772139426037689e-05, + "loss": 0.2465, + "step": 4157 + }, + { + "epoch": 1.535450516986706, + "grad_norm": 0.2898745536804199, + "learning_rate": 9.76967606848134e-05, + "loss": 0.2283, + "step": 4158 + }, + { + "epoch": 1.5358197932053175, + "grad_norm": 0.2701876163482666, + "learning_rate": 9.76721271092499e-05, + "loss": 0.2092, + "step": 4159 + }, + { + "epoch": 1.536189069423929, + "grad_norm": 0.25028571486473083, + "learning_rate": 9.764749353368642e-05, + "loss": 0.2188, + "step": 4160 + }, + { + "epoch": 1.5365583456425407, + "grad_norm": 0.2506429851055145, + "learning_rate": 9.762285995812292e-05, + "loss": 0.2023, + "step": 4161 + }, + { + "epoch": 1.5369276218611523, + "grad_norm": 0.2807749807834625, + "learning_rate": 9.759822638255944e-05, + "loss": 0.2319, + "step": 4162 + }, + { + "epoch": 1.5372968980797637, + "grad_norm": 0.28508177399635315, + "learning_rate": 9.757359280699594e-05, + "loss": 0.226, + "step": 4163 + }, + { + "epoch": 1.537666174298375, + "grad_norm": 0.2668566405773163, + "learning_rate": 9.754895923143245e-05, + "loss": 0.2336, + "step": 4164 + }, + { + "epoch": 1.5380354505169866, + "grad_norm": 0.29818591475486755, + "learning_rate": 9.752432565586895e-05, + "loss": 0.234, + "step": 4165 + }, + { + "epoch": 1.5384047267355982, + "grad_norm": 0.28137752413749695, + "learning_rate": 9.749969208030545e-05, + "loss": 0.2426, + "step": 4166 + }, + { + "epoch": 1.5387740029542099, + "grad_norm": 0.25029632449150085, + "learning_rate": 9.747505850474197e-05, + "loss": 0.2181, + "step": 4167 + }, + { + "epoch": 1.5391432791728212, + "grad_norm": 0.2866497337818146, + "learning_rate": 9.745042492917847e-05, + "loss": 0.2269, + "step": 4168 + }, + { + "epoch": 1.5395125553914328, + "grad_norm": 0.257595419883728, + "learning_rate": 9.742579135361499e-05, + "loss": 0.2237, + "step": 4169 + }, + { + "epoch": 1.5398818316100442, + "grad_norm": 0.2662784159183502, + "learning_rate": 9.740115777805149e-05, + "loss": 0.219, + "step": 4170 + }, + { + "epoch": 1.5402511078286558, + "grad_norm": 0.2836635410785675, + "learning_rate": 9.7376524202488e-05, + "loss": 0.2507, + "step": 4171 + }, + { + "epoch": 1.5406203840472674, + "grad_norm": 0.2796580195426941, + "learning_rate": 9.73518906269245e-05, + "loss": 0.2188, + "step": 4172 + }, + { + "epoch": 1.540989660265879, + "grad_norm": 0.2581844627857208, + "learning_rate": 9.7327257051361e-05, + "loss": 0.2196, + "step": 4173 + }, + { + "epoch": 1.5413589364844904, + "grad_norm": 0.19786061346530914, + "learning_rate": 9.730262347579752e-05, + "loss": 0.1921, + "step": 4174 + }, + { + "epoch": 1.5417282127031018, + "grad_norm": 0.3039240837097168, + "learning_rate": 9.727798990023402e-05, + "loss": 0.2573, + "step": 4175 + }, + { + "epoch": 1.5420974889217134, + "grad_norm": 0.2714802920818329, + "learning_rate": 9.725335632467053e-05, + "loss": 0.2344, + "step": 4176 + }, + { + "epoch": 1.542466765140325, + "grad_norm": 0.29975318908691406, + "learning_rate": 9.722872274910703e-05, + "loss": 0.2177, + "step": 4177 + }, + { + "epoch": 1.5428360413589366, + "grad_norm": 0.26628202199935913, + "learning_rate": 9.720408917354355e-05, + "loss": 0.2214, + "step": 4178 + }, + { + "epoch": 1.543205317577548, + "grad_norm": 0.2539972960948944, + "learning_rate": 9.717945559798005e-05, + "loss": 0.1982, + "step": 4179 + }, + { + "epoch": 1.5435745937961596, + "grad_norm": 0.2528001368045807, + "learning_rate": 9.715482202241656e-05, + "loss": 0.2154, + "step": 4180 + }, + { + "epoch": 1.543943870014771, + "grad_norm": 0.24206940829753876, + "learning_rate": 9.713018844685307e-05, + "loss": 0.2202, + "step": 4181 + }, + { + "epoch": 1.5443131462333826, + "grad_norm": 0.27567821741104126, + "learning_rate": 9.710555487128957e-05, + "loss": 0.2295, + "step": 4182 + }, + { + "epoch": 1.5446824224519942, + "grad_norm": 0.3037053048610687, + "learning_rate": 9.708092129572608e-05, + "loss": 0.2278, + "step": 4183 + }, + { + "epoch": 1.5450516986706058, + "grad_norm": 0.271829217672348, + "learning_rate": 9.705628772016258e-05, + "loss": 0.2217, + "step": 4184 + }, + { + "epoch": 1.5454209748892171, + "grad_norm": 0.2635287344455719, + "learning_rate": 9.70316541445991e-05, + "loss": 0.2399, + "step": 4185 + }, + { + "epoch": 1.5457902511078285, + "grad_norm": 0.2651749849319458, + "learning_rate": 9.70070205690356e-05, + "loss": 0.2607, + "step": 4186 + }, + { + "epoch": 1.5461595273264401, + "grad_norm": 0.25804170966148376, + "learning_rate": 9.698238699347211e-05, + "loss": 0.2095, + "step": 4187 + }, + { + "epoch": 1.5465288035450517, + "grad_norm": 0.25195181369781494, + "learning_rate": 9.695775341790861e-05, + "loss": 0.2181, + "step": 4188 + }, + { + "epoch": 1.5468980797636633, + "grad_norm": 0.2833308279514313, + "learning_rate": 9.693311984234512e-05, + "loss": 0.2257, + "step": 4189 + }, + { + "epoch": 1.5472673559822747, + "grad_norm": 0.2703004777431488, + "learning_rate": 9.690848626678163e-05, + "loss": 0.2386, + "step": 4190 + }, + { + "epoch": 1.547636632200886, + "grad_norm": 0.26954033970832825, + "learning_rate": 9.688385269121813e-05, + "loss": 0.2544, + "step": 4191 + }, + { + "epoch": 1.5480059084194977, + "grad_norm": 0.21081748604774475, + "learning_rate": 9.685921911565465e-05, + "loss": 0.2057, + "step": 4192 + }, + { + "epoch": 1.5483751846381093, + "grad_norm": 0.26732054352760315, + "learning_rate": 9.683458554009115e-05, + "loss": 0.1914, + "step": 4193 + }, + { + "epoch": 1.548744460856721, + "grad_norm": 0.2383386343717575, + "learning_rate": 9.680995196452766e-05, + "loss": 0.2009, + "step": 4194 + }, + { + "epoch": 1.5491137370753325, + "grad_norm": 0.2868897318840027, + "learning_rate": 9.678531838896416e-05, + "loss": 0.2418, + "step": 4195 + }, + { + "epoch": 1.549483013293944, + "grad_norm": 0.25290942192077637, + "learning_rate": 9.676068481340068e-05, + "loss": 0.2372, + "step": 4196 + }, + { + "epoch": 1.5498522895125553, + "grad_norm": 0.24155691266059875, + "learning_rate": 9.673605123783718e-05, + "loss": 0.2404, + "step": 4197 + }, + { + "epoch": 1.5502215657311669, + "grad_norm": 0.23448802530765533, + "learning_rate": 9.671141766227368e-05, + "loss": 0.2228, + "step": 4198 + }, + { + "epoch": 1.5505908419497785, + "grad_norm": 0.260168194770813, + "learning_rate": 9.66867840867102e-05, + "loss": 0.2025, + "step": 4199 + }, + { + "epoch": 1.55096011816839, + "grad_norm": 0.2804551124572754, + "learning_rate": 9.66621505111467e-05, + "loss": 0.2428, + "step": 4200 + }, + { + "epoch": 1.55096011816839, + "eval_loss": 8.607993125915527, + "eval_runtime": 6.9034, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 1.014, + "step": 4200 + }, + { + "epoch": 1.5513293943870015, + "grad_norm": 0.3006301522254944, + "learning_rate": 9.663751693558321e-05, + "loss": 0.3024, + "step": 4201 + }, + { + "epoch": 1.5516986706056128, + "grad_norm": 0.25674816966056824, + "learning_rate": 9.661288336001971e-05, + "loss": 0.2422, + "step": 4202 + }, + { + "epoch": 1.5520679468242244, + "grad_norm": 0.2826618552207947, + "learning_rate": 9.658824978445623e-05, + "loss": 0.2606, + "step": 4203 + }, + { + "epoch": 1.552437223042836, + "grad_norm": 0.2765415906906128, + "learning_rate": 9.656361620889273e-05, + "loss": 0.215, + "step": 4204 + }, + { + "epoch": 1.5528064992614476, + "grad_norm": 0.3028270900249481, + "learning_rate": 9.653898263332923e-05, + "loss": 0.2379, + "step": 4205 + }, + { + "epoch": 1.553175775480059, + "grad_norm": 0.238161101937294, + "learning_rate": 9.651434905776574e-05, + "loss": 0.2107, + "step": 4206 + }, + { + "epoch": 1.5535450516986706, + "grad_norm": 0.2637631893157959, + "learning_rate": 9.648971548220224e-05, + "loss": 0.2354, + "step": 4207 + }, + { + "epoch": 1.553914327917282, + "grad_norm": 0.27503272891044617, + "learning_rate": 9.646508190663876e-05, + "loss": 0.2578, + "step": 4208 + }, + { + "epoch": 1.5542836041358936, + "grad_norm": 0.3316762447357178, + "learning_rate": 9.644044833107526e-05, + "loss": 0.2336, + "step": 4209 + }, + { + "epoch": 1.5546528803545052, + "grad_norm": 0.2363661229610443, + "learning_rate": 9.641581475551177e-05, + "loss": 0.2141, + "step": 4210 + }, + { + "epoch": 1.5550221565731168, + "grad_norm": 0.2649475634098053, + "learning_rate": 9.639118117994827e-05, + "loss": 0.179, + "step": 4211 + }, + { + "epoch": 1.5553914327917282, + "grad_norm": 0.27166005969047546, + "learning_rate": 9.636654760438479e-05, + "loss": 0.2122, + "step": 4212 + }, + { + "epoch": 1.5557607090103396, + "grad_norm": 0.24952058494091034, + "learning_rate": 9.634191402882129e-05, + "loss": 0.2177, + "step": 4213 + }, + { + "epoch": 1.5561299852289512, + "grad_norm": 0.2725524604320526, + "learning_rate": 9.631728045325779e-05, + "loss": 0.2582, + "step": 4214 + }, + { + "epoch": 1.5564992614475628, + "grad_norm": 0.25051164627075195, + "learning_rate": 9.62926468776943e-05, + "loss": 0.2169, + "step": 4215 + }, + { + "epoch": 1.5568685376661744, + "grad_norm": 0.2443505972623825, + "learning_rate": 9.626801330213081e-05, + "loss": 0.2521, + "step": 4216 + }, + { + "epoch": 1.5572378138847858, + "grad_norm": 0.26448705792427063, + "learning_rate": 9.624337972656732e-05, + "loss": 0.206, + "step": 4217 + }, + { + "epoch": 1.5576070901033974, + "grad_norm": 0.3053038418292999, + "learning_rate": 9.621874615100382e-05, + "loss": 0.2534, + "step": 4218 + }, + { + "epoch": 1.5579763663220088, + "grad_norm": 0.27799609303474426, + "learning_rate": 9.619411257544034e-05, + "loss": 0.3057, + "step": 4219 + }, + { + "epoch": 1.5583456425406204, + "grad_norm": 0.2521724998950958, + "learning_rate": 9.616947899987684e-05, + "loss": 0.237, + "step": 4220 + }, + { + "epoch": 1.558714918759232, + "grad_norm": 0.22812429070472717, + "learning_rate": 9.614484542431334e-05, + "loss": 0.2043, + "step": 4221 + }, + { + "epoch": 1.5590841949778436, + "grad_norm": 0.27307596802711487, + "learning_rate": 9.612021184874985e-05, + "loss": 0.2027, + "step": 4222 + }, + { + "epoch": 1.559453471196455, + "grad_norm": 0.2876955270767212, + "learning_rate": 9.609557827318636e-05, + "loss": 0.2235, + "step": 4223 + }, + { + "epoch": 1.5598227474150663, + "grad_norm": 0.25584229826927185, + "learning_rate": 9.607094469762287e-05, + "loss": 0.2009, + "step": 4224 + }, + { + "epoch": 1.560192023633678, + "grad_norm": 0.29169970750808716, + "learning_rate": 9.604631112205937e-05, + "loss": 0.2206, + "step": 4225 + }, + { + "epoch": 1.5605612998522895, + "grad_norm": 0.2679328918457031, + "learning_rate": 9.602167754649589e-05, + "loss": 0.2243, + "step": 4226 + }, + { + "epoch": 1.5609305760709011, + "grad_norm": 0.2782287299633026, + "learning_rate": 9.599704397093239e-05, + "loss": 0.2083, + "step": 4227 + }, + { + "epoch": 1.5612998522895125, + "grad_norm": 0.24246180057525635, + "learning_rate": 9.597241039536889e-05, + "loss": 0.2148, + "step": 4228 + }, + { + "epoch": 1.5616691285081241, + "grad_norm": 0.2631807327270508, + "learning_rate": 9.59477768198054e-05, + "loss": 0.2281, + "step": 4229 + }, + { + "epoch": 1.5620384047267355, + "grad_norm": 0.2711851894855499, + "learning_rate": 9.59231432442419e-05, + "loss": 0.1956, + "step": 4230 + }, + { + "epoch": 1.562407680945347, + "grad_norm": 0.24073992669582367, + "learning_rate": 9.589850966867842e-05, + "loss": 0.2178, + "step": 4231 + }, + { + "epoch": 1.5627769571639587, + "grad_norm": 0.2774311602115631, + "learning_rate": 9.587387609311492e-05, + "loss": 0.2189, + "step": 4232 + }, + { + "epoch": 1.5631462333825703, + "grad_norm": 0.28396135568618774, + "learning_rate": 9.584924251755143e-05, + "loss": 0.2206, + "step": 4233 + }, + { + "epoch": 1.5635155096011817, + "grad_norm": 0.26733914017677307, + "learning_rate": 9.582460894198794e-05, + "loss": 0.2223, + "step": 4234 + }, + { + "epoch": 1.563884785819793, + "grad_norm": 0.24736519157886505, + "learning_rate": 9.579997536642445e-05, + "loss": 0.2228, + "step": 4235 + }, + { + "epoch": 1.5642540620384047, + "grad_norm": 0.2417195737361908, + "learning_rate": 9.577534179086095e-05, + "loss": 0.218, + "step": 4236 + }, + { + "epoch": 1.5646233382570163, + "grad_norm": 0.266829252243042, + "learning_rate": 9.575070821529745e-05, + "loss": 0.2227, + "step": 4237 + }, + { + "epoch": 1.5649926144756279, + "grad_norm": 0.26049941778182983, + "learning_rate": 9.572607463973397e-05, + "loss": 0.2114, + "step": 4238 + }, + { + "epoch": 1.5653618906942393, + "grad_norm": 0.17307057976722717, + "learning_rate": 9.570144106417047e-05, + "loss": 0.1572, + "step": 4239 + }, + { + "epoch": 1.5657311669128509, + "grad_norm": 0.26147300004959106, + "learning_rate": 9.567680748860698e-05, + "loss": 0.2094, + "step": 4240 + }, + { + "epoch": 1.5661004431314622, + "grad_norm": 0.22852693498134613, + "learning_rate": 9.565217391304348e-05, + "loss": 0.2068, + "step": 4241 + }, + { + "epoch": 1.5664697193500738, + "grad_norm": 0.275422066450119, + "learning_rate": 9.562754033748e-05, + "loss": 0.2361, + "step": 4242 + }, + { + "epoch": 1.5668389955686854, + "grad_norm": 0.24394144117832184, + "learning_rate": 9.56029067619165e-05, + "loss": 0.2088, + "step": 4243 + }, + { + "epoch": 1.567208271787297, + "grad_norm": 0.27227410674095154, + "learning_rate": 9.5578273186353e-05, + "loss": 0.2047, + "step": 4244 + }, + { + "epoch": 1.5675775480059084, + "grad_norm": 0.34393373131752014, + "learning_rate": 9.555363961078951e-05, + "loss": 0.268, + "step": 4245 + }, + { + "epoch": 1.5679468242245198, + "grad_norm": 0.3661457896232605, + "learning_rate": 9.552900603522602e-05, + "loss": 0.2346, + "step": 4246 + }, + { + "epoch": 1.5683161004431314, + "grad_norm": 0.28046879172325134, + "learning_rate": 9.550437245966253e-05, + "loss": 0.2538, + "step": 4247 + }, + { + "epoch": 1.568685376661743, + "grad_norm": 0.4853428304195404, + "learning_rate": 9.547973888409903e-05, + "loss": 0.2191, + "step": 4248 + }, + { + "epoch": 1.5690546528803546, + "grad_norm": 0.2943223714828491, + "learning_rate": 9.545510530853555e-05, + "loss": 0.2603, + "step": 4249 + }, + { + "epoch": 1.569423929098966, + "grad_norm": 0.2574216425418854, + "learning_rate": 9.543047173297205e-05, + "loss": 0.1965, + "step": 4250 + }, + { + "epoch": 1.569423929098966, + "eval_loss": 8.583556175231934, + "eval_runtime": 6.911, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 1.013, + "step": 4250 + }, + { + "epoch": 1.5697932053175776, + "grad_norm": 0.2280886173248291, + "learning_rate": 9.540583815740856e-05, + "loss": 0.2, + "step": 4251 + }, + { + "epoch": 1.570162481536189, + "grad_norm": 0.2786300480365753, + "learning_rate": 9.538120458184506e-05, + "loss": 0.2436, + "step": 4252 + }, + { + "epoch": 1.5705317577548006, + "grad_norm": 0.30089664459228516, + "learning_rate": 9.535657100628156e-05, + "loss": 0.2156, + "step": 4253 + }, + { + "epoch": 1.5709010339734122, + "grad_norm": 0.3307819068431854, + "learning_rate": 9.533193743071808e-05, + "loss": 0.2441, + "step": 4254 + }, + { + "epoch": 1.5712703101920238, + "grad_norm": 0.26521754264831543, + "learning_rate": 9.530730385515458e-05, + "loss": 0.2309, + "step": 4255 + }, + { + "epoch": 1.5716395864106352, + "grad_norm": 0.24399127066135406, + "learning_rate": 9.52826702795911e-05, + "loss": 0.208, + "step": 4256 + }, + { + "epoch": 1.5720088626292466, + "grad_norm": 0.26744556427001953, + "learning_rate": 9.52580367040276e-05, + "loss": 0.2589, + "step": 4257 + }, + { + "epoch": 1.5723781388478582, + "grad_norm": 0.3047908842563629, + "learning_rate": 9.523340312846411e-05, + "loss": 0.2437, + "step": 4258 + }, + { + "epoch": 1.5727474150664698, + "grad_norm": 0.24568799138069153, + "learning_rate": 9.520876955290061e-05, + "loss": 0.1756, + "step": 4259 + }, + { + "epoch": 1.5731166912850814, + "grad_norm": 0.293376624584198, + "learning_rate": 9.518413597733711e-05, + "loss": 0.2475, + "step": 4260 + }, + { + "epoch": 1.5734859675036927, + "grad_norm": 0.21811045706272125, + "learning_rate": 9.515950240177363e-05, + "loss": 0.1666, + "step": 4261 + }, + { + "epoch": 1.5738552437223041, + "grad_norm": 0.2748865783214569, + "learning_rate": 9.513486882621013e-05, + "loss": 0.2435, + "step": 4262 + }, + { + "epoch": 1.5742245199409157, + "grad_norm": 0.2553178668022156, + "learning_rate": 9.511023525064664e-05, + "loss": 0.2161, + "step": 4263 + }, + { + "epoch": 1.5745937961595273, + "grad_norm": 0.25822287797927856, + "learning_rate": 9.508560167508314e-05, + "loss": 0.1921, + "step": 4264 + }, + { + "epoch": 1.574963072378139, + "grad_norm": 0.29153579473495483, + "learning_rate": 9.506096809951966e-05, + "loss": 0.2393, + "step": 4265 + }, + { + "epoch": 1.5753323485967505, + "grad_norm": 0.3022720515727997, + "learning_rate": 9.503633452395616e-05, + "loss": 0.2243, + "step": 4266 + }, + { + "epoch": 1.575701624815362, + "grad_norm": 0.2753484547138214, + "learning_rate": 9.501170094839267e-05, + "loss": 0.2446, + "step": 4267 + }, + { + "epoch": 1.5760709010339733, + "grad_norm": 0.26976677775382996, + "learning_rate": 9.498706737282918e-05, + "loss": 0.2328, + "step": 4268 + }, + { + "epoch": 1.576440177252585, + "grad_norm": 0.2824487090110779, + "learning_rate": 9.496243379726568e-05, + "loss": 0.1868, + "step": 4269 + }, + { + "epoch": 1.5768094534711965, + "grad_norm": 0.22535039484500885, + "learning_rate": 9.493780022170219e-05, + "loss": 0.1917, + "step": 4270 + }, + { + "epoch": 1.577178729689808, + "grad_norm": 0.26481205224990845, + "learning_rate": 9.491316664613869e-05, + "loss": 0.2641, + "step": 4271 + }, + { + "epoch": 1.5775480059084195, + "grad_norm": 0.22678865492343903, + "learning_rate": 9.488853307057521e-05, + "loss": 0.2086, + "step": 4272 + }, + { + "epoch": 1.5779172821270309, + "grad_norm": 0.27742093801498413, + "learning_rate": 9.486389949501171e-05, + "loss": 0.2641, + "step": 4273 + }, + { + "epoch": 1.5782865583456425, + "grad_norm": 0.2587934136390686, + "learning_rate": 9.483926591944822e-05, + "loss": 0.2365, + "step": 4274 + }, + { + "epoch": 1.578655834564254, + "grad_norm": 0.28948602080345154, + "learning_rate": 9.481463234388472e-05, + "loss": 0.2134, + "step": 4275 + }, + { + "epoch": 1.5790251107828657, + "grad_norm": 0.29900646209716797, + "learning_rate": 9.478999876832122e-05, + "loss": 0.2732, + "step": 4276 + }, + { + "epoch": 1.579394387001477, + "grad_norm": 0.21964584290981293, + "learning_rate": 9.476536519275774e-05, + "loss": 0.1803, + "step": 4277 + }, + { + "epoch": 1.5797636632200887, + "grad_norm": 0.39024025201797485, + "learning_rate": 9.474073161719424e-05, + "loss": 0.2882, + "step": 4278 + }, + { + "epoch": 1.5801329394387, + "grad_norm": 0.23472507297992706, + "learning_rate": 9.471609804163076e-05, + "loss": 0.1944, + "step": 4279 + }, + { + "epoch": 1.5805022156573116, + "grad_norm": 0.31429603695869446, + "learning_rate": 9.469146446606726e-05, + "loss": 0.2304, + "step": 4280 + }, + { + "epoch": 1.5808714918759232, + "grad_norm": 0.26538583636283875, + "learning_rate": 9.466683089050377e-05, + "loss": 0.2622, + "step": 4281 + }, + { + "epoch": 1.5812407680945348, + "grad_norm": 0.2561086118221283, + "learning_rate": 9.464219731494027e-05, + "loss": 0.2264, + "step": 4282 + }, + { + "epoch": 1.5816100443131462, + "grad_norm": 0.24844366312026978, + "learning_rate": 9.461756373937679e-05, + "loss": 0.2188, + "step": 4283 + }, + { + "epoch": 1.5819793205317576, + "grad_norm": 0.270039439201355, + "learning_rate": 9.459293016381329e-05, + "loss": 0.2321, + "step": 4284 + }, + { + "epoch": 1.5823485967503692, + "grad_norm": 0.2678910791873932, + "learning_rate": 9.456829658824979e-05, + "loss": 0.2087, + "step": 4285 + }, + { + "epoch": 1.5827178729689808, + "grad_norm": 0.25707265734672546, + "learning_rate": 9.45436630126863e-05, + "loss": 0.209, + "step": 4286 + }, + { + "epoch": 1.5830871491875924, + "grad_norm": 0.24305176734924316, + "learning_rate": 9.45190294371228e-05, + "loss": 0.1968, + "step": 4287 + }, + { + "epoch": 1.5834564254062038, + "grad_norm": 0.2654803991317749, + "learning_rate": 9.449439586155932e-05, + "loss": 0.2065, + "step": 4288 + }, + { + "epoch": 1.5838257016248154, + "grad_norm": 0.3010595142841339, + "learning_rate": 9.446976228599582e-05, + "loss": 0.2704, + "step": 4289 + }, + { + "epoch": 1.5841949778434268, + "grad_norm": 0.2822839617729187, + "learning_rate": 9.444512871043233e-05, + "loss": 0.2438, + "step": 4290 + }, + { + "epoch": 1.5845642540620384, + "grad_norm": 0.2414712905883789, + "learning_rate": 9.442049513486884e-05, + "loss": 0.2086, + "step": 4291 + }, + { + "epoch": 1.58493353028065, + "grad_norm": 0.19916607439517975, + "learning_rate": 9.439586155930534e-05, + "loss": 0.1889, + "step": 4292 + }, + { + "epoch": 1.5853028064992616, + "grad_norm": 0.290366530418396, + "learning_rate": 9.437122798374185e-05, + "loss": 0.2611, + "step": 4293 + }, + { + "epoch": 1.585672082717873, + "grad_norm": 0.2957131862640381, + "learning_rate": 9.434659440817835e-05, + "loss": 0.2007, + "step": 4294 + }, + { + "epoch": 1.5860413589364843, + "grad_norm": 0.2877734303474426, + "learning_rate": 9.432196083261487e-05, + "loss": 0.2412, + "step": 4295 + }, + { + "epoch": 1.586410635155096, + "grad_norm": 0.2387487143278122, + "learning_rate": 9.429732725705137e-05, + "loss": 0.188, + "step": 4296 + }, + { + "epoch": 1.5867799113737076, + "grad_norm": 0.3164692521095276, + "learning_rate": 9.427269368148787e-05, + "loss": 0.2918, + "step": 4297 + }, + { + "epoch": 1.5871491875923192, + "grad_norm": 0.2371818721294403, + "learning_rate": 9.424806010592437e-05, + "loss": 0.1848, + "step": 4298 + }, + { + "epoch": 1.5875184638109305, + "grad_norm": 0.24461007118225098, + "learning_rate": 9.422342653036089e-05, + "loss": 0.2165, + "step": 4299 + }, + { + "epoch": 1.5878877400295421, + "grad_norm": 0.266857385635376, + "learning_rate": 9.419879295479739e-05, + "loss": 0.2539, + "step": 4300 + }, + { + "epoch": 1.5878877400295421, + "eval_loss": 8.731088638305664, + "eval_runtime": 6.9074, + "eval_samples_per_second": 7.239, + "eval_steps_per_second": 1.013, + "step": 4300 + }, + { + "epoch": 1.5882570162481535, + "grad_norm": 0.25499698519706726, + "learning_rate": 9.41741593792339e-05, + "loss": 0.2091, + "step": 4301 + }, + { + "epoch": 1.5886262924667651, + "grad_norm": 0.2722216844558716, + "learning_rate": 9.41495258036704e-05, + "loss": 0.2108, + "step": 4302 + }, + { + "epoch": 1.5889955686853767, + "grad_norm": 0.27079811692237854, + "learning_rate": 9.41248922281069e-05, + "loss": 0.2194, + "step": 4303 + }, + { + "epoch": 1.5893648449039883, + "grad_norm": 0.25969022512435913, + "learning_rate": 9.410025865254342e-05, + "loss": 0.2056, + "step": 4304 + }, + { + "epoch": 1.5897341211225997, + "grad_norm": 0.2638678252696991, + "learning_rate": 9.407562507697992e-05, + "loss": 0.2049, + "step": 4305 + }, + { + "epoch": 1.590103397341211, + "grad_norm": 0.25118133425712585, + "learning_rate": 9.405099150141643e-05, + "loss": 0.1993, + "step": 4306 + }, + { + "epoch": 1.5904726735598227, + "grad_norm": 0.22928109765052795, + "learning_rate": 9.402635792585293e-05, + "loss": 0.1917, + "step": 4307 + }, + { + "epoch": 1.5908419497784343, + "grad_norm": 0.2853640913963318, + "learning_rate": 9.400172435028945e-05, + "loss": 0.2379, + "step": 4308 + }, + { + "epoch": 1.591211225997046, + "grad_norm": 0.34507888555526733, + "learning_rate": 9.397709077472595e-05, + "loss": 0.2714, + "step": 4309 + }, + { + "epoch": 1.5915805022156573, + "grad_norm": 0.2786507308483124, + "learning_rate": 9.395245719916245e-05, + "loss": 0.2251, + "step": 4310 + }, + { + "epoch": 1.5919497784342689, + "grad_norm": 0.2837584614753723, + "learning_rate": 9.392782362359897e-05, + "loss": 0.2336, + "step": 4311 + }, + { + "epoch": 1.5923190546528803, + "grad_norm": 0.2021365910768509, + "learning_rate": 9.390319004803547e-05, + "loss": 0.1868, + "step": 4312 + }, + { + "epoch": 1.5926883308714919, + "grad_norm": 0.21287457644939423, + "learning_rate": 9.387855647247198e-05, + "loss": 0.1837, + "step": 4313 + }, + { + "epoch": 1.5930576070901035, + "grad_norm": 0.29180389642715454, + "learning_rate": 9.385392289690848e-05, + "loss": 0.2128, + "step": 4314 + }, + { + "epoch": 1.593426883308715, + "grad_norm": 0.2749641239643097, + "learning_rate": 9.3829289321345e-05, + "loss": 0.2073, + "step": 4315 + }, + { + "epoch": 1.5937961595273265, + "grad_norm": 0.2640233039855957, + "learning_rate": 9.38046557457815e-05, + "loss": 0.2467, + "step": 4316 + }, + { + "epoch": 1.5941654357459378, + "grad_norm": 0.31479495763778687, + "learning_rate": 9.378002217021801e-05, + "loss": 0.253, + "step": 4317 + }, + { + "epoch": 1.5945347119645494, + "grad_norm": 0.32752975821495056, + "learning_rate": 9.375538859465451e-05, + "loss": 0.2746, + "step": 4318 + }, + { + "epoch": 1.594903988183161, + "grad_norm": 0.2716265320777893, + "learning_rate": 9.373075501909102e-05, + "loss": 0.252, + "step": 4319 + }, + { + "epoch": 1.5952732644017726, + "grad_norm": 0.26598629355430603, + "learning_rate": 9.370612144352753e-05, + "loss": 0.2319, + "step": 4320 + }, + { + "epoch": 1.595642540620384, + "grad_norm": 0.2427518665790558, + "learning_rate": 9.368148786796403e-05, + "loss": 0.2091, + "step": 4321 + }, + { + "epoch": 1.5960118168389956, + "grad_norm": 0.36686015129089355, + "learning_rate": 9.365685429240055e-05, + "loss": 0.3001, + "step": 4322 + }, + { + "epoch": 1.596381093057607, + "grad_norm": 0.27344608306884766, + "learning_rate": 9.363222071683705e-05, + "loss": 0.2236, + "step": 4323 + }, + { + "epoch": 1.5967503692762186, + "grad_norm": 0.25575870275497437, + "learning_rate": 9.360758714127356e-05, + "loss": 0.2236, + "step": 4324 + }, + { + "epoch": 1.5971196454948302, + "grad_norm": 0.2633550465106964, + "learning_rate": 9.358295356571006e-05, + "loss": 0.1927, + "step": 4325 + }, + { + "epoch": 1.5974889217134418, + "grad_norm": 0.26798486709594727, + "learning_rate": 9.355831999014656e-05, + "loss": 0.2251, + "step": 4326 + }, + { + "epoch": 1.5978581979320532, + "grad_norm": 0.3109707534313202, + "learning_rate": 9.353368641458308e-05, + "loss": 0.2266, + "step": 4327 + }, + { + "epoch": 1.5982274741506646, + "grad_norm": 0.27218085527420044, + "learning_rate": 9.350905283901958e-05, + "loss": 0.2129, + "step": 4328 + }, + { + "epoch": 1.5985967503692762, + "grad_norm": 0.323910117149353, + "learning_rate": 9.34844192634561e-05, + "loss": 0.2893, + "step": 4329 + }, + { + "epoch": 1.5989660265878878, + "grad_norm": 0.29320478439331055, + "learning_rate": 9.34597856878926e-05, + "loss": 0.2249, + "step": 4330 + }, + { + "epoch": 1.5993353028064994, + "grad_norm": 0.28432488441467285, + "learning_rate": 9.343515211232911e-05, + "loss": 0.2616, + "step": 4331 + }, + { + "epoch": 1.5997045790251108, + "grad_norm": 0.2829086482524872, + "learning_rate": 9.341051853676561e-05, + "loss": 0.2229, + "step": 4332 + }, + { + "epoch": 1.6000738552437221, + "grad_norm": 0.29658791422843933, + "learning_rate": 9.338588496120213e-05, + "loss": 0.2278, + "step": 4333 + }, + { + "epoch": 1.6004431314623337, + "grad_norm": 0.25604844093322754, + "learning_rate": 9.336125138563863e-05, + "loss": 0.2345, + "step": 4334 + }, + { + "epoch": 1.6008124076809453, + "grad_norm": 0.28668370842933655, + "learning_rate": 9.333661781007513e-05, + "loss": 0.2626, + "step": 4335 + }, + { + "epoch": 1.601181683899557, + "grad_norm": 0.26734551787376404, + "learning_rate": 9.331198423451164e-05, + "loss": 0.2585, + "step": 4336 + }, + { + "epoch": 1.6015509601181686, + "grad_norm": 0.2896179258823395, + "learning_rate": 9.328735065894814e-05, + "loss": 0.2438, + "step": 4337 + }, + { + "epoch": 1.60192023633678, + "grad_norm": 0.2624828815460205, + "learning_rate": 9.326271708338466e-05, + "loss": 0.2337, + "step": 4338 + }, + { + "epoch": 1.6022895125553913, + "grad_norm": 0.24414487183094025, + "learning_rate": 9.323808350782116e-05, + "loss": 0.2332, + "step": 4339 + }, + { + "epoch": 1.602658788774003, + "grad_norm": 0.25343552231788635, + "learning_rate": 9.321344993225767e-05, + "loss": 0.2235, + "step": 4340 + }, + { + "epoch": 1.6030280649926145, + "grad_norm": 0.2515551745891571, + "learning_rate": 9.318881635669417e-05, + "loss": 0.2418, + "step": 4341 + }, + { + "epoch": 1.6033973412112261, + "grad_norm": 0.2902366518974304, + "learning_rate": 9.316418278113068e-05, + "loss": 0.2333, + "step": 4342 + }, + { + "epoch": 1.6037666174298375, + "grad_norm": 0.2690475285053253, + "learning_rate": 9.313954920556719e-05, + "loss": 0.2669, + "step": 4343 + }, + { + "epoch": 1.6041358936484489, + "grad_norm": 0.3078955113887787, + "learning_rate": 9.311491563000369e-05, + "loss": 0.26, + "step": 4344 + }, + { + "epoch": 1.6045051698670605, + "grad_norm": 0.22262433171272278, + "learning_rate": 9.30902820544402e-05, + "loss": 0.2101, + "step": 4345 + }, + { + "epoch": 1.604874446085672, + "grad_norm": 0.2608828544616699, + "learning_rate": 9.306564847887671e-05, + "loss": 0.2098, + "step": 4346 + }, + { + "epoch": 1.6052437223042837, + "grad_norm": 0.26358357071876526, + "learning_rate": 9.304101490331322e-05, + "loss": 0.2232, + "step": 4347 + }, + { + "epoch": 1.605612998522895, + "grad_norm": 0.25944313406944275, + "learning_rate": 9.301638132774972e-05, + "loss": 0.2143, + "step": 4348 + }, + { + "epoch": 1.6059822747415067, + "grad_norm": 0.2558457553386688, + "learning_rate": 9.299174775218624e-05, + "loss": 0.2125, + "step": 4349 + }, + { + "epoch": 1.606351550960118, + "grad_norm": 0.2252526432275772, + "learning_rate": 9.296711417662274e-05, + "loss": 0.1943, + "step": 4350 + }, + { + "epoch": 1.606351550960118, + "eval_loss": 8.560277938842773, + "eval_runtime": 6.906, + "eval_samples_per_second": 7.24, + "eval_steps_per_second": 1.014, + "step": 4350 + }, + { + "epoch": 1.6067208271787297, + "grad_norm": 0.324086993932724, + "learning_rate": 9.294248060105924e-05, + "loss": 0.2458, + "step": 4351 + }, + { + "epoch": 1.6070901033973413, + "grad_norm": 0.2828923761844635, + "learning_rate": 9.291784702549575e-05, + "loss": 0.2571, + "step": 4352 + }, + { + "epoch": 1.6074593796159529, + "grad_norm": 0.23643232882022858, + "learning_rate": 9.289321344993226e-05, + "loss": 0.2355, + "step": 4353 + }, + { + "epoch": 1.6078286558345642, + "grad_norm": 0.20197457075119019, + "learning_rate": 9.286857987436877e-05, + "loss": 0.1875, + "step": 4354 + }, + { + "epoch": 1.6081979320531756, + "grad_norm": 0.28388962149620056, + "learning_rate": 9.284394629880527e-05, + "loss": 0.2473, + "step": 4355 + }, + { + "epoch": 1.6085672082717872, + "grad_norm": 0.2933094799518585, + "learning_rate": 9.281931272324179e-05, + "loss": 0.247, + "step": 4356 + }, + { + "epoch": 1.6089364844903988, + "grad_norm": 0.22914531826972961, + "learning_rate": 9.279467914767829e-05, + "loss": 0.1943, + "step": 4357 + }, + { + "epoch": 1.6093057607090104, + "grad_norm": 0.2677878141403198, + "learning_rate": 9.277004557211479e-05, + "loss": 0.211, + "step": 4358 + }, + { + "epoch": 1.6096750369276218, + "grad_norm": 0.35986587405204773, + "learning_rate": 9.27454119965513e-05, + "loss": 0.2793, + "step": 4359 + }, + { + "epoch": 1.6100443131462334, + "grad_norm": 0.2859800457954407, + "learning_rate": 9.27207784209878e-05, + "loss": 0.1968, + "step": 4360 + }, + { + "epoch": 1.6104135893648448, + "grad_norm": 0.21314610540866852, + "learning_rate": 9.269614484542432e-05, + "loss": 0.1765, + "step": 4361 + }, + { + "epoch": 1.6107828655834564, + "grad_norm": 0.31760886311531067, + "learning_rate": 9.267151126986082e-05, + "loss": 0.2448, + "step": 4362 + }, + { + "epoch": 1.611152141802068, + "grad_norm": 0.2576107382774353, + "learning_rate": 9.264687769429733e-05, + "loss": 0.2316, + "step": 4363 + }, + { + "epoch": 1.6115214180206796, + "grad_norm": 0.28768306970596313, + "learning_rate": 9.262224411873384e-05, + "loss": 0.2169, + "step": 4364 + }, + { + "epoch": 1.611890694239291, + "grad_norm": 0.26528215408325195, + "learning_rate": 9.259761054317035e-05, + "loss": 0.2359, + "step": 4365 + }, + { + "epoch": 1.6122599704579024, + "grad_norm": 0.28595995903015137, + "learning_rate": 9.257297696760685e-05, + "loss": 0.2401, + "step": 4366 + }, + { + "epoch": 1.612629246676514, + "grad_norm": 0.2672710418701172, + "learning_rate": 9.254834339204335e-05, + "loss": 0.2397, + "step": 4367 + }, + { + "epoch": 1.6129985228951256, + "grad_norm": 0.24663472175598145, + "learning_rate": 9.252370981647987e-05, + "loss": 0.1983, + "step": 4368 + }, + { + "epoch": 1.6133677991137372, + "grad_norm": 0.3181290626525879, + "learning_rate": 9.249907624091637e-05, + "loss": 0.2636, + "step": 4369 + }, + { + "epoch": 1.6137370753323486, + "grad_norm": 0.26299792528152466, + "learning_rate": 9.247444266535288e-05, + "loss": 0.2118, + "step": 4370 + }, + { + "epoch": 1.6141063515509602, + "grad_norm": 0.31882691383361816, + "learning_rate": 9.244980908978938e-05, + "loss": 0.2355, + "step": 4371 + }, + { + "epoch": 1.6144756277695715, + "grad_norm": 0.22851574420928955, + "learning_rate": 9.24251755142259e-05, + "loss": 0.2006, + "step": 4372 + }, + { + "epoch": 1.6148449039881831, + "grad_norm": 0.21132752299308777, + "learning_rate": 9.24005419386624e-05, + "loss": 0.1882, + "step": 4373 + }, + { + "epoch": 1.6152141802067947, + "grad_norm": 0.24345532059669495, + "learning_rate": 9.23759083630989e-05, + "loss": 0.2019, + "step": 4374 + }, + { + "epoch": 1.6155834564254064, + "grad_norm": 0.25397005677223206, + "learning_rate": 9.235127478753542e-05, + "loss": 0.2083, + "step": 4375 + }, + { + "epoch": 1.6159527326440177, + "grad_norm": 0.2710159420967102, + "learning_rate": 9.232664121197192e-05, + "loss": 0.232, + "step": 4376 + }, + { + "epoch": 1.6163220088626291, + "grad_norm": 0.25521108508110046, + "learning_rate": 9.230200763640843e-05, + "loss": 0.2198, + "step": 4377 + }, + { + "epoch": 1.6166912850812407, + "grad_norm": 0.2474714070558548, + "learning_rate": 9.227737406084493e-05, + "loss": 0.1896, + "step": 4378 + }, + { + "epoch": 1.6170605612998523, + "grad_norm": 0.2805114686489105, + "learning_rate": 9.225274048528145e-05, + "loss": 0.2242, + "step": 4379 + }, + { + "epoch": 1.617429837518464, + "grad_norm": 0.25245898962020874, + "learning_rate": 9.222810690971795e-05, + "loss": 0.2315, + "step": 4380 + }, + { + "epoch": 1.6177991137370753, + "grad_norm": 0.25641945004463196, + "learning_rate": 9.220347333415445e-05, + "loss": 0.1768, + "step": 4381 + }, + { + "epoch": 1.618168389955687, + "grad_norm": 0.28127458691596985, + "learning_rate": 9.217883975859096e-05, + "loss": 0.2339, + "step": 4382 + }, + { + "epoch": 1.6185376661742983, + "grad_norm": 0.313975989818573, + "learning_rate": 9.215420618302746e-05, + "loss": 0.2652, + "step": 4383 + }, + { + "epoch": 1.6189069423929099, + "grad_norm": 0.3414907157421112, + "learning_rate": 9.212957260746398e-05, + "loss": 0.2699, + "step": 4384 + }, + { + "epoch": 1.6192762186115215, + "grad_norm": 0.25880712270736694, + "learning_rate": 9.210493903190048e-05, + "loss": 0.2282, + "step": 4385 + }, + { + "epoch": 1.619645494830133, + "grad_norm": 0.3199182450771332, + "learning_rate": 9.2080305456337e-05, + "loss": 0.2325, + "step": 4386 + }, + { + "epoch": 1.6200147710487445, + "grad_norm": 0.2711215913295746, + "learning_rate": 9.20556718807735e-05, + "loss": 0.2246, + "step": 4387 + }, + { + "epoch": 1.6203840472673559, + "grad_norm": 0.25906720757484436, + "learning_rate": 9.203103830521001e-05, + "loss": 0.2138, + "step": 4388 + }, + { + "epoch": 1.6207533234859675, + "grad_norm": 0.23228012025356293, + "learning_rate": 9.200640472964651e-05, + "loss": 0.1772, + "step": 4389 + }, + { + "epoch": 1.621122599704579, + "grad_norm": 0.2670915424823761, + "learning_rate": 9.198177115408301e-05, + "loss": 0.1799, + "step": 4390 + }, + { + "epoch": 1.6214918759231907, + "grad_norm": 0.24604369699954987, + "learning_rate": 9.195713757851953e-05, + "loss": 0.2523, + "step": 4391 + }, + { + "epoch": 1.621861152141802, + "grad_norm": 0.21664880216121674, + "learning_rate": 9.193250400295603e-05, + "loss": 0.1905, + "step": 4392 + }, + { + "epoch": 1.6222304283604134, + "grad_norm": 0.24081917107105255, + "learning_rate": 9.190787042739254e-05, + "loss": 0.2349, + "step": 4393 + }, + { + "epoch": 1.622599704579025, + "grad_norm": 0.2866224944591522, + "learning_rate": 9.188323685182904e-05, + "loss": 0.2482, + "step": 4394 + }, + { + "epoch": 1.6229689807976366, + "grad_norm": 0.29684486985206604, + "learning_rate": 9.185860327626556e-05, + "loss": 0.2158, + "step": 4395 + }, + { + "epoch": 1.6233382570162482, + "grad_norm": 0.2610672116279602, + "learning_rate": 9.183396970070206e-05, + "loss": 0.2357, + "step": 4396 + }, + { + "epoch": 1.6237075332348598, + "grad_norm": 0.23020614683628082, + "learning_rate": 9.180933612513856e-05, + "loss": 0.195, + "step": 4397 + }, + { + "epoch": 1.6240768094534712, + "grad_norm": 0.25154760479927063, + "learning_rate": 9.178470254957508e-05, + "loss": 0.1978, + "step": 4398 + }, + { + "epoch": 1.6244460856720826, + "grad_norm": 0.30506688356399536, + "learning_rate": 9.176006897401158e-05, + "loss": 0.2421, + "step": 4399 + }, + { + "epoch": 1.6248153618906942, + "grad_norm": 0.27587535977363586, + "learning_rate": 9.173543539844809e-05, + "loss": 0.2454, + "step": 4400 + }, + { + "epoch": 1.6248153618906942, + "eval_loss": 8.603718757629395, + "eval_runtime": 6.9105, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 1.013, + "step": 4400 + }, + { + "epoch": 1.6251846381093058, + "grad_norm": 0.30779048800468445, + "learning_rate": 9.171080182288459e-05, + "loss": 0.2153, + "step": 4401 + }, + { + "epoch": 1.6255539143279174, + "grad_norm": 0.3112994432449341, + "learning_rate": 9.168616824732111e-05, + "loss": 0.2374, + "step": 4402 + }, + { + "epoch": 1.6259231905465288, + "grad_norm": 0.3864116668701172, + "learning_rate": 9.166153467175761e-05, + "loss": 0.2948, + "step": 4403 + }, + { + "epoch": 1.6262924667651402, + "grad_norm": 0.2731153666973114, + "learning_rate": 9.163690109619412e-05, + "loss": 0.2038, + "step": 4404 + }, + { + "epoch": 1.6266617429837518, + "grad_norm": 0.24435505270957947, + "learning_rate": 9.161226752063062e-05, + "loss": 0.2165, + "step": 4405 + }, + { + "epoch": 1.6270310192023634, + "grad_norm": 0.2985380291938782, + "learning_rate": 9.158763394506713e-05, + "loss": 0.2402, + "step": 4406 + }, + { + "epoch": 1.627400295420975, + "grad_norm": 0.24354571104049683, + "learning_rate": 9.156300036950364e-05, + "loss": 0.2205, + "step": 4407 + }, + { + "epoch": 1.6277695716395866, + "grad_norm": 0.2648698687553406, + "learning_rate": 9.153836679394014e-05, + "loss": 0.2424, + "step": 4408 + }, + { + "epoch": 1.628138847858198, + "grad_norm": 0.24913465976715088, + "learning_rate": 9.151373321837666e-05, + "loss": 0.1698, + "step": 4409 + }, + { + "epoch": 1.6285081240768093, + "grad_norm": 0.24999235570430756, + "learning_rate": 9.148909964281316e-05, + "loss": 0.2289, + "step": 4410 + }, + { + "epoch": 1.628877400295421, + "grad_norm": 0.3075230121612549, + "learning_rate": 9.146446606724967e-05, + "loss": 0.2186, + "step": 4411 + }, + { + "epoch": 1.6292466765140325, + "grad_norm": 0.26258617639541626, + "learning_rate": 9.143983249168617e-05, + "loss": 0.2338, + "step": 4412 + }, + { + "epoch": 1.6296159527326441, + "grad_norm": 0.3226744830608368, + "learning_rate": 9.141519891612267e-05, + "loss": 0.2535, + "step": 4413 + }, + { + "epoch": 1.6299852289512555, + "grad_norm": 0.27790889143943787, + "learning_rate": 9.139056534055919e-05, + "loss": 0.2105, + "step": 4414 + }, + { + "epoch": 1.630354505169867, + "grad_norm": 0.28507715463638306, + "learning_rate": 9.136593176499569e-05, + "loss": 0.2273, + "step": 4415 + }, + { + "epoch": 1.6307237813884785, + "grad_norm": 0.2365242838859558, + "learning_rate": 9.13412981894322e-05, + "loss": 0.1995, + "step": 4416 + }, + { + "epoch": 1.6310930576070901, + "grad_norm": 0.27237915992736816, + "learning_rate": 9.13166646138687e-05, + "loss": 0.2262, + "step": 4417 + }, + { + "epoch": 1.6314623338257017, + "grad_norm": 0.2524102032184601, + "learning_rate": 9.129203103830522e-05, + "loss": 0.2149, + "step": 4418 + }, + { + "epoch": 1.631831610044313, + "grad_norm": 0.22756114602088928, + "learning_rate": 9.126739746274172e-05, + "loss": 0.221, + "step": 4419 + }, + { + "epoch": 1.6322008862629247, + "grad_norm": 0.2564525306224823, + "learning_rate": 9.124276388717824e-05, + "loss": 0.1862, + "step": 4420 + }, + { + "epoch": 1.632570162481536, + "grad_norm": 0.22287985682487488, + "learning_rate": 9.121813031161474e-05, + "loss": 0.1892, + "step": 4421 + }, + { + "epoch": 1.6329394387001477, + "grad_norm": 0.23777393996715546, + "learning_rate": 9.119349673605124e-05, + "loss": 0.208, + "step": 4422 + }, + { + "epoch": 1.6333087149187593, + "grad_norm": 0.3179725706577301, + "learning_rate": 9.116886316048775e-05, + "loss": 0.2411, + "step": 4423 + }, + { + "epoch": 1.6336779911373709, + "grad_norm": 0.255845308303833, + "learning_rate": 9.114422958492425e-05, + "loss": 0.2089, + "step": 4424 + }, + { + "epoch": 1.6340472673559823, + "grad_norm": 0.26181289553642273, + "learning_rate": 9.111959600936077e-05, + "loss": 0.1946, + "step": 4425 + }, + { + "epoch": 1.6344165435745936, + "grad_norm": 0.33533358573913574, + "learning_rate": 9.109496243379727e-05, + "loss": 0.2516, + "step": 4426 + }, + { + "epoch": 1.6347858197932053, + "grad_norm": 0.22640307247638702, + "learning_rate": 9.107032885823378e-05, + "loss": 0.1803, + "step": 4427 + }, + { + "epoch": 1.6351550960118169, + "grad_norm": 0.27431797981262207, + "learning_rate": 9.104569528267028e-05, + "loss": 0.2378, + "step": 4428 + }, + { + "epoch": 1.6355243722304285, + "grad_norm": 0.25171250104904175, + "learning_rate": 9.102106170710679e-05, + "loss": 0.2006, + "step": 4429 + }, + { + "epoch": 1.6358936484490398, + "grad_norm": 0.2583106458187103, + "learning_rate": 9.09964281315433e-05, + "loss": 0.2313, + "step": 4430 + }, + { + "epoch": 1.6362629246676514, + "grad_norm": 0.2734076976776123, + "learning_rate": 9.09717945559798e-05, + "loss": 0.2193, + "step": 4431 + }, + { + "epoch": 1.6366322008862628, + "grad_norm": 0.23614242672920227, + "learning_rate": 9.094716098041632e-05, + "loss": 0.1847, + "step": 4432 + }, + { + "epoch": 1.6370014771048744, + "grad_norm": 0.39616069197654724, + "learning_rate": 9.092252740485282e-05, + "loss": 0.2203, + "step": 4433 + }, + { + "epoch": 1.637370753323486, + "grad_norm": 0.2629946768283844, + "learning_rate": 9.089789382928933e-05, + "loss": 0.2253, + "step": 4434 + }, + { + "epoch": 1.6377400295420976, + "grad_norm": 0.3351542055606842, + "learning_rate": 9.087326025372583e-05, + "loss": 0.203, + "step": 4435 + }, + { + "epoch": 1.638109305760709, + "grad_norm": 0.3559204041957855, + "learning_rate": 9.084862667816235e-05, + "loss": 0.2843, + "step": 4436 + }, + { + "epoch": 1.6384785819793204, + "grad_norm": 0.21351653337478638, + "learning_rate": 9.082399310259885e-05, + "loss": 0.1876, + "step": 4437 + }, + { + "epoch": 1.638847858197932, + "grad_norm": 0.2293735295534134, + "learning_rate": 9.079935952703535e-05, + "loss": 0.2115, + "step": 4438 + }, + { + "epoch": 1.6392171344165436, + "grad_norm": 0.2577255666255951, + "learning_rate": 9.077472595147186e-05, + "loss": 0.2504, + "step": 4439 + }, + { + "epoch": 1.6395864106351552, + "grad_norm": 0.2347128838300705, + "learning_rate": 9.075009237590837e-05, + "loss": 0.2109, + "step": 4440 + }, + { + "epoch": 1.6399556868537666, + "grad_norm": 0.25779810547828674, + "learning_rate": 9.072545880034488e-05, + "loss": 0.2077, + "step": 4441 + }, + { + "epoch": 1.6403249630723782, + "grad_norm": 0.26894909143447876, + "learning_rate": 9.070082522478138e-05, + "loss": 0.2174, + "step": 4442 + }, + { + "epoch": 1.6406942392909896, + "grad_norm": 0.2303198128938675, + "learning_rate": 9.06761916492179e-05, + "loss": 0.2154, + "step": 4443 + }, + { + "epoch": 1.6410635155096012, + "grad_norm": 0.37128573656082153, + "learning_rate": 9.06515580736544e-05, + "loss": 0.2366, + "step": 4444 + }, + { + "epoch": 1.6414327917282128, + "grad_norm": 0.3127291202545166, + "learning_rate": 9.06269244980909e-05, + "loss": 0.2435, + "step": 4445 + }, + { + "epoch": 1.6418020679468244, + "grad_norm": 0.23107470571994781, + "learning_rate": 9.060229092252741e-05, + "loss": 0.2, + "step": 4446 + }, + { + "epoch": 1.6421713441654358, + "grad_norm": 0.28541290760040283, + "learning_rate": 9.057765734696391e-05, + "loss": 0.257, + "step": 4447 + }, + { + "epoch": 1.6425406203840471, + "grad_norm": 0.24584290385246277, + "learning_rate": 9.055302377140043e-05, + "loss": 0.2145, + "step": 4448 + }, + { + "epoch": 1.6429098966026587, + "grad_norm": 0.2853289544582367, + "learning_rate": 9.052839019583693e-05, + "loss": 0.2586, + "step": 4449 + }, + { + "epoch": 1.6432791728212703, + "grad_norm": 0.26111945509910583, + "learning_rate": 9.050375662027344e-05, + "loss": 0.2684, + "step": 4450 + }, + { + "epoch": 1.6432791728212703, + "eval_loss": 8.450728416442871, + "eval_runtime": 6.9114, + "eval_samples_per_second": 7.234, + "eval_steps_per_second": 1.013, + "step": 4450 + }, + { + "epoch": 1.643648449039882, + "grad_norm": 0.2327665239572525, + "learning_rate": 9.047912304470995e-05, + "loss": 0.2307, + "step": 4451 + }, + { + "epoch": 1.6440177252584933, + "grad_norm": 0.26080307364463806, + "learning_rate": 9.045448946914645e-05, + "loss": 0.2198, + "step": 4452 + }, + { + "epoch": 1.644387001477105, + "grad_norm": 0.28412219882011414, + "learning_rate": 9.042985589358296e-05, + "loss": 0.2584, + "step": 4453 + }, + { + "epoch": 1.6447562776957163, + "grad_norm": 0.25336772203445435, + "learning_rate": 9.040522231801946e-05, + "loss": 0.2215, + "step": 4454 + }, + { + "epoch": 1.645125553914328, + "grad_norm": 0.27296313643455505, + "learning_rate": 9.038058874245598e-05, + "loss": 0.2286, + "step": 4455 + }, + { + "epoch": 1.6454948301329395, + "grad_norm": 0.3685697019100189, + "learning_rate": 9.035595516689248e-05, + "loss": 0.2698, + "step": 4456 + }, + { + "epoch": 1.6458641063515511, + "grad_norm": 0.22689512372016907, + "learning_rate": 9.033132159132899e-05, + "loss": 0.2155, + "step": 4457 + }, + { + "epoch": 1.6462333825701625, + "grad_norm": 0.2767208218574524, + "learning_rate": 9.03066880157655e-05, + "loss": 0.2148, + "step": 4458 + }, + { + "epoch": 1.6466026587887739, + "grad_norm": 0.2580651640892029, + "learning_rate": 9.028205444020201e-05, + "loss": 0.1992, + "step": 4459 + }, + { + "epoch": 1.6469719350073855, + "grad_norm": 0.22029533982276917, + "learning_rate": 9.025742086463851e-05, + "loss": 0.1998, + "step": 4460 + }, + { + "epoch": 1.647341211225997, + "grad_norm": 0.3048872947692871, + "learning_rate": 9.023278728907501e-05, + "loss": 0.2698, + "step": 4461 + }, + { + "epoch": 1.6477104874446087, + "grad_norm": 0.22069992125034332, + "learning_rate": 9.020815371351152e-05, + "loss": 0.2031, + "step": 4462 + }, + { + "epoch": 1.64807976366322, + "grad_norm": 0.2402629405260086, + "learning_rate": 9.018352013794803e-05, + "loss": 0.2322, + "step": 4463 + }, + { + "epoch": 1.6484490398818314, + "grad_norm": 0.2560381293296814, + "learning_rate": 9.015888656238454e-05, + "loss": 0.2272, + "step": 4464 + }, + { + "epoch": 1.648818316100443, + "grad_norm": 0.3089340329170227, + "learning_rate": 9.013425298682104e-05, + "loss": 0.2148, + "step": 4465 + }, + { + "epoch": 1.6491875923190547, + "grad_norm": 0.27589669823646545, + "learning_rate": 9.010961941125756e-05, + "loss": 0.2303, + "step": 4466 + }, + { + "epoch": 1.6495568685376663, + "grad_norm": 0.3324155807495117, + "learning_rate": 9.008498583569406e-05, + "loss": 0.236, + "step": 4467 + }, + { + "epoch": 1.6499261447562779, + "grad_norm": 0.23615163564682007, + "learning_rate": 9.006035226013056e-05, + "loss": 0.2205, + "step": 4468 + }, + { + "epoch": 1.6502954209748892, + "grad_norm": 0.2249806523323059, + "learning_rate": 9.003571868456707e-05, + "loss": 0.1887, + "step": 4469 + }, + { + "epoch": 1.6506646971935006, + "grad_norm": 0.2413998246192932, + "learning_rate": 9.001108510900357e-05, + "loss": 0.2149, + "step": 4470 + }, + { + "epoch": 1.6510339734121122, + "grad_norm": 0.23416151106357574, + "learning_rate": 8.998645153344009e-05, + "loss": 0.2094, + "step": 4471 + }, + { + "epoch": 1.6514032496307238, + "grad_norm": 0.22638297080993652, + "learning_rate": 8.996181795787659e-05, + "loss": 0.1766, + "step": 4472 + }, + { + "epoch": 1.6517725258493354, + "grad_norm": 0.313290536403656, + "learning_rate": 8.99371843823131e-05, + "loss": 0.2551, + "step": 4473 + }, + { + "epoch": 1.6521418020679468, + "grad_norm": 0.2513146698474884, + "learning_rate": 8.99125508067496e-05, + "loss": 0.2278, + "step": 4474 + }, + { + "epoch": 1.6525110782865582, + "grad_norm": 0.3417368233203888, + "learning_rate": 8.988791723118612e-05, + "loss": 0.1914, + "step": 4475 + }, + { + "epoch": 1.6528803545051698, + "grad_norm": 0.23986800014972687, + "learning_rate": 8.986328365562262e-05, + "loss": 0.2004, + "step": 4476 + }, + { + "epoch": 1.6532496307237814, + "grad_norm": 0.27774810791015625, + "learning_rate": 8.983865008005912e-05, + "loss": 0.2218, + "step": 4477 + }, + { + "epoch": 1.653618906942393, + "grad_norm": 0.22682726383209229, + "learning_rate": 8.981401650449564e-05, + "loss": 0.1863, + "step": 4478 + }, + { + "epoch": 1.6539881831610044, + "grad_norm": 0.2327871471643448, + "learning_rate": 8.978938292893214e-05, + "loss": 0.1801, + "step": 4479 + }, + { + "epoch": 1.654357459379616, + "grad_norm": 0.28414270281791687, + "learning_rate": 8.976474935336865e-05, + "loss": 0.2481, + "step": 4480 + }, + { + "epoch": 1.6547267355982274, + "grad_norm": 0.24541908502578735, + "learning_rate": 8.974011577780515e-05, + "loss": 0.2035, + "step": 4481 + }, + { + "epoch": 1.655096011816839, + "grad_norm": 0.29181188344955444, + "learning_rate": 8.971548220224167e-05, + "loss": 0.2255, + "step": 4482 + }, + { + "epoch": 1.6554652880354506, + "grad_norm": 0.28111740946769714, + "learning_rate": 8.969084862667817e-05, + "loss": 0.2323, + "step": 4483 + }, + { + "epoch": 1.6558345642540622, + "grad_norm": 0.26628875732421875, + "learning_rate": 8.966621505111467e-05, + "loss": 0.2336, + "step": 4484 + }, + { + "epoch": 1.6562038404726735, + "grad_norm": 0.23158717155456543, + "learning_rate": 8.964158147555119e-05, + "loss": 0.2151, + "step": 4485 + }, + { + "epoch": 1.656573116691285, + "grad_norm": 0.26410263776779175, + "learning_rate": 8.961694789998769e-05, + "loss": 0.2457, + "step": 4486 + }, + { + "epoch": 1.6569423929098965, + "grad_norm": 0.25557923316955566, + "learning_rate": 8.95923143244242e-05, + "loss": 0.212, + "step": 4487 + }, + { + "epoch": 1.6573116691285081, + "grad_norm": 0.27062490582466125, + "learning_rate": 8.95676807488607e-05, + "loss": 0.2099, + "step": 4488 + }, + { + "epoch": 1.6576809453471197, + "grad_norm": 0.2580481469631195, + "learning_rate": 8.954304717329722e-05, + "loss": 0.2054, + "step": 4489 + }, + { + "epoch": 1.6580502215657311, + "grad_norm": 0.24587883055210114, + "learning_rate": 8.951841359773372e-05, + "loss": 0.2223, + "step": 4490 + }, + { + "epoch": 1.6584194977843427, + "grad_norm": 0.31387656927108765, + "learning_rate": 8.949378002217023e-05, + "loss": 0.2229, + "step": 4491 + }, + { + "epoch": 1.658788774002954, + "grad_norm": 0.21689686179161072, + "learning_rate": 8.946914644660673e-05, + "loss": 0.1798, + "step": 4492 + }, + { + "epoch": 1.6591580502215657, + "grad_norm": 0.23878653347492218, + "learning_rate": 8.944451287104323e-05, + "loss": 0.213, + "step": 4493 + }, + { + "epoch": 1.6595273264401773, + "grad_norm": 0.3204362690448761, + "learning_rate": 8.941987929547975e-05, + "loss": 0.2331, + "step": 4494 + }, + { + "epoch": 1.659896602658789, + "grad_norm": 0.2179742306470871, + "learning_rate": 8.939524571991625e-05, + "loss": 0.2212, + "step": 4495 + }, + { + "epoch": 1.6602658788774003, + "grad_norm": 0.2339923083782196, + "learning_rate": 8.937061214435276e-05, + "loss": 0.2096, + "step": 4496 + }, + { + "epoch": 1.6606351550960117, + "grad_norm": 0.2619478702545166, + "learning_rate": 8.934597856878927e-05, + "loss": 0.1959, + "step": 4497 + }, + { + "epoch": 1.6610044313146233, + "grad_norm": 0.3399810194969177, + "learning_rate": 8.932134499322578e-05, + "loss": 0.303, + "step": 4498 + }, + { + "epoch": 1.6613737075332349, + "grad_norm": 0.3005475103855133, + "learning_rate": 8.929671141766228e-05, + "loss": 0.2258, + "step": 4499 + }, + { + "epoch": 1.6617429837518465, + "grad_norm": 0.2556156516075134, + "learning_rate": 8.927207784209878e-05, + "loss": 0.2144, + "step": 4500 + }, + { + "epoch": 1.6617429837518465, + "eval_loss": 8.411576271057129, + "eval_runtime": 6.9174, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 1.012, + "step": 4500 + }, + { + "epoch": 1.6621122599704579, + "grad_norm": 0.23614928126335144, + "learning_rate": 8.92474442665353e-05, + "loss": 0.2139, + "step": 4501 + }, + { + "epoch": 1.6624815361890695, + "grad_norm": 0.29147398471832275, + "learning_rate": 8.92228106909718e-05, + "loss": 0.238, + "step": 4502 + }, + { + "epoch": 1.6628508124076808, + "grad_norm": 0.26998746395111084, + "learning_rate": 8.919817711540831e-05, + "loss": 0.2305, + "step": 4503 + }, + { + "epoch": 1.6632200886262924, + "grad_norm": 0.3895392119884491, + "learning_rate": 8.917354353984481e-05, + "loss": 0.2263, + "step": 4504 + }, + { + "epoch": 1.663589364844904, + "grad_norm": 0.2641807496547699, + "learning_rate": 8.914890996428133e-05, + "loss": 0.2073, + "step": 4505 + }, + { + "epoch": 1.6639586410635157, + "grad_norm": 0.27693089842796326, + "learning_rate": 8.912427638871783e-05, + "loss": 0.2241, + "step": 4506 + }, + { + "epoch": 1.664327917282127, + "grad_norm": 0.30198314785957336, + "learning_rate": 8.909964281315434e-05, + "loss": 0.2526, + "step": 4507 + }, + { + "epoch": 1.6646971935007384, + "grad_norm": 0.27313119173049927, + "learning_rate": 8.907500923759085e-05, + "loss": 0.2714, + "step": 4508 + }, + { + "epoch": 1.66506646971935, + "grad_norm": 0.2115759700536728, + "learning_rate": 8.905037566202735e-05, + "loss": 0.1949, + "step": 4509 + }, + { + "epoch": 1.6654357459379616, + "grad_norm": 0.2874866724014282, + "learning_rate": 8.902574208646386e-05, + "loss": 0.23, + "step": 4510 + }, + { + "epoch": 1.6658050221565732, + "grad_norm": 0.26632991433143616, + "learning_rate": 8.900110851090036e-05, + "loss": 0.2592, + "step": 4511 + }, + { + "epoch": 1.6661742983751846, + "grad_norm": 0.2636313736438751, + "learning_rate": 8.897647493533688e-05, + "loss": 0.2216, + "step": 4512 + }, + { + "epoch": 1.6665435745937962, + "grad_norm": 0.2382415533065796, + "learning_rate": 8.895184135977338e-05, + "loss": 0.2181, + "step": 4513 + }, + { + "epoch": 1.6669128508124076, + "grad_norm": 0.2203097939491272, + "learning_rate": 8.892720778420989e-05, + "loss": 0.2042, + "step": 4514 + }, + { + "epoch": 1.6672821270310192, + "grad_norm": 0.26166653633117676, + "learning_rate": 8.89025742086464e-05, + "loss": 0.1809, + "step": 4515 + }, + { + "epoch": 1.6676514032496308, + "grad_norm": 0.43701672554016113, + "learning_rate": 8.88779406330829e-05, + "loss": 0.2743, + "step": 4516 + }, + { + "epoch": 1.6680206794682424, + "grad_norm": 0.26765570044517517, + "learning_rate": 8.885330705751941e-05, + "loss": 0.2422, + "step": 4517 + }, + { + "epoch": 1.6683899556868538, + "grad_norm": 0.25912296772003174, + "learning_rate": 8.882867348195591e-05, + "loss": 0.2217, + "step": 4518 + }, + { + "epoch": 1.6687592319054652, + "grad_norm": 0.26312991976737976, + "learning_rate": 8.880403990639243e-05, + "loss": 0.2188, + "step": 4519 + }, + { + "epoch": 1.6691285081240768, + "grad_norm": 0.27818185091018677, + "learning_rate": 8.877940633082893e-05, + "loss": 0.2211, + "step": 4520 + }, + { + "epoch": 1.6694977843426884, + "grad_norm": 0.22294266521930695, + "learning_rate": 8.875477275526544e-05, + "loss": 0.2, + "step": 4521 + }, + { + "epoch": 1.6698670605613, + "grad_norm": 0.23186852037906647, + "learning_rate": 8.873013917970194e-05, + "loss": 0.2025, + "step": 4522 + }, + { + "epoch": 1.6702363367799113, + "grad_norm": 0.2861183285713196, + "learning_rate": 8.870550560413846e-05, + "loss": 0.27, + "step": 4523 + }, + { + "epoch": 1.670605612998523, + "grad_norm": 0.2688869535923004, + "learning_rate": 8.868087202857496e-05, + "loss": 0.2306, + "step": 4524 + }, + { + "epoch": 1.6709748892171343, + "grad_norm": 0.23829078674316406, + "learning_rate": 8.865623845301146e-05, + "loss": 0.2056, + "step": 4525 + }, + { + "epoch": 1.671344165435746, + "grad_norm": 0.2583219110965729, + "learning_rate": 8.863160487744797e-05, + "loss": 0.2285, + "step": 4526 + }, + { + "epoch": 1.6717134416543575, + "grad_norm": 0.30879443883895874, + "learning_rate": 8.860697130188447e-05, + "loss": 0.2348, + "step": 4527 + }, + { + "epoch": 1.6720827178729691, + "grad_norm": 0.2706676125526428, + "learning_rate": 8.858233772632099e-05, + "loss": 0.2211, + "step": 4528 + }, + { + "epoch": 1.6724519940915805, + "grad_norm": 0.23030823469161987, + "learning_rate": 8.855770415075748e-05, + "loss": 0.2062, + "step": 4529 + }, + { + "epoch": 1.672821270310192, + "grad_norm": 0.2939527928829193, + "learning_rate": 8.853307057519399e-05, + "loss": 0.2615, + "step": 4530 + }, + { + "epoch": 1.6731905465288035, + "grad_norm": 0.2624320983886719, + "learning_rate": 8.850843699963049e-05, + "loss": 0.2091, + "step": 4531 + }, + { + "epoch": 1.673559822747415, + "grad_norm": 0.20729155838489532, + "learning_rate": 8.848380342406701e-05, + "loss": 0.1914, + "step": 4532 + }, + { + "epoch": 1.6739290989660267, + "grad_norm": 0.2327888309955597, + "learning_rate": 8.845916984850351e-05, + "loss": 0.2, + "step": 4533 + }, + { + "epoch": 1.674298375184638, + "grad_norm": 0.31823375821113586, + "learning_rate": 8.843453627294001e-05, + "loss": 0.2119, + "step": 4534 + }, + { + "epoch": 1.6746676514032495, + "grad_norm": 0.25384020805358887, + "learning_rate": 8.840990269737652e-05, + "loss": 0.1903, + "step": 4535 + }, + { + "epoch": 1.675036927621861, + "grad_norm": 0.287805438041687, + "learning_rate": 8.838526912181303e-05, + "loss": 0.2306, + "step": 4536 + }, + { + "epoch": 1.6754062038404727, + "grad_norm": 0.3064287602901459, + "learning_rate": 8.836063554624954e-05, + "loss": 0.1921, + "step": 4537 + }, + { + "epoch": 1.6757754800590843, + "grad_norm": 0.264697790145874, + "learning_rate": 8.833600197068604e-05, + "loss": 0.2248, + "step": 4538 + }, + { + "epoch": 1.6761447562776959, + "grad_norm": 0.2815844714641571, + "learning_rate": 8.831136839512256e-05, + "loss": 0.2633, + "step": 4539 + }, + { + "epoch": 1.6765140324963073, + "grad_norm": 0.2589470148086548, + "learning_rate": 8.828673481955906e-05, + "loss": 0.1819, + "step": 4540 + }, + { + "epoch": 1.6768833087149186, + "grad_norm": 0.21535301208496094, + "learning_rate": 8.826210124399557e-05, + "loss": 0.1881, + "step": 4541 + }, + { + "epoch": 1.6772525849335302, + "grad_norm": 0.29212647676467896, + "learning_rate": 8.823746766843207e-05, + "loss": 0.2185, + "step": 4542 + }, + { + "epoch": 1.6776218611521418, + "grad_norm": 0.2741040289402008, + "learning_rate": 8.821283409286857e-05, + "loss": 0.2196, + "step": 4543 + }, + { + "epoch": 1.6779911373707534, + "grad_norm": 0.2383076548576355, + "learning_rate": 8.818820051730509e-05, + "loss": 0.2046, + "step": 4544 + }, + { + "epoch": 1.6783604135893648, + "grad_norm": 0.2639963924884796, + "learning_rate": 8.816356694174159e-05, + "loss": 0.2334, + "step": 4545 + }, + { + "epoch": 1.6787296898079762, + "grad_norm": 0.3037216067314148, + "learning_rate": 8.81389333661781e-05, + "loss": 0.1947, + "step": 4546 + }, + { + "epoch": 1.6790989660265878, + "grad_norm": 0.2624236047267914, + "learning_rate": 8.81142997906146e-05, + "loss": 0.2056, + "step": 4547 + }, + { + "epoch": 1.6794682422451994, + "grad_norm": 0.23620982468128204, + "learning_rate": 8.808966621505112e-05, + "loss": 0.1979, + "step": 4548 + }, + { + "epoch": 1.679837518463811, + "grad_norm": 0.21172912418842316, + "learning_rate": 8.806503263948762e-05, + "loss": 0.177, + "step": 4549 + }, + { + "epoch": 1.6802067946824224, + "grad_norm": 0.3856848478317261, + "learning_rate": 8.804039906392412e-05, + "loss": 0.259, + "step": 4550 + }, + { + "epoch": 1.6802067946824224, + "eval_loss": 8.5475435256958, + "eval_runtime": 6.9151, + "eval_samples_per_second": 7.231, + "eval_steps_per_second": 1.012, + "step": 4550 + }, + { + "epoch": 1.680576070901034, + "grad_norm": 0.23105467855930328, + "learning_rate": 8.801576548836064e-05, + "loss": 0.2161, + "step": 4551 + }, + { + "epoch": 1.6809453471196454, + "grad_norm": 0.29655417799949646, + "learning_rate": 8.799113191279714e-05, + "loss": 0.217, + "step": 4552 + }, + { + "epoch": 1.681314623338257, + "grad_norm": 0.2758445739746094, + "learning_rate": 8.796649833723365e-05, + "loss": 0.2431, + "step": 4553 + }, + { + "epoch": 1.6816838995568686, + "grad_norm": 0.2686046063899994, + "learning_rate": 8.794186476167015e-05, + "loss": 0.1973, + "step": 4554 + }, + { + "epoch": 1.6820531757754802, + "grad_norm": 0.24882875382900238, + "learning_rate": 8.791723118610667e-05, + "loss": 0.2335, + "step": 4555 + }, + { + "epoch": 1.6824224519940916, + "grad_norm": 0.25683221220970154, + "learning_rate": 8.789259761054317e-05, + "loss": 0.2231, + "step": 4556 + }, + { + "epoch": 1.682791728212703, + "grad_norm": 0.25730934739112854, + "learning_rate": 8.786796403497968e-05, + "loss": 0.252, + "step": 4557 + }, + { + "epoch": 1.6831610044313146, + "grad_norm": 0.2846654951572418, + "learning_rate": 8.784333045941618e-05, + "loss": 0.2449, + "step": 4558 + }, + { + "epoch": 1.6835302806499262, + "grad_norm": 0.4119119346141815, + "learning_rate": 8.781869688385269e-05, + "loss": 0.2866, + "step": 4559 + }, + { + "epoch": 1.6838995568685378, + "grad_norm": 0.2867501974105835, + "learning_rate": 8.77940633082892e-05, + "loss": 0.229, + "step": 4560 + }, + { + "epoch": 1.6842688330871491, + "grad_norm": 0.2786272168159485, + "learning_rate": 8.77694297327257e-05, + "loss": 0.2252, + "step": 4561 + }, + { + "epoch": 1.6846381093057607, + "grad_norm": 0.3096006512641907, + "learning_rate": 8.774479615716222e-05, + "loss": 0.232, + "step": 4562 + }, + { + "epoch": 1.6850073855243721, + "grad_norm": 0.3025314509868622, + "learning_rate": 8.772016258159872e-05, + "loss": 0.2052, + "step": 4563 + }, + { + "epoch": 1.6853766617429837, + "grad_norm": 0.2546060383319855, + "learning_rate": 8.769552900603523e-05, + "loss": 0.2061, + "step": 4564 + }, + { + "epoch": 1.6857459379615953, + "grad_norm": 0.27384698390960693, + "learning_rate": 8.767089543047173e-05, + "loss": 0.1982, + "step": 4565 + }, + { + "epoch": 1.686115214180207, + "grad_norm": 0.22614800930023193, + "learning_rate": 8.764626185490823e-05, + "loss": 0.1787, + "step": 4566 + }, + { + "epoch": 1.6864844903988183, + "grad_norm": 0.22748345136642456, + "learning_rate": 8.762162827934475e-05, + "loss": 0.2267, + "step": 4567 + }, + { + "epoch": 1.6868537666174297, + "grad_norm": 0.2576672434806824, + "learning_rate": 8.759699470378125e-05, + "loss": 0.2177, + "step": 4568 + }, + { + "epoch": 1.6872230428360413, + "grad_norm": 0.1900009661912918, + "learning_rate": 8.757236112821776e-05, + "loss": 0.1648, + "step": 4569 + }, + { + "epoch": 1.687592319054653, + "grad_norm": 0.22448039054870605, + "learning_rate": 8.754772755265427e-05, + "loss": 0.2018, + "step": 4570 + }, + { + "epoch": 1.6879615952732645, + "grad_norm": 0.35891246795654297, + "learning_rate": 8.752309397709078e-05, + "loss": 0.205, + "step": 4571 + }, + { + "epoch": 1.6883308714918759, + "grad_norm": 0.2575661540031433, + "learning_rate": 8.749846040152728e-05, + "loss": 0.1995, + "step": 4572 + }, + { + "epoch": 1.6887001477104875, + "grad_norm": 0.2475462704896927, + "learning_rate": 8.74738268259638e-05, + "loss": 0.2018, + "step": 4573 + }, + { + "epoch": 1.6890694239290989, + "grad_norm": 0.23700089752674103, + "learning_rate": 8.74491932504003e-05, + "loss": 0.185, + "step": 4574 + }, + { + "epoch": 1.6894387001477105, + "grad_norm": 0.21290355920791626, + "learning_rate": 8.74245596748368e-05, + "loss": 0.2088, + "step": 4575 + }, + { + "epoch": 1.689807976366322, + "grad_norm": 0.23290672898292542, + "learning_rate": 8.739992609927331e-05, + "loss": 0.2086, + "step": 4576 + }, + { + "epoch": 1.6901772525849337, + "grad_norm": 0.2843339145183563, + "learning_rate": 8.737529252370981e-05, + "loss": 0.2319, + "step": 4577 + }, + { + "epoch": 1.690546528803545, + "grad_norm": 0.2296927124261856, + "learning_rate": 8.735065894814633e-05, + "loss": 0.2141, + "step": 4578 + }, + { + "epoch": 1.6909158050221564, + "grad_norm": 0.30377277731895447, + "learning_rate": 8.732602537258283e-05, + "loss": 0.2431, + "step": 4579 + }, + { + "epoch": 1.691285081240768, + "grad_norm": 0.2648451626300812, + "learning_rate": 8.730139179701934e-05, + "loss": 0.2224, + "step": 4580 + }, + { + "epoch": 1.6916543574593796, + "grad_norm": 0.22149725258350372, + "learning_rate": 8.727675822145585e-05, + "loss": 0.1822, + "step": 4581 + }, + { + "epoch": 1.6920236336779912, + "grad_norm": 0.2965066134929657, + "learning_rate": 8.725212464589235e-05, + "loss": 0.2592, + "step": 4582 + }, + { + "epoch": 1.6923929098966026, + "grad_norm": 0.21857520937919617, + "learning_rate": 8.722749107032886e-05, + "loss": 0.1954, + "step": 4583 + }, + { + "epoch": 1.6927621861152142, + "grad_norm": 0.32628974318504333, + "learning_rate": 8.720285749476536e-05, + "loss": 0.2329, + "step": 4584 + }, + { + "epoch": 1.6931314623338256, + "grad_norm": 0.2505199909210205, + "learning_rate": 8.717822391920188e-05, + "loss": 0.2236, + "step": 4585 + }, + { + "epoch": 1.6935007385524372, + "grad_norm": 0.2687026262283325, + "learning_rate": 8.715359034363838e-05, + "loss": 0.2288, + "step": 4586 + }, + { + "epoch": 1.6938700147710488, + "grad_norm": 0.20491193234920502, + "learning_rate": 8.712895676807489e-05, + "loss": 0.1793, + "step": 4587 + }, + { + "epoch": 1.6942392909896604, + "grad_norm": 0.29209116101264954, + "learning_rate": 8.71043231925114e-05, + "loss": 0.2214, + "step": 4588 + }, + { + "epoch": 1.6946085672082718, + "grad_norm": 0.3510444462299347, + "learning_rate": 8.707968961694791e-05, + "loss": 0.2423, + "step": 4589 + }, + { + "epoch": 1.6949778434268832, + "grad_norm": 0.266463041305542, + "learning_rate": 8.705505604138441e-05, + "loss": 0.2268, + "step": 4590 + }, + { + "epoch": 1.6953471196454948, + "grad_norm": 0.25104019045829773, + "learning_rate": 8.703042246582091e-05, + "loss": 0.2296, + "step": 4591 + }, + { + "epoch": 1.6957163958641064, + "grad_norm": 0.3233650326728821, + "learning_rate": 8.700578889025742e-05, + "loss": 0.1925, + "step": 4592 + }, + { + "epoch": 1.696085672082718, + "grad_norm": 0.2694375813007355, + "learning_rate": 8.698115531469393e-05, + "loss": 0.2297, + "step": 4593 + }, + { + "epoch": 1.6964549483013294, + "grad_norm": 0.2822797894477844, + "learning_rate": 8.695652173913044e-05, + "loss": 0.243, + "step": 4594 + }, + { + "epoch": 1.696824224519941, + "grad_norm": 0.26656627655029297, + "learning_rate": 8.693188816356694e-05, + "loss": 0.2018, + "step": 4595 + }, + { + "epoch": 1.6971935007385524, + "grad_norm": 0.22680732607841492, + "learning_rate": 8.690725458800346e-05, + "loss": 0.2025, + "step": 4596 + }, + { + "epoch": 1.697562776957164, + "grad_norm": 0.24421238899230957, + "learning_rate": 8.688262101243996e-05, + "loss": 0.2123, + "step": 4597 + }, + { + "epoch": 1.6979320531757756, + "grad_norm": 0.3312530219554901, + "learning_rate": 8.685798743687646e-05, + "loss": 0.2772, + "step": 4598 + }, + { + "epoch": 1.6983013293943872, + "grad_norm": 0.4265027642250061, + "learning_rate": 8.683335386131297e-05, + "loss": 0.247, + "step": 4599 + }, + { + "epoch": 1.6986706056129985, + "grad_norm": 0.26428765058517456, + "learning_rate": 8.680872028574947e-05, + "loss": 0.1901, + "step": 4600 + }, + { + "epoch": 1.6986706056129985, + "eval_loss": 8.581058502197266, + "eval_runtime": 6.9045, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 1.014, + "step": 4600 + }, + { + "epoch": 1.69903988183161, + "grad_norm": 0.25573796033859253, + "learning_rate": 8.678408671018599e-05, + "loss": 0.2326, + "step": 4601 + }, + { + "epoch": 1.6994091580502215, + "grad_norm": 0.2627505660057068, + "learning_rate": 8.675945313462249e-05, + "loss": 0.2314, + "step": 4602 + }, + { + "epoch": 1.6997784342688331, + "grad_norm": 0.22598692774772644, + "learning_rate": 8.6734819559059e-05, + "loss": 0.1983, + "step": 4603 + }, + { + "epoch": 1.7001477104874447, + "grad_norm": 0.26533573865890503, + "learning_rate": 8.67101859834955e-05, + "loss": 0.214, + "step": 4604 + }, + { + "epoch": 1.700516986706056, + "grad_norm": 0.2719341814517975, + "learning_rate": 8.668555240793201e-05, + "loss": 0.2275, + "step": 4605 + }, + { + "epoch": 1.7008862629246675, + "grad_norm": 0.25944045186042786, + "learning_rate": 8.666091883236852e-05, + "loss": 0.2288, + "step": 4606 + }, + { + "epoch": 1.701255539143279, + "grad_norm": 0.266681432723999, + "learning_rate": 8.663628525680502e-05, + "loss": 0.2141, + "step": 4607 + }, + { + "epoch": 1.7016248153618907, + "grad_norm": 0.26570042967796326, + "learning_rate": 8.661165168124154e-05, + "loss": 0.2143, + "step": 4608 + }, + { + "epoch": 1.7019940915805023, + "grad_norm": 0.2453954666852951, + "learning_rate": 8.658701810567804e-05, + "loss": 0.208, + "step": 4609 + }, + { + "epoch": 1.702363367799114, + "grad_norm": 0.25718823075294495, + "learning_rate": 8.656238453011455e-05, + "loss": 0.2079, + "step": 4610 + }, + { + "epoch": 1.7027326440177253, + "grad_norm": 0.3281457722187042, + "learning_rate": 8.653775095455105e-05, + "loss": 0.2506, + "step": 4611 + }, + { + "epoch": 1.7031019202363367, + "grad_norm": 0.24291884899139404, + "learning_rate": 8.651311737898757e-05, + "loss": 0.2224, + "step": 4612 + }, + { + "epoch": 1.7034711964549483, + "grad_norm": 0.232660710811615, + "learning_rate": 8.648848380342407e-05, + "loss": 0.1785, + "step": 4613 + }, + { + "epoch": 1.7038404726735599, + "grad_norm": 0.2992324233055115, + "learning_rate": 8.646385022786057e-05, + "loss": 0.216, + "step": 4614 + }, + { + "epoch": 1.7042097488921715, + "grad_norm": 0.31075674295425415, + "learning_rate": 8.643921665229709e-05, + "loss": 0.2268, + "step": 4615 + }, + { + "epoch": 1.7045790251107829, + "grad_norm": 0.28981921076774597, + "learning_rate": 8.641458307673359e-05, + "loss": 0.221, + "step": 4616 + }, + { + "epoch": 1.7049483013293942, + "grad_norm": 0.24357815086841583, + "learning_rate": 8.63899495011701e-05, + "loss": 0.1974, + "step": 4617 + }, + { + "epoch": 1.7053175775480058, + "grad_norm": 0.3200026750564575, + "learning_rate": 8.63653159256066e-05, + "loss": 0.2495, + "step": 4618 + }, + { + "epoch": 1.7056868537666174, + "grad_norm": 0.29544976353645325, + "learning_rate": 8.634068235004312e-05, + "loss": 0.213, + "step": 4619 + }, + { + "epoch": 1.706056129985229, + "grad_norm": 0.3011132776737213, + "learning_rate": 8.631604877447962e-05, + "loss": 0.2214, + "step": 4620 + }, + { + "epoch": 1.7064254062038404, + "grad_norm": 0.2651318311691284, + "learning_rate": 8.629141519891612e-05, + "loss": 0.1787, + "step": 4621 + }, + { + "epoch": 1.706794682422452, + "grad_norm": 0.21498651802539825, + "learning_rate": 8.626678162335263e-05, + "loss": 0.1743, + "step": 4622 + }, + { + "epoch": 1.7071639586410634, + "grad_norm": 0.2721802294254303, + "learning_rate": 8.624214804778913e-05, + "loss": 0.2079, + "step": 4623 + }, + { + "epoch": 1.707533234859675, + "grad_norm": 0.3063674867153168, + "learning_rate": 8.621751447222565e-05, + "loss": 0.2395, + "step": 4624 + }, + { + "epoch": 1.7079025110782866, + "grad_norm": 0.2962421774864197, + "learning_rate": 8.619288089666215e-05, + "loss": 0.2666, + "step": 4625 + }, + { + "epoch": 1.7082717872968982, + "grad_norm": 0.23558349907398224, + "learning_rate": 8.616824732109867e-05, + "loss": 0.1796, + "step": 4626 + }, + { + "epoch": 1.7086410635155096, + "grad_norm": 0.22817738354206085, + "learning_rate": 8.614361374553517e-05, + "loss": 0.2159, + "step": 4627 + }, + { + "epoch": 1.709010339734121, + "grad_norm": 0.2731928825378418, + "learning_rate": 8.611898016997168e-05, + "loss": 0.2289, + "step": 4628 + }, + { + "epoch": 1.7093796159527326, + "grad_norm": 0.22399276494979858, + "learning_rate": 8.609434659440818e-05, + "loss": 0.1965, + "step": 4629 + }, + { + "epoch": 1.7097488921713442, + "grad_norm": 0.24558617174625397, + "learning_rate": 8.606971301884468e-05, + "loss": 0.2067, + "step": 4630 + }, + { + "epoch": 1.7101181683899558, + "grad_norm": 0.24412156641483307, + "learning_rate": 8.60450794432812e-05, + "loss": 0.2217, + "step": 4631 + }, + { + "epoch": 1.7104874446085672, + "grad_norm": 0.36428219079971313, + "learning_rate": 8.60204458677177e-05, + "loss": 0.253, + "step": 4632 + }, + { + "epoch": 1.7108567208271788, + "grad_norm": 0.2031269520521164, + "learning_rate": 8.599581229215421e-05, + "loss": 0.1767, + "step": 4633 + }, + { + "epoch": 1.7112259970457901, + "grad_norm": 0.27926796674728394, + "learning_rate": 8.597117871659071e-05, + "loss": 0.2396, + "step": 4634 + }, + { + "epoch": 1.7115952732644018, + "grad_norm": 0.29344406723976135, + "learning_rate": 8.594654514102723e-05, + "loss": 0.2115, + "step": 4635 + }, + { + "epoch": 1.7119645494830134, + "grad_norm": 0.2905105650424957, + "learning_rate": 8.592191156546373e-05, + "loss": 0.2589, + "step": 4636 + }, + { + "epoch": 1.712333825701625, + "grad_norm": 0.21833282709121704, + "learning_rate": 8.589727798990023e-05, + "loss": 0.1873, + "step": 4637 + }, + { + "epoch": 1.7127031019202363, + "grad_norm": 0.35743653774261475, + "learning_rate": 8.587264441433675e-05, + "loss": 0.2701, + "step": 4638 + }, + { + "epoch": 1.7130723781388477, + "grad_norm": 0.26741376519203186, + "learning_rate": 8.584801083877325e-05, + "loss": 0.2186, + "step": 4639 + }, + { + "epoch": 1.7134416543574593, + "grad_norm": 0.27695584297180176, + "learning_rate": 8.582337726320976e-05, + "loss": 0.2158, + "step": 4640 + }, + { + "epoch": 1.713810930576071, + "grad_norm": 0.27247166633605957, + "learning_rate": 8.579874368764626e-05, + "loss": 0.2287, + "step": 4641 + }, + { + "epoch": 1.7141802067946825, + "grad_norm": 0.25723299384117126, + "learning_rate": 8.577411011208278e-05, + "loss": 0.2316, + "step": 4642 + }, + { + "epoch": 1.714549483013294, + "grad_norm": 0.2114037722349167, + "learning_rate": 8.574947653651928e-05, + "loss": 0.2004, + "step": 4643 + }, + { + "epoch": 1.7149187592319055, + "grad_norm": 0.2582630217075348, + "learning_rate": 8.57248429609558e-05, + "loss": 0.18, + "step": 4644 + }, + { + "epoch": 1.715288035450517, + "grad_norm": 0.29137298464775085, + "learning_rate": 8.57002093853923e-05, + "loss": 0.2117, + "step": 4645 + }, + { + "epoch": 1.7156573116691285, + "grad_norm": 0.2782554626464844, + "learning_rate": 8.56755758098288e-05, + "loss": 0.2366, + "step": 4646 + }, + { + "epoch": 1.71602658788774, + "grad_norm": 0.25987300276756287, + "learning_rate": 8.565094223426531e-05, + "loss": 0.2202, + "step": 4647 + }, + { + "epoch": 1.7163958641063517, + "grad_norm": 0.3870409429073334, + "learning_rate": 8.562630865870181e-05, + "loss": 0.2684, + "step": 4648 + }, + { + "epoch": 1.716765140324963, + "grad_norm": 0.30918747186660767, + "learning_rate": 8.560167508313833e-05, + "loss": 0.259, + "step": 4649 + }, + { + "epoch": 1.7171344165435745, + "grad_norm": 0.23314863443374634, + "learning_rate": 8.557704150757483e-05, + "loss": 0.1862, + "step": 4650 + }, + { + "epoch": 1.7171344165435745, + "eval_loss": 8.523117065429688, + "eval_runtime": 6.9127, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 1.013, + "step": 4650 + }, + { + "epoch": 1.717503692762186, + "grad_norm": 0.2415207326412201, + "learning_rate": 8.555240793201134e-05, + "loss": 0.1904, + "step": 4651 + }, + { + "epoch": 1.7178729689807977, + "grad_norm": 0.26603949069976807, + "learning_rate": 8.552777435644784e-05, + "loss": 0.2459, + "step": 4652 + }, + { + "epoch": 1.7182422451994093, + "grad_norm": 0.28633639216423035, + "learning_rate": 8.550314078088434e-05, + "loss": 0.2357, + "step": 4653 + }, + { + "epoch": 1.7186115214180206, + "grad_norm": 0.27855318784713745, + "learning_rate": 8.547850720532086e-05, + "loss": 0.2124, + "step": 4654 + }, + { + "epoch": 1.7189807976366323, + "grad_norm": 0.26668715476989746, + "learning_rate": 8.545387362975736e-05, + "loss": 0.1895, + "step": 4655 + }, + { + "epoch": 1.7193500738552436, + "grad_norm": 0.26364853978157043, + "learning_rate": 8.542924005419387e-05, + "loss": 0.2247, + "step": 4656 + }, + { + "epoch": 1.7197193500738552, + "grad_norm": 0.22795814275741577, + "learning_rate": 8.540460647863038e-05, + "loss": 0.1891, + "step": 4657 + }, + { + "epoch": 1.7200886262924668, + "grad_norm": 0.25729796290397644, + "learning_rate": 8.537997290306689e-05, + "loss": 0.2129, + "step": 4658 + }, + { + "epoch": 1.7204579025110784, + "grad_norm": 0.26862889528274536, + "learning_rate": 8.535533932750339e-05, + "loss": 0.2307, + "step": 4659 + }, + { + "epoch": 1.7208271787296898, + "grad_norm": 0.22502557933330536, + "learning_rate": 8.53307057519399e-05, + "loss": 0.1853, + "step": 4660 + }, + { + "epoch": 1.7211964549483012, + "grad_norm": 0.2404657006263733, + "learning_rate": 8.53060721763764e-05, + "loss": 0.2317, + "step": 4661 + }, + { + "epoch": 1.7215657311669128, + "grad_norm": 0.2772236764431, + "learning_rate": 8.528143860081291e-05, + "loss": 0.2205, + "step": 4662 + }, + { + "epoch": 1.7219350073855244, + "grad_norm": 0.22665022313594818, + "learning_rate": 8.525680502524942e-05, + "loss": 0.2114, + "step": 4663 + }, + { + "epoch": 1.722304283604136, + "grad_norm": 0.23656398057937622, + "learning_rate": 8.523217144968592e-05, + "loss": 0.2413, + "step": 4664 + }, + { + "epoch": 1.7226735598227474, + "grad_norm": 0.2562445104122162, + "learning_rate": 8.520753787412244e-05, + "loss": 0.2062, + "step": 4665 + }, + { + "epoch": 1.7230428360413588, + "grad_norm": 0.20661282539367676, + "learning_rate": 8.518290429855894e-05, + "loss": 0.1974, + "step": 4666 + }, + { + "epoch": 1.7234121122599704, + "grad_norm": 0.3172898590564728, + "learning_rate": 8.515827072299545e-05, + "loss": 0.2684, + "step": 4667 + }, + { + "epoch": 1.723781388478582, + "grad_norm": 0.26641401648521423, + "learning_rate": 8.513363714743195e-05, + "loss": 0.2162, + "step": 4668 + }, + { + "epoch": 1.7241506646971936, + "grad_norm": 0.3146515190601349, + "learning_rate": 8.510900357186846e-05, + "loss": 0.2208, + "step": 4669 + }, + { + "epoch": 1.7245199409158052, + "grad_norm": 0.30448660254478455, + "learning_rate": 8.508436999630497e-05, + "loss": 0.2794, + "step": 4670 + }, + { + "epoch": 1.7248892171344166, + "grad_norm": 0.28368067741394043, + "learning_rate": 8.505973642074147e-05, + "loss": 0.2488, + "step": 4671 + }, + { + "epoch": 1.725258493353028, + "grad_norm": 0.30605000257492065, + "learning_rate": 8.503510284517799e-05, + "loss": 0.242, + "step": 4672 + }, + { + "epoch": 1.7256277695716395, + "grad_norm": 0.26712796092033386, + "learning_rate": 8.501046926961449e-05, + "loss": 0.2152, + "step": 4673 + }, + { + "epoch": 1.7259970457902511, + "grad_norm": 0.31008580327033997, + "learning_rate": 8.4985835694051e-05, + "loss": 0.2411, + "step": 4674 + }, + { + "epoch": 1.7263663220088628, + "grad_norm": 0.26372337341308594, + "learning_rate": 8.49612021184875e-05, + "loss": 0.2062, + "step": 4675 + }, + { + "epoch": 1.7267355982274741, + "grad_norm": 0.31291308999061584, + "learning_rate": 8.493656854292402e-05, + "loss": 0.2881, + "step": 4676 + }, + { + "epoch": 1.7271048744460855, + "grad_norm": 0.25760897994041443, + "learning_rate": 8.491193496736052e-05, + "loss": 0.2024, + "step": 4677 + }, + { + "epoch": 1.7274741506646971, + "grad_norm": 0.2914617955684662, + "learning_rate": 8.488730139179702e-05, + "loss": 0.236, + "step": 4678 + }, + { + "epoch": 1.7278434268833087, + "grad_norm": 0.35891687870025635, + "learning_rate": 8.486266781623353e-05, + "loss": 0.2895, + "step": 4679 + }, + { + "epoch": 1.7282127031019203, + "grad_norm": 0.22936636209487915, + "learning_rate": 8.483803424067004e-05, + "loss": 0.184, + "step": 4680 + }, + { + "epoch": 1.7285819793205317, + "grad_norm": 0.27689000964164734, + "learning_rate": 8.481340066510655e-05, + "loss": 0.2291, + "step": 4681 + }, + { + "epoch": 1.7289512555391433, + "grad_norm": 0.2192811667919159, + "learning_rate": 8.478876708954305e-05, + "loss": 0.202, + "step": 4682 + }, + { + "epoch": 1.7293205317577547, + "grad_norm": 0.23268942534923553, + "learning_rate": 8.476413351397957e-05, + "loss": 0.1944, + "step": 4683 + }, + { + "epoch": 1.7296898079763663, + "grad_norm": 0.25791624188423157, + "learning_rate": 8.473949993841607e-05, + "loss": 0.2492, + "step": 4684 + }, + { + "epoch": 1.730059084194978, + "grad_norm": 0.2439473569393158, + "learning_rate": 8.471486636285257e-05, + "loss": 0.1879, + "step": 4685 + }, + { + "epoch": 1.7304283604135895, + "grad_norm": 0.26058387756347656, + "learning_rate": 8.469023278728908e-05, + "loss": 0.2409, + "step": 4686 + }, + { + "epoch": 1.7307976366322009, + "grad_norm": 0.2187359780073166, + "learning_rate": 8.466559921172558e-05, + "loss": 0.2107, + "step": 4687 + }, + { + "epoch": 1.7311669128508123, + "grad_norm": 0.2818317115306854, + "learning_rate": 8.46409656361621e-05, + "loss": 0.2479, + "step": 4688 + }, + { + "epoch": 1.7315361890694239, + "grad_norm": 0.294546902179718, + "learning_rate": 8.46163320605986e-05, + "loss": 0.2268, + "step": 4689 + }, + { + "epoch": 1.7319054652880355, + "grad_norm": 0.24807016551494598, + "learning_rate": 8.459169848503511e-05, + "loss": 0.221, + "step": 4690 + }, + { + "epoch": 1.732274741506647, + "grad_norm": 0.24749433994293213, + "learning_rate": 8.456706490947162e-05, + "loss": 0.2188, + "step": 4691 + }, + { + "epoch": 1.7326440177252584, + "grad_norm": 0.3079688549041748, + "learning_rate": 8.454243133390812e-05, + "loss": 0.2583, + "step": 4692 + }, + { + "epoch": 1.73301329394387, + "grad_norm": 0.2550850212574005, + "learning_rate": 8.451779775834463e-05, + "loss": 0.2135, + "step": 4693 + }, + { + "epoch": 1.7333825701624814, + "grad_norm": 0.25055626034736633, + "learning_rate": 8.449316418278113e-05, + "loss": 0.1936, + "step": 4694 + }, + { + "epoch": 1.733751846381093, + "grad_norm": 0.2375052273273468, + "learning_rate": 8.446853060721765e-05, + "loss": 0.1861, + "step": 4695 + }, + { + "epoch": 1.7341211225997046, + "grad_norm": 0.239014133810997, + "learning_rate": 8.444389703165415e-05, + "loss": 0.1974, + "step": 4696 + }, + { + "epoch": 1.7344903988183162, + "grad_norm": 0.23493143916130066, + "learning_rate": 8.441926345609066e-05, + "loss": 0.2069, + "step": 4697 + }, + { + "epoch": 1.7348596750369276, + "grad_norm": 0.27003049850463867, + "learning_rate": 8.439462988052716e-05, + "loss": 0.1926, + "step": 4698 + }, + { + "epoch": 1.735228951255539, + "grad_norm": 0.2863848805427551, + "learning_rate": 8.436999630496368e-05, + "loss": 0.2666, + "step": 4699 + }, + { + "epoch": 1.7355982274741506, + "grad_norm": 0.23896265029907227, + "learning_rate": 8.434536272940018e-05, + "loss": 0.1886, + "step": 4700 + }, + { + "epoch": 1.7355982274741506, + "eval_loss": 8.436310768127441, + "eval_runtime": 6.9112, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 1.013, + "step": 4700 + }, + { + "epoch": 1.7359675036927622, + "grad_norm": 0.2585345208644867, + "learning_rate": 8.432072915383668e-05, + "loss": 0.2168, + "step": 4701 + }, + { + "epoch": 1.7363367799113738, + "grad_norm": 0.31819796562194824, + "learning_rate": 8.42960955782732e-05, + "loss": 0.2673, + "step": 4702 + }, + { + "epoch": 1.7367060561299852, + "grad_norm": 0.27601850032806396, + "learning_rate": 8.42714620027097e-05, + "loss": 0.1946, + "step": 4703 + }, + { + "epoch": 1.7370753323485968, + "grad_norm": 0.257088840007782, + "learning_rate": 8.424682842714621e-05, + "loss": 0.1843, + "step": 4704 + }, + { + "epoch": 1.7374446085672082, + "grad_norm": 0.3266865313053131, + "learning_rate": 8.422219485158271e-05, + "loss": 0.244, + "step": 4705 + }, + { + "epoch": 1.7378138847858198, + "grad_norm": 0.25994256138801575, + "learning_rate": 8.419756127601923e-05, + "loss": 0.2227, + "step": 4706 + }, + { + "epoch": 1.7381831610044314, + "grad_norm": 0.2578684985637665, + "learning_rate": 8.417292770045573e-05, + "loss": 0.2443, + "step": 4707 + }, + { + "epoch": 1.738552437223043, + "grad_norm": 0.2504384219646454, + "learning_rate": 8.414829412489223e-05, + "loss": 0.2073, + "step": 4708 + }, + { + "epoch": 1.7389217134416544, + "grad_norm": 0.2609635293483734, + "learning_rate": 8.412366054932874e-05, + "loss": 0.2129, + "step": 4709 + }, + { + "epoch": 1.7392909896602657, + "grad_norm": 0.3248125910758972, + "learning_rate": 8.409902697376524e-05, + "loss": 0.2593, + "step": 4710 + }, + { + "epoch": 1.7396602658788773, + "grad_norm": 0.27035218477249146, + "learning_rate": 8.407439339820176e-05, + "loss": 0.2119, + "step": 4711 + }, + { + "epoch": 1.740029542097489, + "grad_norm": 0.2774043083190918, + "learning_rate": 8.404975982263826e-05, + "loss": 0.2125, + "step": 4712 + }, + { + "epoch": 1.7403988183161005, + "grad_norm": 0.23256514966487885, + "learning_rate": 8.402512624707477e-05, + "loss": 0.2085, + "step": 4713 + }, + { + "epoch": 1.740768094534712, + "grad_norm": 0.2681777775287628, + "learning_rate": 8.400049267151128e-05, + "loss": 0.2184, + "step": 4714 + }, + { + "epoch": 1.7411373707533235, + "grad_norm": 0.22253958880901337, + "learning_rate": 8.397585909594779e-05, + "loss": 0.1928, + "step": 4715 + }, + { + "epoch": 1.741506646971935, + "grad_norm": 0.28154608607292175, + "learning_rate": 8.395122552038429e-05, + "loss": 0.2222, + "step": 4716 + }, + { + "epoch": 1.7418759231905465, + "grad_norm": 0.2564038932323456, + "learning_rate": 8.392659194482079e-05, + "loss": 0.2142, + "step": 4717 + }, + { + "epoch": 1.7422451994091581, + "grad_norm": 0.34169241786003113, + "learning_rate": 8.390195836925731e-05, + "loss": 0.23, + "step": 4718 + }, + { + "epoch": 1.7426144756277697, + "grad_norm": 0.23960208892822266, + "learning_rate": 8.387732479369381e-05, + "loss": 0.2058, + "step": 4719 + }, + { + "epoch": 1.742983751846381, + "grad_norm": 0.2392963021993637, + "learning_rate": 8.385269121813032e-05, + "loss": 0.2041, + "step": 4720 + }, + { + "epoch": 1.7433530280649925, + "grad_norm": 0.3414040207862854, + "learning_rate": 8.382805764256682e-05, + "loss": 0.2685, + "step": 4721 + }, + { + "epoch": 1.743722304283604, + "grad_norm": 0.26945218443870544, + "learning_rate": 8.380342406700334e-05, + "loss": 0.2005, + "step": 4722 + }, + { + "epoch": 1.7440915805022157, + "grad_norm": 0.2988106906414032, + "learning_rate": 8.377879049143984e-05, + "loss": 0.2511, + "step": 4723 + }, + { + "epoch": 1.7444608567208273, + "grad_norm": 0.2449643462896347, + "learning_rate": 8.375415691587634e-05, + "loss": 0.2153, + "step": 4724 + }, + { + "epoch": 1.7448301329394387, + "grad_norm": 0.27837881445884705, + "learning_rate": 8.372952334031286e-05, + "loss": 0.2322, + "step": 4725 + }, + { + "epoch": 1.7451994091580503, + "grad_norm": 0.2524349093437195, + "learning_rate": 8.370488976474936e-05, + "loss": 0.1868, + "step": 4726 + }, + { + "epoch": 1.7455686853766617, + "grad_norm": 0.3248385488986969, + "learning_rate": 8.368025618918587e-05, + "loss": 0.3078, + "step": 4727 + }, + { + "epoch": 1.7459379615952733, + "grad_norm": 0.24664220213890076, + "learning_rate": 8.365562261362237e-05, + "loss": 0.2043, + "step": 4728 + }, + { + "epoch": 1.7463072378138849, + "grad_norm": 0.2315950095653534, + "learning_rate": 8.363098903805889e-05, + "loss": 0.2213, + "step": 4729 + }, + { + "epoch": 1.7466765140324965, + "grad_norm": 0.29755106568336487, + "learning_rate": 8.360635546249539e-05, + "loss": 0.2345, + "step": 4730 + }, + { + "epoch": 1.7470457902511078, + "grad_norm": 0.2861720621585846, + "learning_rate": 8.35817218869319e-05, + "loss": 0.2192, + "step": 4731 + }, + { + "epoch": 1.7474150664697192, + "grad_norm": 0.34982576966285706, + "learning_rate": 8.35570883113684e-05, + "loss": 0.2079, + "step": 4732 + }, + { + "epoch": 1.7477843426883308, + "grad_norm": 0.24439233541488647, + "learning_rate": 8.35324547358049e-05, + "loss": 0.2161, + "step": 4733 + }, + { + "epoch": 1.7481536189069424, + "grad_norm": 0.2534776031970978, + "learning_rate": 8.350782116024142e-05, + "loss": 0.212, + "step": 4734 + }, + { + "epoch": 1.748522895125554, + "grad_norm": 0.2761339545249939, + "learning_rate": 8.348318758467792e-05, + "loss": 0.2487, + "step": 4735 + }, + { + "epoch": 1.7488921713441654, + "grad_norm": 0.35301342606544495, + "learning_rate": 8.345855400911444e-05, + "loss": 0.2425, + "step": 4736 + }, + { + "epoch": 1.7492614475627768, + "grad_norm": 0.29787227511405945, + "learning_rate": 8.343392043355094e-05, + "loss": 0.2081, + "step": 4737 + }, + { + "epoch": 1.7496307237813884, + "grad_norm": 0.2650465965270996, + "learning_rate": 8.340928685798745e-05, + "loss": 0.2102, + "step": 4738 + }, + { + "epoch": 1.75, + "grad_norm": 0.2994593381881714, + "learning_rate": 8.338465328242395e-05, + "loss": 0.2551, + "step": 4739 + }, + { + "epoch": 1.7503692762186116, + "grad_norm": 0.26589399576187134, + "learning_rate": 8.336001970686045e-05, + "loss": 0.1922, + "step": 4740 + }, + { + "epoch": 1.7507385524372232, + "grad_norm": 0.25669199228286743, + "learning_rate": 8.333538613129697e-05, + "loss": 0.1883, + "step": 4741 + }, + { + "epoch": 1.7511078286558346, + "grad_norm": 0.2789362072944641, + "learning_rate": 8.331075255573347e-05, + "loss": 0.2402, + "step": 4742 + }, + { + "epoch": 1.751477104874446, + "grad_norm": 0.26320910453796387, + "learning_rate": 8.328611898016998e-05, + "loss": 0.2018, + "step": 4743 + }, + { + "epoch": 1.7518463810930576, + "grad_norm": 0.29045823216438293, + "learning_rate": 8.326148540460648e-05, + "loss": 0.2388, + "step": 4744 + }, + { + "epoch": 1.7522156573116692, + "grad_norm": 0.3354822099208832, + "learning_rate": 8.3236851829043e-05, + "loss": 0.2701, + "step": 4745 + }, + { + "epoch": 1.7525849335302808, + "grad_norm": 0.22395887970924377, + "learning_rate": 8.32122182534795e-05, + "loss": 0.1837, + "step": 4746 + }, + { + "epoch": 1.7529542097488922, + "grad_norm": 0.2769034206867218, + "learning_rate": 8.318758467791602e-05, + "loss": 0.2311, + "step": 4747 + }, + { + "epoch": 1.7533234859675035, + "grad_norm": 0.22001677751541138, + "learning_rate": 8.316295110235252e-05, + "loss": 0.1784, + "step": 4748 + }, + { + "epoch": 1.7536927621861151, + "grad_norm": 0.2750139534473419, + "learning_rate": 8.313831752678902e-05, + "loss": 0.2015, + "step": 4749 + }, + { + "epoch": 1.7540620384047267, + "grad_norm": 0.27339109778404236, + "learning_rate": 8.311368395122553e-05, + "loss": 0.2252, + "step": 4750 + }, + { + "epoch": 1.7540620384047267, + "eval_loss": 8.395513534545898, + "eval_runtime": 6.9126, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 1.013, + "step": 4750 + }, + { + "epoch": 1.7544313146233383, + "grad_norm": 0.27735012769699097, + "learning_rate": 8.308905037566203e-05, + "loss": 0.2064, + "step": 4751 + }, + { + "epoch": 1.7548005908419497, + "grad_norm": 0.24390773475170135, + "learning_rate": 8.306441680009855e-05, + "loss": 0.2048, + "step": 4752 + }, + { + "epoch": 1.7551698670605613, + "grad_norm": 0.2693495750427246, + "learning_rate": 8.303978322453505e-05, + "loss": 0.2129, + "step": 4753 + }, + { + "epoch": 1.7555391432791727, + "grad_norm": 0.26011401414871216, + "learning_rate": 8.301514964897156e-05, + "loss": 0.2158, + "step": 4754 + }, + { + "epoch": 1.7559084194977843, + "grad_norm": 0.2850961685180664, + "learning_rate": 8.299051607340806e-05, + "loss": 0.2376, + "step": 4755 + }, + { + "epoch": 1.756277695716396, + "grad_norm": 0.27132123708724976, + "learning_rate": 8.296588249784457e-05, + "loss": 0.2314, + "step": 4756 + }, + { + "epoch": 1.7566469719350075, + "grad_norm": 0.23719866573810577, + "learning_rate": 8.294124892228108e-05, + "loss": 0.2037, + "step": 4757 + }, + { + "epoch": 1.757016248153619, + "grad_norm": 0.2462550550699234, + "learning_rate": 8.291661534671758e-05, + "loss": 0.2023, + "step": 4758 + }, + { + "epoch": 1.7573855243722303, + "grad_norm": 0.2407626062631607, + "learning_rate": 8.28919817711541e-05, + "loss": 0.2121, + "step": 4759 + }, + { + "epoch": 1.7577548005908419, + "grad_norm": 0.30009201169013977, + "learning_rate": 8.28673481955906e-05, + "loss": 0.2188, + "step": 4760 + }, + { + "epoch": 1.7581240768094535, + "grad_norm": 0.24682067334651947, + "learning_rate": 8.28427146200271e-05, + "loss": 0.1984, + "step": 4761 + }, + { + "epoch": 1.758493353028065, + "grad_norm": 0.269798219203949, + "learning_rate": 8.28180810444636e-05, + "loss": 0.1997, + "step": 4762 + }, + { + "epoch": 1.7588626292466765, + "grad_norm": 0.2574988603591919, + "learning_rate": 8.279344746890011e-05, + "loss": 0.2089, + "step": 4763 + }, + { + "epoch": 1.759231905465288, + "grad_norm": 0.30966028571128845, + "learning_rate": 8.276881389333661e-05, + "loss": 0.2548, + "step": 4764 + }, + { + "epoch": 1.7596011816838995, + "grad_norm": 0.3135741949081421, + "learning_rate": 8.274418031777313e-05, + "loss": 0.2695, + "step": 4765 + }, + { + "epoch": 1.759970457902511, + "grad_norm": 0.28584057092666626, + "learning_rate": 8.271954674220963e-05, + "loss": 0.2065, + "step": 4766 + }, + { + "epoch": 1.7603397341211227, + "grad_norm": 0.27670496702194214, + "learning_rate": 8.269491316664613e-05, + "loss": 0.24, + "step": 4767 + }, + { + "epoch": 1.7607090103397343, + "grad_norm": 0.24431340396404266, + "learning_rate": 8.267027959108265e-05, + "loss": 0.232, + "step": 4768 + }, + { + "epoch": 1.7610782865583456, + "grad_norm": 0.28236982226371765, + "learning_rate": 8.264564601551915e-05, + "loss": 0.2299, + "step": 4769 + }, + { + "epoch": 1.761447562776957, + "grad_norm": 0.21445855498313904, + "learning_rate": 8.262101243995566e-05, + "loss": 0.1876, + "step": 4770 + }, + { + "epoch": 1.7618168389955686, + "grad_norm": 0.2856297791004181, + "learning_rate": 8.259637886439216e-05, + "loss": 0.212, + "step": 4771 + }, + { + "epoch": 1.7621861152141802, + "grad_norm": 0.19477324187755585, + "learning_rate": 8.257174528882868e-05, + "loss": 0.1743, + "step": 4772 + }, + { + "epoch": 1.7625553914327918, + "grad_norm": 0.2618711590766907, + "learning_rate": 8.254711171326518e-05, + "loss": 0.2071, + "step": 4773 + }, + { + "epoch": 1.7629246676514032, + "grad_norm": 0.22237637639045715, + "learning_rate": 8.252247813770168e-05, + "loss": 0.1944, + "step": 4774 + }, + { + "epoch": 1.7632939438700148, + "grad_norm": 0.2223082035779953, + "learning_rate": 8.24978445621382e-05, + "loss": 0.1727, + "step": 4775 + }, + { + "epoch": 1.7636632200886262, + "grad_norm": 0.3005097210407257, + "learning_rate": 8.24732109865747e-05, + "loss": 0.2558, + "step": 4776 + }, + { + "epoch": 1.7640324963072378, + "grad_norm": 0.23957087099552155, + "learning_rate": 8.244857741101121e-05, + "loss": 0.2076, + "step": 4777 + }, + { + "epoch": 1.7644017725258494, + "grad_norm": 0.22704055905342102, + "learning_rate": 8.242394383544771e-05, + "loss": 0.1755, + "step": 4778 + }, + { + "epoch": 1.764771048744461, + "grad_norm": 0.23419803380966187, + "learning_rate": 8.239931025988423e-05, + "loss": 0.202, + "step": 4779 + }, + { + "epoch": 1.7651403249630724, + "grad_norm": 0.25554797053337097, + "learning_rate": 8.237467668432073e-05, + "loss": 0.2133, + "step": 4780 + }, + { + "epoch": 1.7655096011816838, + "grad_norm": 0.24982520937919617, + "learning_rate": 8.235004310875724e-05, + "loss": 0.2177, + "step": 4781 + }, + { + "epoch": 1.7658788774002954, + "grad_norm": 0.3109075427055359, + "learning_rate": 8.232540953319374e-05, + "loss": 0.2316, + "step": 4782 + }, + { + "epoch": 1.766248153618907, + "grad_norm": 0.20598343014717102, + "learning_rate": 8.230077595763024e-05, + "loss": 0.1787, + "step": 4783 + }, + { + "epoch": 1.7666174298375186, + "grad_norm": 0.2667976915836334, + "learning_rate": 8.227614238206676e-05, + "loss": 0.2037, + "step": 4784 + }, + { + "epoch": 1.76698670605613, + "grad_norm": 0.2704645097255707, + "learning_rate": 8.225150880650326e-05, + "loss": 0.2335, + "step": 4785 + }, + { + "epoch": 1.7673559822747416, + "grad_norm": 0.3791755437850952, + "learning_rate": 8.222687523093977e-05, + "loss": 0.2784, + "step": 4786 + }, + { + "epoch": 1.767725258493353, + "grad_norm": 0.2833936810493469, + "learning_rate": 8.220224165537628e-05, + "loss": 0.1924, + "step": 4787 + }, + { + "epoch": 1.7680945347119645, + "grad_norm": 0.24855190515518188, + "learning_rate": 8.217760807981279e-05, + "loss": 0.1978, + "step": 4788 + }, + { + "epoch": 1.7684638109305761, + "grad_norm": 0.2796396017074585, + "learning_rate": 8.215297450424929e-05, + "loss": 0.2497, + "step": 4789 + }, + { + "epoch": 1.7688330871491877, + "grad_norm": 0.23762375116348267, + "learning_rate": 8.212834092868579e-05, + "loss": 0.2037, + "step": 4790 + }, + { + "epoch": 1.7692023633677991, + "grad_norm": 0.2644731402397156, + "learning_rate": 8.210370735312231e-05, + "loss": 0.208, + "step": 4791 + }, + { + "epoch": 1.7695716395864105, + "grad_norm": 0.2457064837217331, + "learning_rate": 8.207907377755881e-05, + "loss": 0.1847, + "step": 4792 + }, + { + "epoch": 1.769940915805022, + "grad_norm": 0.2766428291797638, + "learning_rate": 8.205444020199532e-05, + "loss": 0.242, + "step": 4793 + }, + { + "epoch": 1.7703101920236337, + "grad_norm": 0.2451019138097763, + "learning_rate": 8.202980662643182e-05, + "loss": 0.2224, + "step": 4794 + }, + { + "epoch": 1.7706794682422453, + "grad_norm": 0.2344719022512436, + "learning_rate": 8.200517305086834e-05, + "loss": 0.2342, + "step": 4795 + }, + { + "epoch": 1.7710487444608567, + "grad_norm": 0.3354860544204712, + "learning_rate": 8.198053947530484e-05, + "loss": 0.269, + "step": 4796 + }, + { + "epoch": 1.7714180206794683, + "grad_norm": 0.3599562346935272, + "learning_rate": 8.195590589974135e-05, + "loss": 0.2308, + "step": 4797 + }, + { + "epoch": 1.7717872968980797, + "grad_norm": 0.26370885968208313, + "learning_rate": 8.193127232417786e-05, + "loss": 0.2158, + "step": 4798 + }, + { + "epoch": 1.7721565731166913, + "grad_norm": 0.24752885103225708, + "learning_rate": 8.190663874861436e-05, + "loss": 0.1915, + "step": 4799 + }, + { + "epoch": 1.7725258493353029, + "grad_norm": 0.24951229989528656, + "learning_rate": 8.188200517305087e-05, + "loss": 0.2094, + "step": 4800 + }, + { + "epoch": 1.7725258493353029, + "eval_loss": 8.407576560974121, + "eval_runtime": 6.9106, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 1.013, + "step": 4800 + }, + { + "epoch": 1.7728951255539145, + "grad_norm": 0.2691768705844879, + "learning_rate": 8.185737159748737e-05, + "loss": 0.235, + "step": 4801 + }, + { + "epoch": 1.7732644017725259, + "grad_norm": 0.2421412467956543, + "learning_rate": 8.183273802192389e-05, + "loss": 0.2299, + "step": 4802 + }, + { + "epoch": 1.7736336779911372, + "grad_norm": 0.27343040704727173, + "learning_rate": 8.180810444636039e-05, + "loss": 0.243, + "step": 4803 + }, + { + "epoch": 1.7740029542097489, + "grad_norm": 0.3850736618041992, + "learning_rate": 8.17834708707969e-05, + "loss": 0.2563, + "step": 4804 + }, + { + "epoch": 1.7743722304283605, + "grad_norm": 0.3254380226135254, + "learning_rate": 8.17588372952334e-05, + "loss": 0.2783, + "step": 4805 + }, + { + "epoch": 1.774741506646972, + "grad_norm": 0.31123337149620056, + "learning_rate": 8.17342037196699e-05, + "loss": 0.2009, + "step": 4806 + }, + { + "epoch": 1.7751107828655834, + "grad_norm": 0.2375793606042862, + "learning_rate": 8.170957014410642e-05, + "loss": 0.1914, + "step": 4807 + }, + { + "epoch": 1.7754800590841948, + "grad_norm": 0.2882954478263855, + "learning_rate": 8.168493656854292e-05, + "loss": 0.2159, + "step": 4808 + }, + { + "epoch": 1.7758493353028064, + "grad_norm": 0.23820452392101288, + "learning_rate": 8.166030299297943e-05, + "loss": 0.219, + "step": 4809 + }, + { + "epoch": 1.776218611521418, + "grad_norm": 0.295207142829895, + "learning_rate": 8.163566941741594e-05, + "loss": 0.2675, + "step": 4810 + }, + { + "epoch": 1.7765878877400296, + "grad_norm": 0.23802897334098816, + "learning_rate": 8.161103584185245e-05, + "loss": 0.2021, + "step": 4811 + }, + { + "epoch": 1.7769571639586412, + "grad_norm": 0.28950175642967224, + "learning_rate": 8.158640226628895e-05, + "loss": 0.2474, + "step": 4812 + }, + { + "epoch": 1.7773264401772526, + "grad_norm": 0.31371310353279114, + "learning_rate": 8.156176869072547e-05, + "loss": 0.2187, + "step": 4813 + }, + { + "epoch": 1.777695716395864, + "grad_norm": 0.2886470854282379, + "learning_rate": 8.153713511516197e-05, + "loss": 0.2452, + "step": 4814 + }, + { + "epoch": 1.7780649926144756, + "grad_norm": 0.2744346261024475, + "learning_rate": 8.151250153959847e-05, + "loss": 0.2583, + "step": 4815 + }, + { + "epoch": 1.7784342688330872, + "grad_norm": 0.29489076137542725, + "learning_rate": 8.148786796403498e-05, + "loss": 0.2324, + "step": 4816 + }, + { + "epoch": 1.7788035450516988, + "grad_norm": 0.33477842807769775, + "learning_rate": 8.146323438847148e-05, + "loss": 0.3164, + "step": 4817 + }, + { + "epoch": 1.7791728212703102, + "grad_norm": 0.33081015944480896, + "learning_rate": 8.1438600812908e-05, + "loss": 0.2777, + "step": 4818 + }, + { + "epoch": 1.7795420974889216, + "grad_norm": 0.27487534284591675, + "learning_rate": 8.14139672373445e-05, + "loss": 0.247, + "step": 4819 + }, + { + "epoch": 1.7799113737075332, + "grad_norm": 0.2167029082775116, + "learning_rate": 8.138933366178101e-05, + "loss": 0.2095, + "step": 4820 + }, + { + "epoch": 1.7802806499261448, + "grad_norm": 0.29033416509628296, + "learning_rate": 8.136470008621752e-05, + "loss": 0.2365, + "step": 4821 + }, + { + "epoch": 1.7806499261447564, + "grad_norm": 0.20453786849975586, + "learning_rate": 8.134006651065402e-05, + "loss": 0.1913, + "step": 4822 + }, + { + "epoch": 1.7810192023633677, + "grad_norm": 0.2932835519313812, + "learning_rate": 8.131543293509053e-05, + "loss": 0.2719, + "step": 4823 + }, + { + "epoch": 1.7813884785819794, + "grad_norm": 0.21910883486270905, + "learning_rate": 8.129079935952703e-05, + "loss": 0.2007, + "step": 4824 + }, + { + "epoch": 1.7817577548005907, + "grad_norm": 0.21229508519172668, + "learning_rate": 8.126616578396355e-05, + "loss": 0.1875, + "step": 4825 + }, + { + "epoch": 1.7821270310192023, + "grad_norm": 0.22403907775878906, + "learning_rate": 8.124153220840005e-05, + "loss": 0.204, + "step": 4826 + }, + { + "epoch": 1.782496307237814, + "grad_norm": 0.25153085589408875, + "learning_rate": 8.121689863283656e-05, + "loss": 0.2324, + "step": 4827 + }, + { + "epoch": 1.7828655834564255, + "grad_norm": 0.26192471385002136, + "learning_rate": 8.119226505727306e-05, + "loss": 0.2211, + "step": 4828 + }, + { + "epoch": 1.783234859675037, + "grad_norm": 0.2920961081981659, + "learning_rate": 8.116763148170956e-05, + "loss": 0.2213, + "step": 4829 + }, + { + "epoch": 1.7836041358936483, + "grad_norm": 0.2861524224281311, + "learning_rate": 8.114299790614608e-05, + "loss": 0.2474, + "step": 4830 + }, + { + "epoch": 1.78397341211226, + "grad_norm": 0.2897714376449585, + "learning_rate": 8.111836433058258e-05, + "loss": 0.2176, + "step": 4831 + }, + { + "epoch": 1.7843426883308715, + "grad_norm": 0.27227744460105896, + "learning_rate": 8.10937307550191e-05, + "loss": 0.2292, + "step": 4832 + }, + { + "epoch": 1.784711964549483, + "grad_norm": 0.24579398334026337, + "learning_rate": 8.10690971794556e-05, + "loss": 0.2483, + "step": 4833 + }, + { + "epoch": 1.7850812407680945, + "grad_norm": 0.2334270179271698, + "learning_rate": 8.104446360389211e-05, + "loss": 0.188, + "step": 4834 + }, + { + "epoch": 1.785450516986706, + "grad_norm": 0.22719815373420715, + "learning_rate": 8.101983002832861e-05, + "loss": 0.212, + "step": 4835 + }, + { + "epoch": 1.7858197932053175, + "grad_norm": 0.2677370309829712, + "learning_rate": 8.099519645276513e-05, + "loss": 0.2058, + "step": 4836 + }, + { + "epoch": 1.786189069423929, + "grad_norm": 0.27946871519088745, + "learning_rate": 8.097056287720163e-05, + "loss": 0.1988, + "step": 4837 + }, + { + "epoch": 1.7865583456425407, + "grad_norm": 0.25194233655929565, + "learning_rate": 8.094592930163813e-05, + "loss": 0.2069, + "step": 4838 + }, + { + "epoch": 1.7869276218611523, + "grad_norm": 0.25942498445510864, + "learning_rate": 8.092129572607464e-05, + "loss": 0.2219, + "step": 4839 + }, + { + "epoch": 1.7872968980797637, + "grad_norm": 0.22439908981323242, + "learning_rate": 8.089666215051114e-05, + "loss": 0.195, + "step": 4840 + }, + { + "epoch": 1.787666174298375, + "grad_norm": 0.23637856543064117, + "learning_rate": 8.087202857494766e-05, + "loss": 0.1772, + "step": 4841 + }, + { + "epoch": 1.7880354505169866, + "grad_norm": 0.26319101452827454, + "learning_rate": 8.084739499938416e-05, + "loss": 0.1853, + "step": 4842 + }, + { + "epoch": 1.7884047267355982, + "grad_norm": 0.2574473023414612, + "learning_rate": 8.082276142382068e-05, + "loss": 0.2052, + "step": 4843 + }, + { + "epoch": 1.7887740029542099, + "grad_norm": 0.2488311529159546, + "learning_rate": 8.079812784825718e-05, + "loss": 0.2094, + "step": 4844 + }, + { + "epoch": 1.7891432791728212, + "grad_norm": 0.2719607353210449, + "learning_rate": 8.077349427269368e-05, + "loss": 0.2253, + "step": 4845 + }, + { + "epoch": 1.7895125553914328, + "grad_norm": 0.2649681568145752, + "learning_rate": 8.074886069713019e-05, + "loss": 0.2009, + "step": 4846 + }, + { + "epoch": 1.7898818316100442, + "grad_norm": 0.31188032031059265, + "learning_rate": 8.072422712156669e-05, + "loss": 0.2564, + "step": 4847 + }, + { + "epoch": 1.7902511078286558, + "grad_norm": 0.3002725839614868, + "learning_rate": 8.069959354600321e-05, + "loss": 0.2218, + "step": 4848 + }, + { + "epoch": 1.7906203840472674, + "grad_norm": 0.30102506279945374, + "learning_rate": 8.067495997043971e-05, + "loss": 0.2498, + "step": 4849 + }, + { + "epoch": 1.790989660265879, + "grad_norm": 0.2869955599308014, + "learning_rate": 8.065032639487622e-05, + "loss": 0.2212, + "step": 4850 + }, + { + "epoch": 1.790989660265879, + "eval_loss": 8.425841331481934, + "eval_runtime": 6.9189, + "eval_samples_per_second": 7.227, + "eval_steps_per_second": 1.012, + "step": 4850 + }, + { + "epoch": 1.7913589364844904, + "grad_norm": 0.2678149342536926, + "learning_rate": 8.062569281931272e-05, + "loss": 0.2127, + "step": 4851 + }, + { + "epoch": 1.7917282127031018, + "grad_norm": 0.23937039077281952, + "learning_rate": 8.060105924374924e-05, + "loss": 0.1904, + "step": 4852 + }, + { + "epoch": 1.7920974889217134, + "grad_norm": 0.2650289237499237, + "learning_rate": 8.057642566818574e-05, + "loss": 0.1801, + "step": 4853 + }, + { + "epoch": 1.792466765140325, + "grad_norm": 0.22451524436473846, + "learning_rate": 8.055179209262224e-05, + "loss": 0.2028, + "step": 4854 + }, + { + "epoch": 1.7928360413589366, + "grad_norm": 0.2734116017818451, + "learning_rate": 8.052715851705876e-05, + "loss": 0.2419, + "step": 4855 + }, + { + "epoch": 1.793205317577548, + "grad_norm": 0.26262366771698, + "learning_rate": 8.050252494149526e-05, + "loss": 0.2318, + "step": 4856 + }, + { + "epoch": 1.7935745937961596, + "grad_norm": 0.28790807723999023, + "learning_rate": 8.047789136593177e-05, + "loss": 0.2354, + "step": 4857 + }, + { + "epoch": 1.793943870014771, + "grad_norm": 0.30597084760665894, + "learning_rate": 8.045325779036827e-05, + "loss": 0.2497, + "step": 4858 + }, + { + "epoch": 1.7943131462333826, + "grad_norm": 0.29202237725257874, + "learning_rate": 8.042862421480479e-05, + "loss": 0.2286, + "step": 4859 + }, + { + "epoch": 1.7946824224519942, + "grad_norm": 0.26619091629981995, + "learning_rate": 8.040399063924129e-05, + "loss": 0.2312, + "step": 4860 + }, + { + "epoch": 1.7950516986706058, + "grad_norm": 0.29905468225479126, + "learning_rate": 8.037935706367779e-05, + "loss": 0.225, + "step": 4861 + }, + { + "epoch": 1.7954209748892171, + "grad_norm": 0.26778802275657654, + "learning_rate": 8.03547234881143e-05, + "loss": 0.245, + "step": 4862 + }, + { + "epoch": 1.7957902511078285, + "grad_norm": 0.23977141082286835, + "learning_rate": 8.03300899125508e-05, + "loss": 0.158, + "step": 4863 + }, + { + "epoch": 1.7961595273264401, + "grad_norm": 0.2552523910999298, + "learning_rate": 8.030545633698732e-05, + "loss": 0.2121, + "step": 4864 + }, + { + "epoch": 1.7965288035450517, + "grad_norm": 0.25521668791770935, + "learning_rate": 8.028082276142382e-05, + "loss": 0.2136, + "step": 4865 + }, + { + "epoch": 1.7968980797636633, + "grad_norm": 0.2718730568885803, + "learning_rate": 8.025618918586034e-05, + "loss": 0.2518, + "step": 4866 + }, + { + "epoch": 1.7972673559822747, + "grad_norm": 0.2757890820503235, + "learning_rate": 8.023155561029684e-05, + "loss": 0.2262, + "step": 4867 + }, + { + "epoch": 1.797636632200886, + "grad_norm": 0.3216096758842468, + "learning_rate": 8.020692203473335e-05, + "loss": 0.2269, + "step": 4868 + }, + { + "epoch": 1.7980059084194977, + "grad_norm": 0.28278493881225586, + "learning_rate": 8.018228845916985e-05, + "loss": 0.1984, + "step": 4869 + }, + { + "epoch": 1.7983751846381093, + "grad_norm": 0.26853060722351074, + "learning_rate": 8.015765488360635e-05, + "loss": 0.1987, + "step": 4870 + }, + { + "epoch": 1.798744460856721, + "grad_norm": 0.29373711347579956, + "learning_rate": 8.013302130804287e-05, + "loss": 0.1894, + "step": 4871 + }, + { + "epoch": 1.7991137370753325, + "grad_norm": 0.23830413818359375, + "learning_rate": 8.010838773247937e-05, + "loss": 0.2157, + "step": 4872 + }, + { + "epoch": 1.799483013293944, + "grad_norm": 0.24021795392036438, + "learning_rate": 8.008375415691588e-05, + "loss": 0.2001, + "step": 4873 + }, + { + "epoch": 1.7998522895125553, + "grad_norm": 0.290291428565979, + "learning_rate": 8.005912058135238e-05, + "loss": 0.2528, + "step": 4874 + }, + { + "epoch": 1.8002215657311669, + "grad_norm": 0.26946234703063965, + "learning_rate": 8.00344870057889e-05, + "loss": 0.23, + "step": 4875 + }, + { + "epoch": 1.8005908419497785, + "grad_norm": 0.24534159898757935, + "learning_rate": 8.00098534302254e-05, + "loss": 0.2185, + "step": 4876 + }, + { + "epoch": 1.80096011816839, + "grad_norm": 0.28724390268325806, + "learning_rate": 7.99852198546619e-05, + "loss": 0.2236, + "step": 4877 + }, + { + "epoch": 1.8013293943870015, + "grad_norm": 0.31672096252441406, + "learning_rate": 7.996058627909842e-05, + "loss": 0.2016, + "step": 4878 + }, + { + "epoch": 1.8016986706056128, + "grad_norm": 0.2232753038406372, + "learning_rate": 7.993595270353492e-05, + "loss": 0.1993, + "step": 4879 + }, + { + "epoch": 1.8020679468242244, + "grad_norm": 0.2510431408882141, + "learning_rate": 7.991131912797143e-05, + "loss": 0.1641, + "step": 4880 + }, + { + "epoch": 1.802437223042836, + "grad_norm": 0.23758597671985626, + "learning_rate": 7.988668555240793e-05, + "loss": 0.2004, + "step": 4881 + }, + { + "epoch": 1.8028064992614476, + "grad_norm": 0.2565006911754608, + "learning_rate": 7.986205197684445e-05, + "loss": 0.2071, + "step": 4882 + }, + { + "epoch": 1.803175775480059, + "grad_norm": 0.27667269110679626, + "learning_rate": 7.983741840128095e-05, + "loss": 0.242, + "step": 4883 + }, + { + "epoch": 1.8035450516986706, + "grad_norm": 0.2804733216762543, + "learning_rate": 7.981278482571746e-05, + "loss": 0.2311, + "step": 4884 + }, + { + "epoch": 1.803914327917282, + "grad_norm": 0.24930794537067413, + "learning_rate": 7.978815125015396e-05, + "loss": 0.2092, + "step": 4885 + }, + { + "epoch": 1.8042836041358936, + "grad_norm": 0.25951051712036133, + "learning_rate": 7.976351767459047e-05, + "loss": 0.2201, + "step": 4886 + }, + { + "epoch": 1.8046528803545052, + "grad_norm": 0.2643817663192749, + "learning_rate": 7.973888409902698e-05, + "loss": 0.2552, + "step": 4887 + }, + { + "epoch": 1.8050221565731168, + "grad_norm": 0.2603071331977844, + "learning_rate": 7.971425052346348e-05, + "loss": 0.2065, + "step": 4888 + }, + { + "epoch": 1.8053914327917282, + "grad_norm": 0.26162391901016235, + "learning_rate": 7.96896169479e-05, + "loss": 0.1834, + "step": 4889 + }, + { + "epoch": 1.8057607090103396, + "grad_norm": 0.2756583094596863, + "learning_rate": 7.96649833723365e-05, + "loss": 0.199, + "step": 4890 + }, + { + "epoch": 1.8061299852289512, + "grad_norm": 0.3199632167816162, + "learning_rate": 7.964034979677301e-05, + "loss": 0.2425, + "step": 4891 + }, + { + "epoch": 1.8064992614475628, + "grad_norm": 0.23668265342712402, + "learning_rate": 7.961571622120951e-05, + "loss": 0.1897, + "step": 4892 + }, + { + "epoch": 1.8068685376661744, + "grad_norm": 0.2989102005958557, + "learning_rate": 7.959108264564601e-05, + "loss": 0.201, + "step": 4893 + }, + { + "epoch": 1.8072378138847858, + "grad_norm": 0.25371021032333374, + "learning_rate": 7.956644907008253e-05, + "loss": 0.2087, + "step": 4894 + }, + { + "epoch": 1.8076070901033974, + "grad_norm": 0.2640797793865204, + "learning_rate": 7.954181549451903e-05, + "loss": 0.2091, + "step": 4895 + }, + { + "epoch": 1.8079763663220088, + "grad_norm": 0.2927210330963135, + "learning_rate": 7.951718191895554e-05, + "loss": 0.2631, + "step": 4896 + }, + { + "epoch": 1.8083456425406204, + "grad_norm": 0.2759553790092468, + "learning_rate": 7.949254834339205e-05, + "loss": 0.1957, + "step": 4897 + }, + { + "epoch": 1.808714918759232, + "grad_norm": 0.24155765771865845, + "learning_rate": 7.946791476782856e-05, + "loss": 0.1931, + "step": 4898 + }, + { + "epoch": 1.8090841949778436, + "grad_norm": 0.25750887393951416, + "learning_rate": 7.944328119226506e-05, + "loss": 0.2149, + "step": 4899 + }, + { + "epoch": 1.809453471196455, + "grad_norm": 0.28763970732688904, + "learning_rate": 7.941864761670158e-05, + "loss": 0.219, + "step": 4900 + }, + { + "epoch": 1.809453471196455, + "eval_loss": 8.389344215393066, + "eval_runtime": 6.9087, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 1.013, + "step": 4900 + }, + { + "epoch": 1.8098227474150663, + "grad_norm": 0.2834678292274475, + "learning_rate": 7.939401404113808e-05, + "loss": 0.2655, + "step": 4901 + }, + { + "epoch": 1.810192023633678, + "grad_norm": 0.2416168451309204, + "learning_rate": 7.936938046557458e-05, + "loss": 0.216, + "step": 4902 + }, + { + "epoch": 1.8105612998522895, + "grad_norm": 0.2716910243034363, + "learning_rate": 7.934474689001109e-05, + "loss": 0.2228, + "step": 4903 + }, + { + "epoch": 1.8109305760709011, + "grad_norm": 0.31382885575294495, + "learning_rate": 7.93201133144476e-05, + "loss": 0.254, + "step": 4904 + }, + { + "epoch": 1.8112998522895125, + "grad_norm": 0.3279086947441101, + "learning_rate": 7.929547973888411e-05, + "loss": 0.2321, + "step": 4905 + }, + { + "epoch": 1.8116691285081241, + "grad_norm": 0.27616676688194275, + "learning_rate": 7.927084616332061e-05, + "loss": 0.2222, + "step": 4906 + }, + { + "epoch": 1.8120384047267355, + "grad_norm": 0.24837574362754822, + "learning_rate": 7.924621258775712e-05, + "loss": 0.2216, + "step": 4907 + }, + { + "epoch": 1.812407680945347, + "grad_norm": 0.3547925353050232, + "learning_rate": 7.922157901219363e-05, + "loss": 0.3071, + "step": 4908 + }, + { + "epoch": 1.8127769571639587, + "grad_norm": 0.21705329418182373, + "learning_rate": 7.919694543663013e-05, + "loss": 0.1904, + "step": 4909 + }, + { + "epoch": 1.8131462333825703, + "grad_norm": 0.2728787064552307, + "learning_rate": 7.917231186106664e-05, + "loss": 0.2308, + "step": 4910 + }, + { + "epoch": 1.8135155096011817, + "grad_norm": 0.2706068456172943, + "learning_rate": 7.914767828550314e-05, + "loss": 0.1898, + "step": 4911 + }, + { + "epoch": 1.813884785819793, + "grad_norm": 0.28375810384750366, + "learning_rate": 7.912304470993966e-05, + "loss": 0.2431, + "step": 4912 + }, + { + "epoch": 1.8142540620384047, + "grad_norm": 0.3095022141933441, + "learning_rate": 7.909841113437616e-05, + "loss": 0.2642, + "step": 4913 + }, + { + "epoch": 1.8146233382570163, + "grad_norm": 0.26290616393089294, + "learning_rate": 7.907377755881267e-05, + "loss": 0.2391, + "step": 4914 + }, + { + "epoch": 1.8149926144756279, + "grad_norm": 0.24634678661823273, + "learning_rate": 7.904914398324917e-05, + "loss": 0.2119, + "step": 4915 + }, + { + "epoch": 1.8153618906942393, + "grad_norm": 0.25221940875053406, + "learning_rate": 7.902451040768567e-05, + "loss": 0.1937, + "step": 4916 + }, + { + "epoch": 1.8157311669128509, + "grad_norm": 0.22208702564239502, + "learning_rate": 7.899987683212219e-05, + "loss": 0.2028, + "step": 4917 + }, + { + "epoch": 1.8161004431314622, + "grad_norm": 0.23337139189243317, + "learning_rate": 7.897524325655869e-05, + "loss": 0.1834, + "step": 4918 + }, + { + "epoch": 1.8164697193500738, + "grad_norm": 0.2484610378742218, + "learning_rate": 7.89506096809952e-05, + "loss": 0.1876, + "step": 4919 + }, + { + "epoch": 1.8168389955686854, + "grad_norm": 0.2622143626213074, + "learning_rate": 7.89259761054317e-05, + "loss": 0.1954, + "step": 4920 + }, + { + "epoch": 1.817208271787297, + "grad_norm": 0.29779428243637085, + "learning_rate": 7.890134252986822e-05, + "loss": 0.2177, + "step": 4921 + }, + { + "epoch": 1.8175775480059084, + "grad_norm": 0.2187691330909729, + "learning_rate": 7.887670895430472e-05, + "loss": 0.1975, + "step": 4922 + }, + { + "epoch": 1.8179468242245198, + "grad_norm": 0.25445714592933655, + "learning_rate": 7.885207537874124e-05, + "loss": 0.2046, + "step": 4923 + }, + { + "epoch": 1.8183161004431314, + "grad_norm": 0.3028862476348877, + "learning_rate": 7.882744180317774e-05, + "loss": 0.2128, + "step": 4924 + }, + { + "epoch": 1.818685376661743, + "grad_norm": 0.2241683155298233, + "learning_rate": 7.880280822761424e-05, + "loss": 0.2043, + "step": 4925 + }, + { + "epoch": 1.8190546528803546, + "grad_norm": 0.3093726933002472, + "learning_rate": 7.877817465205075e-05, + "loss": 0.2413, + "step": 4926 + }, + { + "epoch": 1.819423929098966, + "grad_norm": 0.32770586013793945, + "learning_rate": 7.875354107648725e-05, + "loss": 0.2328, + "step": 4927 + }, + { + "epoch": 1.8197932053175776, + "grad_norm": 0.27520349621772766, + "learning_rate": 7.872890750092377e-05, + "loss": 0.2157, + "step": 4928 + }, + { + "epoch": 1.820162481536189, + "grad_norm": 0.28705155849456787, + "learning_rate": 7.870427392536027e-05, + "loss": 0.2423, + "step": 4929 + }, + { + "epoch": 1.8205317577548006, + "grad_norm": 0.23400430381298065, + "learning_rate": 7.867964034979678e-05, + "loss": 0.2119, + "step": 4930 + }, + { + "epoch": 1.8209010339734122, + "grad_norm": 0.29326730966567993, + "learning_rate": 7.865500677423329e-05, + "loss": 0.2109, + "step": 4931 + }, + { + "epoch": 1.8212703101920238, + "grad_norm": 0.3255024254322052, + "learning_rate": 7.863037319866979e-05, + "loss": 0.2397, + "step": 4932 + }, + { + "epoch": 1.8216395864106352, + "grad_norm": 0.24067874252796173, + "learning_rate": 7.86057396231063e-05, + "loss": 0.211, + "step": 4933 + }, + { + "epoch": 1.8220088626292466, + "grad_norm": 0.28856396675109863, + "learning_rate": 7.85811060475428e-05, + "loss": 0.2484, + "step": 4934 + }, + { + "epoch": 1.8223781388478582, + "grad_norm": 0.2879740595817566, + "learning_rate": 7.855647247197932e-05, + "loss": 0.2322, + "step": 4935 + }, + { + "epoch": 1.8227474150664698, + "grad_norm": 0.2875503897666931, + "learning_rate": 7.853183889641582e-05, + "loss": 0.2622, + "step": 4936 + }, + { + "epoch": 1.8231166912850814, + "grad_norm": 0.29062366485595703, + "learning_rate": 7.850720532085233e-05, + "loss": 0.2535, + "step": 4937 + }, + { + "epoch": 1.8234859675036927, + "grad_norm": 0.2513425052165985, + "learning_rate": 7.848257174528883e-05, + "loss": 0.1921, + "step": 4938 + }, + { + "epoch": 1.8238552437223041, + "grad_norm": 0.3021498918533325, + "learning_rate": 7.845793816972535e-05, + "loss": 0.2251, + "step": 4939 + }, + { + "epoch": 1.8242245199409157, + "grad_norm": 0.254720538854599, + "learning_rate": 7.843330459416185e-05, + "loss": 0.1784, + "step": 4940 + }, + { + "epoch": 1.8245937961595273, + "grad_norm": 0.21871700882911682, + "learning_rate": 7.840867101859835e-05, + "loss": 0.1815, + "step": 4941 + }, + { + "epoch": 1.824963072378139, + "grad_norm": 0.2641341984272003, + "learning_rate": 7.838403744303487e-05, + "loss": 0.2071, + "step": 4942 + }, + { + "epoch": 1.8253323485967505, + "grad_norm": 0.21553672850131989, + "learning_rate": 7.835940386747137e-05, + "loss": 0.1873, + "step": 4943 + }, + { + "epoch": 1.825701624815362, + "grad_norm": 0.259754478931427, + "learning_rate": 7.833477029190788e-05, + "loss": 0.1979, + "step": 4944 + }, + { + "epoch": 1.8260709010339733, + "grad_norm": 0.29389017820358276, + "learning_rate": 7.831013671634438e-05, + "loss": 0.2437, + "step": 4945 + }, + { + "epoch": 1.826440177252585, + "grad_norm": 0.26598039269447327, + "learning_rate": 7.82855031407809e-05, + "loss": 0.2129, + "step": 4946 + }, + { + "epoch": 1.8268094534711965, + "grad_norm": 0.3003842234611511, + "learning_rate": 7.82608695652174e-05, + "loss": 0.2419, + "step": 4947 + }, + { + "epoch": 1.827178729689808, + "grad_norm": 0.22277085483074188, + "learning_rate": 7.82362359896539e-05, + "loss": 0.1896, + "step": 4948 + }, + { + "epoch": 1.8275480059084195, + "grad_norm": 0.2221381813287735, + "learning_rate": 7.821160241409041e-05, + "loss": 0.1772, + "step": 4949 + }, + { + "epoch": 1.8279172821270309, + "grad_norm": 0.31694450974464417, + "learning_rate": 7.818696883852691e-05, + "loss": 0.1947, + "step": 4950 + }, + { + "epoch": 1.8279172821270309, + "eval_loss": 8.493175506591797, + "eval_runtime": 6.982, + "eval_samples_per_second": 7.161, + "eval_steps_per_second": 1.003, + "step": 4950 + }, + { + "epoch": 1.8282865583456425, + "grad_norm": 0.22468301653862, + "learning_rate": 7.816233526296343e-05, + "loss": 0.19, + "step": 4951 + }, + { + "epoch": 1.828655834564254, + "grad_norm": 0.2737385630607605, + "learning_rate": 7.813770168739993e-05, + "loss": 0.236, + "step": 4952 + }, + { + "epoch": 1.8290251107828657, + "grad_norm": 0.30210262537002563, + "learning_rate": 7.811306811183645e-05, + "loss": 0.2542, + "step": 4953 + }, + { + "epoch": 1.829394387001477, + "grad_norm": 0.2417190670967102, + "learning_rate": 7.808843453627295e-05, + "loss": 0.1843, + "step": 4954 + }, + { + "epoch": 1.8297636632200887, + "grad_norm": 0.25768449902534485, + "learning_rate": 7.806380096070946e-05, + "loss": 0.2206, + "step": 4955 + }, + { + "epoch": 1.8301329394387, + "grad_norm": 0.29524388909339905, + "learning_rate": 7.803916738514596e-05, + "loss": 0.2069, + "step": 4956 + }, + { + "epoch": 1.8305022156573116, + "grad_norm": 0.32162609696388245, + "learning_rate": 7.801453380958246e-05, + "loss": 0.2285, + "step": 4957 + }, + { + "epoch": 1.8308714918759232, + "grad_norm": 0.2545388638973236, + "learning_rate": 7.798990023401898e-05, + "loss": 0.2502, + "step": 4958 + }, + { + "epoch": 1.8312407680945348, + "grad_norm": 0.23092465102672577, + "learning_rate": 7.796526665845548e-05, + "loss": 0.2012, + "step": 4959 + }, + { + "epoch": 1.8316100443131462, + "grad_norm": 0.22163672745227814, + "learning_rate": 7.7940633082892e-05, + "loss": 0.1968, + "step": 4960 + }, + { + "epoch": 1.8319793205317576, + "grad_norm": 0.2829563021659851, + "learning_rate": 7.79159995073285e-05, + "loss": 0.2097, + "step": 4961 + }, + { + "epoch": 1.8323485967503692, + "grad_norm": 0.2914331257343292, + "learning_rate": 7.789136593176501e-05, + "loss": 0.2329, + "step": 4962 + }, + { + "epoch": 1.8327178729689808, + "grad_norm": 0.22718369960784912, + "learning_rate": 7.786673235620151e-05, + "loss": 0.2073, + "step": 4963 + }, + { + "epoch": 1.8330871491875924, + "grad_norm": 0.2333722859621048, + "learning_rate": 7.784209878063801e-05, + "loss": 0.2073, + "step": 4964 + }, + { + "epoch": 1.8334564254062038, + "grad_norm": 0.35279470682144165, + "learning_rate": 7.781746520507453e-05, + "loss": 0.2774, + "step": 4965 + }, + { + "epoch": 1.8338257016248154, + "grad_norm": 0.19717776775360107, + "learning_rate": 7.779283162951103e-05, + "loss": 0.1622, + "step": 4966 + }, + { + "epoch": 1.8341949778434268, + "grad_norm": 0.21716821193695068, + "learning_rate": 7.776819805394754e-05, + "loss": 0.2016, + "step": 4967 + }, + { + "epoch": 1.8345642540620384, + "grad_norm": 0.20469699800014496, + "learning_rate": 7.774356447838404e-05, + "loss": 0.1898, + "step": 4968 + }, + { + "epoch": 1.83493353028065, + "grad_norm": 0.25230807065963745, + "learning_rate": 7.771893090282056e-05, + "loss": 0.2221, + "step": 4969 + }, + { + "epoch": 1.8353028064992616, + "grad_norm": 0.27665260434150696, + "learning_rate": 7.769429732725706e-05, + "loss": 0.2476, + "step": 4970 + }, + { + "epoch": 1.835672082717873, + "grad_norm": 0.19727131724357605, + "learning_rate": 7.766966375169357e-05, + "loss": 0.1503, + "step": 4971 + }, + { + "epoch": 1.8360413589364843, + "grad_norm": 0.2601411044597626, + "learning_rate": 7.764503017613007e-05, + "loss": 0.2557, + "step": 4972 + }, + { + "epoch": 1.836410635155096, + "grad_norm": 0.2841900587081909, + "learning_rate": 7.762039660056658e-05, + "loss": 0.2348, + "step": 4973 + }, + { + "epoch": 1.8367799113737076, + "grad_norm": 0.2924654483795166, + "learning_rate": 7.759576302500309e-05, + "loss": 0.1987, + "step": 4974 + }, + { + "epoch": 1.8371491875923192, + "grad_norm": 0.2486112266778946, + "learning_rate": 7.757112944943959e-05, + "loss": 0.1875, + "step": 4975 + }, + { + "epoch": 1.8375184638109305, + "grad_norm": 0.27105146646499634, + "learning_rate": 7.75464958738761e-05, + "loss": 0.2347, + "step": 4976 + }, + { + "epoch": 1.8378877400295421, + "grad_norm": 0.2482648640871048, + "learning_rate": 7.752186229831261e-05, + "loss": 0.2321, + "step": 4977 + }, + { + "epoch": 1.8382570162481535, + "grad_norm": 0.28159570693969727, + "learning_rate": 7.749722872274912e-05, + "loss": 0.2224, + "step": 4978 + }, + { + "epoch": 1.8386262924667651, + "grad_norm": 0.28768110275268555, + "learning_rate": 7.747259514718562e-05, + "loss": 0.2505, + "step": 4979 + }, + { + "epoch": 1.8389955686853767, + "grad_norm": 0.2761133015155792, + "learning_rate": 7.744796157162212e-05, + "loss": 0.2079, + "step": 4980 + }, + { + "epoch": 1.8393648449039883, + "grad_norm": 0.26142653822898865, + "learning_rate": 7.742332799605864e-05, + "loss": 0.2028, + "step": 4981 + }, + { + "epoch": 1.8397341211225997, + "grad_norm": 0.2538490295410156, + "learning_rate": 7.739869442049514e-05, + "loss": 0.2248, + "step": 4982 + }, + { + "epoch": 1.840103397341211, + "grad_norm": 0.24057616293430328, + "learning_rate": 7.737406084493165e-05, + "loss": 0.2108, + "step": 4983 + }, + { + "epoch": 1.8404726735598227, + "grad_norm": 0.2461548000574112, + "learning_rate": 7.734942726936816e-05, + "loss": 0.1793, + "step": 4984 + }, + { + "epoch": 1.8408419497784343, + "grad_norm": 0.24027347564697266, + "learning_rate": 7.732479369380467e-05, + "loss": 0.2087, + "step": 4985 + }, + { + "epoch": 1.841211225997046, + "grad_norm": 0.32156988978385925, + "learning_rate": 7.730016011824117e-05, + "loss": 0.2189, + "step": 4986 + }, + { + "epoch": 1.8415805022156573, + "grad_norm": 0.24344180524349213, + "learning_rate": 7.727552654267769e-05, + "loss": 0.1932, + "step": 4987 + }, + { + "epoch": 1.8419497784342689, + "grad_norm": 0.31036823987960815, + "learning_rate": 7.725089296711419e-05, + "loss": 0.2347, + "step": 4988 + }, + { + "epoch": 1.8423190546528803, + "grad_norm": 0.2929305136203766, + "learning_rate": 7.722625939155069e-05, + "loss": 0.2053, + "step": 4989 + }, + { + "epoch": 1.8426883308714919, + "grad_norm": 0.3108742833137512, + "learning_rate": 7.72016258159872e-05, + "loss": 0.2174, + "step": 4990 + }, + { + "epoch": 1.8430576070901035, + "grad_norm": 0.22484628856182098, + "learning_rate": 7.71769922404237e-05, + "loss": 0.198, + "step": 4991 + }, + { + "epoch": 1.843426883308715, + "grad_norm": 0.2797197699546814, + "learning_rate": 7.715235866486022e-05, + "loss": 0.2003, + "step": 4992 + }, + { + "epoch": 1.8437961595273265, + "grad_norm": 0.24485258758068085, + "learning_rate": 7.71277250892967e-05, + "loss": 0.2112, + "step": 4993 + }, + { + "epoch": 1.8441654357459378, + "grad_norm": 0.27039220929145813, + "learning_rate": 7.710309151373322e-05, + "loss": 0.1976, + "step": 4994 + }, + { + "epoch": 1.8445347119645494, + "grad_norm": 0.2981707751750946, + "learning_rate": 7.707845793816972e-05, + "loss": 0.2104, + "step": 4995 + }, + { + "epoch": 1.844903988183161, + "grad_norm": 0.26567134261131287, + "learning_rate": 7.705382436260624e-05, + "loss": 0.2297, + "step": 4996 + }, + { + "epoch": 1.8452732644017726, + "grad_norm": 0.3407639265060425, + "learning_rate": 7.702919078704274e-05, + "loss": 0.2323, + "step": 4997 + }, + { + "epoch": 1.845642540620384, + "grad_norm": 0.2924421727657318, + "learning_rate": 7.700455721147924e-05, + "loss": 0.2604, + "step": 4998 + }, + { + "epoch": 1.8460118168389956, + "grad_norm": 0.3046894073486328, + "learning_rate": 7.697992363591575e-05, + "loss": 0.2278, + "step": 4999 + }, + { + "epoch": 1.846381093057607, + "grad_norm": 0.24182730913162231, + "learning_rate": 7.695529006035225e-05, + "loss": 0.1688, + "step": 5000 + }, + { + "epoch": 1.846381093057607, + "eval_loss": 8.546396255493164, + "eval_runtime": 6.9143, + "eval_samples_per_second": 7.231, + "eval_steps_per_second": 1.012, + "step": 5000 + }, + { + "epoch": 1.8467503692762186, + "grad_norm": 0.25284621119499207, + "learning_rate": 7.693065648478877e-05, + "loss": 0.2165, + "step": 5001 + }, + { + "epoch": 1.8471196454948302, + "grad_norm": 0.23829656839370728, + "learning_rate": 7.690602290922527e-05, + "loss": 0.1971, + "step": 5002 + }, + { + "epoch": 1.8474889217134418, + "grad_norm": 0.2520526647567749, + "learning_rate": 7.688138933366178e-05, + "loss": 0.2361, + "step": 5003 + }, + { + "epoch": 1.8478581979320532, + "grad_norm": 0.2554837763309479, + "learning_rate": 7.685675575809829e-05, + "loss": 0.2239, + "step": 5004 + }, + { + "epoch": 1.8482274741506646, + "grad_norm": 0.23465128242969513, + "learning_rate": 7.68321221825348e-05, + "loss": 0.1939, + "step": 5005 + }, + { + "epoch": 1.8485967503692762, + "grad_norm": 0.2598876953125, + "learning_rate": 7.68074886069713e-05, + "loss": 0.2587, + "step": 5006 + }, + { + "epoch": 1.8489660265878878, + "grad_norm": 0.2522546947002411, + "learning_rate": 7.67828550314078e-05, + "loss": 0.2164, + "step": 5007 + }, + { + "epoch": 1.8493353028064994, + "grad_norm": 0.30059924721717834, + "learning_rate": 7.675822145584432e-05, + "loss": 0.2691, + "step": 5008 + }, + { + "epoch": 1.8497045790251108, + "grad_norm": 0.28205376863479614, + "learning_rate": 7.673358788028082e-05, + "loss": 0.2376, + "step": 5009 + }, + { + "epoch": 1.8500738552437221, + "grad_norm": 0.27052178978919983, + "learning_rate": 7.670895430471733e-05, + "loss": 0.2372, + "step": 5010 + }, + { + "epoch": 1.8504431314623337, + "grad_norm": 0.3611307740211487, + "learning_rate": 7.668432072915383e-05, + "loss": 0.2557, + "step": 5011 + }, + { + "epoch": 1.8508124076809453, + "grad_norm": 0.23945903778076172, + "learning_rate": 7.665968715359035e-05, + "loss": 0.2107, + "step": 5012 + }, + { + "epoch": 1.851181683899557, + "grad_norm": 0.2769378125667572, + "learning_rate": 7.663505357802685e-05, + "loss": 0.2629, + "step": 5013 + }, + { + "epoch": 1.8515509601181686, + "grad_norm": 0.23962347209453583, + "learning_rate": 7.661042000246335e-05, + "loss": 0.1993, + "step": 5014 + }, + { + "epoch": 1.85192023633678, + "grad_norm": 0.24721671640872955, + "learning_rate": 7.658578642689986e-05, + "loss": 0.2139, + "step": 5015 + }, + { + "epoch": 1.8522895125553913, + "grad_norm": 0.24365712702274323, + "learning_rate": 7.656115285133637e-05, + "loss": 0.1932, + "step": 5016 + }, + { + "epoch": 1.852658788774003, + "grad_norm": 0.2605902850627899, + "learning_rate": 7.653651927577288e-05, + "loss": 0.2067, + "step": 5017 + }, + { + "epoch": 1.8530280649926145, + "grad_norm": 0.21690890192985535, + "learning_rate": 7.651188570020938e-05, + "loss": 0.1889, + "step": 5018 + }, + { + "epoch": 1.8533973412112261, + "grad_norm": 0.24974678456783295, + "learning_rate": 7.64872521246459e-05, + "loss": 0.2191, + "step": 5019 + }, + { + "epoch": 1.8537666174298375, + "grad_norm": 0.2690393924713135, + "learning_rate": 7.64626185490824e-05, + "loss": 0.2144, + "step": 5020 + }, + { + "epoch": 1.8541358936484489, + "grad_norm": 0.29325082898139954, + "learning_rate": 7.643798497351891e-05, + "loss": 0.2306, + "step": 5021 + }, + { + "epoch": 1.8545051698670605, + "grad_norm": 0.25871407985687256, + "learning_rate": 7.641335139795541e-05, + "loss": 0.2033, + "step": 5022 + }, + { + "epoch": 1.854874446085672, + "grad_norm": 0.3737225830554962, + "learning_rate": 7.638871782239191e-05, + "loss": 0.2352, + "step": 5023 + }, + { + "epoch": 1.8552437223042837, + "grad_norm": 0.21670207381248474, + "learning_rate": 7.636408424682843e-05, + "loss": 0.2164, + "step": 5024 + }, + { + "epoch": 1.855612998522895, + "grad_norm": 0.24353617429733276, + "learning_rate": 7.633945067126493e-05, + "loss": 0.2049, + "step": 5025 + }, + { + "epoch": 1.8559822747415067, + "grad_norm": 0.29860228300094604, + "learning_rate": 7.631481709570144e-05, + "loss": 0.232, + "step": 5026 + }, + { + "epoch": 1.856351550960118, + "grad_norm": 0.238505020737648, + "learning_rate": 7.629018352013795e-05, + "loss": 0.1928, + "step": 5027 + }, + { + "epoch": 1.8567208271787297, + "grad_norm": 0.318330854177475, + "learning_rate": 7.626554994457446e-05, + "loss": 0.2214, + "step": 5028 + }, + { + "epoch": 1.8570901033973413, + "grad_norm": 0.2553394138813019, + "learning_rate": 7.624091636901096e-05, + "loss": 0.1926, + "step": 5029 + }, + { + "epoch": 1.8574593796159529, + "grad_norm": 0.22410456836223602, + "learning_rate": 7.621628279344746e-05, + "loss": 0.2114, + "step": 5030 + }, + { + "epoch": 1.8578286558345642, + "grad_norm": 0.22183454036712646, + "learning_rate": 7.619164921788398e-05, + "loss": 0.1786, + "step": 5031 + }, + { + "epoch": 1.8581979320531756, + "grad_norm": 0.2652347981929779, + "learning_rate": 7.616701564232048e-05, + "loss": 0.2178, + "step": 5032 + }, + { + "epoch": 1.8585672082717872, + "grad_norm": 0.2816067636013031, + "learning_rate": 7.614238206675699e-05, + "loss": 0.1948, + "step": 5033 + }, + { + "epoch": 1.8589364844903988, + "grad_norm": 0.26688727736473083, + "learning_rate": 7.61177484911935e-05, + "loss": 0.2174, + "step": 5034 + }, + { + "epoch": 1.8593057607090104, + "grad_norm": 0.25919273495674133, + "learning_rate": 7.609311491563001e-05, + "loss": 0.2587, + "step": 5035 + }, + { + "epoch": 1.8596750369276218, + "grad_norm": 0.24714429676532745, + "learning_rate": 7.606848134006651e-05, + "loss": 0.1994, + "step": 5036 + }, + { + "epoch": 1.8600443131462334, + "grad_norm": 0.2927660048007965, + "learning_rate": 7.604384776450302e-05, + "loss": 0.2773, + "step": 5037 + }, + { + "epoch": 1.8604135893648448, + "grad_norm": 0.27404817938804626, + "learning_rate": 7.601921418893953e-05, + "loss": 0.2124, + "step": 5038 + }, + { + "epoch": 1.8607828655834564, + "grad_norm": 0.23629409074783325, + "learning_rate": 7.599458061337603e-05, + "loss": 0.1981, + "step": 5039 + }, + { + "epoch": 1.861152141802068, + "grad_norm": 0.29185813665390015, + "learning_rate": 7.596994703781254e-05, + "loss": 0.2378, + "step": 5040 + }, + { + "epoch": 1.8615214180206796, + "grad_norm": 0.24912190437316895, + "learning_rate": 7.594531346224904e-05, + "loss": 0.2178, + "step": 5041 + }, + { + "epoch": 1.861890694239291, + "grad_norm": 0.2529584467411041, + "learning_rate": 7.592067988668556e-05, + "loss": 0.1932, + "step": 5042 + }, + { + "epoch": 1.8622599704579024, + "grad_norm": 0.2637450098991394, + "learning_rate": 7.589604631112206e-05, + "loss": 0.2092, + "step": 5043 + }, + { + "epoch": 1.862629246676514, + "grad_norm": 0.30589428544044495, + "learning_rate": 7.587141273555857e-05, + "loss": 0.2481, + "step": 5044 + }, + { + "epoch": 1.8629985228951256, + "grad_norm": 0.253279447555542, + "learning_rate": 7.584677915999507e-05, + "loss": 0.2073, + "step": 5045 + }, + { + "epoch": 1.8633677991137372, + "grad_norm": 0.27927908301353455, + "learning_rate": 7.582214558443157e-05, + "loss": 0.2271, + "step": 5046 + }, + { + "epoch": 1.8637370753323486, + "grad_norm": 0.27703237533569336, + "learning_rate": 7.579751200886809e-05, + "loss": 0.2011, + "step": 5047 + }, + { + "epoch": 1.8641063515509602, + "grad_norm": 0.23969240486621857, + "learning_rate": 7.577287843330459e-05, + "loss": 0.197, + "step": 5048 + }, + { + "epoch": 1.8644756277695715, + "grad_norm": 0.2620817720890045, + "learning_rate": 7.57482448577411e-05, + "loss": 0.2097, + "step": 5049 + }, + { + "epoch": 1.8648449039881831, + "grad_norm": 0.2675533592700958, + "learning_rate": 7.57236112821776e-05, + "loss": 0.2241, + "step": 5050 + }, + { + "epoch": 1.8648449039881831, + "eval_loss": 8.633878707885742, + "eval_runtime": 6.9312, + "eval_samples_per_second": 7.214, + "eval_steps_per_second": 1.01, + "step": 5050 + }, + { + "epoch": 1.8652141802067947, + "grad_norm": 0.28639355301856995, + "learning_rate": 7.569897770661412e-05, + "loss": 0.2282, + "step": 5051 + }, + { + "epoch": 1.8655834564254064, + "grad_norm": 0.313135027885437, + "learning_rate": 7.567434413105062e-05, + "loss": 0.2347, + "step": 5052 + }, + { + "epoch": 1.8659527326440177, + "grad_norm": 0.21219402551651, + "learning_rate": 7.564971055548714e-05, + "loss": 0.1979, + "step": 5053 + }, + { + "epoch": 1.8663220088626291, + "grad_norm": 0.2504329979419708, + "learning_rate": 7.562507697992364e-05, + "loss": 0.2105, + "step": 5054 + }, + { + "epoch": 1.8666912850812407, + "grad_norm": 0.2170252501964569, + "learning_rate": 7.560044340436014e-05, + "loss": 0.1736, + "step": 5055 + }, + { + "epoch": 1.8670605612998523, + "grad_norm": 0.2863474488258362, + "learning_rate": 7.557580982879665e-05, + "loss": 0.2376, + "step": 5056 + }, + { + "epoch": 1.867429837518464, + "grad_norm": 0.25895577669143677, + "learning_rate": 7.555117625323315e-05, + "loss": 0.2071, + "step": 5057 + }, + { + "epoch": 1.8677991137370753, + "grad_norm": 0.2513246238231659, + "learning_rate": 7.552654267766967e-05, + "loss": 0.2056, + "step": 5058 + }, + { + "epoch": 1.868168389955687, + "grad_norm": 0.23155029118061066, + "learning_rate": 7.550190910210617e-05, + "loss": 0.2161, + "step": 5059 + }, + { + "epoch": 1.8685376661742983, + "grad_norm": 0.22597303986549377, + "learning_rate": 7.547727552654268e-05, + "loss": 0.2046, + "step": 5060 + }, + { + "epoch": 1.8689069423929099, + "grad_norm": 0.2339690923690796, + "learning_rate": 7.545264195097919e-05, + "loss": 0.1976, + "step": 5061 + }, + { + "epoch": 1.8692762186115215, + "grad_norm": 0.2399640530347824, + "learning_rate": 7.542800837541569e-05, + "loss": 0.2293, + "step": 5062 + }, + { + "epoch": 1.869645494830133, + "grad_norm": 0.30100929737091064, + "learning_rate": 7.54033747998522e-05, + "loss": 0.219, + "step": 5063 + }, + { + "epoch": 1.8700147710487445, + "grad_norm": 0.30362051725387573, + "learning_rate": 7.53787412242887e-05, + "loss": 0.2099, + "step": 5064 + }, + { + "epoch": 1.8703840472673559, + "grad_norm": 0.33054691553115845, + "learning_rate": 7.535410764872522e-05, + "loss": 0.2277, + "step": 5065 + }, + { + "epoch": 1.8707533234859675, + "grad_norm": 0.24478068947792053, + "learning_rate": 7.532947407316172e-05, + "loss": 0.2184, + "step": 5066 + }, + { + "epoch": 1.871122599704579, + "grad_norm": 0.25750991702079773, + "learning_rate": 7.530484049759823e-05, + "loss": 0.2144, + "step": 5067 + }, + { + "epoch": 1.8714918759231907, + "grad_norm": 0.2703867554664612, + "learning_rate": 7.528020692203473e-05, + "loss": 0.238, + "step": 5068 + }, + { + "epoch": 1.871861152141802, + "grad_norm": 0.32197922468185425, + "learning_rate": 7.525557334647124e-05, + "loss": 0.2587, + "step": 5069 + }, + { + "epoch": 1.8722304283604134, + "grad_norm": 0.29007717967033386, + "learning_rate": 7.523093977090775e-05, + "loss": 0.2347, + "step": 5070 + }, + { + "epoch": 1.872599704579025, + "grad_norm": 0.2746235728263855, + "learning_rate": 7.520630619534425e-05, + "loss": 0.2378, + "step": 5071 + }, + { + "epoch": 1.8729689807976366, + "grad_norm": 0.2863379120826721, + "learning_rate": 7.518167261978077e-05, + "loss": 0.215, + "step": 5072 + }, + { + "epoch": 1.8733382570162482, + "grad_norm": 0.2593397796154022, + "learning_rate": 7.515703904421727e-05, + "loss": 0.2682, + "step": 5073 + }, + { + "epoch": 1.8737075332348598, + "grad_norm": 0.28525978326797485, + "learning_rate": 7.513240546865378e-05, + "loss": 0.2311, + "step": 5074 + }, + { + "epoch": 1.8740768094534712, + "grad_norm": 0.31422123312950134, + "learning_rate": 7.510777189309028e-05, + "loss": 0.2592, + "step": 5075 + }, + { + "epoch": 1.8744460856720826, + "grad_norm": 0.2931804358959198, + "learning_rate": 7.50831383175268e-05, + "loss": 0.2329, + "step": 5076 + }, + { + "epoch": 1.8748153618906942, + "grad_norm": 0.24347998201847076, + "learning_rate": 7.50585047419633e-05, + "loss": 0.2012, + "step": 5077 + }, + { + "epoch": 1.8751846381093058, + "grad_norm": 0.2852291762828827, + "learning_rate": 7.50338711663998e-05, + "loss": 0.2108, + "step": 5078 + }, + { + "epoch": 1.8755539143279174, + "grad_norm": 0.25467193126678467, + "learning_rate": 7.500923759083631e-05, + "loss": 0.2038, + "step": 5079 + }, + { + "epoch": 1.8759231905465288, + "grad_norm": 0.33976683020591736, + "learning_rate": 7.498460401527282e-05, + "loss": 0.2846, + "step": 5080 + }, + { + "epoch": 1.8762924667651402, + "grad_norm": 0.2470531314611435, + "learning_rate": 7.495997043970933e-05, + "loss": 0.2099, + "step": 5081 + }, + { + "epoch": 1.8766617429837518, + "grad_norm": 0.2801273465156555, + "learning_rate": 7.493533686414583e-05, + "loss": 0.2309, + "step": 5082 + }, + { + "epoch": 1.8770310192023634, + "grad_norm": 0.23430602252483368, + "learning_rate": 7.491070328858235e-05, + "loss": 0.182, + "step": 5083 + }, + { + "epoch": 1.877400295420975, + "grad_norm": 0.20895348489284515, + "learning_rate": 7.488606971301885e-05, + "loss": 0.18, + "step": 5084 + }, + { + "epoch": 1.8777695716395866, + "grad_norm": 0.26840075850486755, + "learning_rate": 7.486143613745535e-05, + "loss": 0.222, + "step": 5085 + }, + { + "epoch": 1.878138847858198, + "grad_norm": 0.2851375341415405, + "learning_rate": 7.483680256189186e-05, + "loss": 0.2167, + "step": 5086 + }, + { + "epoch": 1.8785081240768093, + "grad_norm": 0.2629348337650299, + "learning_rate": 7.481216898632836e-05, + "loss": 0.1893, + "step": 5087 + }, + { + "epoch": 1.878877400295421, + "grad_norm": 0.25359249114990234, + "learning_rate": 7.478753541076488e-05, + "loss": 0.206, + "step": 5088 + }, + { + "epoch": 1.8792466765140325, + "grad_norm": 0.31631186604499817, + "learning_rate": 7.476290183520138e-05, + "loss": 0.2797, + "step": 5089 + }, + { + "epoch": 1.8796159527326441, + "grad_norm": 0.2985302805900574, + "learning_rate": 7.47382682596379e-05, + "loss": 0.2134, + "step": 5090 + }, + { + "epoch": 1.8799852289512555, + "grad_norm": 0.24728788435459137, + "learning_rate": 7.47136346840744e-05, + "loss": 0.2211, + "step": 5091 + }, + { + "epoch": 1.880354505169867, + "grad_norm": 0.22737756371498108, + "learning_rate": 7.468900110851091e-05, + "loss": 0.1824, + "step": 5092 + }, + { + "epoch": 1.8807237813884785, + "grad_norm": 0.2688554525375366, + "learning_rate": 7.466436753294741e-05, + "loss": 0.219, + "step": 5093 + }, + { + "epoch": 1.8810930576070901, + "grad_norm": 0.2756056785583496, + "learning_rate": 7.463973395738391e-05, + "loss": 0.2542, + "step": 5094 + }, + { + "epoch": 1.8814623338257017, + "grad_norm": 0.2676892578601837, + "learning_rate": 7.461510038182043e-05, + "loss": 0.224, + "step": 5095 + }, + { + "epoch": 1.881831610044313, + "grad_norm": 0.25254231691360474, + "learning_rate": 7.459046680625693e-05, + "loss": 0.2002, + "step": 5096 + }, + { + "epoch": 1.8822008862629247, + "grad_norm": 0.2552568316459656, + "learning_rate": 7.456583323069344e-05, + "loss": 0.2111, + "step": 5097 + }, + { + "epoch": 1.882570162481536, + "grad_norm": 0.4236763119697571, + "learning_rate": 7.454119965512994e-05, + "loss": 0.1806, + "step": 5098 + }, + { + "epoch": 1.8829394387001477, + "grad_norm": 0.31727367639541626, + "learning_rate": 7.451656607956646e-05, + "loss": 0.2609, + "step": 5099 + }, + { + "epoch": 1.8833087149187593, + "grad_norm": 0.21042287349700928, + "learning_rate": 7.449193250400296e-05, + "loss": 0.1714, + "step": 5100 + }, + { + "epoch": 1.8833087149187593, + "eval_loss": 8.615030288696289, + "eval_runtime": 6.9201, + "eval_samples_per_second": 7.225, + "eval_steps_per_second": 1.012, + "step": 5100 + }, + { + "epoch": 1.8836779911373709, + "grad_norm": 0.2589627802371979, + "learning_rate": 7.446729892843946e-05, + "loss": 0.2011, + "step": 5101 + }, + { + "epoch": 1.8840472673559823, + "grad_norm": 0.24489286541938782, + "learning_rate": 7.444266535287597e-05, + "loss": 0.2214, + "step": 5102 + }, + { + "epoch": 1.8844165435745936, + "grad_norm": 0.2737046480178833, + "learning_rate": 7.441803177731248e-05, + "loss": 0.2098, + "step": 5103 + }, + { + "epoch": 1.8847858197932053, + "grad_norm": 0.2817241847515106, + "learning_rate": 7.439339820174899e-05, + "loss": 0.2054, + "step": 5104 + }, + { + "epoch": 1.8851550960118169, + "grad_norm": 0.267406702041626, + "learning_rate": 7.436876462618549e-05, + "loss": 0.2029, + "step": 5105 + }, + { + "epoch": 1.8855243722304285, + "grad_norm": 0.26581209897994995, + "learning_rate": 7.4344131050622e-05, + "loss": 0.2221, + "step": 5106 + }, + { + "epoch": 1.8858936484490398, + "grad_norm": 0.23471567034721375, + "learning_rate": 7.431949747505851e-05, + "loss": 0.1833, + "step": 5107 + }, + { + "epoch": 1.8862629246676514, + "grad_norm": 0.25246110558509827, + "learning_rate": 7.429486389949502e-05, + "loss": 0.1774, + "step": 5108 + }, + { + "epoch": 1.8866322008862628, + "grad_norm": 0.2882575988769531, + "learning_rate": 7.427023032393152e-05, + "loss": 0.2419, + "step": 5109 + }, + { + "epoch": 1.8870014771048744, + "grad_norm": 0.26771315932273865, + "learning_rate": 7.424559674836802e-05, + "loss": 0.2422, + "step": 5110 + }, + { + "epoch": 1.887370753323486, + "grad_norm": 0.2439570277929306, + "learning_rate": 7.422096317280454e-05, + "loss": 0.2042, + "step": 5111 + }, + { + "epoch": 1.8877400295420976, + "grad_norm": 0.255900502204895, + "learning_rate": 7.419632959724104e-05, + "loss": 0.2186, + "step": 5112 + }, + { + "epoch": 1.888109305760709, + "grad_norm": 0.3075030446052551, + "learning_rate": 7.417169602167755e-05, + "loss": 0.2696, + "step": 5113 + }, + { + "epoch": 1.8884785819793204, + "grad_norm": 0.31678903102874756, + "learning_rate": 7.414706244611406e-05, + "loss": 0.1841, + "step": 5114 + }, + { + "epoch": 1.888847858197932, + "grad_norm": 0.2935866415500641, + "learning_rate": 7.412242887055057e-05, + "loss": 0.2421, + "step": 5115 + }, + { + "epoch": 1.8892171344165436, + "grad_norm": 0.24934333562850952, + "learning_rate": 7.409779529498707e-05, + "loss": 0.2091, + "step": 5116 + }, + { + "epoch": 1.8895864106351552, + "grad_norm": 0.28294795751571655, + "learning_rate": 7.407316171942357e-05, + "loss": 0.2107, + "step": 5117 + }, + { + "epoch": 1.8899556868537666, + "grad_norm": 0.23547568917274475, + "learning_rate": 7.404852814386009e-05, + "loss": 0.1731, + "step": 5118 + }, + { + "epoch": 1.8903249630723782, + "grad_norm": 0.26893723011016846, + "learning_rate": 7.402389456829659e-05, + "loss": 0.2197, + "step": 5119 + }, + { + "epoch": 1.8906942392909896, + "grad_norm": 0.2891537547111511, + "learning_rate": 7.39992609927331e-05, + "loss": 0.2562, + "step": 5120 + }, + { + "epoch": 1.8910635155096012, + "grad_norm": 0.23823553323745728, + "learning_rate": 7.39746274171696e-05, + "loss": 0.1801, + "step": 5121 + }, + { + "epoch": 1.8914327917282128, + "grad_norm": 0.23695747554302216, + "learning_rate": 7.394999384160612e-05, + "loss": 0.1884, + "step": 5122 + }, + { + "epoch": 1.8918020679468244, + "grad_norm": 0.27938270568847656, + "learning_rate": 7.392536026604262e-05, + "loss": 0.238, + "step": 5123 + }, + { + "epoch": 1.8921713441654358, + "grad_norm": 0.3660219609737396, + "learning_rate": 7.390072669047913e-05, + "loss": 0.2831, + "step": 5124 + }, + { + "epoch": 1.8925406203840471, + "grad_norm": 0.2791675329208374, + "learning_rate": 7.387609311491564e-05, + "loss": 0.1951, + "step": 5125 + }, + { + "epoch": 1.8929098966026587, + "grad_norm": 0.2358480840921402, + "learning_rate": 7.385145953935214e-05, + "loss": 0.1877, + "step": 5126 + }, + { + "epoch": 1.8932791728212703, + "grad_norm": 0.2654988467693329, + "learning_rate": 7.382682596378865e-05, + "loss": 0.258, + "step": 5127 + }, + { + "epoch": 1.893648449039882, + "grad_norm": 0.2392893135547638, + "learning_rate": 7.380219238822515e-05, + "loss": 0.1952, + "step": 5128 + }, + { + "epoch": 1.8940177252584933, + "grad_norm": 0.2693444490432739, + "learning_rate": 7.377755881266167e-05, + "loss": 0.216, + "step": 5129 + }, + { + "epoch": 1.894387001477105, + "grad_norm": 0.2769111394882202, + "learning_rate": 7.375292523709817e-05, + "loss": 0.1837, + "step": 5130 + }, + { + "epoch": 1.8947562776957163, + "grad_norm": 0.25017088651657104, + "learning_rate": 7.372829166153468e-05, + "loss": 0.2164, + "step": 5131 + }, + { + "epoch": 1.895125553914328, + "grad_norm": 0.27317383885383606, + "learning_rate": 7.370365808597118e-05, + "loss": 0.2244, + "step": 5132 + }, + { + "epoch": 1.8954948301329395, + "grad_norm": 0.3125111162662506, + "learning_rate": 7.367902451040768e-05, + "loss": 0.2032, + "step": 5133 + }, + { + "epoch": 1.8958641063515511, + "grad_norm": 0.27704447507858276, + "learning_rate": 7.36543909348442e-05, + "loss": 0.2494, + "step": 5134 + }, + { + "epoch": 1.8962333825701625, + "grad_norm": 0.2758181691169739, + "learning_rate": 7.36297573592807e-05, + "loss": 0.1946, + "step": 5135 + }, + { + "epoch": 1.8966026587887739, + "grad_norm": 0.24134181439876556, + "learning_rate": 7.360512378371721e-05, + "loss": 0.2156, + "step": 5136 + }, + { + "epoch": 1.8969719350073855, + "grad_norm": 0.24057316780090332, + "learning_rate": 7.358049020815372e-05, + "loss": 0.216, + "step": 5137 + }, + { + "epoch": 1.897341211225997, + "grad_norm": 0.2869103252887726, + "learning_rate": 7.355585663259023e-05, + "loss": 0.2222, + "step": 5138 + }, + { + "epoch": 1.8977104874446087, + "grad_norm": 0.2887077033519745, + "learning_rate": 7.353122305702673e-05, + "loss": 0.2335, + "step": 5139 + }, + { + "epoch": 1.89807976366322, + "grad_norm": 0.23099581897258759, + "learning_rate": 7.350658948146323e-05, + "loss": 0.1801, + "step": 5140 + }, + { + "epoch": 1.8984490398818314, + "grad_norm": 0.2678428888320923, + "learning_rate": 7.348195590589975e-05, + "loss": 0.2274, + "step": 5141 + }, + { + "epoch": 1.898818316100443, + "grad_norm": 0.25359606742858887, + "learning_rate": 7.345732233033625e-05, + "loss": 0.2007, + "step": 5142 + }, + { + "epoch": 1.8991875923190547, + "grad_norm": 0.2669982612133026, + "learning_rate": 7.343268875477276e-05, + "loss": 0.2204, + "step": 5143 + }, + { + "epoch": 1.8995568685376663, + "grad_norm": 0.3419477343559265, + "learning_rate": 7.340805517920926e-05, + "loss": 0.288, + "step": 5144 + }, + { + "epoch": 1.8999261447562779, + "grad_norm": 0.32460084557533264, + "learning_rate": 7.338342160364578e-05, + "loss": 0.2618, + "step": 5145 + }, + { + "epoch": 1.9002954209748892, + "grad_norm": 0.2309187799692154, + "learning_rate": 7.335878802808228e-05, + "loss": 0.1789, + "step": 5146 + }, + { + "epoch": 1.9006646971935006, + "grad_norm": 0.21184204518795013, + "learning_rate": 7.33341544525188e-05, + "loss": 0.1917, + "step": 5147 + }, + { + "epoch": 1.9010339734121122, + "grad_norm": 0.28004971146583557, + "learning_rate": 7.33095208769553e-05, + "loss": 0.2328, + "step": 5148 + }, + { + "epoch": 1.9014032496307238, + "grad_norm": 0.30203181505203247, + "learning_rate": 7.32848873013918e-05, + "loss": 0.2189, + "step": 5149 + }, + { + "epoch": 1.9017725258493354, + "grad_norm": 0.3539334237575531, + "learning_rate": 7.326025372582831e-05, + "loss": 0.2167, + "step": 5150 + }, + { + "epoch": 1.9017725258493354, + "eval_loss": 8.734087944030762, + "eval_runtime": 6.9054, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 1.014, + "step": 5150 + }, + { + "epoch": 1.9021418020679468, + "grad_norm": 0.25480496883392334, + "learning_rate": 7.323562015026481e-05, + "loss": 0.2154, + "step": 5151 + }, + { + "epoch": 1.9025110782865582, + "grad_norm": 0.5700995326042175, + "learning_rate": 7.321098657470133e-05, + "loss": 0.2734, + "step": 5152 + }, + { + "epoch": 1.9028803545051698, + "grad_norm": 0.2859126031398773, + "learning_rate": 7.318635299913783e-05, + "loss": 0.2391, + "step": 5153 + }, + { + "epoch": 1.9032496307237814, + "grad_norm": 0.25656676292419434, + "learning_rate": 7.316171942357434e-05, + "loss": 0.2065, + "step": 5154 + }, + { + "epoch": 1.903618906942393, + "grad_norm": 0.2831866443157196, + "learning_rate": 7.313708584801084e-05, + "loss": 0.2417, + "step": 5155 + }, + { + "epoch": 1.9039881831610044, + "grad_norm": 0.2174830287694931, + "learning_rate": 7.311245227244734e-05, + "loss": 0.1925, + "step": 5156 + }, + { + "epoch": 1.904357459379616, + "grad_norm": 0.2546316087245941, + "learning_rate": 7.308781869688386e-05, + "loss": 0.1728, + "step": 5157 + }, + { + "epoch": 1.9047267355982274, + "grad_norm": 0.19988849759101868, + "learning_rate": 7.306318512132036e-05, + "loss": 0.1754, + "step": 5158 + }, + { + "epoch": 1.905096011816839, + "grad_norm": 0.2537265419960022, + "learning_rate": 7.303855154575688e-05, + "loss": 0.1971, + "step": 5159 + }, + { + "epoch": 1.9054652880354506, + "grad_norm": 0.29449230432510376, + "learning_rate": 7.301391797019338e-05, + "loss": 0.2293, + "step": 5160 + }, + { + "epoch": 1.9058345642540622, + "grad_norm": 0.2413308471441269, + "learning_rate": 7.298928439462989e-05, + "loss": 0.2013, + "step": 5161 + }, + { + "epoch": 1.9062038404726735, + "grad_norm": 0.2884998023509979, + "learning_rate": 7.296465081906639e-05, + "loss": 0.2242, + "step": 5162 + }, + { + "epoch": 1.906573116691285, + "grad_norm": 0.22200524806976318, + "learning_rate": 7.29400172435029e-05, + "loss": 0.1982, + "step": 5163 + }, + { + "epoch": 1.9069423929098965, + "grad_norm": 0.3152354061603546, + "learning_rate": 7.291538366793941e-05, + "loss": 0.2854, + "step": 5164 + }, + { + "epoch": 1.9073116691285081, + "grad_norm": 0.246052548289299, + "learning_rate": 7.289075009237591e-05, + "loss": 0.1698, + "step": 5165 + }, + { + "epoch": 1.9076809453471197, + "grad_norm": 0.2668823003768921, + "learning_rate": 7.286611651681242e-05, + "loss": 0.2077, + "step": 5166 + }, + { + "epoch": 1.9080502215657311, + "grad_norm": 0.287730872631073, + "learning_rate": 7.284148294124892e-05, + "loss": 0.1822, + "step": 5167 + }, + { + "epoch": 1.9084194977843427, + "grad_norm": 0.30587324500083923, + "learning_rate": 7.281684936568544e-05, + "loss": 0.2425, + "step": 5168 + }, + { + "epoch": 1.908788774002954, + "grad_norm": 0.2801584005355835, + "learning_rate": 7.279221579012194e-05, + "loss": 0.1967, + "step": 5169 + }, + { + "epoch": 1.9091580502215657, + "grad_norm": 0.2854374945163727, + "learning_rate": 7.276758221455846e-05, + "loss": 0.2131, + "step": 5170 + }, + { + "epoch": 1.9095273264401773, + "grad_norm": 0.26117175817489624, + "learning_rate": 7.274294863899496e-05, + "loss": 0.1686, + "step": 5171 + }, + { + "epoch": 1.909896602658789, + "grad_norm": 0.1922745704650879, + "learning_rate": 7.271831506343146e-05, + "loss": 0.1589, + "step": 5172 + }, + { + "epoch": 1.9102658788774003, + "grad_norm": 0.2977977693080902, + "learning_rate": 7.269368148786797e-05, + "loss": 0.2018, + "step": 5173 + }, + { + "epoch": 1.9106351550960117, + "grad_norm": 0.35264915227890015, + "learning_rate": 7.266904791230447e-05, + "loss": 0.268, + "step": 5174 + }, + { + "epoch": 1.9110044313146233, + "grad_norm": 0.2662093937397003, + "learning_rate": 7.264441433674099e-05, + "loss": 0.1902, + "step": 5175 + }, + { + "epoch": 1.9113737075332349, + "grad_norm": 0.29595673084259033, + "learning_rate": 7.261978076117749e-05, + "loss": 0.2585, + "step": 5176 + }, + { + "epoch": 1.9117429837518465, + "grad_norm": 0.2660640478134155, + "learning_rate": 7.2595147185614e-05, + "loss": 0.2171, + "step": 5177 + }, + { + "epoch": 1.9121122599704579, + "grad_norm": 0.3448339104652405, + "learning_rate": 7.25705136100505e-05, + "loss": 0.216, + "step": 5178 + }, + { + "epoch": 1.9124815361890695, + "grad_norm": 0.3071405291557312, + "learning_rate": 7.254588003448702e-05, + "loss": 0.2431, + "step": 5179 + }, + { + "epoch": 1.9128508124076808, + "grad_norm": 0.24723811447620392, + "learning_rate": 7.252124645892352e-05, + "loss": 0.2129, + "step": 5180 + }, + { + "epoch": 1.9132200886262924, + "grad_norm": 0.24988505244255066, + "learning_rate": 7.249661288336002e-05, + "loss": 0.1732, + "step": 5181 + }, + { + "epoch": 1.913589364844904, + "grad_norm": 0.2117289900779724, + "learning_rate": 7.247197930779654e-05, + "loss": 0.1804, + "step": 5182 + }, + { + "epoch": 1.9139586410635157, + "grad_norm": 0.23544207215309143, + "learning_rate": 7.244734573223304e-05, + "loss": 0.1913, + "step": 5183 + }, + { + "epoch": 1.914327917282127, + "grad_norm": 0.219760000705719, + "learning_rate": 7.242271215666955e-05, + "loss": 0.1889, + "step": 5184 + }, + { + "epoch": 1.9146971935007384, + "grad_norm": 0.2315712422132492, + "learning_rate": 7.239807858110605e-05, + "loss": 0.1927, + "step": 5185 + }, + { + "epoch": 1.91506646971935, + "grad_norm": 0.27324286103248596, + "learning_rate": 7.237344500554257e-05, + "loss": 0.2145, + "step": 5186 + }, + { + "epoch": 1.9154357459379616, + "grad_norm": 0.2549130320549011, + "learning_rate": 7.234881142997907e-05, + "loss": 0.1971, + "step": 5187 + }, + { + "epoch": 1.9158050221565732, + "grad_norm": 0.2791629433631897, + "learning_rate": 7.232417785441557e-05, + "loss": 0.1992, + "step": 5188 + }, + { + "epoch": 1.9161742983751846, + "grad_norm": 0.28310275077819824, + "learning_rate": 7.229954427885208e-05, + "loss": 0.2539, + "step": 5189 + }, + { + "epoch": 1.9165435745937962, + "grad_norm": 0.3158780634403229, + "learning_rate": 7.227491070328859e-05, + "loss": 0.2392, + "step": 5190 + }, + { + "epoch": 1.9169128508124076, + "grad_norm": 0.22292019426822662, + "learning_rate": 7.22502771277251e-05, + "loss": 0.1772, + "step": 5191 + }, + { + "epoch": 1.9172821270310192, + "grad_norm": 0.2001081258058548, + "learning_rate": 7.22256435521616e-05, + "loss": 0.1664, + "step": 5192 + }, + { + "epoch": 1.9176514032496308, + "grad_norm": 0.24532493948936462, + "learning_rate": 7.220100997659812e-05, + "loss": 0.2084, + "step": 5193 + }, + { + "epoch": 1.9180206794682424, + "grad_norm": 0.2594752013683319, + "learning_rate": 7.217637640103462e-05, + "loss": 0.195, + "step": 5194 + }, + { + "epoch": 1.9183899556868538, + "grad_norm": 0.23948968946933746, + "learning_rate": 7.215174282547113e-05, + "loss": 0.1747, + "step": 5195 + }, + { + "epoch": 1.9187592319054652, + "grad_norm": 0.2977029085159302, + "learning_rate": 7.212710924990763e-05, + "loss": 0.2241, + "step": 5196 + }, + { + "epoch": 1.9191285081240768, + "grad_norm": 0.2505946457386017, + "learning_rate": 7.210247567434413e-05, + "loss": 0.1855, + "step": 5197 + }, + { + "epoch": 1.9194977843426884, + "grad_norm": 0.2547343075275421, + "learning_rate": 7.207784209878065e-05, + "loss": 0.1748, + "step": 5198 + }, + { + "epoch": 1.9198670605613, + "grad_norm": 0.23434215784072876, + "learning_rate": 7.205320852321715e-05, + "loss": 0.1877, + "step": 5199 + }, + { + "epoch": 1.9202363367799113, + "grad_norm": 0.23530955612659454, + "learning_rate": 7.202857494765366e-05, + "loss": 0.1779, + "step": 5200 + }, + { + "epoch": 1.9202363367799113, + "eval_loss": 8.783550262451172, + "eval_runtime": 7.0206, + "eval_samples_per_second": 7.122, + "eval_steps_per_second": 0.997, + "step": 5200 + }, + { + "epoch": 1.920605612998523, + "grad_norm": 0.2563255727291107, + "learning_rate": 7.200394137209016e-05, + "loss": 0.1811, + "step": 5201 + }, + { + "epoch": 1.9209748892171343, + "grad_norm": 0.30660751461982727, + "learning_rate": 7.197930779652668e-05, + "loss": 0.2505, + "step": 5202 + }, + { + "epoch": 1.921344165435746, + "grad_norm": 0.35703355073928833, + "learning_rate": 7.195467422096318e-05, + "loss": 0.2915, + "step": 5203 + }, + { + "epoch": 1.9217134416543575, + "grad_norm": 0.24874936044216156, + "learning_rate": 7.193004064539968e-05, + "loss": 0.1988, + "step": 5204 + }, + { + "epoch": 1.9220827178729691, + "grad_norm": 0.21768781542778015, + "learning_rate": 7.19054070698362e-05, + "loss": 0.1897, + "step": 5205 + }, + { + "epoch": 1.9224519940915805, + "grad_norm": 0.32390838861465454, + "learning_rate": 7.18807734942727e-05, + "loss": 0.2856, + "step": 5206 + }, + { + "epoch": 1.922821270310192, + "grad_norm": 0.2662147879600525, + "learning_rate": 7.185613991870921e-05, + "loss": 0.2107, + "step": 5207 + }, + { + "epoch": 1.9231905465288035, + "grad_norm": 0.2214909940958023, + "learning_rate": 7.183150634314571e-05, + "loss": 0.181, + "step": 5208 + }, + { + "epoch": 1.923559822747415, + "grad_norm": 0.22049537301063538, + "learning_rate": 7.180687276758223e-05, + "loss": 0.1816, + "step": 5209 + }, + { + "epoch": 1.9239290989660267, + "grad_norm": 0.3045891225337982, + "learning_rate": 7.178223919201873e-05, + "loss": 0.2121, + "step": 5210 + }, + { + "epoch": 1.924298375184638, + "grad_norm": 0.2475506216287613, + "learning_rate": 7.175760561645524e-05, + "loss": 0.1996, + "step": 5211 + }, + { + "epoch": 1.9246676514032495, + "grad_norm": 0.2531642019748688, + "learning_rate": 7.173297204089174e-05, + "loss": 0.1859, + "step": 5212 + }, + { + "epoch": 1.925036927621861, + "grad_norm": 0.23886911571025848, + "learning_rate": 7.170833846532825e-05, + "loss": 0.1832, + "step": 5213 + }, + { + "epoch": 1.9254062038404727, + "grad_norm": 0.2570420503616333, + "learning_rate": 7.168370488976476e-05, + "loss": 0.2202, + "step": 5214 + }, + { + "epoch": 1.9257754800590843, + "grad_norm": 0.269432932138443, + "learning_rate": 7.165907131420126e-05, + "loss": 0.2078, + "step": 5215 + }, + { + "epoch": 1.9261447562776959, + "grad_norm": 0.23512187600135803, + "learning_rate": 7.163443773863778e-05, + "loss": 0.1827, + "step": 5216 + }, + { + "epoch": 1.9265140324963073, + "grad_norm": 0.24199458956718445, + "learning_rate": 7.160980416307428e-05, + "loss": 0.1964, + "step": 5217 + }, + { + "epoch": 1.9268833087149186, + "grad_norm": 0.2482496201992035, + "learning_rate": 7.158517058751079e-05, + "loss": 0.1918, + "step": 5218 + }, + { + "epoch": 1.9272525849335302, + "grad_norm": 0.22271081805229187, + "learning_rate": 7.156053701194729e-05, + "loss": 0.1731, + "step": 5219 + }, + { + "epoch": 1.9276218611521418, + "grad_norm": 0.2919629216194153, + "learning_rate": 7.15359034363838e-05, + "loss": 0.2069, + "step": 5220 + }, + { + "epoch": 1.9279911373707534, + "grad_norm": 0.3609210252761841, + "learning_rate": 7.151126986082031e-05, + "loss": 0.2052, + "step": 5221 + }, + { + "epoch": 1.9283604135893648, + "grad_norm": 0.2660120129585266, + "learning_rate": 7.148663628525681e-05, + "loss": 0.2619, + "step": 5222 + }, + { + "epoch": 1.9287296898079762, + "grad_norm": 0.2840423285961151, + "learning_rate": 7.146200270969332e-05, + "loss": 0.2152, + "step": 5223 + }, + { + "epoch": 1.9290989660265878, + "grad_norm": 0.2630270719528198, + "learning_rate": 7.143736913412983e-05, + "loss": 0.2171, + "step": 5224 + }, + { + "epoch": 1.9294682422451994, + "grad_norm": 0.2664338946342468, + "learning_rate": 7.141273555856633e-05, + "loss": 0.2113, + "step": 5225 + }, + { + "epoch": 1.929837518463811, + "grad_norm": 0.22360597550868988, + "learning_rate": 7.138810198300283e-05, + "loss": 0.1856, + "step": 5226 + }, + { + "epoch": 1.9302067946824224, + "grad_norm": 0.2448599487543106, + "learning_rate": 7.136346840743934e-05, + "loss": 0.1803, + "step": 5227 + }, + { + "epoch": 1.930576070901034, + "grad_norm": 0.24953798949718475, + "learning_rate": 7.133883483187584e-05, + "loss": 0.2066, + "step": 5228 + }, + { + "epoch": 1.9309453471196454, + "grad_norm": 0.3000904321670532, + "learning_rate": 7.131420125631236e-05, + "loss": 0.2276, + "step": 5229 + }, + { + "epoch": 1.931314623338257, + "grad_norm": 0.2455965280532837, + "learning_rate": 7.128956768074886e-05, + "loss": 0.1979, + "step": 5230 + }, + { + "epoch": 1.9316838995568686, + "grad_norm": 0.28488701581954956, + "learning_rate": 7.126493410518536e-05, + "loss": 0.2154, + "step": 5231 + }, + { + "epoch": 1.9320531757754802, + "grad_norm": 0.25120359659194946, + "learning_rate": 7.124030052962187e-05, + "loss": 0.1963, + "step": 5232 + }, + { + "epoch": 1.9324224519940916, + "grad_norm": 0.2701643407344818, + "learning_rate": 7.121566695405838e-05, + "loss": 0.2307, + "step": 5233 + }, + { + "epoch": 1.932791728212703, + "grad_norm": 0.2560136616230011, + "learning_rate": 7.119103337849489e-05, + "loss": 0.2306, + "step": 5234 + }, + { + "epoch": 1.9331610044313146, + "grad_norm": 0.28809478878974915, + "learning_rate": 7.116639980293139e-05, + "loss": 0.2152, + "step": 5235 + }, + { + "epoch": 1.9335302806499262, + "grad_norm": 0.2696695923805237, + "learning_rate": 7.11417662273679e-05, + "loss": 0.2381, + "step": 5236 + }, + { + "epoch": 1.9338995568685378, + "grad_norm": 0.3057447373867035, + "learning_rate": 7.111713265180441e-05, + "loss": 0.2468, + "step": 5237 + }, + { + "epoch": 1.9342688330871491, + "grad_norm": 0.40276628732681274, + "learning_rate": 7.109249907624091e-05, + "loss": 0.2443, + "step": 5238 + }, + { + "epoch": 1.9346381093057607, + "grad_norm": 0.33375439047813416, + "learning_rate": 7.106786550067742e-05, + "loss": 0.2559, + "step": 5239 + }, + { + "epoch": 1.9350073855243721, + "grad_norm": 0.27155959606170654, + "learning_rate": 7.104323192511392e-05, + "loss": 0.2337, + "step": 5240 + }, + { + "epoch": 1.9353766617429837, + "grad_norm": 0.28857383131980896, + "learning_rate": 7.101859834955044e-05, + "loss": 0.2332, + "step": 5241 + }, + { + "epoch": 1.9357459379615953, + "grad_norm": 0.2066834717988968, + "learning_rate": 7.099396477398694e-05, + "loss": 0.1602, + "step": 5242 + }, + { + "epoch": 1.936115214180207, + "grad_norm": 0.2341184914112091, + "learning_rate": 7.096933119842345e-05, + "loss": 0.1715, + "step": 5243 + }, + { + "epoch": 1.9364844903988183, + "grad_norm": 0.2721196413040161, + "learning_rate": 7.094469762285996e-05, + "loss": 0.2087, + "step": 5244 + }, + { + "epoch": 1.9368537666174297, + "grad_norm": 0.28074800968170166, + "learning_rate": 7.092006404729647e-05, + "loss": 0.2014, + "step": 5245 + }, + { + "epoch": 1.9372230428360413, + "grad_norm": 0.2780204117298126, + "learning_rate": 7.089543047173297e-05, + "loss": 0.1949, + "step": 5246 + }, + { + "epoch": 1.937592319054653, + "grad_norm": 0.2857365012168884, + "learning_rate": 7.087079689616947e-05, + "loss": 0.2148, + "step": 5247 + }, + { + "epoch": 1.9379615952732645, + "grad_norm": 0.24216902256011963, + "learning_rate": 7.084616332060599e-05, + "loss": 0.1749, + "step": 5248 + }, + { + "epoch": 1.9383308714918759, + "grad_norm": 0.2718616724014282, + "learning_rate": 7.082152974504249e-05, + "loss": 0.2073, + "step": 5249 + }, + { + "epoch": 1.9387001477104875, + "grad_norm": 0.2629314661026001, + "learning_rate": 7.0796896169479e-05, + "loss": 0.2092, + "step": 5250 + }, + { + "epoch": 1.9387001477104875, + "eval_loss": 8.669917106628418, + "eval_runtime": 6.9113, + "eval_samples_per_second": 7.234, + "eval_steps_per_second": 1.013, + "step": 5250 + }, + { + "epoch": 1.9390694239290989, + "grad_norm": 0.23412397503852844, + "learning_rate": 7.07722625939155e-05, + "loss": 0.1978, + "step": 5251 + }, + { + "epoch": 1.9394387001477105, + "grad_norm": 0.263004869222641, + "learning_rate": 7.074762901835202e-05, + "loss": 0.2031, + "step": 5252 + }, + { + "epoch": 1.939807976366322, + "grad_norm": 0.2258167713880539, + "learning_rate": 7.072299544278852e-05, + "loss": 0.1859, + "step": 5253 + }, + { + "epoch": 1.9401772525849337, + "grad_norm": 0.26763907074928284, + "learning_rate": 7.069836186722502e-05, + "loss": 0.2325, + "step": 5254 + }, + { + "epoch": 1.940546528803545, + "grad_norm": 0.22789397835731506, + "learning_rate": 7.067372829166154e-05, + "loss": 0.2081, + "step": 5255 + }, + { + "epoch": 1.9409158050221564, + "grad_norm": 0.3065509796142578, + "learning_rate": 7.064909471609804e-05, + "loss": 0.2147, + "step": 5256 + }, + { + "epoch": 1.941285081240768, + "grad_norm": 0.28669196367263794, + "learning_rate": 7.062446114053455e-05, + "loss": 0.2277, + "step": 5257 + }, + { + "epoch": 1.9416543574593796, + "grad_norm": 0.24266768991947174, + "learning_rate": 7.059982756497105e-05, + "loss": 0.1802, + "step": 5258 + }, + { + "epoch": 1.9420236336779912, + "grad_norm": 0.2694268226623535, + "learning_rate": 7.057519398940757e-05, + "loss": 0.2129, + "step": 5259 + }, + { + "epoch": 1.9423929098966026, + "grad_norm": 0.2952920198440552, + "learning_rate": 7.055056041384407e-05, + "loss": 0.211, + "step": 5260 + }, + { + "epoch": 1.9427621861152142, + "grad_norm": 0.2330179512500763, + "learning_rate": 7.052592683828058e-05, + "loss": 0.185, + "step": 5261 + }, + { + "epoch": 1.9431314623338256, + "grad_norm": 0.25271061062812805, + "learning_rate": 7.050129326271708e-05, + "loss": 0.21, + "step": 5262 + }, + { + "epoch": 1.9435007385524372, + "grad_norm": 0.2830020487308502, + "learning_rate": 7.047665968715358e-05, + "loss": 0.2071, + "step": 5263 + }, + { + "epoch": 1.9438700147710488, + "grad_norm": 0.25679001212120056, + "learning_rate": 7.04520261115901e-05, + "loss": 0.2001, + "step": 5264 + }, + { + "epoch": 1.9442392909896604, + "grad_norm": 0.28757187724113464, + "learning_rate": 7.04273925360266e-05, + "loss": 0.2527, + "step": 5265 + }, + { + "epoch": 1.9446085672082718, + "grad_norm": 0.37781277298927307, + "learning_rate": 7.040275896046312e-05, + "loss": 0.253, + "step": 5266 + }, + { + "epoch": 1.9449778434268832, + "grad_norm": 0.24056147038936615, + "learning_rate": 7.037812538489962e-05, + "loss": 0.1621, + "step": 5267 + }, + { + "epoch": 1.9453471196454948, + "grad_norm": 0.3649381697177887, + "learning_rate": 7.035349180933613e-05, + "loss": 0.2915, + "step": 5268 + }, + { + "epoch": 1.9457163958641064, + "grad_norm": 0.2788947522640228, + "learning_rate": 7.032885823377263e-05, + "loss": 0.2269, + "step": 5269 + }, + { + "epoch": 1.946085672082718, + "grad_norm": 0.26175859570503235, + "learning_rate": 7.030422465820913e-05, + "loss": 0.2197, + "step": 5270 + }, + { + "epoch": 1.9464549483013294, + "grad_norm": 0.6320207118988037, + "learning_rate": 7.027959108264565e-05, + "loss": 0.254, + "step": 5271 + }, + { + "epoch": 1.946824224519941, + "grad_norm": 0.29075735807418823, + "learning_rate": 7.025495750708215e-05, + "loss": 0.2406, + "step": 5272 + }, + { + "epoch": 1.9471935007385524, + "grad_norm": 0.2364053577184677, + "learning_rate": 7.023032393151866e-05, + "loss": 0.2152, + "step": 5273 + }, + { + "epoch": 1.947562776957164, + "grad_norm": 0.26927581429481506, + "learning_rate": 7.020569035595516e-05, + "loss": 0.1935, + "step": 5274 + }, + { + "epoch": 1.9479320531757756, + "grad_norm": 0.28360775113105774, + "learning_rate": 7.018105678039168e-05, + "loss": 0.2062, + "step": 5275 + }, + { + "epoch": 1.9483013293943872, + "grad_norm": 0.24862898886203766, + "learning_rate": 7.015642320482818e-05, + "loss": 0.2247, + "step": 5276 + }, + { + "epoch": 1.9486706056129985, + "grad_norm": 0.23638607561588287, + "learning_rate": 7.01317896292647e-05, + "loss": 0.2396, + "step": 5277 + }, + { + "epoch": 1.94903988183161, + "grad_norm": 0.2960643470287323, + "learning_rate": 7.01071560537012e-05, + "loss": 0.2325, + "step": 5278 + }, + { + "epoch": 1.9494091580502215, + "grad_norm": 0.2552415430545807, + "learning_rate": 7.00825224781377e-05, + "loss": 0.2154, + "step": 5279 + }, + { + "epoch": 1.9497784342688331, + "grad_norm": 0.24241521954536438, + "learning_rate": 7.005788890257421e-05, + "loss": 0.2254, + "step": 5280 + }, + { + "epoch": 1.9501477104874447, + "grad_norm": 0.24517644941806793, + "learning_rate": 7.003325532701071e-05, + "loss": 0.2039, + "step": 5281 + }, + { + "epoch": 1.950516986706056, + "grad_norm": 0.25527405738830566, + "learning_rate": 7.000862175144723e-05, + "loss": 0.1882, + "step": 5282 + }, + { + "epoch": 1.9508862629246675, + "grad_norm": 0.2643362879753113, + "learning_rate": 6.998398817588373e-05, + "loss": 0.2007, + "step": 5283 + }, + { + "epoch": 1.951255539143279, + "grad_norm": 0.27945810556411743, + "learning_rate": 6.995935460032024e-05, + "loss": 0.2589, + "step": 5284 + }, + { + "epoch": 1.9516248153618907, + "grad_norm": 0.25476232171058655, + "learning_rate": 6.993472102475674e-05, + "loss": 0.1947, + "step": 5285 + }, + { + "epoch": 1.9519940915805023, + "grad_norm": 0.27262425422668457, + "learning_rate": 6.991008744919325e-05, + "loss": 0.235, + "step": 5286 + }, + { + "epoch": 1.952363367799114, + "grad_norm": 0.26380208134651184, + "learning_rate": 6.988545387362976e-05, + "loss": 0.1909, + "step": 5287 + }, + { + "epoch": 1.9527326440177253, + "grad_norm": 0.24780826270580292, + "learning_rate": 6.986082029806626e-05, + "loss": 0.1996, + "step": 5288 + }, + { + "epoch": 1.9531019202363367, + "grad_norm": 0.30668652057647705, + "learning_rate": 6.983618672250278e-05, + "loss": 0.2024, + "step": 5289 + }, + { + "epoch": 1.9534711964549483, + "grad_norm": 0.2457285374403, + "learning_rate": 6.981155314693928e-05, + "loss": 0.2182, + "step": 5290 + }, + { + "epoch": 1.9538404726735599, + "grad_norm": 0.2678014039993286, + "learning_rate": 6.978691957137579e-05, + "loss": 0.2194, + "step": 5291 + }, + { + "epoch": 1.9542097488921715, + "grad_norm": 0.37533849477767944, + "learning_rate": 6.976228599581229e-05, + "loss": 0.261, + "step": 5292 + }, + { + "epoch": 1.9545790251107829, + "grad_norm": 0.34228652715682983, + "learning_rate": 6.97376524202488e-05, + "loss": 0.2561, + "step": 5293 + }, + { + "epoch": 1.9549483013293942, + "grad_norm": 0.2933686673641205, + "learning_rate": 6.971301884468531e-05, + "loss": 0.217, + "step": 5294 + }, + { + "epoch": 1.9553175775480058, + "grad_norm": 0.2976348400115967, + "learning_rate": 6.968838526912181e-05, + "loss": 0.1891, + "step": 5295 + }, + { + "epoch": 1.9556868537666174, + "grad_norm": 0.18855692446231842, + "learning_rate": 6.966375169355832e-05, + "loss": 0.1696, + "step": 5296 + }, + { + "epoch": 1.956056129985229, + "grad_norm": 0.27885833382606506, + "learning_rate": 6.963911811799482e-05, + "loss": 0.1809, + "step": 5297 + }, + { + "epoch": 1.9564254062038404, + "grad_norm": 0.27493786811828613, + "learning_rate": 6.961448454243134e-05, + "loss": 0.2063, + "step": 5298 + }, + { + "epoch": 1.956794682422452, + "grad_norm": 0.2346276044845581, + "learning_rate": 6.958985096686784e-05, + "loss": 0.1871, + "step": 5299 + }, + { + "epoch": 1.9571639586410634, + "grad_norm": 0.2509481906890869, + "learning_rate": 6.956521739130436e-05, + "loss": 0.1757, + "step": 5300 + }, + { + "epoch": 1.9571639586410634, + "eval_loss": 8.83945083618164, + "eval_runtime": 6.9064, + "eval_samples_per_second": 7.24, + "eval_steps_per_second": 1.014, + "step": 5300 + }, + { + "epoch": 1.957533234859675, + "grad_norm": 0.2483249008655548, + "learning_rate": 6.954058381574086e-05, + "loss": 0.2068, + "step": 5301 + }, + { + "epoch": 1.9579025110782866, + "grad_norm": 0.24711182713508606, + "learning_rate": 6.951595024017736e-05, + "loss": 0.1872, + "step": 5302 + }, + { + "epoch": 1.9582717872968982, + "grad_norm": 0.2375470995903015, + "learning_rate": 6.949131666461387e-05, + "loss": 0.1739, + "step": 5303 + }, + { + "epoch": 1.9586410635155096, + "grad_norm": 0.29090622067451477, + "learning_rate": 6.946668308905037e-05, + "loss": 0.23, + "step": 5304 + }, + { + "epoch": 1.959010339734121, + "grad_norm": 0.28284651041030884, + "learning_rate": 6.944204951348689e-05, + "loss": 0.219, + "step": 5305 + }, + { + "epoch": 1.9593796159527326, + "grad_norm": 0.24656659364700317, + "learning_rate": 6.941741593792339e-05, + "loss": 0.1853, + "step": 5306 + }, + { + "epoch": 1.9597488921713442, + "grad_norm": 0.26115861535072327, + "learning_rate": 6.93927823623599e-05, + "loss": 0.1766, + "step": 5307 + }, + { + "epoch": 1.9601181683899558, + "grad_norm": 0.2802481949329376, + "learning_rate": 6.93681487867964e-05, + "loss": 0.2559, + "step": 5308 + }, + { + "epoch": 1.9604874446085672, + "grad_norm": 0.2408927083015442, + "learning_rate": 6.93435152112329e-05, + "loss": 0.2084, + "step": 5309 + }, + { + "epoch": 1.9608567208271788, + "grad_norm": 0.2518807649612427, + "learning_rate": 6.931888163566942e-05, + "loss": 0.2076, + "step": 5310 + }, + { + "epoch": 1.9612259970457901, + "grad_norm": 0.26369550824165344, + "learning_rate": 6.929424806010592e-05, + "loss": 0.1891, + "step": 5311 + }, + { + "epoch": 1.9615952732644018, + "grad_norm": 0.2569386959075928, + "learning_rate": 6.926961448454244e-05, + "loss": 0.2264, + "step": 5312 + }, + { + "epoch": 1.9619645494830134, + "grad_norm": 0.27959755063056946, + "learning_rate": 6.924498090897894e-05, + "loss": 0.252, + "step": 5313 + }, + { + "epoch": 1.962333825701625, + "grad_norm": 0.27952420711517334, + "learning_rate": 6.922034733341545e-05, + "loss": 0.2355, + "step": 5314 + }, + { + "epoch": 1.9627031019202363, + "grad_norm": 0.2669624984264374, + "learning_rate": 6.919571375785195e-05, + "loss": 0.2219, + "step": 5315 + }, + { + "epoch": 1.9630723781388477, + "grad_norm": 0.25061479210853577, + "learning_rate": 6.917108018228847e-05, + "loss": 0.2148, + "step": 5316 + }, + { + "epoch": 1.9634416543574593, + "grad_norm": 0.23279403150081635, + "learning_rate": 6.914644660672497e-05, + "loss": 0.1925, + "step": 5317 + }, + { + "epoch": 1.963810930576071, + "grad_norm": 0.3832642138004303, + "learning_rate": 6.912181303116147e-05, + "loss": 0.2735, + "step": 5318 + }, + { + "epoch": 1.9641802067946825, + "grad_norm": 0.28592193126678467, + "learning_rate": 6.909717945559798e-05, + "loss": 0.2298, + "step": 5319 + }, + { + "epoch": 1.964549483013294, + "grad_norm": 0.27921274304389954, + "learning_rate": 6.907254588003449e-05, + "loss": 0.1924, + "step": 5320 + }, + { + "epoch": 1.9649187592319055, + "grad_norm": 0.2360224723815918, + "learning_rate": 6.9047912304471e-05, + "loss": 0.1947, + "step": 5321 + }, + { + "epoch": 1.965288035450517, + "grad_norm": 0.2677941918373108, + "learning_rate": 6.90232787289075e-05, + "loss": 0.2369, + "step": 5322 + }, + { + "epoch": 1.9656573116691285, + "grad_norm": 0.24764162302017212, + "learning_rate": 6.899864515334402e-05, + "loss": 0.1769, + "step": 5323 + }, + { + "epoch": 1.96602658788774, + "grad_norm": 0.2411895990371704, + "learning_rate": 6.897401157778052e-05, + "loss": 0.172, + "step": 5324 + }, + { + "epoch": 1.9663958641063517, + "grad_norm": 0.25845447182655334, + "learning_rate": 6.894937800221702e-05, + "loss": 0.1848, + "step": 5325 + }, + { + "epoch": 1.966765140324963, + "grad_norm": 0.22850818932056427, + "learning_rate": 6.892474442665353e-05, + "loss": 0.175, + "step": 5326 + }, + { + "epoch": 1.9671344165435745, + "grad_norm": 0.2564733326435089, + "learning_rate": 6.890011085109003e-05, + "loss": 0.2258, + "step": 5327 + }, + { + "epoch": 1.967503692762186, + "grad_norm": 0.250072181224823, + "learning_rate": 6.887547727552655e-05, + "loss": 0.1953, + "step": 5328 + }, + { + "epoch": 1.9678729689807977, + "grad_norm": 0.26034319400787354, + "learning_rate": 6.885084369996305e-05, + "loss": 0.2005, + "step": 5329 + }, + { + "epoch": 1.9682422451994093, + "grad_norm": 0.23670229315757751, + "learning_rate": 6.882621012439956e-05, + "loss": 0.191, + "step": 5330 + }, + { + "epoch": 1.9686115214180206, + "grad_norm": 0.2298651933670044, + "learning_rate": 6.880157654883607e-05, + "loss": 0.2058, + "step": 5331 + }, + { + "epoch": 1.9689807976366323, + "grad_norm": 0.2397143691778183, + "learning_rate": 6.877694297327258e-05, + "loss": 0.1965, + "step": 5332 + }, + { + "epoch": 1.9693500738552436, + "grad_norm": 0.3013535439968109, + "learning_rate": 6.875230939770908e-05, + "loss": 0.2156, + "step": 5333 + }, + { + "epoch": 1.9697193500738552, + "grad_norm": 0.22425903379917145, + "learning_rate": 6.872767582214558e-05, + "loss": 0.1794, + "step": 5334 + }, + { + "epoch": 1.9700886262924668, + "grad_norm": 0.21018943190574646, + "learning_rate": 6.87030422465821e-05, + "loss": 0.1715, + "step": 5335 + }, + { + "epoch": 1.9704579025110784, + "grad_norm": 0.3065391480922699, + "learning_rate": 6.86784086710186e-05, + "loss": 0.2392, + "step": 5336 + }, + { + "epoch": 1.9708271787296898, + "grad_norm": 0.3289620876312256, + "learning_rate": 6.865377509545511e-05, + "loss": 0.1721, + "step": 5337 + }, + { + "epoch": 1.9711964549483012, + "grad_norm": 0.25325801968574524, + "learning_rate": 6.862914151989161e-05, + "loss": 0.2013, + "step": 5338 + }, + { + "epoch": 1.9715657311669128, + "grad_norm": 0.3083876669406891, + "learning_rate": 6.860450794432813e-05, + "loss": 0.2428, + "step": 5339 + }, + { + "epoch": 1.9719350073855244, + "grad_norm": 0.2598389983177185, + "learning_rate": 6.857987436876463e-05, + "loss": 0.2037, + "step": 5340 + }, + { + "epoch": 1.972304283604136, + "grad_norm": 0.23687897622585297, + "learning_rate": 6.855524079320113e-05, + "loss": 0.1981, + "step": 5341 + }, + { + "epoch": 1.9726735598227474, + "grad_norm": 0.2632770538330078, + "learning_rate": 6.853060721763764e-05, + "loss": 0.1849, + "step": 5342 + }, + { + "epoch": 1.9730428360413588, + "grad_norm": 0.3879653811454773, + "learning_rate": 6.850597364207415e-05, + "loss": 0.2532, + "step": 5343 + }, + { + "epoch": 1.9734121122599704, + "grad_norm": 0.25035494565963745, + "learning_rate": 6.848134006651066e-05, + "loss": 0.2147, + "step": 5344 + }, + { + "epoch": 1.973781388478582, + "grad_norm": 0.23090393841266632, + "learning_rate": 6.845670649094716e-05, + "loss": 0.1636, + "step": 5345 + }, + { + "epoch": 1.9741506646971936, + "grad_norm": 0.2700568437576294, + "learning_rate": 6.843207291538368e-05, + "loss": 0.226, + "step": 5346 + }, + { + "epoch": 1.9745199409158052, + "grad_norm": 0.2624082863330841, + "learning_rate": 6.840743933982018e-05, + "loss": 0.209, + "step": 5347 + }, + { + "epoch": 1.9748892171344166, + "grad_norm": 0.2354520559310913, + "learning_rate": 6.838280576425669e-05, + "loss": 0.1953, + "step": 5348 + }, + { + "epoch": 1.975258493353028, + "grad_norm": 0.2941135764122009, + "learning_rate": 6.835817218869319e-05, + "loss": 0.1756, + "step": 5349 + }, + { + "epoch": 1.9756277695716395, + "grad_norm": 0.27199020981788635, + "learning_rate": 6.83335386131297e-05, + "loss": 0.2218, + "step": 5350 + }, + { + "epoch": 1.9756277695716395, + "eval_loss": 8.727352142333984, + "eval_runtime": 6.8956, + "eval_samples_per_second": 7.251, + "eval_steps_per_second": 1.015, + "step": 5350 + }, + { + "epoch": 1.9759970457902511, + "grad_norm": 0.29092612862586975, + "learning_rate": 6.830890503756621e-05, + "loss": 0.2334, + "step": 5351 + }, + { + "epoch": 1.9763663220088628, + "grad_norm": 0.39105093479156494, + "learning_rate": 6.828427146200271e-05, + "loss": 0.2567, + "step": 5352 + }, + { + "epoch": 1.9767355982274741, + "grad_norm": 0.3149438500404358, + "learning_rate": 6.825963788643922e-05, + "loss": 0.2078, + "step": 5353 + }, + { + "epoch": 1.9771048744460855, + "grad_norm": 0.22398285567760468, + "learning_rate": 6.823500431087573e-05, + "loss": 0.1754, + "step": 5354 + }, + { + "epoch": 1.9774741506646971, + "grad_norm": 0.3090299367904663, + "learning_rate": 6.821037073531224e-05, + "loss": 0.228, + "step": 5355 + }, + { + "epoch": 1.9778434268833087, + "grad_norm": 0.23092366755008698, + "learning_rate": 6.818573715974874e-05, + "loss": 0.2142, + "step": 5356 + }, + { + "epoch": 1.9782127031019203, + "grad_norm": 0.28547608852386475, + "learning_rate": 6.816110358418524e-05, + "loss": 0.2182, + "step": 5357 + }, + { + "epoch": 1.9785819793205317, + "grad_norm": 0.23349973559379578, + "learning_rate": 6.813647000862176e-05, + "loss": 0.2115, + "step": 5358 + }, + { + "epoch": 1.9789512555391433, + "grad_norm": 0.1864565759897232, + "learning_rate": 6.811183643305826e-05, + "loss": 0.1618, + "step": 5359 + }, + { + "epoch": 1.9793205317577547, + "grad_norm": 0.22898973524570465, + "learning_rate": 6.808720285749477e-05, + "loss": 0.1771, + "step": 5360 + }, + { + "epoch": 1.9796898079763663, + "grad_norm": 0.3169689476490021, + "learning_rate": 6.806256928193127e-05, + "loss": 0.2105, + "step": 5361 + }, + { + "epoch": 1.980059084194978, + "grad_norm": 0.2720812261104584, + "learning_rate": 6.803793570636779e-05, + "loss": 0.1967, + "step": 5362 + }, + { + "epoch": 1.9804283604135895, + "grad_norm": 0.29020199179649353, + "learning_rate": 6.801330213080429e-05, + "loss": 0.2213, + "step": 5363 + }, + { + "epoch": 1.9807976366322009, + "grad_norm": 0.2596421241760254, + "learning_rate": 6.79886685552408e-05, + "loss": 0.2162, + "step": 5364 + }, + { + "epoch": 1.9811669128508123, + "grad_norm": 0.24864576756954193, + "learning_rate": 6.79640349796773e-05, + "loss": 0.2123, + "step": 5365 + }, + { + "epoch": 1.9815361890694239, + "grad_norm": 0.2603173553943634, + "learning_rate": 6.79394014041138e-05, + "loss": 0.1967, + "step": 5366 + }, + { + "epoch": 1.9819054652880355, + "grad_norm": 0.3156823515892029, + "learning_rate": 6.791476782855032e-05, + "loss": 0.2208, + "step": 5367 + }, + { + "epoch": 1.982274741506647, + "grad_norm": 0.24355867505073547, + "learning_rate": 6.789013425298682e-05, + "loss": 0.1769, + "step": 5368 + }, + { + "epoch": 1.9826440177252584, + "grad_norm": 0.2803266942501068, + "learning_rate": 6.786550067742334e-05, + "loss": 0.2127, + "step": 5369 + }, + { + "epoch": 1.98301329394387, + "grad_norm": 0.33743464946746826, + "learning_rate": 6.784086710185984e-05, + "loss": 0.2529, + "step": 5370 + }, + { + "epoch": 1.9833825701624814, + "grad_norm": 0.288459837436676, + "learning_rate": 6.781623352629635e-05, + "loss": 0.2618, + "step": 5371 + }, + { + "epoch": 1.983751846381093, + "grad_norm": 0.33656227588653564, + "learning_rate": 6.779159995073285e-05, + "loss": 0.239, + "step": 5372 + }, + { + "epoch": 1.9841211225997046, + "grad_norm": 0.22855593264102936, + "learning_rate": 6.776696637516935e-05, + "loss": 0.1876, + "step": 5373 + }, + { + "epoch": 1.9844903988183162, + "grad_norm": 0.32245996594429016, + "learning_rate": 6.774233279960587e-05, + "loss": 0.2429, + "step": 5374 + }, + { + "epoch": 1.9848596750369276, + "grad_norm": 0.25610625743865967, + "learning_rate": 6.771769922404237e-05, + "loss": 0.2256, + "step": 5375 + }, + { + "epoch": 1.985228951255539, + "grad_norm": 0.24810069799423218, + "learning_rate": 6.769306564847889e-05, + "loss": 0.2031, + "step": 5376 + }, + { + "epoch": 1.9855982274741506, + "grad_norm": 0.2843506634235382, + "learning_rate": 6.766843207291539e-05, + "loss": 0.2074, + "step": 5377 + }, + { + "epoch": 1.9859675036927622, + "grad_norm": 0.2351301908493042, + "learning_rate": 6.76437984973519e-05, + "loss": 0.1952, + "step": 5378 + }, + { + "epoch": 1.9863367799113738, + "grad_norm": 0.22625257074832916, + "learning_rate": 6.76191649217884e-05, + "loss": 0.2023, + "step": 5379 + }, + { + "epoch": 1.9867060561299852, + "grad_norm": 0.2793484926223755, + "learning_rate": 6.75945313462249e-05, + "loss": 0.2063, + "step": 5380 + }, + { + "epoch": 1.9870753323485968, + "grad_norm": 0.2820974290370941, + "learning_rate": 6.756989777066142e-05, + "loss": 0.2181, + "step": 5381 + }, + { + "epoch": 1.9874446085672082, + "grad_norm": 0.36126255989074707, + "learning_rate": 6.754526419509792e-05, + "loss": 0.2302, + "step": 5382 + }, + { + "epoch": 1.9878138847858198, + "grad_norm": 0.29336369037628174, + "learning_rate": 6.752063061953443e-05, + "loss": 0.2287, + "step": 5383 + }, + { + "epoch": 1.9881831610044314, + "grad_norm": 0.2996385991573334, + "learning_rate": 6.749599704397093e-05, + "loss": 0.2298, + "step": 5384 + }, + { + "epoch": 1.988552437223043, + "grad_norm": 0.24825133383274078, + "learning_rate": 6.747136346840745e-05, + "loss": 0.1969, + "step": 5385 + }, + { + "epoch": 1.9889217134416544, + "grad_norm": 0.2714211344718933, + "learning_rate": 6.744672989284395e-05, + "loss": 0.2341, + "step": 5386 + }, + { + "epoch": 1.9892909896602657, + "grad_norm": 0.2769167721271515, + "learning_rate": 6.742209631728046e-05, + "loss": 0.2211, + "step": 5387 + }, + { + "epoch": 1.9896602658788773, + "grad_norm": 0.2678992748260498, + "learning_rate": 6.739746274171697e-05, + "loss": 0.2375, + "step": 5388 + }, + { + "epoch": 1.990029542097489, + "grad_norm": 0.2509574592113495, + "learning_rate": 6.737282916615347e-05, + "loss": 0.187, + "step": 5389 + }, + { + "epoch": 1.9903988183161005, + "grad_norm": 0.2541256546974182, + "learning_rate": 6.734819559058998e-05, + "loss": 0.2048, + "step": 5390 + }, + { + "epoch": 1.990768094534712, + "grad_norm": 0.3410632908344269, + "learning_rate": 6.732356201502648e-05, + "loss": 0.247, + "step": 5391 + }, + { + "epoch": 1.9911373707533235, + "grad_norm": 0.2736058831214905, + "learning_rate": 6.7298928439463e-05, + "loss": 0.2201, + "step": 5392 + }, + { + "epoch": 1.991506646971935, + "grad_norm": 0.21654681861400604, + "learning_rate": 6.72742948638995e-05, + "loss": 0.2179, + "step": 5393 + }, + { + "epoch": 1.9918759231905465, + "grad_norm": 0.21487143635749817, + "learning_rate": 6.724966128833601e-05, + "loss": 0.1889, + "step": 5394 + }, + { + "epoch": 1.9922451994091581, + "grad_norm": 0.2841379642486572, + "learning_rate": 6.722502771277251e-05, + "loss": 0.214, + "step": 5395 + }, + { + "epoch": 1.9926144756277697, + "grad_norm": 0.3092198669910431, + "learning_rate": 6.720039413720902e-05, + "loss": 0.2083, + "step": 5396 + }, + { + "epoch": 1.992983751846381, + "grad_norm": 0.25814154744148254, + "learning_rate": 6.717576056164553e-05, + "loss": 0.2009, + "step": 5397 + }, + { + "epoch": 1.9933530280649925, + "grad_norm": 0.23792724311351776, + "learning_rate": 6.715112698608203e-05, + "loss": 0.2138, + "step": 5398 + }, + { + "epoch": 1.993722304283604, + "grad_norm": 0.21204932034015656, + "learning_rate": 6.712649341051855e-05, + "loss": 0.1867, + "step": 5399 + }, + { + "epoch": 1.9940915805022157, + "grad_norm": 0.25629615783691406, + "learning_rate": 6.710185983495505e-05, + "loss": 0.2341, + "step": 5400 + }, + { + "epoch": 1.9940915805022157, + "eval_loss": 8.743721008300781, + "eval_runtime": 6.9165, + "eval_samples_per_second": 7.229, + "eval_steps_per_second": 1.012, + "step": 5400 + }, + { + "epoch": 1.9944608567208273, + "grad_norm": 0.25987663865089417, + "learning_rate": 6.707722625939156e-05, + "loss": 0.2523, + "step": 5401 + }, + { + "epoch": 1.9948301329394387, + "grad_norm": 0.26842883229255676, + "learning_rate": 6.705259268382806e-05, + "loss": 0.222, + "step": 5402 + }, + { + "epoch": 1.9951994091580503, + "grad_norm": 0.29410263895988464, + "learning_rate": 6.702795910826458e-05, + "loss": 0.2154, + "step": 5403 + }, + { + "epoch": 1.9955686853766617, + "grad_norm": 0.2571640908718109, + "learning_rate": 6.700332553270108e-05, + "loss": 0.1962, + "step": 5404 + }, + { + "epoch": 1.9959379615952733, + "grad_norm": 0.22956082224845886, + "learning_rate": 6.697869195713758e-05, + "loss": 0.1788, + "step": 5405 + }, + { + "epoch": 1.9963072378138849, + "grad_norm": 0.2511707544326782, + "learning_rate": 6.69540583815741e-05, + "loss": 0.2027, + "step": 5406 + }, + { + "epoch": 1.9966765140324965, + "grad_norm": 0.2535149157047272, + "learning_rate": 6.69294248060106e-05, + "loss": 0.2279, + "step": 5407 + }, + { + "epoch": 1.9970457902511078, + "grad_norm": 0.2487695962190628, + "learning_rate": 6.690479123044711e-05, + "loss": 0.1988, + "step": 5408 + }, + { + "epoch": 1.9974150664697192, + "grad_norm": 0.28799867630004883, + "learning_rate": 6.688015765488361e-05, + "loss": 0.2028, + "step": 5409 + }, + { + "epoch": 1.9977843426883308, + "grad_norm": 0.25841328501701355, + "learning_rate": 6.685552407932013e-05, + "loss": 0.2044, + "step": 5410 + }, + { + "epoch": 1.9981536189069424, + "grad_norm": 0.21879181265830994, + "learning_rate": 6.683089050375663e-05, + "loss": 0.1907, + "step": 5411 + }, + { + "epoch": 1.998522895125554, + "grad_norm": 0.2565045952796936, + "learning_rate": 6.680625692819313e-05, + "loss": 0.1821, + "step": 5412 + }, + { + "epoch": 1.9988921713441654, + "grad_norm": 0.2256195992231369, + "learning_rate": 6.678162335262964e-05, + "loss": 0.2185, + "step": 5413 + }, + { + "epoch": 1.9992614475627768, + "grad_norm": 0.23369260132312775, + "learning_rate": 6.675698977706614e-05, + "loss": 0.1923, + "step": 5414 + }, + { + "epoch": 1.9996307237813884, + "grad_norm": 0.33043569326400757, + "learning_rate": 6.673235620150266e-05, + "loss": 0.2233, + "step": 5415 + }, + { + "epoch": 2.0, + "grad_norm": 0.2837330400943756, + "learning_rate": 6.670772262593916e-05, + "loss": 0.238, + "step": 5416 + }, + { + "epoch": 2.0003692762186116, + "grad_norm": 0.2008206844329834, + "learning_rate": 6.668308905037567e-05, + "loss": 0.1371, + "step": 5417 + }, + { + "epoch": 2.000738552437223, + "grad_norm": 0.2056458592414856, + "learning_rate": 6.665845547481217e-05, + "loss": 0.1671, + "step": 5418 + }, + { + "epoch": 2.0011078286558344, + "grad_norm": 0.21612484753131866, + "learning_rate": 6.663382189924869e-05, + "loss": 0.1765, + "step": 5419 + }, + { + "epoch": 2.001477104874446, + "grad_norm": 0.21130305528640747, + "learning_rate": 6.660918832368519e-05, + "loss": 0.2189, + "step": 5420 + }, + { + "epoch": 2.0018463810930576, + "grad_norm": 0.2064601480960846, + "learning_rate": 6.658455474812169e-05, + "loss": 0.1864, + "step": 5421 + }, + { + "epoch": 2.002215657311669, + "grad_norm": 0.1826343834400177, + "learning_rate": 6.65599211725582e-05, + "loss": 0.1422, + "step": 5422 + }, + { + "epoch": 2.0025849335302808, + "grad_norm": 0.22062867879867554, + "learning_rate": 6.653528759699471e-05, + "loss": 0.1539, + "step": 5423 + }, + { + "epoch": 2.0029542097488924, + "grad_norm": 0.21401406824588776, + "learning_rate": 6.651065402143122e-05, + "loss": 0.1531, + "step": 5424 + }, + { + "epoch": 2.0033234859675035, + "grad_norm": 0.2662554383277893, + "learning_rate": 6.648602044586772e-05, + "loss": 0.2015, + "step": 5425 + }, + { + "epoch": 2.003692762186115, + "grad_norm": 0.23949043452739716, + "learning_rate": 6.646138687030424e-05, + "loss": 0.1538, + "step": 5426 + }, + { + "epoch": 2.0040620384047267, + "grad_norm": 0.21747782826423645, + "learning_rate": 6.643675329474074e-05, + "loss": 0.1593, + "step": 5427 + }, + { + "epoch": 2.0044313146233383, + "grad_norm": 0.1934739053249359, + "learning_rate": 6.641211971917724e-05, + "loss": 0.1225, + "step": 5428 + }, + { + "epoch": 2.00480059084195, + "grad_norm": 0.21713680028915405, + "learning_rate": 6.638748614361375e-05, + "loss": 0.1521, + "step": 5429 + }, + { + "epoch": 2.005169867060561, + "grad_norm": 0.23411938548088074, + "learning_rate": 6.636285256805026e-05, + "loss": 0.1759, + "step": 5430 + }, + { + "epoch": 2.0055391432791727, + "grad_norm": 0.23709343373775482, + "learning_rate": 6.633821899248677e-05, + "loss": 0.1486, + "step": 5431 + }, + { + "epoch": 2.0059084194977843, + "grad_norm": 0.2065429836511612, + "learning_rate": 6.631358541692327e-05, + "loss": 0.1426, + "step": 5432 + }, + { + "epoch": 2.006277695716396, + "grad_norm": 0.20203599333763123, + "learning_rate": 6.628895184135979e-05, + "loss": 0.1493, + "step": 5433 + }, + { + "epoch": 2.0066469719350075, + "grad_norm": 0.25458580255508423, + "learning_rate": 6.626431826579629e-05, + "loss": 0.1802, + "step": 5434 + }, + { + "epoch": 2.007016248153619, + "grad_norm": 0.23985198140144348, + "learning_rate": 6.62396846902328e-05, + "loss": 0.1783, + "step": 5435 + }, + { + "epoch": 2.0073855243722303, + "grad_norm": 0.2161349207162857, + "learning_rate": 6.62150511146693e-05, + "loss": 0.152, + "step": 5436 + }, + { + "epoch": 2.007754800590842, + "grad_norm": 0.39866572618484497, + "learning_rate": 6.61904175391058e-05, + "loss": 0.1503, + "step": 5437 + }, + { + "epoch": 2.0081240768094535, + "grad_norm": 0.24833878874778748, + "learning_rate": 6.616578396354232e-05, + "loss": 0.1753, + "step": 5438 + }, + { + "epoch": 2.008493353028065, + "grad_norm": 0.30654793977737427, + "learning_rate": 6.614115038797882e-05, + "loss": 0.1891, + "step": 5439 + }, + { + "epoch": 2.0088626292466767, + "grad_norm": 0.2662530243396759, + "learning_rate": 6.611651681241533e-05, + "loss": 0.1717, + "step": 5440 + }, + { + "epoch": 2.009231905465288, + "grad_norm": 0.21219800412654877, + "learning_rate": 6.609188323685184e-05, + "loss": 0.1666, + "step": 5441 + }, + { + "epoch": 2.0096011816838995, + "grad_norm": 0.2617355287075043, + "learning_rate": 6.606724966128835e-05, + "loss": 0.179, + "step": 5442 + }, + { + "epoch": 2.009970457902511, + "grad_norm": 0.26514819264411926, + "learning_rate": 6.604261608572485e-05, + "loss": 0.1694, + "step": 5443 + }, + { + "epoch": 2.0103397341211227, + "grad_norm": 0.21686361730098724, + "learning_rate": 6.601798251016135e-05, + "loss": 0.174, + "step": 5444 + }, + { + "epoch": 2.0107090103397343, + "grad_norm": 0.23915503919124603, + "learning_rate": 6.599334893459787e-05, + "loss": 0.1597, + "step": 5445 + }, + { + "epoch": 2.011078286558346, + "grad_norm": 0.267193466424942, + "learning_rate": 6.596871535903437e-05, + "loss": 0.1672, + "step": 5446 + }, + { + "epoch": 2.011447562776957, + "grad_norm": 0.30497974157333374, + "learning_rate": 6.594408178347088e-05, + "loss": 0.191, + "step": 5447 + }, + { + "epoch": 2.0118168389955686, + "grad_norm": 0.27026695013046265, + "learning_rate": 6.591944820790738e-05, + "loss": 0.1819, + "step": 5448 + }, + { + "epoch": 2.0121861152141802, + "grad_norm": 0.28754425048828125, + "learning_rate": 6.58948146323439e-05, + "loss": 0.1611, + "step": 5449 + }, + { + "epoch": 2.012555391432792, + "grad_norm": 0.2765105068683624, + "learning_rate": 6.58701810567804e-05, + "loss": 0.1531, + "step": 5450 + }, + { + "epoch": 2.012555391432792, + "eval_loss": 8.831276893615723, + "eval_runtime": 6.9112, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 1.013, + "step": 5450 + }, + { + "epoch": 2.0129246676514034, + "grad_norm": 0.30785098671913147, + "learning_rate": 6.58455474812169e-05, + "loss": 0.1637, + "step": 5451 + }, + { + "epoch": 2.0132939438700146, + "grad_norm": 0.2837028503417969, + "learning_rate": 6.582091390565341e-05, + "loss": 0.1648, + "step": 5452 + }, + { + "epoch": 2.013663220088626, + "grad_norm": 0.22950854897499084, + "learning_rate": 6.579628033008992e-05, + "loss": 0.1568, + "step": 5453 + }, + { + "epoch": 2.014032496307238, + "grad_norm": 0.258826345205307, + "learning_rate": 6.577164675452643e-05, + "loss": 0.1615, + "step": 5454 + }, + { + "epoch": 2.0144017725258494, + "grad_norm": 0.26013994216918945, + "learning_rate": 6.574701317896293e-05, + "loss": 0.164, + "step": 5455 + }, + { + "epoch": 2.014771048744461, + "grad_norm": 0.2874129116535187, + "learning_rate": 6.572237960339945e-05, + "loss": 0.1572, + "step": 5456 + }, + { + "epoch": 2.015140324963072, + "grad_norm": 0.32115882635116577, + "learning_rate": 6.569774602783593e-05, + "loss": 0.1583, + "step": 5457 + }, + { + "epoch": 2.0155096011816838, + "grad_norm": 0.26224714517593384, + "learning_rate": 6.567311245227245e-05, + "loss": 0.1696, + "step": 5458 + }, + { + "epoch": 2.0158788774002954, + "grad_norm": 0.3201155364513397, + "learning_rate": 6.564847887670895e-05, + "loss": 0.1709, + "step": 5459 + }, + { + "epoch": 2.016248153618907, + "grad_norm": 0.24211126565933228, + "learning_rate": 6.562384530114546e-05, + "loss": 0.1417, + "step": 5460 + }, + { + "epoch": 2.0166174298375186, + "grad_norm": 0.27571773529052734, + "learning_rate": 6.559921172558197e-05, + "loss": 0.1432, + "step": 5461 + }, + { + "epoch": 2.01698670605613, + "grad_norm": 0.26108747720718384, + "learning_rate": 6.557457815001847e-05, + "loss": 0.1819, + "step": 5462 + }, + { + "epoch": 2.0173559822747413, + "grad_norm": 0.22557683289051056, + "learning_rate": 6.554994457445498e-05, + "loss": 0.1493, + "step": 5463 + }, + { + "epoch": 2.017725258493353, + "grad_norm": 0.2540645897388458, + "learning_rate": 6.552531099889148e-05, + "loss": 0.1648, + "step": 5464 + }, + { + "epoch": 2.0180945347119645, + "grad_norm": 0.26069769263267517, + "learning_rate": 6.5500677423328e-05, + "loss": 0.1682, + "step": 5465 + }, + { + "epoch": 2.018463810930576, + "grad_norm": 0.23872898519039154, + "learning_rate": 6.54760438477645e-05, + "loss": 0.1828, + "step": 5466 + }, + { + "epoch": 2.0188330871491877, + "grad_norm": 0.2863729000091553, + "learning_rate": 6.545141027220101e-05, + "loss": 0.1847, + "step": 5467 + }, + { + "epoch": 2.019202363367799, + "grad_norm": 0.2118794322013855, + "learning_rate": 6.542677669663751e-05, + "loss": 0.1496, + "step": 5468 + }, + { + "epoch": 2.0195716395864105, + "grad_norm": 0.23926551640033722, + "learning_rate": 6.540214312107403e-05, + "loss": 0.1788, + "step": 5469 + }, + { + "epoch": 2.019940915805022, + "grad_norm": 0.2735627591609955, + "learning_rate": 6.537750954551053e-05, + "loss": 0.2097, + "step": 5470 + }, + { + "epoch": 2.0203101920236337, + "grad_norm": 0.23740151524543762, + "learning_rate": 6.535287596994703e-05, + "loss": 0.1666, + "step": 5471 + }, + { + "epoch": 2.0206794682422453, + "grad_norm": 0.2518722712993622, + "learning_rate": 6.532824239438355e-05, + "loss": 0.1445, + "step": 5472 + }, + { + "epoch": 2.021048744460857, + "grad_norm": 0.24195218086242676, + "learning_rate": 6.530360881882005e-05, + "loss": 0.1427, + "step": 5473 + }, + { + "epoch": 2.021418020679468, + "grad_norm": 0.26780420541763306, + "learning_rate": 6.527897524325656e-05, + "loss": 0.1484, + "step": 5474 + }, + { + "epoch": 2.0217872968980797, + "grad_norm": 0.22850856184959412, + "learning_rate": 6.525434166769306e-05, + "loss": 0.1411, + "step": 5475 + }, + { + "epoch": 2.0221565731166913, + "grad_norm": 0.2006332278251648, + "learning_rate": 6.522970809212958e-05, + "loss": 0.1414, + "step": 5476 + }, + { + "epoch": 2.022525849335303, + "grad_norm": 0.28092196583747864, + "learning_rate": 6.520507451656608e-05, + "loss": 0.1817, + "step": 5477 + }, + { + "epoch": 2.0228951255539145, + "grad_norm": 0.21626244485378265, + "learning_rate": 6.518044094100258e-05, + "loss": 0.1429, + "step": 5478 + }, + { + "epoch": 2.0232644017725256, + "grad_norm": 0.27406346797943115, + "learning_rate": 6.51558073654391e-05, + "loss": 0.1618, + "step": 5479 + }, + { + "epoch": 2.0236336779911372, + "grad_norm": 0.21502982079982758, + "learning_rate": 6.51311737898756e-05, + "loss": 0.1254, + "step": 5480 + }, + { + "epoch": 2.024002954209749, + "grad_norm": 0.22727058827877045, + "learning_rate": 6.510654021431211e-05, + "loss": 0.1494, + "step": 5481 + }, + { + "epoch": 2.0243722304283605, + "grad_norm": 0.2909197509288788, + "learning_rate": 6.508190663874861e-05, + "loss": 0.1859, + "step": 5482 + }, + { + "epoch": 2.024741506646972, + "grad_norm": 0.204005166888237, + "learning_rate": 6.505727306318512e-05, + "loss": 0.1564, + "step": 5483 + }, + { + "epoch": 2.0251107828655837, + "grad_norm": 0.24216368794441223, + "learning_rate": 6.503263948762163e-05, + "loss": 0.165, + "step": 5484 + }, + { + "epoch": 2.025480059084195, + "grad_norm": 0.2794150114059448, + "learning_rate": 6.500800591205814e-05, + "loss": 0.147, + "step": 5485 + }, + { + "epoch": 2.0258493353028064, + "grad_norm": 0.2236781269311905, + "learning_rate": 6.498337233649464e-05, + "loss": 0.1819, + "step": 5486 + }, + { + "epoch": 2.026218611521418, + "grad_norm": 0.2640867233276367, + "learning_rate": 6.495873876093114e-05, + "loss": 0.1665, + "step": 5487 + }, + { + "epoch": 2.0265878877400296, + "grad_norm": 0.26615992188453674, + "learning_rate": 6.493410518536766e-05, + "loss": 0.1742, + "step": 5488 + }, + { + "epoch": 2.0269571639586412, + "grad_norm": 0.26467806100845337, + "learning_rate": 6.490947160980416e-05, + "loss": 0.1769, + "step": 5489 + }, + { + "epoch": 2.0273264401772524, + "grad_norm": 0.21917937695980072, + "learning_rate": 6.488483803424067e-05, + "loss": 0.1563, + "step": 5490 + }, + { + "epoch": 2.027695716395864, + "grad_norm": 0.32269448041915894, + "learning_rate": 6.486020445867717e-05, + "loss": 0.1903, + "step": 5491 + }, + { + "epoch": 2.0280649926144756, + "grad_norm": 0.26801756024360657, + "learning_rate": 6.483557088311369e-05, + "loss": 0.1631, + "step": 5492 + }, + { + "epoch": 2.028434268833087, + "grad_norm": 0.2210240364074707, + "learning_rate": 6.481093730755019e-05, + "loss": 0.1449, + "step": 5493 + }, + { + "epoch": 2.028803545051699, + "grad_norm": 0.27296894788742065, + "learning_rate": 6.478630373198669e-05, + "loss": 0.1474, + "step": 5494 + }, + { + "epoch": 2.0291728212703104, + "grad_norm": 0.21951426565647125, + "learning_rate": 6.47616701564232e-05, + "loss": 0.1355, + "step": 5495 + }, + { + "epoch": 2.0295420974889216, + "grad_norm": 0.2339065819978714, + "learning_rate": 6.47370365808597e-05, + "loss": 0.1427, + "step": 5496 + }, + { + "epoch": 2.029911373707533, + "grad_norm": 0.23001059889793396, + "learning_rate": 6.471240300529622e-05, + "loss": 0.1601, + "step": 5497 + }, + { + "epoch": 2.0302806499261448, + "grad_norm": 0.2240254133939743, + "learning_rate": 6.468776942973272e-05, + "loss": 0.1455, + "step": 5498 + }, + { + "epoch": 2.0306499261447564, + "grad_norm": 0.31751587986946106, + "learning_rate": 6.466313585416924e-05, + "loss": 0.1939, + "step": 5499 + }, + { + "epoch": 2.031019202363368, + "grad_norm": 0.23716969788074493, + "learning_rate": 6.463850227860574e-05, + "loss": 0.1576, + "step": 5500 + }, + { + "epoch": 2.031019202363368, + "eval_loss": 8.865036010742188, + "eval_runtime": 6.8964, + "eval_samples_per_second": 7.25, + "eval_steps_per_second": 1.015, + "step": 5500 + }, + { + "epoch": 2.031388478581979, + "grad_norm": 0.2523720860481262, + "learning_rate": 6.461386870304225e-05, + "loss": 0.1671, + "step": 5501 + }, + { + "epoch": 2.0317577548005907, + "grad_norm": 0.27259740233421326, + "learning_rate": 6.458923512747875e-05, + "loss": 0.1723, + "step": 5502 + }, + { + "epoch": 2.0321270310192023, + "grad_norm": 0.20139923691749573, + "learning_rate": 6.456460155191526e-05, + "loss": 0.1578, + "step": 5503 + }, + { + "epoch": 2.032496307237814, + "grad_norm": 0.2715934216976166, + "learning_rate": 6.453996797635177e-05, + "loss": 0.1691, + "step": 5504 + }, + { + "epoch": 2.0328655834564255, + "grad_norm": 0.2548423409461975, + "learning_rate": 6.451533440078827e-05, + "loss": 0.1635, + "step": 5505 + }, + { + "epoch": 2.033234859675037, + "grad_norm": 0.24985384941101074, + "learning_rate": 6.449070082522479e-05, + "loss": 0.1601, + "step": 5506 + }, + { + "epoch": 2.0336041358936483, + "grad_norm": 0.24926096200942993, + "learning_rate": 6.446606724966129e-05, + "loss": 0.1414, + "step": 5507 + }, + { + "epoch": 2.03397341211226, + "grad_norm": 0.21222984790802002, + "learning_rate": 6.44414336740978e-05, + "loss": 0.1358, + "step": 5508 + }, + { + "epoch": 2.0343426883308715, + "grad_norm": 0.2673918604850769, + "learning_rate": 6.44168000985343e-05, + "loss": 0.1741, + "step": 5509 + }, + { + "epoch": 2.034711964549483, + "grad_norm": 0.24519532918930054, + "learning_rate": 6.43921665229708e-05, + "loss": 0.1515, + "step": 5510 + }, + { + "epoch": 2.0350812407680947, + "grad_norm": 0.2878076434135437, + "learning_rate": 6.436753294740732e-05, + "loss": 0.1716, + "step": 5511 + }, + { + "epoch": 2.035450516986706, + "grad_norm": 0.23664633929729462, + "learning_rate": 6.434289937184382e-05, + "loss": 0.1627, + "step": 5512 + }, + { + "epoch": 2.0358197932053175, + "grad_norm": 0.24452872574329376, + "learning_rate": 6.431826579628033e-05, + "loss": 0.1566, + "step": 5513 + }, + { + "epoch": 2.036189069423929, + "grad_norm": 0.21907813847064972, + "learning_rate": 6.429363222071683e-05, + "loss": 0.148, + "step": 5514 + }, + { + "epoch": 2.0365583456425407, + "grad_norm": 0.2250799834728241, + "learning_rate": 6.426899864515335e-05, + "loss": 0.1743, + "step": 5515 + }, + { + "epoch": 2.0369276218611523, + "grad_norm": 0.2510002553462982, + "learning_rate": 6.424436506958985e-05, + "loss": 0.1892, + "step": 5516 + }, + { + "epoch": 2.037296898079764, + "grad_norm": 0.1832273155450821, + "learning_rate": 6.421973149402637e-05, + "loss": 0.1396, + "step": 5517 + }, + { + "epoch": 2.037666174298375, + "grad_norm": 0.27862271666526794, + "learning_rate": 6.419509791846287e-05, + "loss": 0.1839, + "step": 5518 + }, + { + "epoch": 2.0380354505169866, + "grad_norm": 0.2576388716697693, + "learning_rate": 6.417046434289937e-05, + "loss": 0.1791, + "step": 5519 + }, + { + "epoch": 2.0384047267355982, + "grad_norm": 0.26902034878730774, + "learning_rate": 6.414583076733588e-05, + "loss": 0.1718, + "step": 5520 + }, + { + "epoch": 2.03877400295421, + "grad_norm": 0.21109691262245178, + "learning_rate": 6.412119719177238e-05, + "loss": 0.1501, + "step": 5521 + }, + { + "epoch": 2.0391432791728215, + "grad_norm": 0.26864200830459595, + "learning_rate": 6.40965636162089e-05, + "loss": 0.1473, + "step": 5522 + }, + { + "epoch": 2.0395125553914326, + "grad_norm": 0.2651221752166748, + "learning_rate": 6.40719300406454e-05, + "loss": 0.1589, + "step": 5523 + }, + { + "epoch": 2.039881831610044, + "grad_norm": 0.2410530298948288, + "learning_rate": 6.404729646508191e-05, + "loss": 0.1771, + "step": 5524 + }, + { + "epoch": 2.040251107828656, + "grad_norm": 0.27192580699920654, + "learning_rate": 6.402266288951841e-05, + "loss": 0.1615, + "step": 5525 + }, + { + "epoch": 2.0406203840472674, + "grad_norm": 0.2844083309173584, + "learning_rate": 6.399802931395492e-05, + "loss": 0.1633, + "step": 5526 + }, + { + "epoch": 2.040989660265879, + "grad_norm": 0.3292931914329529, + "learning_rate": 6.397339573839143e-05, + "loss": 0.155, + "step": 5527 + }, + { + "epoch": 2.04135893648449, + "grad_norm": 0.3270472288131714, + "learning_rate": 6.394876216282793e-05, + "loss": 0.1932, + "step": 5528 + }, + { + "epoch": 2.041728212703102, + "grad_norm": 0.2201695740222931, + "learning_rate": 6.392412858726445e-05, + "loss": 0.1594, + "step": 5529 + }, + { + "epoch": 2.0420974889217134, + "grad_norm": 0.24588629603385925, + "learning_rate": 6.389949501170095e-05, + "loss": 0.1607, + "step": 5530 + }, + { + "epoch": 2.042466765140325, + "grad_norm": 0.2765435576438904, + "learning_rate": 6.387486143613746e-05, + "loss": 0.1737, + "step": 5531 + }, + { + "epoch": 2.0428360413589366, + "grad_norm": 0.20165285468101501, + "learning_rate": 6.385022786057396e-05, + "loss": 0.1505, + "step": 5532 + }, + { + "epoch": 2.043205317577548, + "grad_norm": 0.27548524737358093, + "learning_rate": 6.382559428501046e-05, + "loss": 0.1428, + "step": 5533 + }, + { + "epoch": 2.0435745937961594, + "grad_norm": 0.22964608669281006, + "learning_rate": 6.380096070944698e-05, + "loss": 0.1622, + "step": 5534 + }, + { + "epoch": 2.043943870014771, + "grad_norm": 0.28305673599243164, + "learning_rate": 6.377632713388348e-05, + "loss": 0.1559, + "step": 5535 + }, + { + "epoch": 2.0443131462333826, + "grad_norm": 0.2547336220741272, + "learning_rate": 6.375169355832e-05, + "loss": 0.1638, + "step": 5536 + }, + { + "epoch": 2.044682422451994, + "grad_norm": 0.2725939154624939, + "learning_rate": 6.37270599827565e-05, + "loss": 0.1624, + "step": 5537 + }, + { + "epoch": 2.0450516986706058, + "grad_norm": 0.2612154483795166, + "learning_rate": 6.370242640719301e-05, + "loss": 0.1728, + "step": 5538 + }, + { + "epoch": 2.045420974889217, + "grad_norm": 0.23097579181194305, + "learning_rate": 6.367779283162951e-05, + "loss": 0.1536, + "step": 5539 + }, + { + "epoch": 2.0457902511078285, + "grad_norm": 0.3517185151576996, + "learning_rate": 6.365315925606603e-05, + "loss": 0.1575, + "step": 5540 + }, + { + "epoch": 2.04615952732644, + "grad_norm": 0.253917396068573, + "learning_rate": 6.362852568050253e-05, + "loss": 0.1536, + "step": 5541 + }, + { + "epoch": 2.0465288035450517, + "grad_norm": 0.2539718449115753, + "learning_rate": 6.360389210493903e-05, + "loss": 0.1576, + "step": 5542 + }, + { + "epoch": 2.0468980797636633, + "grad_norm": 0.23898600041866302, + "learning_rate": 6.357925852937554e-05, + "loss": 0.1833, + "step": 5543 + }, + { + "epoch": 2.047267355982275, + "grad_norm": 0.2915307879447937, + "learning_rate": 6.355462495381204e-05, + "loss": 0.1584, + "step": 5544 + }, + { + "epoch": 2.047636632200886, + "grad_norm": 0.2636708617210388, + "learning_rate": 6.352999137824856e-05, + "loss": 0.143, + "step": 5545 + }, + { + "epoch": 2.0480059084194977, + "grad_norm": 0.24808430671691895, + "learning_rate": 6.350535780268506e-05, + "loss": 0.1517, + "step": 5546 + }, + { + "epoch": 2.0483751846381093, + "grad_norm": 0.21386811137199402, + "learning_rate": 6.348072422712157e-05, + "loss": 0.1645, + "step": 5547 + }, + { + "epoch": 2.048744460856721, + "grad_norm": 0.2917517125606537, + "learning_rate": 6.345609065155808e-05, + "loss": 0.186, + "step": 5548 + }, + { + "epoch": 2.0491137370753325, + "grad_norm": 0.27769407629966736, + "learning_rate": 6.343145707599458e-05, + "loss": 0.1675, + "step": 5549 + }, + { + "epoch": 2.0494830132939437, + "grad_norm": 0.23442015051841736, + "learning_rate": 6.340682350043109e-05, + "loss": 0.1528, + "step": 5550 + }, + { + "epoch": 2.0494830132939437, + "eval_loss": 8.974048614501953, + "eval_runtime": 6.9822, + "eval_samples_per_second": 7.161, + "eval_steps_per_second": 1.003, + "step": 5550 + }, + { + "epoch": 2.0498522895125553, + "grad_norm": 0.2265266627073288, + "learning_rate": 6.338218992486759e-05, + "loss": 0.138, + "step": 5551 + }, + { + "epoch": 2.050221565731167, + "grad_norm": 0.25261080265045166, + "learning_rate": 6.33575563493041e-05, + "loss": 0.1868, + "step": 5552 + }, + { + "epoch": 2.0505908419497785, + "grad_norm": 0.24036560952663422, + "learning_rate": 6.333292277374061e-05, + "loss": 0.1597, + "step": 5553 + }, + { + "epoch": 2.05096011816839, + "grad_norm": 0.24662528932094574, + "learning_rate": 6.330828919817712e-05, + "loss": 0.1605, + "step": 5554 + }, + { + "epoch": 2.0513293943870017, + "grad_norm": 0.2829256057739258, + "learning_rate": 6.328365562261362e-05, + "loss": 0.1886, + "step": 5555 + }, + { + "epoch": 2.051698670605613, + "grad_norm": 0.23051898181438446, + "learning_rate": 6.325902204705014e-05, + "loss": 0.1679, + "step": 5556 + }, + { + "epoch": 2.0520679468242244, + "grad_norm": 0.2228373885154724, + "learning_rate": 6.323438847148664e-05, + "loss": 0.1506, + "step": 5557 + }, + { + "epoch": 2.052437223042836, + "grad_norm": 0.27319326996803284, + "learning_rate": 6.320975489592314e-05, + "loss": 0.1588, + "step": 5558 + }, + { + "epoch": 2.0528064992614476, + "grad_norm": 0.3222609758377075, + "learning_rate": 6.318512132035965e-05, + "loss": 0.1889, + "step": 5559 + }, + { + "epoch": 2.0531757754800593, + "grad_norm": 0.23243394494056702, + "learning_rate": 6.316048774479616e-05, + "loss": 0.1625, + "step": 5560 + }, + { + "epoch": 2.0535450516986704, + "grad_norm": 0.2206272929906845, + "learning_rate": 6.313585416923267e-05, + "loss": 0.1555, + "step": 5561 + }, + { + "epoch": 2.053914327917282, + "grad_norm": 0.26515230536460876, + "learning_rate": 6.311122059366917e-05, + "loss": 0.154, + "step": 5562 + }, + { + "epoch": 2.0542836041358936, + "grad_norm": 0.31433629989624023, + "learning_rate": 6.308658701810569e-05, + "loss": 0.1714, + "step": 5563 + }, + { + "epoch": 2.054652880354505, + "grad_norm": 0.26802873611450195, + "learning_rate": 6.306195344254219e-05, + "loss": 0.1718, + "step": 5564 + }, + { + "epoch": 2.055022156573117, + "grad_norm": 0.25002798438072205, + "learning_rate": 6.303731986697869e-05, + "loss": 0.1668, + "step": 5565 + }, + { + "epoch": 2.0553914327917284, + "grad_norm": 0.2200467586517334, + "learning_rate": 6.30126862914152e-05, + "loss": 0.1421, + "step": 5566 + }, + { + "epoch": 2.0557607090103396, + "grad_norm": 0.2514795660972595, + "learning_rate": 6.29880527158517e-05, + "loss": 0.1604, + "step": 5567 + }, + { + "epoch": 2.056129985228951, + "grad_norm": 0.2597319185733795, + "learning_rate": 6.296341914028822e-05, + "loss": 0.1477, + "step": 5568 + }, + { + "epoch": 2.056499261447563, + "grad_norm": 0.2636418342590332, + "learning_rate": 6.293878556472472e-05, + "loss": 0.1667, + "step": 5569 + }, + { + "epoch": 2.0568685376661744, + "grad_norm": 0.23757022619247437, + "learning_rate": 6.291415198916123e-05, + "loss": 0.1607, + "step": 5570 + }, + { + "epoch": 2.057237813884786, + "grad_norm": 0.25267642736434937, + "learning_rate": 6.288951841359774e-05, + "loss": 0.1596, + "step": 5571 + }, + { + "epoch": 2.057607090103397, + "grad_norm": 0.21636489033699036, + "learning_rate": 6.286488483803425e-05, + "loss": 0.1222, + "step": 5572 + }, + { + "epoch": 2.0579763663220088, + "grad_norm": 0.22161704301834106, + "learning_rate": 6.284025126247075e-05, + "loss": 0.1473, + "step": 5573 + }, + { + "epoch": 2.0583456425406204, + "grad_norm": 0.27761852741241455, + "learning_rate": 6.281561768690725e-05, + "loss": 0.1843, + "step": 5574 + }, + { + "epoch": 2.058714918759232, + "grad_norm": 0.26985830068588257, + "learning_rate": 6.279098411134377e-05, + "loss": 0.164, + "step": 5575 + }, + { + "epoch": 2.0590841949778436, + "grad_norm": 0.2994496524333954, + "learning_rate": 6.276635053578027e-05, + "loss": 0.1674, + "step": 5576 + }, + { + "epoch": 2.059453471196455, + "grad_norm": 0.2222304344177246, + "learning_rate": 6.274171696021678e-05, + "loss": 0.1643, + "step": 5577 + }, + { + "epoch": 2.0598227474150663, + "grad_norm": 0.2655903100967407, + "learning_rate": 6.271708338465328e-05, + "loss": 0.1671, + "step": 5578 + }, + { + "epoch": 2.060192023633678, + "grad_norm": 0.22182148694992065, + "learning_rate": 6.26924498090898e-05, + "loss": 0.1532, + "step": 5579 + }, + { + "epoch": 2.0605612998522895, + "grad_norm": 0.2951536476612091, + "learning_rate": 6.26678162335263e-05, + "loss": 0.1572, + "step": 5580 + }, + { + "epoch": 2.060930576070901, + "grad_norm": 0.22108204662799835, + "learning_rate": 6.26431826579628e-05, + "loss": 0.1607, + "step": 5581 + }, + { + "epoch": 2.0612998522895127, + "grad_norm": 0.2341223508119583, + "learning_rate": 6.261854908239932e-05, + "loss": 0.1533, + "step": 5582 + }, + { + "epoch": 2.061669128508124, + "grad_norm": 0.26648667454719543, + "learning_rate": 6.259391550683582e-05, + "loss": 0.1887, + "step": 5583 + }, + { + "epoch": 2.0620384047267355, + "grad_norm": 0.24292941391468048, + "learning_rate": 6.256928193127233e-05, + "loss": 0.1765, + "step": 5584 + }, + { + "epoch": 2.062407680945347, + "grad_norm": 0.2355756014585495, + "learning_rate": 6.254464835570883e-05, + "loss": 0.1585, + "step": 5585 + }, + { + "epoch": 2.0627769571639587, + "grad_norm": 0.27099609375, + "learning_rate": 6.252001478014535e-05, + "loss": 0.1743, + "step": 5586 + }, + { + "epoch": 2.0631462333825703, + "grad_norm": 0.23027196526527405, + "learning_rate": 6.249538120458185e-05, + "loss": 0.1499, + "step": 5587 + }, + { + "epoch": 2.0635155096011815, + "grad_norm": 0.2403704822063446, + "learning_rate": 6.247074762901836e-05, + "loss": 0.1673, + "step": 5588 + }, + { + "epoch": 2.063884785819793, + "grad_norm": 0.2873874008655548, + "learning_rate": 6.244611405345486e-05, + "loss": 0.1911, + "step": 5589 + }, + { + "epoch": 2.0642540620384047, + "grad_norm": 0.21171340346336365, + "learning_rate": 6.242148047789136e-05, + "loss": 0.138, + "step": 5590 + }, + { + "epoch": 2.0646233382570163, + "grad_norm": 0.3022492527961731, + "learning_rate": 6.239684690232788e-05, + "loss": 0.2033, + "step": 5591 + }, + { + "epoch": 2.064992614475628, + "grad_norm": 0.23277626931667328, + "learning_rate": 6.237221332676438e-05, + "loss": 0.1579, + "step": 5592 + }, + { + "epoch": 2.0653618906942395, + "grad_norm": 0.2369033247232437, + "learning_rate": 6.23475797512009e-05, + "loss": 0.1684, + "step": 5593 + }, + { + "epoch": 2.0657311669128506, + "grad_norm": 0.2492230236530304, + "learning_rate": 6.23229461756374e-05, + "loss": 0.154, + "step": 5594 + }, + { + "epoch": 2.0661004431314622, + "grad_norm": 0.24498562514781952, + "learning_rate": 6.229831260007391e-05, + "loss": 0.1564, + "step": 5595 + }, + { + "epoch": 2.066469719350074, + "grad_norm": 0.2393152415752411, + "learning_rate": 6.227367902451041e-05, + "loss": 0.1686, + "step": 5596 + }, + { + "epoch": 2.0668389955686854, + "grad_norm": 0.2348770946264267, + "learning_rate": 6.224904544894691e-05, + "loss": 0.132, + "step": 5597 + }, + { + "epoch": 2.067208271787297, + "grad_norm": 0.209956556558609, + "learning_rate": 6.222441187338343e-05, + "loss": 0.1545, + "step": 5598 + }, + { + "epoch": 2.067577548005908, + "grad_norm": 0.25055035948753357, + "learning_rate": 6.219977829781993e-05, + "loss": 0.1675, + "step": 5599 + }, + { + "epoch": 2.06794682422452, + "grad_norm": 0.236570343375206, + "learning_rate": 6.217514472225644e-05, + "loss": 0.1626, + "step": 5600 + }, + { + "epoch": 2.06794682422452, + "eval_loss": 8.886003494262695, + "eval_runtime": 6.9093, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 1.013, + "step": 5600 + }, + { + "epoch": 2.0683161004431314, + "grad_norm": 0.24847035109996796, + "learning_rate": 6.215051114669294e-05, + "loss": 0.1654, + "step": 5601 + }, + { + "epoch": 2.068685376661743, + "grad_norm": 0.28444361686706543, + "learning_rate": 6.212587757112946e-05, + "loss": 0.1997, + "step": 5602 + }, + { + "epoch": 2.0690546528803546, + "grad_norm": 0.2462826371192932, + "learning_rate": 6.210124399556596e-05, + "loss": 0.1762, + "step": 5603 + }, + { + "epoch": 2.069423929098966, + "grad_norm": 0.2648668587207794, + "learning_rate": 6.207661042000246e-05, + "loss": 0.1873, + "step": 5604 + }, + { + "epoch": 2.0697932053175774, + "grad_norm": 0.26081904768943787, + "learning_rate": 6.205197684443898e-05, + "loss": 0.1823, + "step": 5605 + }, + { + "epoch": 2.070162481536189, + "grad_norm": 0.2584434151649475, + "learning_rate": 6.202734326887548e-05, + "loss": 0.1762, + "step": 5606 + }, + { + "epoch": 2.0705317577548006, + "grad_norm": 0.2575317323207855, + "learning_rate": 6.200270969331199e-05, + "loss": 0.142, + "step": 5607 + }, + { + "epoch": 2.070901033973412, + "grad_norm": 0.28123465180397034, + "learning_rate": 6.197807611774849e-05, + "loss": 0.1829, + "step": 5608 + }, + { + "epoch": 2.071270310192024, + "grad_norm": 0.2610478699207306, + "learning_rate": 6.195344254218501e-05, + "loss": 0.1506, + "step": 5609 + }, + { + "epoch": 2.071639586410635, + "grad_norm": 0.21671868860721588, + "learning_rate": 6.192880896662151e-05, + "loss": 0.1452, + "step": 5610 + }, + { + "epoch": 2.0720088626292466, + "grad_norm": 0.3393421769142151, + "learning_rate": 6.190417539105802e-05, + "loss": 0.1732, + "step": 5611 + }, + { + "epoch": 2.072378138847858, + "grad_norm": 0.2455674409866333, + "learning_rate": 6.187954181549452e-05, + "loss": 0.1906, + "step": 5612 + }, + { + "epoch": 2.0727474150664698, + "grad_norm": 0.5672195553779602, + "learning_rate": 6.185490823993103e-05, + "loss": 0.1597, + "step": 5613 + }, + { + "epoch": 2.0731166912850814, + "grad_norm": 0.2862783968448639, + "learning_rate": 6.183027466436754e-05, + "loss": 0.1666, + "step": 5614 + }, + { + "epoch": 2.073485967503693, + "grad_norm": 0.3376414179801941, + "learning_rate": 6.180564108880404e-05, + "loss": 0.177, + "step": 5615 + }, + { + "epoch": 2.073855243722304, + "grad_norm": 0.32282647490501404, + "learning_rate": 6.178100751324056e-05, + "loss": 0.1739, + "step": 5616 + }, + { + "epoch": 2.0742245199409157, + "grad_norm": 0.24022123217582703, + "learning_rate": 6.175637393767706e-05, + "loss": 0.1602, + "step": 5617 + }, + { + "epoch": 2.0745937961595273, + "grad_norm": 0.2855636179447174, + "learning_rate": 6.173174036211357e-05, + "loss": 0.1641, + "step": 5618 + }, + { + "epoch": 2.074963072378139, + "grad_norm": 0.2610416114330292, + "learning_rate": 6.170710678655007e-05, + "loss": 0.1629, + "step": 5619 + }, + { + "epoch": 2.0753323485967505, + "grad_norm": 0.24738983809947968, + "learning_rate": 6.168247321098657e-05, + "loss": 0.1599, + "step": 5620 + }, + { + "epoch": 2.0757016248153617, + "grad_norm": 0.28171485662460327, + "learning_rate": 6.165783963542309e-05, + "loss": 0.1778, + "step": 5621 + }, + { + "epoch": 2.0760709010339733, + "grad_norm": 0.25921472907066345, + "learning_rate": 6.163320605985959e-05, + "loss": 0.155, + "step": 5622 + }, + { + "epoch": 2.076440177252585, + "grad_norm": 0.2662225067615509, + "learning_rate": 6.16085724842961e-05, + "loss": 0.1419, + "step": 5623 + }, + { + "epoch": 2.0768094534711965, + "grad_norm": 0.2824224531650543, + "learning_rate": 6.15839389087326e-05, + "loss": 0.1576, + "step": 5624 + }, + { + "epoch": 2.077178729689808, + "grad_norm": 0.2524067759513855, + "learning_rate": 6.155930533316912e-05, + "loss": 0.1494, + "step": 5625 + }, + { + "epoch": 2.0775480059084197, + "grad_norm": 0.2609933912754059, + "learning_rate": 6.153467175760562e-05, + "loss": 0.1919, + "step": 5626 + }, + { + "epoch": 2.077917282127031, + "grad_norm": 0.2927902340888977, + "learning_rate": 6.151003818204214e-05, + "loss": 0.1966, + "step": 5627 + }, + { + "epoch": 2.0782865583456425, + "grad_norm": 0.2893534302711487, + "learning_rate": 6.148540460647864e-05, + "loss": 0.1772, + "step": 5628 + }, + { + "epoch": 2.078655834564254, + "grad_norm": 0.24202780425548553, + "learning_rate": 6.146077103091514e-05, + "loss": 0.162, + "step": 5629 + }, + { + "epoch": 2.0790251107828657, + "grad_norm": 0.26749351620674133, + "learning_rate": 6.143613745535165e-05, + "loss": 0.2068, + "step": 5630 + }, + { + "epoch": 2.0793943870014773, + "grad_norm": 0.2624955177307129, + "learning_rate": 6.141150387978815e-05, + "loss": 0.1779, + "step": 5631 + }, + { + "epoch": 2.0797636632200884, + "grad_norm": 0.2747156023979187, + "learning_rate": 6.138687030422467e-05, + "loss": 0.1556, + "step": 5632 + }, + { + "epoch": 2.0801329394387, + "grad_norm": 0.24384087324142456, + "learning_rate": 6.136223672866117e-05, + "loss": 0.1651, + "step": 5633 + }, + { + "epoch": 2.0805022156573116, + "grad_norm": 0.23479653894901276, + "learning_rate": 6.133760315309768e-05, + "loss": 0.1747, + "step": 5634 + }, + { + "epoch": 2.0808714918759232, + "grad_norm": 0.20609234273433685, + "learning_rate": 6.131296957753418e-05, + "loss": 0.1339, + "step": 5635 + }, + { + "epoch": 2.081240768094535, + "grad_norm": 0.2780122756958008, + "learning_rate": 6.128833600197069e-05, + "loss": 0.1578, + "step": 5636 + }, + { + "epoch": 2.0816100443131464, + "grad_norm": 0.27534547448158264, + "learning_rate": 6.12637024264072e-05, + "loss": 0.1592, + "step": 5637 + }, + { + "epoch": 2.0819793205317576, + "grad_norm": 0.244219109416008, + "learning_rate": 6.12390688508437e-05, + "loss": 0.1681, + "step": 5638 + }, + { + "epoch": 2.082348596750369, + "grad_norm": 0.23482748866081238, + "learning_rate": 6.121443527528022e-05, + "loss": 0.1712, + "step": 5639 + }, + { + "epoch": 2.082717872968981, + "grad_norm": 0.3016130030155182, + "learning_rate": 6.118980169971672e-05, + "loss": 0.1615, + "step": 5640 + }, + { + "epoch": 2.0830871491875924, + "grad_norm": 0.23813587427139282, + "learning_rate": 6.116516812415323e-05, + "loss": 0.1669, + "step": 5641 + }, + { + "epoch": 2.083456425406204, + "grad_norm": 0.21675412356853485, + "learning_rate": 6.114053454858973e-05, + "loss": 0.1505, + "step": 5642 + }, + { + "epoch": 2.083825701624815, + "grad_norm": 0.24451133608818054, + "learning_rate": 6.111590097302625e-05, + "loss": 0.1553, + "step": 5643 + }, + { + "epoch": 2.0841949778434268, + "grad_norm": 0.20567579567432404, + "learning_rate": 6.109126739746275e-05, + "loss": 0.1443, + "step": 5644 + }, + { + "epoch": 2.0845642540620384, + "grad_norm": 0.2865724265575409, + "learning_rate": 6.106663382189925e-05, + "loss": 0.1689, + "step": 5645 + }, + { + "epoch": 2.08493353028065, + "grad_norm": 0.23653779923915863, + "learning_rate": 6.104200024633576e-05, + "loss": 0.1549, + "step": 5646 + }, + { + "epoch": 2.0853028064992616, + "grad_norm": 0.30944138765335083, + "learning_rate": 6.101736667077227e-05, + "loss": 0.1765, + "step": 5647 + }, + { + "epoch": 2.085672082717873, + "grad_norm": 0.20806938409805298, + "learning_rate": 6.099273309520877e-05, + "loss": 0.1303, + "step": 5648 + }, + { + "epoch": 2.0860413589364843, + "grad_norm": 0.2285393476486206, + "learning_rate": 6.096809951964528e-05, + "loss": 0.1712, + "step": 5649 + }, + { + "epoch": 2.086410635155096, + "grad_norm": 0.2759014070034027, + "learning_rate": 6.094346594408179e-05, + "loss": 0.1905, + "step": 5650 + }, + { + "epoch": 2.086410635155096, + "eval_loss": 8.949413299560547, + "eval_runtime": 6.912, + "eval_samples_per_second": 7.234, + "eval_steps_per_second": 1.013, + "step": 5650 + }, + { + "epoch": 2.0867799113737076, + "grad_norm": 0.2335757464170456, + "learning_rate": 6.09188323685183e-05, + "loss": 0.1568, + "step": 5651 + }, + { + "epoch": 2.087149187592319, + "grad_norm": 0.2460927963256836, + "learning_rate": 6.0894198792954805e-05, + "loss": 0.17, + "step": 5652 + }, + { + "epoch": 2.0875184638109308, + "grad_norm": 0.22263208031654358, + "learning_rate": 6.086956521739131e-05, + "loss": 0.1526, + "step": 5653 + }, + { + "epoch": 2.087887740029542, + "grad_norm": 0.2463398277759552, + "learning_rate": 6.084493164182782e-05, + "loss": 0.1554, + "step": 5654 + }, + { + "epoch": 2.0882570162481535, + "grad_norm": 0.23850668966770172, + "learning_rate": 6.082029806626433e-05, + "loss": 0.1526, + "step": 5655 + }, + { + "epoch": 2.088626292466765, + "grad_norm": 0.22471396625041962, + "learning_rate": 6.079566449070083e-05, + "loss": 0.1725, + "step": 5656 + }, + { + "epoch": 2.0889955686853767, + "grad_norm": 0.2538785934448242, + "learning_rate": 6.077103091513734e-05, + "loss": 0.1645, + "step": 5657 + }, + { + "epoch": 2.0893648449039883, + "grad_norm": 0.24892370402812958, + "learning_rate": 6.0746397339573845e-05, + "loss": 0.1709, + "step": 5658 + }, + { + "epoch": 2.0897341211226, + "grad_norm": 0.24924823641777039, + "learning_rate": 6.072176376401035e-05, + "loss": 0.1859, + "step": 5659 + }, + { + "epoch": 2.090103397341211, + "grad_norm": 0.25672534108161926, + "learning_rate": 6.069713018844686e-05, + "loss": 0.1745, + "step": 5660 + }, + { + "epoch": 2.0904726735598227, + "grad_norm": 0.298927366733551, + "learning_rate": 6.067249661288337e-05, + "loss": 0.1691, + "step": 5661 + }, + { + "epoch": 2.0908419497784343, + "grad_norm": 0.27035221457481384, + "learning_rate": 6.0647863037319877e-05, + "loss": 0.1676, + "step": 5662 + }, + { + "epoch": 2.091211225997046, + "grad_norm": 0.29977741837501526, + "learning_rate": 6.0623229461756384e-05, + "loss": 0.1722, + "step": 5663 + }, + { + "epoch": 2.0915805022156575, + "grad_norm": 0.2617363929748535, + "learning_rate": 6.0598595886192886e-05, + "loss": 0.1736, + "step": 5664 + }, + { + "epoch": 2.0919497784342687, + "grad_norm": 0.22223763167858124, + "learning_rate": 6.057396231062939e-05, + "loss": 0.1533, + "step": 5665 + }, + { + "epoch": 2.0923190546528803, + "grad_norm": 0.3501130938529968, + "learning_rate": 6.05493287350659e-05, + "loss": 0.213, + "step": 5666 + }, + { + "epoch": 2.092688330871492, + "grad_norm": 0.23561058938503265, + "learning_rate": 6.052469515950241e-05, + "loss": 0.174, + "step": 5667 + }, + { + "epoch": 2.0930576070901035, + "grad_norm": 0.269911527633667, + "learning_rate": 6.050006158393892e-05, + "loss": 0.1592, + "step": 5668 + }, + { + "epoch": 2.093426883308715, + "grad_norm": 0.27687492966651917, + "learning_rate": 6.0475428008375425e-05, + "loss": 0.1786, + "step": 5669 + }, + { + "epoch": 2.0937961595273262, + "grad_norm": 0.22382964193820953, + "learning_rate": 6.045079443281193e-05, + "loss": 0.1644, + "step": 5670 + }, + { + "epoch": 2.094165435745938, + "grad_norm": 0.22125090658664703, + "learning_rate": 6.042616085724844e-05, + "loss": 0.1622, + "step": 5671 + }, + { + "epoch": 2.0945347119645494, + "grad_norm": 0.2765488922595978, + "learning_rate": 6.040152728168494e-05, + "loss": 0.1792, + "step": 5672 + }, + { + "epoch": 2.094903988183161, + "grad_norm": 0.2499363124370575, + "learning_rate": 6.037689370612145e-05, + "loss": 0.1536, + "step": 5673 + }, + { + "epoch": 2.0952732644017726, + "grad_norm": 0.24733877182006836, + "learning_rate": 6.035226013055796e-05, + "loss": 0.1547, + "step": 5674 + }, + { + "epoch": 2.0956425406203842, + "grad_norm": 0.23976095020771027, + "learning_rate": 6.0327626554994465e-05, + "loss": 0.1588, + "step": 5675 + }, + { + "epoch": 2.0960118168389954, + "grad_norm": 0.30741435289382935, + "learning_rate": 6.030299297943097e-05, + "loss": 0.196, + "step": 5676 + }, + { + "epoch": 2.096381093057607, + "grad_norm": 0.2492215484380722, + "learning_rate": 6.027835940386748e-05, + "loss": 0.1606, + "step": 5677 + }, + { + "epoch": 2.0967503692762186, + "grad_norm": 0.263040155172348, + "learning_rate": 6.025372582830399e-05, + "loss": 0.1592, + "step": 5678 + }, + { + "epoch": 2.09711964549483, + "grad_norm": 0.2523921728134155, + "learning_rate": 6.022909225274049e-05, + "loss": 0.1538, + "step": 5679 + }, + { + "epoch": 2.097488921713442, + "grad_norm": 0.23371194303035736, + "learning_rate": 6.0204458677177e-05, + "loss": 0.151, + "step": 5680 + }, + { + "epoch": 2.097858197932053, + "grad_norm": 0.29407617449760437, + "learning_rate": 6.0179825101613506e-05, + "loss": 0.1707, + "step": 5681 + }, + { + "epoch": 2.0982274741506646, + "grad_norm": 0.29304641485214233, + "learning_rate": 6.0155191526050014e-05, + "loss": 0.1446, + "step": 5682 + }, + { + "epoch": 2.098596750369276, + "grad_norm": 0.23968440294265747, + "learning_rate": 6.013055795048652e-05, + "loss": 0.1678, + "step": 5683 + }, + { + "epoch": 2.098966026587888, + "grad_norm": 0.268049031496048, + "learning_rate": 6.010592437492303e-05, + "loss": 0.1609, + "step": 5684 + }, + { + "epoch": 2.0993353028064994, + "grad_norm": 0.26140445470809937, + "learning_rate": 6.008129079935954e-05, + "loss": 0.1621, + "step": 5685 + }, + { + "epoch": 2.099704579025111, + "grad_norm": 0.25041842460632324, + "learning_rate": 6.0056657223796045e-05, + "loss": 0.1529, + "step": 5686 + }, + { + "epoch": 2.100073855243722, + "grad_norm": 0.2577163577079773, + "learning_rate": 6.0032023648232546e-05, + "loss": 0.1665, + "step": 5687 + }, + { + "epoch": 2.1004431314623337, + "grad_norm": 0.2427249401807785, + "learning_rate": 6.0007390072669054e-05, + "loss": 0.1507, + "step": 5688 + }, + { + "epoch": 2.1008124076809453, + "grad_norm": 0.2372257262468338, + "learning_rate": 5.9982756497105555e-05, + "loss": 0.1435, + "step": 5689 + }, + { + "epoch": 2.101181683899557, + "grad_norm": 0.21788518130779266, + "learning_rate": 5.9958122921542056e-05, + "loss": 0.16, + "step": 5690 + }, + { + "epoch": 2.1015509601181686, + "grad_norm": 0.25974151492118835, + "learning_rate": 5.9933489345978564e-05, + "loss": 0.1539, + "step": 5691 + }, + { + "epoch": 2.1019202363367797, + "grad_norm": 0.27588191628456116, + "learning_rate": 5.990885577041507e-05, + "loss": 0.1714, + "step": 5692 + }, + { + "epoch": 2.1022895125553913, + "grad_norm": 0.226209357380867, + "learning_rate": 5.988422219485158e-05, + "loss": 0.1781, + "step": 5693 + }, + { + "epoch": 2.102658788774003, + "grad_norm": 0.2654951512813568, + "learning_rate": 5.985958861928809e-05, + "loss": 0.1693, + "step": 5694 + }, + { + "epoch": 2.1030280649926145, + "grad_norm": 0.24550700187683105, + "learning_rate": 5.9834955043724595e-05, + "loss": 0.1431, + "step": 5695 + }, + { + "epoch": 2.103397341211226, + "grad_norm": 0.30725374817848206, + "learning_rate": 5.98103214681611e-05, + "loss": 0.2038, + "step": 5696 + }, + { + "epoch": 2.1037666174298377, + "grad_norm": 0.3099895119667053, + "learning_rate": 5.978568789259761e-05, + "loss": 0.1594, + "step": 5697 + }, + { + "epoch": 2.104135893648449, + "grad_norm": 0.26871299743652344, + "learning_rate": 5.976105431703411e-05, + "loss": 0.1511, + "step": 5698 + }, + { + "epoch": 2.1045051698670605, + "grad_norm": 0.25275158882141113, + "learning_rate": 5.973642074147062e-05, + "loss": 0.1474, + "step": 5699 + }, + { + "epoch": 2.104874446085672, + "grad_norm": 0.2868088483810425, + "learning_rate": 5.971178716590713e-05, + "loss": 0.2003, + "step": 5700 + }, + { + "epoch": 2.104874446085672, + "eval_loss": 8.939833641052246, + "eval_runtime": 6.9056, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 1.014, + "step": 5700 + }, + { + "epoch": 2.1052437223042837, + "grad_norm": 0.22058777511119843, + "learning_rate": 5.9687153590343636e-05, + "loss": 0.1565, + "step": 5701 + }, + { + "epoch": 2.1056129985228953, + "grad_norm": 0.2508271038532257, + "learning_rate": 5.9662520014780144e-05, + "loss": 0.1647, + "step": 5702 + }, + { + "epoch": 2.1059822747415065, + "grad_norm": 0.26392272114753723, + "learning_rate": 5.963788643921665e-05, + "loss": 0.1822, + "step": 5703 + }, + { + "epoch": 2.106351550960118, + "grad_norm": 0.4531320631504059, + "learning_rate": 5.961325286365316e-05, + "loss": 0.2001, + "step": 5704 + }, + { + "epoch": 2.1067208271787297, + "grad_norm": 0.26225924491882324, + "learning_rate": 5.958861928808967e-05, + "loss": 0.1904, + "step": 5705 + }, + { + "epoch": 2.1070901033973413, + "grad_norm": 0.320968896150589, + "learning_rate": 5.956398571252617e-05, + "loss": 0.1756, + "step": 5706 + }, + { + "epoch": 2.107459379615953, + "grad_norm": 0.24713966250419617, + "learning_rate": 5.9539352136962676e-05, + "loss": 0.1917, + "step": 5707 + }, + { + "epoch": 2.1078286558345645, + "grad_norm": 0.3021135628223419, + "learning_rate": 5.9514718561399184e-05, + "loss": 0.1574, + "step": 5708 + }, + { + "epoch": 2.1081979320531756, + "grad_norm": 0.23941320180892944, + "learning_rate": 5.949008498583569e-05, + "loss": 0.1438, + "step": 5709 + }, + { + "epoch": 2.1085672082717872, + "grad_norm": 0.26765161752700806, + "learning_rate": 5.94654514102722e-05, + "loss": 0.1481, + "step": 5710 + }, + { + "epoch": 2.108936484490399, + "grad_norm": 0.2889775335788727, + "learning_rate": 5.944081783470871e-05, + "loss": 0.1827, + "step": 5711 + }, + { + "epoch": 2.1093057607090104, + "grad_norm": 0.26539409160614014, + "learning_rate": 5.9416184259145216e-05, + "loss": 0.1708, + "step": 5712 + }, + { + "epoch": 2.109675036927622, + "grad_norm": 0.26203033328056335, + "learning_rate": 5.939155068358172e-05, + "loss": 0.1734, + "step": 5713 + }, + { + "epoch": 2.110044313146233, + "grad_norm": 0.2789088487625122, + "learning_rate": 5.9366917108018224e-05, + "loss": 0.1719, + "step": 5714 + }, + { + "epoch": 2.110413589364845, + "grad_norm": 0.2513851225376129, + "learning_rate": 5.934228353245473e-05, + "loss": 0.1481, + "step": 5715 + }, + { + "epoch": 2.1107828655834564, + "grad_norm": 0.24327149987220764, + "learning_rate": 5.931764995689124e-05, + "loss": 0.1665, + "step": 5716 + }, + { + "epoch": 2.111152141802068, + "grad_norm": 0.2595502734184265, + "learning_rate": 5.929301638132775e-05, + "loss": 0.1675, + "step": 5717 + }, + { + "epoch": 2.1115214180206796, + "grad_norm": 0.23620468378067017, + "learning_rate": 5.9268382805764256e-05, + "loss": 0.1634, + "step": 5718 + }, + { + "epoch": 2.1118906942392908, + "grad_norm": 0.2348773032426834, + "learning_rate": 5.9243749230200764e-05, + "loss": 0.1573, + "step": 5719 + }, + { + "epoch": 2.1122599704579024, + "grad_norm": 0.22741945087909698, + "learning_rate": 5.921911565463727e-05, + "loss": 0.1529, + "step": 5720 + }, + { + "epoch": 2.112629246676514, + "grad_norm": 0.2487422674894333, + "learning_rate": 5.919448207907378e-05, + "loss": 0.1511, + "step": 5721 + }, + { + "epoch": 2.1129985228951256, + "grad_norm": 0.24365821480751038, + "learning_rate": 5.916984850351028e-05, + "loss": 0.1675, + "step": 5722 + }, + { + "epoch": 2.113367799113737, + "grad_norm": 0.2260286659002304, + "learning_rate": 5.914521492794679e-05, + "loss": 0.1415, + "step": 5723 + }, + { + "epoch": 2.113737075332349, + "grad_norm": 0.3071775436401367, + "learning_rate": 5.9120581352383296e-05, + "loss": 0.1776, + "step": 5724 + }, + { + "epoch": 2.11410635155096, + "grad_norm": 0.27772781252861023, + "learning_rate": 5.9095947776819804e-05, + "loss": 0.1744, + "step": 5725 + }, + { + "epoch": 2.1144756277695715, + "grad_norm": 0.2565504014492035, + "learning_rate": 5.907131420125631e-05, + "loss": 0.153, + "step": 5726 + }, + { + "epoch": 2.114844903988183, + "grad_norm": 0.27050095796585083, + "learning_rate": 5.904668062569282e-05, + "loss": 0.1501, + "step": 5727 + }, + { + "epoch": 2.1152141802067947, + "grad_norm": 0.24788367748260498, + "learning_rate": 5.902204705012933e-05, + "loss": 0.1606, + "step": 5728 + }, + { + "epoch": 2.1155834564254064, + "grad_norm": 0.19395440816879272, + "learning_rate": 5.8997413474565836e-05, + "loss": 0.1363, + "step": 5729 + }, + { + "epoch": 2.1159527326440175, + "grad_norm": 0.25060683488845825, + "learning_rate": 5.897277989900234e-05, + "loss": 0.1573, + "step": 5730 + }, + { + "epoch": 2.116322008862629, + "grad_norm": 0.21095387637615204, + "learning_rate": 5.8948146323438845e-05, + "loss": 0.1449, + "step": 5731 + }, + { + "epoch": 2.1166912850812407, + "grad_norm": 0.2954171895980835, + "learning_rate": 5.892351274787535e-05, + "loss": 0.1781, + "step": 5732 + }, + { + "epoch": 2.1170605612998523, + "grad_norm": 0.31764087080955505, + "learning_rate": 5.889887917231186e-05, + "loss": 0.1807, + "step": 5733 + }, + { + "epoch": 2.117429837518464, + "grad_norm": 0.23707197606563568, + "learning_rate": 5.887424559674837e-05, + "loss": 0.1571, + "step": 5734 + }, + { + "epoch": 2.1177991137370755, + "grad_norm": 0.23075878620147705, + "learning_rate": 5.8849612021184876e-05, + "loss": 0.1633, + "step": 5735 + }, + { + "epoch": 2.1181683899556867, + "grad_norm": 0.3358774483203888, + "learning_rate": 5.8824978445621384e-05, + "loss": 0.1609, + "step": 5736 + }, + { + "epoch": 2.1185376661742983, + "grad_norm": 0.28955793380737305, + "learning_rate": 5.880034487005789e-05, + "loss": 0.1624, + "step": 5737 + }, + { + "epoch": 2.11890694239291, + "grad_norm": 0.25172266364097595, + "learning_rate": 5.877571129449439e-05, + "loss": 0.1493, + "step": 5738 + }, + { + "epoch": 2.1192762186115215, + "grad_norm": 0.28911444544792175, + "learning_rate": 5.87510777189309e-05, + "loss": 0.1481, + "step": 5739 + }, + { + "epoch": 2.119645494830133, + "grad_norm": 0.31172704696655273, + "learning_rate": 5.872644414336741e-05, + "loss": 0.1823, + "step": 5740 + }, + { + "epoch": 2.1200147710487443, + "grad_norm": 0.2168399840593338, + "learning_rate": 5.8701810567803916e-05, + "loss": 0.1602, + "step": 5741 + }, + { + "epoch": 2.120384047267356, + "grad_norm": 0.28895869851112366, + "learning_rate": 5.8677176992240424e-05, + "loss": 0.1487, + "step": 5742 + }, + { + "epoch": 2.1207533234859675, + "grad_norm": 0.21684803068637848, + "learning_rate": 5.865254341667693e-05, + "loss": 0.1404, + "step": 5743 + }, + { + "epoch": 2.121122599704579, + "grad_norm": 0.27986645698547363, + "learning_rate": 5.862790984111344e-05, + "loss": 0.1718, + "step": 5744 + }, + { + "epoch": 2.1214918759231907, + "grad_norm": 0.2666008472442627, + "learning_rate": 5.860327626554995e-05, + "loss": 0.1677, + "step": 5745 + }, + { + "epoch": 2.1218611521418023, + "grad_norm": 0.2622698247432709, + "learning_rate": 5.857864268998645e-05, + "loss": 0.1779, + "step": 5746 + }, + { + "epoch": 2.1222304283604134, + "grad_norm": 0.25940361618995667, + "learning_rate": 5.855400911442296e-05, + "loss": 0.1787, + "step": 5747 + }, + { + "epoch": 2.122599704579025, + "grad_norm": 0.27325671911239624, + "learning_rate": 5.8529375538859465e-05, + "loss": 0.1756, + "step": 5748 + }, + { + "epoch": 2.1229689807976366, + "grad_norm": 0.24460530281066895, + "learning_rate": 5.850474196329597e-05, + "loss": 0.1614, + "step": 5749 + }, + { + "epoch": 2.1233382570162482, + "grad_norm": 0.2554047703742981, + "learning_rate": 5.848010838773248e-05, + "loss": 0.1613, + "step": 5750 + }, + { + "epoch": 2.1233382570162482, + "eval_loss": 9.041410446166992, + "eval_runtime": 6.9239, + "eval_samples_per_second": 7.221, + "eval_steps_per_second": 1.011, + "step": 5750 + }, + { + "epoch": 2.12370753323486, + "grad_norm": 0.2444206327199936, + "learning_rate": 5.845547481216899e-05, + "loss": 0.1659, + "step": 5751 + }, + { + "epoch": 2.124076809453471, + "grad_norm": 0.24779276549816132, + "learning_rate": 5.8430841236605496e-05, + "loss": 0.1551, + "step": 5752 + }, + { + "epoch": 2.1244460856720826, + "grad_norm": 0.2467038333415985, + "learning_rate": 5.8406207661042e-05, + "loss": 0.1547, + "step": 5753 + }, + { + "epoch": 2.124815361890694, + "grad_norm": 0.2746858298778534, + "learning_rate": 5.8381574085478505e-05, + "loss": 0.1611, + "step": 5754 + }, + { + "epoch": 2.125184638109306, + "grad_norm": 0.2260943502187729, + "learning_rate": 5.835694050991501e-05, + "loss": 0.1435, + "step": 5755 + }, + { + "epoch": 2.1255539143279174, + "grad_norm": 0.2710299491882324, + "learning_rate": 5.833230693435152e-05, + "loss": 0.1664, + "step": 5756 + }, + { + "epoch": 2.125923190546529, + "grad_norm": 0.2729892432689667, + "learning_rate": 5.830767335878803e-05, + "loss": 0.2115, + "step": 5757 + }, + { + "epoch": 2.12629246676514, + "grad_norm": 0.24607224762439728, + "learning_rate": 5.8283039783224537e-05, + "loss": 0.1531, + "step": 5758 + }, + { + "epoch": 2.1266617429837518, + "grad_norm": 0.2646127939224243, + "learning_rate": 5.8258406207661044e-05, + "loss": 0.1674, + "step": 5759 + }, + { + "epoch": 2.1270310192023634, + "grad_norm": 0.2609061300754547, + "learning_rate": 5.823377263209755e-05, + "loss": 0.1871, + "step": 5760 + }, + { + "epoch": 2.127400295420975, + "grad_norm": 0.23839209973812103, + "learning_rate": 5.8209139056534053e-05, + "loss": 0.1645, + "step": 5761 + }, + { + "epoch": 2.1277695716395866, + "grad_norm": 0.22638250887393951, + "learning_rate": 5.818450548097056e-05, + "loss": 0.1554, + "step": 5762 + }, + { + "epoch": 2.1281388478581977, + "grad_norm": 0.257668137550354, + "learning_rate": 5.815987190540707e-05, + "loss": 0.1702, + "step": 5763 + }, + { + "epoch": 2.1285081240768093, + "grad_norm": 0.2174331396818161, + "learning_rate": 5.813523832984358e-05, + "loss": 0.1617, + "step": 5764 + }, + { + "epoch": 2.128877400295421, + "grad_norm": 0.30169010162353516, + "learning_rate": 5.8110604754280085e-05, + "loss": 0.1788, + "step": 5765 + }, + { + "epoch": 2.1292466765140325, + "grad_norm": 0.22729873657226562, + "learning_rate": 5.808597117871659e-05, + "loss": 0.1784, + "step": 5766 + }, + { + "epoch": 2.129615952732644, + "grad_norm": 0.2566404640674591, + "learning_rate": 5.80613376031531e-05, + "loss": 0.1582, + "step": 5767 + }, + { + "epoch": 2.1299852289512557, + "grad_norm": 0.2525211274623871, + "learning_rate": 5.803670402758961e-05, + "loss": 0.193, + "step": 5768 + }, + { + "epoch": 2.130354505169867, + "grad_norm": 0.28858911991119385, + "learning_rate": 5.801207045202611e-05, + "loss": 0.2081, + "step": 5769 + }, + { + "epoch": 2.1307237813884785, + "grad_norm": 0.24410678446292877, + "learning_rate": 5.798743687646262e-05, + "loss": 0.1825, + "step": 5770 + }, + { + "epoch": 2.13109305760709, + "grad_norm": 0.20956659317016602, + "learning_rate": 5.7962803300899125e-05, + "loss": 0.1428, + "step": 5771 + }, + { + "epoch": 2.1314623338257017, + "grad_norm": 0.24196945130825043, + "learning_rate": 5.793816972533563e-05, + "loss": 0.1521, + "step": 5772 + }, + { + "epoch": 2.1318316100443133, + "grad_norm": 0.20474930107593536, + "learning_rate": 5.791353614977214e-05, + "loss": 0.1369, + "step": 5773 + }, + { + "epoch": 2.1322008862629245, + "grad_norm": 0.27425020933151245, + "learning_rate": 5.788890257420865e-05, + "loss": 0.1794, + "step": 5774 + }, + { + "epoch": 2.132570162481536, + "grad_norm": 0.22237613797187805, + "learning_rate": 5.786426899864516e-05, + "loss": 0.168, + "step": 5775 + }, + { + "epoch": 2.1329394387001477, + "grad_norm": 0.28125712275505066, + "learning_rate": 5.7839635423081665e-05, + "loss": 0.1829, + "step": 5776 + }, + { + "epoch": 2.1333087149187593, + "grad_norm": 0.21498674154281616, + "learning_rate": 5.7815001847518166e-05, + "loss": 0.1465, + "step": 5777 + }, + { + "epoch": 2.133677991137371, + "grad_norm": 0.2870471775531769, + "learning_rate": 5.7790368271954674e-05, + "loss": 0.1688, + "step": 5778 + }, + { + "epoch": 2.1340472673559825, + "grad_norm": 0.27845078706741333, + "learning_rate": 5.776573469639118e-05, + "loss": 0.155, + "step": 5779 + }, + { + "epoch": 2.1344165435745936, + "grad_norm": 0.2511069178581238, + "learning_rate": 5.774110112082769e-05, + "loss": 0.1743, + "step": 5780 + }, + { + "epoch": 2.1347858197932053, + "grad_norm": 0.26359841227531433, + "learning_rate": 5.77164675452642e-05, + "loss": 0.1775, + "step": 5781 + }, + { + "epoch": 2.135155096011817, + "grad_norm": 0.20309565961360931, + "learning_rate": 5.7691833969700705e-05, + "loss": 0.1399, + "step": 5782 + }, + { + "epoch": 2.1355243722304285, + "grad_norm": 0.24620765447616577, + "learning_rate": 5.766720039413721e-05, + "loss": 0.1777, + "step": 5783 + }, + { + "epoch": 2.13589364844904, + "grad_norm": 0.28663280606269836, + "learning_rate": 5.764256681857372e-05, + "loss": 0.1588, + "step": 5784 + }, + { + "epoch": 2.136262924667651, + "grad_norm": 0.239238440990448, + "learning_rate": 5.761793324301022e-05, + "loss": 0.1758, + "step": 5785 + }, + { + "epoch": 2.136632200886263, + "grad_norm": 0.2247789204120636, + "learning_rate": 5.759329966744673e-05, + "loss": 0.1272, + "step": 5786 + }, + { + "epoch": 2.1370014771048744, + "grad_norm": 0.2647974193096161, + "learning_rate": 5.756866609188324e-05, + "loss": 0.1846, + "step": 5787 + }, + { + "epoch": 2.137370753323486, + "grad_norm": 0.206637442111969, + "learning_rate": 5.7544032516319745e-05, + "loss": 0.1457, + "step": 5788 + }, + { + "epoch": 2.1377400295420976, + "grad_norm": 0.2614061236381531, + "learning_rate": 5.751939894075625e-05, + "loss": 0.1774, + "step": 5789 + }, + { + "epoch": 2.1381093057607092, + "grad_norm": 0.2558061182498932, + "learning_rate": 5.749476536519276e-05, + "loss": 0.1636, + "step": 5790 + }, + { + "epoch": 2.1384785819793204, + "grad_norm": 0.2514142096042633, + "learning_rate": 5.747013178962927e-05, + "loss": 0.1656, + "step": 5791 + }, + { + "epoch": 2.138847858197932, + "grad_norm": 0.2914920449256897, + "learning_rate": 5.744549821406578e-05, + "loss": 0.1738, + "step": 5792 + }, + { + "epoch": 2.1392171344165436, + "grad_norm": 0.25123879313468933, + "learning_rate": 5.742086463850228e-05, + "loss": 0.1805, + "step": 5793 + }, + { + "epoch": 2.139586410635155, + "grad_norm": 0.2547469139099121, + "learning_rate": 5.7396231062938786e-05, + "loss": 0.1688, + "step": 5794 + }, + { + "epoch": 2.139955686853767, + "grad_norm": 0.2881069779396057, + "learning_rate": 5.7371597487375294e-05, + "loss": 0.1687, + "step": 5795 + }, + { + "epoch": 2.140324963072378, + "grad_norm": 0.2524515986442566, + "learning_rate": 5.73469639118118e-05, + "loss": 0.1595, + "step": 5796 + }, + { + "epoch": 2.1406942392909896, + "grad_norm": 0.24153421819210052, + "learning_rate": 5.732233033624831e-05, + "loss": 0.1471, + "step": 5797 + }, + { + "epoch": 2.141063515509601, + "grad_norm": 0.2270592898130417, + "learning_rate": 5.729769676068482e-05, + "loss": 0.1493, + "step": 5798 + }, + { + "epoch": 2.1414327917282128, + "grad_norm": 0.25704890489578247, + "learning_rate": 5.7273063185121325e-05, + "loss": 0.1548, + "step": 5799 + }, + { + "epoch": 2.1418020679468244, + "grad_norm": 0.24989096820354462, + "learning_rate": 5.724842960955783e-05, + "loss": 0.1719, + "step": 5800 + }, + { + "epoch": 2.1418020679468244, + "eval_loss": 8.96154499053955, + "eval_runtime": 7.0644, + "eval_samples_per_second": 7.078, + "eval_steps_per_second": 0.991, + "step": 5800 + }, + { + "epoch": 2.142171344165436, + "grad_norm": 0.2344249188899994, + "learning_rate": 5.7223796033994334e-05, + "loss": 0.1622, + "step": 5801 + }, + { + "epoch": 2.142540620384047, + "grad_norm": 0.26691779494285583, + "learning_rate": 5.719916245843084e-05, + "loss": 0.1631, + "step": 5802 + }, + { + "epoch": 2.1429098966026587, + "grad_norm": 0.21069104969501495, + "learning_rate": 5.717452888286735e-05, + "loss": 0.1488, + "step": 5803 + }, + { + "epoch": 2.1432791728212703, + "grad_norm": 0.21314620971679688, + "learning_rate": 5.714989530730386e-05, + "loss": 0.1454, + "step": 5804 + }, + { + "epoch": 2.143648449039882, + "grad_norm": 0.24766193330287933, + "learning_rate": 5.7125261731740365e-05, + "loss": 0.1602, + "step": 5805 + }, + { + "epoch": 2.1440177252584935, + "grad_norm": 0.23377639055252075, + "learning_rate": 5.710062815617687e-05, + "loss": 0.1753, + "step": 5806 + }, + { + "epoch": 2.1443870014771047, + "grad_norm": 0.308245986700058, + "learning_rate": 5.707599458061338e-05, + "loss": 0.1609, + "step": 5807 + }, + { + "epoch": 2.1447562776957163, + "grad_norm": 0.23733939230442047, + "learning_rate": 5.705136100504989e-05, + "loss": 0.1566, + "step": 5808 + }, + { + "epoch": 2.145125553914328, + "grad_norm": 0.27068260312080383, + "learning_rate": 5.702672742948639e-05, + "loss": 0.1489, + "step": 5809 + }, + { + "epoch": 2.1454948301329395, + "grad_norm": 0.23649336397647858, + "learning_rate": 5.70020938539229e-05, + "loss": 0.1421, + "step": 5810 + }, + { + "epoch": 2.145864106351551, + "grad_norm": 0.22239355742931366, + "learning_rate": 5.6977460278359406e-05, + "loss": 0.1559, + "step": 5811 + }, + { + "epoch": 2.1462333825701623, + "grad_norm": 0.2244912087917328, + "learning_rate": 5.6952826702795914e-05, + "loss": 0.1477, + "step": 5812 + }, + { + "epoch": 2.146602658788774, + "grad_norm": 0.2682928442955017, + "learning_rate": 5.692819312723242e-05, + "loss": 0.1758, + "step": 5813 + }, + { + "epoch": 2.1469719350073855, + "grad_norm": 0.2725262939929962, + "learning_rate": 5.690355955166893e-05, + "loss": 0.1778, + "step": 5814 + }, + { + "epoch": 2.147341211225997, + "grad_norm": 0.2450253814458847, + "learning_rate": 5.687892597610544e-05, + "loss": 0.1605, + "step": 5815 + }, + { + "epoch": 2.1477104874446087, + "grad_norm": 0.23808549344539642, + "learning_rate": 5.6854292400541945e-05, + "loss": 0.1321, + "step": 5816 + }, + { + "epoch": 2.1480797636632203, + "grad_norm": 0.25841447710990906, + "learning_rate": 5.6829658824978446e-05, + "loss": 0.177, + "step": 5817 + }, + { + "epoch": 2.1484490398818314, + "grad_norm": 0.23997928202152252, + "learning_rate": 5.6805025249414954e-05, + "loss": 0.1556, + "step": 5818 + }, + { + "epoch": 2.148818316100443, + "grad_norm": 0.2319837361574173, + "learning_rate": 5.678039167385146e-05, + "loss": 0.1686, + "step": 5819 + }, + { + "epoch": 2.1491875923190547, + "grad_norm": 0.21900928020477295, + "learning_rate": 5.675575809828797e-05, + "loss": 0.1373, + "step": 5820 + }, + { + "epoch": 2.1495568685376663, + "grad_norm": 0.2606322467327118, + "learning_rate": 5.673112452272448e-05, + "loss": 0.1659, + "step": 5821 + }, + { + "epoch": 2.149926144756278, + "grad_norm": 0.18810664117336273, + "learning_rate": 5.6706490947160986e-05, + "loss": 0.1486, + "step": 5822 + }, + { + "epoch": 2.150295420974889, + "grad_norm": 0.27491050958633423, + "learning_rate": 5.6681857371597493e-05, + "loss": 0.1813, + "step": 5823 + }, + { + "epoch": 2.1506646971935006, + "grad_norm": 0.22454038262367249, + "learning_rate": 5.6657223796034e-05, + "loss": 0.1444, + "step": 5824 + }, + { + "epoch": 2.151033973412112, + "grad_norm": 0.2611875832080841, + "learning_rate": 5.66325902204705e-05, + "loss": 0.1564, + "step": 5825 + }, + { + "epoch": 2.151403249630724, + "grad_norm": 0.28003430366516113, + "learning_rate": 5.660795664490701e-05, + "loss": 0.1745, + "step": 5826 + }, + { + "epoch": 2.1517725258493354, + "grad_norm": 0.2715948224067688, + "learning_rate": 5.658332306934352e-05, + "loss": 0.1796, + "step": 5827 + }, + { + "epoch": 2.152141802067947, + "grad_norm": 0.2846525013446808, + "learning_rate": 5.6558689493780026e-05, + "loss": 0.1866, + "step": 5828 + }, + { + "epoch": 2.152511078286558, + "grad_norm": 0.2380266636610031, + "learning_rate": 5.6534055918216534e-05, + "loss": 0.1689, + "step": 5829 + }, + { + "epoch": 2.15288035450517, + "grad_norm": 0.27683040499687195, + "learning_rate": 5.650942234265304e-05, + "loss": 0.1842, + "step": 5830 + }, + { + "epoch": 2.1532496307237814, + "grad_norm": 0.3287939727306366, + "learning_rate": 5.648478876708955e-05, + "loss": 0.1837, + "step": 5831 + }, + { + "epoch": 2.153618906942393, + "grad_norm": 0.27074283361434937, + "learning_rate": 5.646015519152605e-05, + "loss": 0.1848, + "step": 5832 + }, + { + "epoch": 2.1539881831610046, + "grad_norm": 0.24749475717544556, + "learning_rate": 5.643552161596256e-05, + "loss": 0.1495, + "step": 5833 + }, + { + "epoch": 2.1543574593796158, + "grad_norm": 0.23741114139556885, + "learning_rate": 5.6410888040399066e-05, + "loss": 0.1589, + "step": 5834 + }, + { + "epoch": 2.1547267355982274, + "grad_norm": 0.25389036536216736, + "learning_rate": 5.6386254464835574e-05, + "loss": 0.1728, + "step": 5835 + }, + { + "epoch": 2.155096011816839, + "grad_norm": 0.33868157863616943, + "learning_rate": 5.636162088927208e-05, + "loss": 0.19, + "step": 5836 + }, + { + "epoch": 2.1554652880354506, + "grad_norm": 0.2652325928211212, + "learning_rate": 5.633698731370859e-05, + "loss": 0.1778, + "step": 5837 + }, + { + "epoch": 2.155834564254062, + "grad_norm": 0.30319836735725403, + "learning_rate": 5.63123537381451e-05, + "loss": 0.1627, + "step": 5838 + }, + { + "epoch": 2.1562038404726733, + "grad_norm": 0.46601060032844543, + "learning_rate": 5.6287720162581606e-05, + "loss": 0.175, + "step": 5839 + }, + { + "epoch": 2.156573116691285, + "grad_norm": 0.26574864983558655, + "learning_rate": 5.626308658701811e-05, + "loss": 0.1483, + "step": 5840 + }, + { + "epoch": 2.1569423929098965, + "grad_norm": 0.2571249008178711, + "learning_rate": 5.6238453011454615e-05, + "loss": 0.1787, + "step": 5841 + }, + { + "epoch": 2.157311669128508, + "grad_norm": 0.2515318691730499, + "learning_rate": 5.621381943589112e-05, + "loss": 0.1747, + "step": 5842 + }, + { + "epoch": 2.1576809453471197, + "grad_norm": 0.2893151044845581, + "learning_rate": 5.618918586032763e-05, + "loss": 0.1731, + "step": 5843 + }, + { + "epoch": 2.1580502215657313, + "grad_norm": 0.21122747659683228, + "learning_rate": 5.616455228476414e-05, + "loss": 0.1526, + "step": 5844 + }, + { + "epoch": 2.1584194977843425, + "grad_norm": 0.26366689801216125, + "learning_rate": 5.6139918709200646e-05, + "loss": 0.1835, + "step": 5845 + }, + { + "epoch": 2.158788774002954, + "grad_norm": 0.2680884897708893, + "learning_rate": 5.6115285133637154e-05, + "loss": 0.1716, + "step": 5846 + }, + { + "epoch": 2.1591580502215657, + "grad_norm": 0.21523651480674744, + "learning_rate": 5.609065155807366e-05, + "loss": 0.1439, + "step": 5847 + }, + { + "epoch": 2.1595273264401773, + "grad_norm": 0.2940189242362976, + "learning_rate": 5.606601798251016e-05, + "loss": 0.1683, + "step": 5848 + }, + { + "epoch": 2.159896602658789, + "grad_norm": 0.2173256278038025, + "learning_rate": 5.604138440694667e-05, + "loss": 0.1621, + "step": 5849 + }, + { + "epoch": 2.1602658788774, + "grad_norm": 0.24084913730621338, + "learning_rate": 5.601675083138318e-05, + "loss": 0.1609, + "step": 5850 + }, + { + "epoch": 2.1602658788774, + "eval_loss": 8.892232894897461, + "eval_runtime": 6.9033, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 1.014, + "step": 5850 + }, + { + "epoch": 2.1606351550960117, + "grad_norm": 0.2267000377178192, + "learning_rate": 5.5992117255819687e-05, + "loss": 0.1595, + "step": 5851 + }, + { + "epoch": 2.1610044313146233, + "grad_norm": 0.23268462717533112, + "learning_rate": 5.5967483680256194e-05, + "loss": 0.1517, + "step": 5852 + }, + { + "epoch": 2.161373707533235, + "grad_norm": 0.23246663808822632, + "learning_rate": 5.59428501046927e-05, + "loss": 0.1631, + "step": 5853 + }, + { + "epoch": 2.1617429837518465, + "grad_norm": 0.19398702681064606, + "learning_rate": 5.591821652912921e-05, + "loss": 0.1386, + "step": 5854 + }, + { + "epoch": 2.162112259970458, + "grad_norm": 0.2237091362476349, + "learning_rate": 5.589358295356572e-05, + "loss": 0.164, + "step": 5855 + }, + { + "epoch": 2.1624815361890692, + "grad_norm": 0.2152923196554184, + "learning_rate": 5.586894937800222e-05, + "loss": 0.1738, + "step": 5856 + }, + { + "epoch": 2.162850812407681, + "grad_norm": 0.2877909243106842, + "learning_rate": 5.584431580243873e-05, + "loss": 0.1664, + "step": 5857 + }, + { + "epoch": 2.1632200886262924, + "grad_norm": 0.2521835267543793, + "learning_rate": 5.5819682226875235e-05, + "loss": 0.1641, + "step": 5858 + }, + { + "epoch": 2.163589364844904, + "grad_norm": 0.2644844949245453, + "learning_rate": 5.579504865131174e-05, + "loss": 0.166, + "step": 5859 + }, + { + "epoch": 2.1639586410635157, + "grad_norm": 0.2460847795009613, + "learning_rate": 5.577041507574825e-05, + "loss": 0.1689, + "step": 5860 + }, + { + "epoch": 2.164327917282127, + "grad_norm": 0.24734726548194885, + "learning_rate": 5.574578150018476e-05, + "loss": 0.1604, + "step": 5861 + }, + { + "epoch": 2.1646971935007384, + "grad_norm": 0.2415541261434555, + "learning_rate": 5.5721147924621266e-05, + "loss": 0.1511, + "step": 5862 + }, + { + "epoch": 2.16506646971935, + "grad_norm": 0.3435579240322113, + "learning_rate": 5.5696514349057774e-05, + "loss": 0.1782, + "step": 5863 + }, + { + "epoch": 2.1654357459379616, + "grad_norm": 0.2413255125284195, + "learning_rate": 5.5671880773494275e-05, + "loss": 0.1656, + "step": 5864 + }, + { + "epoch": 2.1658050221565732, + "grad_norm": 0.28339019417762756, + "learning_rate": 5.564724719793078e-05, + "loss": 0.1649, + "step": 5865 + }, + { + "epoch": 2.166174298375185, + "grad_norm": 0.28899264335632324, + "learning_rate": 5.562261362236729e-05, + "loss": 0.1615, + "step": 5866 + }, + { + "epoch": 2.166543574593796, + "grad_norm": 0.26845651865005493, + "learning_rate": 5.55979800468038e-05, + "loss": 0.1631, + "step": 5867 + }, + { + "epoch": 2.1669128508124076, + "grad_norm": 0.32803604006767273, + "learning_rate": 5.557334647124031e-05, + "loss": 0.1812, + "step": 5868 + }, + { + "epoch": 2.167282127031019, + "grad_norm": 0.28242191672325134, + "learning_rate": 5.5548712895676815e-05, + "loss": 0.1494, + "step": 5869 + }, + { + "epoch": 2.167651403249631, + "grad_norm": 0.2370707392692566, + "learning_rate": 5.552407932011332e-05, + "loss": 0.1596, + "step": 5870 + }, + { + "epoch": 2.1680206794682424, + "grad_norm": 0.24583463370800018, + "learning_rate": 5.549944574454983e-05, + "loss": 0.1719, + "step": 5871 + }, + { + "epoch": 2.1683899556868536, + "grad_norm": 0.2284587025642395, + "learning_rate": 5.547481216898633e-05, + "loss": 0.1369, + "step": 5872 + }, + { + "epoch": 2.168759231905465, + "grad_norm": 0.2632143199443817, + "learning_rate": 5.545017859342284e-05, + "loss": 0.1645, + "step": 5873 + }, + { + "epoch": 2.1691285081240768, + "grad_norm": 0.28839612007141113, + "learning_rate": 5.542554501785935e-05, + "loss": 0.1473, + "step": 5874 + }, + { + "epoch": 2.1694977843426884, + "grad_norm": 0.2777511477470398, + "learning_rate": 5.5400911442295855e-05, + "loss": 0.1573, + "step": 5875 + }, + { + "epoch": 2.1698670605613, + "grad_norm": 0.2656247615814209, + "learning_rate": 5.537627786673236e-05, + "loss": 0.1496, + "step": 5876 + }, + { + "epoch": 2.1702363367799116, + "grad_norm": 0.28096097707748413, + "learning_rate": 5.535164429116887e-05, + "loss": 0.1847, + "step": 5877 + }, + { + "epoch": 2.1706056129985227, + "grad_norm": 0.29756811261177063, + "learning_rate": 5.532701071560538e-05, + "loss": 0.1728, + "step": 5878 + }, + { + "epoch": 2.1709748892171343, + "grad_norm": 0.2655864357948303, + "learning_rate": 5.5302377140041886e-05, + "loss": 0.1989, + "step": 5879 + }, + { + "epoch": 2.171344165435746, + "grad_norm": 0.28334152698516846, + "learning_rate": 5.527774356447839e-05, + "loss": 0.1546, + "step": 5880 + }, + { + "epoch": 2.1717134416543575, + "grad_norm": 0.23415227234363556, + "learning_rate": 5.5253109988914895e-05, + "loss": 0.1408, + "step": 5881 + }, + { + "epoch": 2.172082717872969, + "grad_norm": 0.26028910279273987, + "learning_rate": 5.52284764133514e-05, + "loss": 0.175, + "step": 5882 + }, + { + "epoch": 2.1724519940915803, + "grad_norm": 0.2106216996908188, + "learning_rate": 5.520384283778791e-05, + "loss": 0.1472, + "step": 5883 + }, + { + "epoch": 2.172821270310192, + "grad_norm": 0.24899208545684814, + "learning_rate": 5.517920926222442e-05, + "loss": 0.17, + "step": 5884 + }, + { + "epoch": 2.1731905465288035, + "grad_norm": 0.2773733139038086, + "learning_rate": 5.515457568666093e-05, + "loss": 0.1633, + "step": 5885 + }, + { + "epoch": 2.173559822747415, + "grad_norm": 0.2654591202735901, + "learning_rate": 5.5129942111097435e-05, + "loss": 0.1674, + "step": 5886 + }, + { + "epoch": 2.1739290989660267, + "grad_norm": 0.3114142417907715, + "learning_rate": 5.510530853553394e-05, + "loss": 0.2023, + "step": 5887 + }, + { + "epoch": 2.1742983751846383, + "grad_norm": 0.2511567771434784, + "learning_rate": 5.5080674959970444e-05, + "loss": 0.1661, + "step": 5888 + }, + { + "epoch": 2.1746676514032495, + "grad_norm": 0.26515957713127136, + "learning_rate": 5.505604138440695e-05, + "loss": 0.1712, + "step": 5889 + }, + { + "epoch": 2.175036927621861, + "grad_norm": 0.23603031039237976, + "learning_rate": 5.503140780884346e-05, + "loss": 0.1493, + "step": 5890 + }, + { + "epoch": 2.1754062038404727, + "grad_norm": 0.2390149086713791, + "learning_rate": 5.500677423327997e-05, + "loss": 0.16, + "step": 5891 + }, + { + "epoch": 2.1757754800590843, + "grad_norm": 0.28174513578414917, + "learning_rate": 5.4982140657716475e-05, + "loss": 0.1681, + "step": 5892 + }, + { + "epoch": 2.176144756277696, + "grad_norm": 0.23254455626010895, + "learning_rate": 5.495750708215298e-05, + "loss": 0.174, + "step": 5893 + }, + { + "epoch": 2.176514032496307, + "grad_norm": 0.2533921003341675, + "learning_rate": 5.493287350658949e-05, + "loss": 0.1485, + "step": 5894 + }, + { + "epoch": 2.1768833087149186, + "grad_norm": 0.24506518244743347, + "learning_rate": 5.4908239931026e-05, + "loss": 0.1488, + "step": 5895 + }, + { + "epoch": 2.1772525849335302, + "grad_norm": 0.2007390409708023, + "learning_rate": 5.48836063554625e-05, + "loss": 0.1412, + "step": 5896 + }, + { + "epoch": 2.177621861152142, + "grad_norm": 0.2906361520290375, + "learning_rate": 5.485897277989901e-05, + "loss": 0.1676, + "step": 5897 + }, + { + "epoch": 2.1779911373707534, + "grad_norm": 0.23762249946594238, + "learning_rate": 5.4834339204335515e-05, + "loss": 0.1525, + "step": 5898 + }, + { + "epoch": 2.178360413589365, + "grad_norm": 0.2197323590517044, + "learning_rate": 5.480970562877202e-05, + "loss": 0.1456, + "step": 5899 + }, + { + "epoch": 2.178729689807976, + "grad_norm": 0.2336753010749817, + "learning_rate": 5.478507205320853e-05, + "loss": 0.1489, + "step": 5900 + }, + { + "epoch": 2.178729689807976, + "eval_loss": 8.96964168548584, + "eval_runtime": 6.9135, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 1.013, + "step": 5900 + }, + { + "epoch": 2.179098966026588, + "grad_norm": 0.21716323494911194, + "learning_rate": 5.476043847764504e-05, + "loss": 0.1671, + "step": 5901 + }, + { + "epoch": 2.1794682422451994, + "grad_norm": 0.26916810870170593, + "learning_rate": 5.473580490208155e-05, + "loss": 0.1618, + "step": 5902 + }, + { + "epoch": 2.179837518463811, + "grad_norm": 0.26589736342430115, + "learning_rate": 5.4711171326518055e-05, + "loss": 0.1551, + "step": 5903 + }, + { + "epoch": 2.1802067946824226, + "grad_norm": 0.28738972544670105, + "learning_rate": 5.4686537750954556e-05, + "loss": 0.1772, + "step": 5904 + }, + { + "epoch": 2.180576070901034, + "grad_norm": 0.2636337876319885, + "learning_rate": 5.4661904175391064e-05, + "loss": 0.1666, + "step": 5905 + }, + { + "epoch": 2.1809453471196454, + "grad_norm": 0.23394522070884705, + "learning_rate": 5.463727059982757e-05, + "loss": 0.1608, + "step": 5906 + }, + { + "epoch": 2.181314623338257, + "grad_norm": 0.25700148940086365, + "learning_rate": 5.461263702426408e-05, + "loss": 0.1794, + "step": 5907 + }, + { + "epoch": 2.1816838995568686, + "grad_norm": 0.2849240005016327, + "learning_rate": 5.458800344870059e-05, + "loss": 0.1686, + "step": 5908 + }, + { + "epoch": 2.18205317577548, + "grad_norm": 0.2967907190322876, + "learning_rate": 5.4563369873137095e-05, + "loss": 0.1908, + "step": 5909 + }, + { + "epoch": 2.182422451994092, + "grad_norm": 0.29086798429489136, + "learning_rate": 5.45387362975736e-05, + "loss": 0.1731, + "step": 5910 + }, + { + "epoch": 2.182791728212703, + "grad_norm": 0.22889888286590576, + "learning_rate": 5.4514102722010104e-05, + "loss": 0.1608, + "step": 5911 + }, + { + "epoch": 2.1831610044313146, + "grad_norm": 0.27961036562919617, + "learning_rate": 5.448946914644661e-05, + "loss": 0.1679, + "step": 5912 + }, + { + "epoch": 2.183530280649926, + "grad_norm": 0.2329602986574173, + "learning_rate": 5.446483557088312e-05, + "loss": 0.1463, + "step": 5913 + }, + { + "epoch": 2.1838995568685378, + "grad_norm": 0.36494356393814087, + "learning_rate": 5.444020199531963e-05, + "loss": 0.1713, + "step": 5914 + }, + { + "epoch": 2.1842688330871494, + "grad_norm": 0.2212795615196228, + "learning_rate": 5.4415568419756136e-05, + "loss": 0.1451, + "step": 5915 + }, + { + "epoch": 2.1846381093057605, + "grad_norm": 0.2720382809638977, + "learning_rate": 5.4390934844192643e-05, + "loss": 0.154, + "step": 5916 + }, + { + "epoch": 2.185007385524372, + "grad_norm": 0.21868468821048737, + "learning_rate": 5.436630126862915e-05, + "loss": 0.1668, + "step": 5917 + }, + { + "epoch": 2.1853766617429837, + "grad_norm": 0.25078967213630676, + "learning_rate": 5.434166769306566e-05, + "loss": 0.1591, + "step": 5918 + }, + { + "epoch": 2.1857459379615953, + "grad_norm": 0.2894357740879059, + "learning_rate": 5.431703411750216e-05, + "loss": 0.1557, + "step": 5919 + }, + { + "epoch": 2.186115214180207, + "grad_norm": 0.2771257162094116, + "learning_rate": 5.429240054193867e-05, + "loss": 0.1782, + "step": 5920 + }, + { + "epoch": 2.1864844903988185, + "grad_norm": 0.22870942950248718, + "learning_rate": 5.426776696637517e-05, + "loss": 0.1531, + "step": 5921 + }, + { + "epoch": 2.1868537666174297, + "grad_norm": 0.21660159528255463, + "learning_rate": 5.424313339081167e-05, + "loss": 0.1426, + "step": 5922 + }, + { + "epoch": 2.1872230428360413, + "grad_norm": 0.22871670126914978, + "learning_rate": 5.421849981524818e-05, + "loss": 0.1402, + "step": 5923 + }, + { + "epoch": 2.187592319054653, + "grad_norm": 0.429335355758667, + "learning_rate": 5.4193866239684686e-05, + "loss": 0.1901, + "step": 5924 + }, + { + "epoch": 2.1879615952732645, + "grad_norm": 0.27655285596847534, + "learning_rate": 5.4169232664121194e-05, + "loss": 0.1601, + "step": 5925 + }, + { + "epoch": 2.188330871491876, + "grad_norm": 0.25964802503585815, + "learning_rate": 5.41445990885577e-05, + "loss": 0.1764, + "step": 5926 + }, + { + "epoch": 2.1887001477104873, + "grad_norm": 0.23877084255218506, + "learning_rate": 5.411996551299421e-05, + "loss": 0.1626, + "step": 5927 + }, + { + "epoch": 2.189069423929099, + "grad_norm": 0.2237912267446518, + "learning_rate": 5.409533193743072e-05, + "loss": 0.1692, + "step": 5928 + }, + { + "epoch": 2.1894387001477105, + "grad_norm": 0.26796409487724304, + "learning_rate": 5.4070698361867225e-05, + "loss": 0.1942, + "step": 5929 + }, + { + "epoch": 2.189807976366322, + "grad_norm": 0.2429514080286026, + "learning_rate": 5.4046064786303726e-05, + "loss": 0.1568, + "step": 5930 + }, + { + "epoch": 2.1901772525849337, + "grad_norm": 0.29652291536331177, + "learning_rate": 5.4021431210740234e-05, + "loss": 0.165, + "step": 5931 + }, + { + "epoch": 2.1905465288035453, + "grad_norm": 0.28298914432525635, + "learning_rate": 5.399679763517674e-05, + "loss": 0.1801, + "step": 5932 + }, + { + "epoch": 2.1909158050221564, + "grad_norm": 0.20710934698581696, + "learning_rate": 5.397216405961325e-05, + "loss": 0.1392, + "step": 5933 + }, + { + "epoch": 2.191285081240768, + "grad_norm": 0.2547735571861267, + "learning_rate": 5.394753048404976e-05, + "loss": 0.1713, + "step": 5934 + }, + { + "epoch": 2.1916543574593796, + "grad_norm": 0.26793715357780457, + "learning_rate": 5.3922896908486266e-05, + "loss": 0.1732, + "step": 5935 + }, + { + "epoch": 2.1920236336779912, + "grad_norm": 0.22657029330730438, + "learning_rate": 5.3898263332922774e-05, + "loss": 0.1569, + "step": 5936 + }, + { + "epoch": 2.192392909896603, + "grad_norm": 0.28506356477737427, + "learning_rate": 5.387362975735928e-05, + "loss": 0.1977, + "step": 5937 + }, + { + "epoch": 2.192762186115214, + "grad_norm": 0.2525278329849243, + "learning_rate": 5.384899618179578e-05, + "loss": 0.1714, + "step": 5938 + }, + { + "epoch": 2.1931314623338256, + "grad_norm": 0.2511444687843323, + "learning_rate": 5.382436260623229e-05, + "loss": 0.1694, + "step": 5939 + }, + { + "epoch": 2.193500738552437, + "grad_norm": 0.26502445340156555, + "learning_rate": 5.37997290306688e-05, + "loss": 0.1729, + "step": 5940 + }, + { + "epoch": 2.193870014771049, + "grad_norm": 0.24522586166858673, + "learning_rate": 5.3775095455105306e-05, + "loss": 0.1567, + "step": 5941 + }, + { + "epoch": 2.1942392909896604, + "grad_norm": 0.21384859085083008, + "learning_rate": 5.3750461879541814e-05, + "loss": 0.1346, + "step": 5942 + }, + { + "epoch": 2.194608567208272, + "grad_norm": 0.30786994099617004, + "learning_rate": 5.372582830397832e-05, + "loss": 0.1837, + "step": 5943 + }, + { + "epoch": 2.194977843426883, + "grad_norm": 0.22025233507156372, + "learning_rate": 5.370119472841483e-05, + "loss": 0.1642, + "step": 5944 + }, + { + "epoch": 2.195347119645495, + "grad_norm": 0.2457989752292633, + "learning_rate": 5.367656115285134e-05, + "loss": 0.1485, + "step": 5945 + }, + { + "epoch": 2.1957163958641064, + "grad_norm": 0.2296011596918106, + "learning_rate": 5.365192757728784e-05, + "loss": 0.171, + "step": 5946 + }, + { + "epoch": 2.196085672082718, + "grad_norm": 0.2841695547103882, + "learning_rate": 5.3627294001724347e-05, + "loss": 0.1501, + "step": 5947 + }, + { + "epoch": 2.1964549483013296, + "grad_norm": 0.3371691107749939, + "learning_rate": 5.3602660426160854e-05, + "loss": 0.1589, + "step": 5948 + }, + { + "epoch": 2.1968242245199407, + "grad_norm": 0.3208991587162018, + "learning_rate": 5.357802685059736e-05, + "loss": 0.1816, + "step": 5949 + }, + { + "epoch": 2.1971935007385524, + "grad_norm": 0.31153982877731323, + "learning_rate": 5.355339327503387e-05, + "loss": 0.1654, + "step": 5950 + }, + { + "epoch": 2.1971935007385524, + "eval_loss": 8.967092514038086, + "eval_runtime": 6.908, + "eval_samples_per_second": 7.238, + "eval_steps_per_second": 1.013, + "step": 5950 + }, + { + "epoch": 2.197562776957164, + "grad_norm": 0.23863151669502258, + "learning_rate": 5.352875969947038e-05, + "loss": 0.1507, + "step": 5951 + }, + { + "epoch": 2.1979320531757756, + "grad_norm": 0.29910215735435486, + "learning_rate": 5.3504126123906886e-05, + "loss": 0.1819, + "step": 5952 + }, + { + "epoch": 2.198301329394387, + "grad_norm": 0.2546464204788208, + "learning_rate": 5.3479492548343394e-05, + "loss": 0.168, + "step": 5953 + }, + { + "epoch": 2.1986706056129983, + "grad_norm": 0.3151361346244812, + "learning_rate": 5.3454858972779895e-05, + "loss": 0.1727, + "step": 5954 + }, + { + "epoch": 2.19903988183161, + "grad_norm": 0.2756534814834595, + "learning_rate": 5.34302253972164e-05, + "loss": 0.1462, + "step": 5955 + }, + { + "epoch": 2.1994091580502215, + "grad_norm": 0.2369263768196106, + "learning_rate": 5.340559182165291e-05, + "loss": 0.1634, + "step": 5956 + }, + { + "epoch": 2.199778434268833, + "grad_norm": 0.25451090931892395, + "learning_rate": 5.338095824608942e-05, + "loss": 0.1713, + "step": 5957 + }, + { + "epoch": 2.2001477104874447, + "grad_norm": 0.2651297152042389, + "learning_rate": 5.3356324670525926e-05, + "loss": 0.177, + "step": 5958 + }, + { + "epoch": 2.2005169867060563, + "grad_norm": 0.2679627537727356, + "learning_rate": 5.3331691094962434e-05, + "loss": 0.1468, + "step": 5959 + }, + { + "epoch": 2.2008862629246675, + "grad_norm": 0.269664466381073, + "learning_rate": 5.330705751939894e-05, + "loss": 0.1617, + "step": 5960 + }, + { + "epoch": 2.201255539143279, + "grad_norm": 0.2481663078069687, + "learning_rate": 5.328242394383545e-05, + "loss": 0.154, + "step": 5961 + }, + { + "epoch": 2.2016248153618907, + "grad_norm": 0.27390938997268677, + "learning_rate": 5.325779036827195e-05, + "loss": 0.165, + "step": 5962 + }, + { + "epoch": 2.2019940915805023, + "grad_norm": 0.2476736605167389, + "learning_rate": 5.323315679270846e-05, + "loss": 0.1716, + "step": 5963 + }, + { + "epoch": 2.202363367799114, + "grad_norm": 0.23046623170375824, + "learning_rate": 5.320852321714497e-05, + "loss": 0.1634, + "step": 5964 + }, + { + "epoch": 2.202732644017725, + "grad_norm": 0.3048005700111389, + "learning_rate": 5.3183889641581475e-05, + "loss": 0.177, + "step": 5965 + }, + { + "epoch": 2.2031019202363367, + "grad_norm": 0.21594463288784027, + "learning_rate": 5.315925606601798e-05, + "loss": 0.1373, + "step": 5966 + }, + { + "epoch": 2.2034711964549483, + "grad_norm": 0.23990744352340698, + "learning_rate": 5.313462249045449e-05, + "loss": 0.1642, + "step": 5967 + }, + { + "epoch": 2.20384047267356, + "grad_norm": 0.25273141264915466, + "learning_rate": 5.3109988914891e-05, + "loss": 0.1473, + "step": 5968 + }, + { + "epoch": 2.2042097488921715, + "grad_norm": 0.3145434558391571, + "learning_rate": 5.3085355339327506e-05, + "loss": 0.1894, + "step": 5969 + }, + { + "epoch": 2.2045790251107826, + "grad_norm": 0.24428804218769073, + "learning_rate": 5.306072176376401e-05, + "loss": 0.1602, + "step": 5970 + }, + { + "epoch": 2.2049483013293942, + "grad_norm": 0.2531551122665405, + "learning_rate": 5.3036088188200515e-05, + "loss": 0.1541, + "step": 5971 + }, + { + "epoch": 2.205317577548006, + "grad_norm": 0.235373392701149, + "learning_rate": 5.301145461263702e-05, + "loss": 0.1596, + "step": 5972 + }, + { + "epoch": 2.2056868537666174, + "grad_norm": 0.23054225742816925, + "learning_rate": 5.298682103707353e-05, + "loss": 0.1386, + "step": 5973 + }, + { + "epoch": 2.206056129985229, + "grad_norm": 0.20615850389003754, + "learning_rate": 5.296218746151004e-05, + "loss": 0.1499, + "step": 5974 + }, + { + "epoch": 2.2064254062038406, + "grad_norm": 0.21254844963550568, + "learning_rate": 5.2937553885946546e-05, + "loss": 0.1349, + "step": 5975 + }, + { + "epoch": 2.206794682422452, + "grad_norm": 0.25328245759010315, + "learning_rate": 5.2912920310383054e-05, + "loss": 0.1699, + "step": 5976 + }, + { + "epoch": 2.2071639586410634, + "grad_norm": 0.28251996636390686, + "learning_rate": 5.288828673481956e-05, + "loss": 0.156, + "step": 5977 + }, + { + "epoch": 2.207533234859675, + "grad_norm": 0.24897782504558563, + "learning_rate": 5.286365315925606e-05, + "loss": 0.1822, + "step": 5978 + }, + { + "epoch": 2.2079025110782866, + "grad_norm": 0.240656316280365, + "learning_rate": 5.283901958369257e-05, + "loss": 0.1604, + "step": 5979 + }, + { + "epoch": 2.208271787296898, + "grad_norm": 0.23177017271518707, + "learning_rate": 5.281438600812908e-05, + "loss": 0.1447, + "step": 5980 + }, + { + "epoch": 2.2086410635155094, + "grad_norm": 0.2735021710395813, + "learning_rate": 5.278975243256559e-05, + "loss": 0.1745, + "step": 5981 + }, + { + "epoch": 2.209010339734121, + "grad_norm": 0.271853506565094, + "learning_rate": 5.2765118857002095e-05, + "loss": 0.1752, + "step": 5982 + }, + { + "epoch": 2.2093796159527326, + "grad_norm": 0.23105774819850922, + "learning_rate": 5.27404852814386e-05, + "loss": 0.1565, + "step": 5983 + }, + { + "epoch": 2.209748892171344, + "grad_norm": 0.3136729896068573, + "learning_rate": 5.271585170587511e-05, + "loss": 0.1664, + "step": 5984 + }, + { + "epoch": 2.210118168389956, + "grad_norm": 0.26528844237327576, + "learning_rate": 5.269121813031161e-05, + "loss": 0.1631, + "step": 5985 + }, + { + "epoch": 2.2104874446085674, + "grad_norm": 0.25443562865257263, + "learning_rate": 5.266658455474812e-05, + "loss": 0.1683, + "step": 5986 + }, + { + "epoch": 2.2108567208271785, + "grad_norm": 0.31721702218055725, + "learning_rate": 5.264195097918463e-05, + "loss": 0.1668, + "step": 5987 + }, + { + "epoch": 2.21122599704579, + "grad_norm": 0.2350839078426361, + "learning_rate": 5.2617317403621135e-05, + "loss": 0.1679, + "step": 5988 + }, + { + "epoch": 2.2115952732644018, + "grad_norm": 0.26622480154037476, + "learning_rate": 5.259268382805764e-05, + "loss": 0.1757, + "step": 5989 + }, + { + "epoch": 2.2119645494830134, + "grad_norm": 0.23592734336853027, + "learning_rate": 5.256805025249415e-05, + "loss": 0.1406, + "step": 5990 + }, + { + "epoch": 2.212333825701625, + "grad_norm": 0.25126147270202637, + "learning_rate": 5.254341667693066e-05, + "loss": 0.1697, + "step": 5991 + }, + { + "epoch": 2.212703101920236, + "grad_norm": 0.24029427766799927, + "learning_rate": 5.2518783101367167e-05, + "loss": 0.1539, + "step": 5992 + }, + { + "epoch": 2.2130723781388477, + "grad_norm": 0.24290092289447784, + "learning_rate": 5.249414952580367e-05, + "loss": 0.1632, + "step": 5993 + }, + { + "epoch": 2.2134416543574593, + "grad_norm": 0.22171983122825623, + "learning_rate": 5.2469515950240175e-05, + "loss": 0.1535, + "step": 5994 + }, + { + "epoch": 2.213810930576071, + "grad_norm": 0.266553670167923, + "learning_rate": 5.244488237467668e-05, + "loss": 0.1559, + "step": 5995 + }, + { + "epoch": 2.2141802067946825, + "grad_norm": 0.2581496238708496, + "learning_rate": 5.242024879911319e-05, + "loss": 0.1408, + "step": 5996 + }, + { + "epoch": 2.214549483013294, + "grad_norm": 0.2394719272851944, + "learning_rate": 5.23956152235497e-05, + "loss": 0.1478, + "step": 5997 + }, + { + "epoch": 2.2149187592319053, + "grad_norm": 0.3085376024246216, + "learning_rate": 5.237098164798621e-05, + "loss": 0.1916, + "step": 5998 + }, + { + "epoch": 2.215288035450517, + "grad_norm": 0.25496068596839905, + "learning_rate": 5.2346348072422715e-05, + "loss": 0.1539, + "step": 5999 + }, + { + "epoch": 2.2156573116691285, + "grad_norm": 0.33565932512283325, + "learning_rate": 5.232171449685922e-05, + "loss": 0.1734, + "step": 6000 + }, + { + "epoch": 2.2156573116691285, + "eval_loss": 9.012953758239746, + "eval_runtime": 6.9942, + "eval_samples_per_second": 7.149, + "eval_steps_per_second": 1.001, + "step": 6000 + }, + { + "epoch": 2.21602658788774, + "grad_norm": 0.2226184904575348, + "learning_rate": 5.2297080921295724e-05, + "loss": 0.1716, + "step": 6001 + }, + { + "epoch": 2.2163958641063517, + "grad_norm": 0.2914566695690155, + "learning_rate": 5.227244734573223e-05, + "loss": 0.1689, + "step": 6002 + }, + { + "epoch": 2.216765140324963, + "grad_norm": 0.22030657529830933, + "learning_rate": 5.224781377016874e-05, + "loss": 0.1469, + "step": 6003 + }, + { + "epoch": 2.2171344165435745, + "grad_norm": 0.21239815652370453, + "learning_rate": 5.222318019460525e-05, + "loss": 0.1457, + "step": 6004 + }, + { + "epoch": 2.217503692762186, + "grad_norm": 0.21844951808452606, + "learning_rate": 5.2198546619041755e-05, + "loss": 0.1444, + "step": 6005 + }, + { + "epoch": 2.2178729689807977, + "grad_norm": 0.24911431968212128, + "learning_rate": 5.217391304347826e-05, + "loss": 0.1789, + "step": 6006 + }, + { + "epoch": 2.2182422451994093, + "grad_norm": 0.21257571876049042, + "learning_rate": 5.214927946791477e-05, + "loss": 0.1443, + "step": 6007 + }, + { + "epoch": 2.218611521418021, + "grad_norm": 0.2328416407108307, + "learning_rate": 5.212464589235128e-05, + "loss": 0.1611, + "step": 6008 + }, + { + "epoch": 2.218980797636632, + "grad_norm": 0.22937153279781342, + "learning_rate": 5.210001231678778e-05, + "loss": 0.1646, + "step": 6009 + }, + { + "epoch": 2.2193500738552436, + "grad_norm": 0.2553964853286743, + "learning_rate": 5.207537874122429e-05, + "loss": 0.1629, + "step": 6010 + }, + { + "epoch": 2.2197193500738552, + "grad_norm": 0.23443114757537842, + "learning_rate": 5.2050745165660796e-05, + "loss": 0.1468, + "step": 6011 + }, + { + "epoch": 2.220088626292467, + "grad_norm": 0.22487518191337585, + "learning_rate": 5.2026111590097303e-05, + "loss": 0.1536, + "step": 6012 + }, + { + "epoch": 2.2204579025110784, + "grad_norm": 0.2230817973613739, + "learning_rate": 5.200147801453381e-05, + "loss": 0.1555, + "step": 6013 + }, + { + "epoch": 2.2208271787296896, + "grad_norm": 0.24336868524551392, + "learning_rate": 5.197684443897032e-05, + "loss": 0.1454, + "step": 6014 + }, + { + "epoch": 2.221196454948301, + "grad_norm": 0.2095605731010437, + "learning_rate": 5.195221086340683e-05, + "loss": 0.1663, + "step": 6015 + }, + { + "epoch": 2.221565731166913, + "grad_norm": 0.25796231627464294, + "learning_rate": 5.1927577287843335e-05, + "loss": 0.1601, + "step": 6016 + }, + { + "epoch": 2.2219350073855244, + "grad_norm": 0.2913053333759308, + "learning_rate": 5.1902943712279836e-05, + "loss": 0.1665, + "step": 6017 + }, + { + "epoch": 2.222304283604136, + "grad_norm": 0.23500728607177734, + "learning_rate": 5.1878310136716344e-05, + "loss": 0.1473, + "step": 6018 + }, + { + "epoch": 2.2226735598227476, + "grad_norm": 0.28088438510894775, + "learning_rate": 5.185367656115285e-05, + "loss": 0.1645, + "step": 6019 + }, + { + "epoch": 2.2230428360413588, + "grad_norm": 0.27012303471565247, + "learning_rate": 5.182904298558936e-05, + "loss": 0.1475, + "step": 6020 + }, + { + "epoch": 2.2234121122599704, + "grad_norm": 0.23929768800735474, + "learning_rate": 5.180440941002587e-05, + "loss": 0.1499, + "step": 6021 + }, + { + "epoch": 2.223781388478582, + "grad_norm": 0.25012245774269104, + "learning_rate": 5.1779775834462375e-05, + "loss": 0.1685, + "step": 6022 + }, + { + "epoch": 2.2241506646971936, + "grad_norm": 0.23309791088104248, + "learning_rate": 5.175514225889888e-05, + "loss": 0.1571, + "step": 6023 + }, + { + "epoch": 2.224519940915805, + "grad_norm": 0.24119238555431366, + "learning_rate": 5.173050868333539e-05, + "loss": 0.1571, + "step": 6024 + }, + { + "epoch": 2.2248892171344163, + "grad_norm": 0.22112759947776794, + "learning_rate": 5.170587510777189e-05, + "loss": 0.1493, + "step": 6025 + }, + { + "epoch": 2.225258493353028, + "grad_norm": 0.2211490273475647, + "learning_rate": 5.16812415322084e-05, + "loss": 0.1504, + "step": 6026 + }, + { + "epoch": 2.2256277695716395, + "grad_norm": 0.24335838854312897, + "learning_rate": 5.165660795664491e-05, + "loss": 0.158, + "step": 6027 + }, + { + "epoch": 2.225997045790251, + "grad_norm": 0.29804491996765137, + "learning_rate": 5.1631974381081416e-05, + "loss": 0.1716, + "step": 6028 + }, + { + "epoch": 2.2263663220088628, + "grad_norm": 0.24547550082206726, + "learning_rate": 5.1607340805517924e-05, + "loss": 0.1628, + "step": 6029 + }, + { + "epoch": 2.2267355982274744, + "grad_norm": 0.24935106933116913, + "learning_rate": 5.158270722995443e-05, + "loss": 0.1671, + "step": 6030 + }, + { + "epoch": 2.2271048744460855, + "grad_norm": 0.30223795771598816, + "learning_rate": 5.155807365439094e-05, + "loss": 0.1837, + "step": 6031 + }, + { + "epoch": 2.227474150664697, + "grad_norm": 0.23574461042881012, + "learning_rate": 5.153344007882745e-05, + "loss": 0.1514, + "step": 6032 + }, + { + "epoch": 2.2278434268833087, + "grad_norm": 0.28690627217292786, + "learning_rate": 5.150880650326395e-05, + "loss": 0.1711, + "step": 6033 + }, + { + "epoch": 2.2282127031019203, + "grad_norm": 0.27055296301841736, + "learning_rate": 5.1484172927700456e-05, + "loss": 0.1772, + "step": 6034 + }, + { + "epoch": 2.228581979320532, + "grad_norm": 0.24811138212680817, + "learning_rate": 5.1459539352136964e-05, + "loss": 0.1422, + "step": 6035 + }, + { + "epoch": 2.228951255539143, + "grad_norm": 0.2620033025741577, + "learning_rate": 5.143490577657347e-05, + "loss": 0.1528, + "step": 6036 + }, + { + "epoch": 2.2293205317577547, + "grad_norm": 0.31062081456184387, + "learning_rate": 5.141027220100998e-05, + "loss": 0.1884, + "step": 6037 + }, + { + "epoch": 2.2296898079763663, + "grad_norm": 0.25006842613220215, + "learning_rate": 5.138563862544649e-05, + "loss": 0.1481, + "step": 6038 + }, + { + "epoch": 2.230059084194978, + "grad_norm": 0.30625444650650024, + "learning_rate": 5.1361005049882995e-05, + "loss": 0.1746, + "step": 6039 + }, + { + "epoch": 2.2304283604135895, + "grad_norm": 0.30305784940719604, + "learning_rate": 5.13363714743195e-05, + "loss": 0.1607, + "step": 6040 + }, + { + "epoch": 2.230797636632201, + "grad_norm": 0.3416635990142822, + "learning_rate": 5.1311737898756004e-05, + "loss": 0.1644, + "step": 6041 + }, + { + "epoch": 2.2311669128508123, + "grad_norm": 0.27630504965782166, + "learning_rate": 5.128710432319251e-05, + "loss": 0.1785, + "step": 6042 + }, + { + "epoch": 2.231536189069424, + "grad_norm": 0.2725943922996521, + "learning_rate": 5.126247074762902e-05, + "loss": 0.1626, + "step": 6043 + }, + { + "epoch": 2.2319054652880355, + "grad_norm": 0.2947699725627899, + "learning_rate": 5.123783717206553e-05, + "loss": 0.1653, + "step": 6044 + }, + { + "epoch": 2.232274741506647, + "grad_norm": 0.18387740850448608, + "learning_rate": 5.1213203596502036e-05, + "loss": 0.1301, + "step": 6045 + }, + { + "epoch": 2.2326440177252587, + "grad_norm": 0.3253576159477234, + "learning_rate": 5.1188570020938544e-05, + "loss": 0.2083, + "step": 6046 + }, + { + "epoch": 2.23301329394387, + "grad_norm": 0.23650340735912323, + "learning_rate": 5.116393644537505e-05, + "loss": 0.1599, + "step": 6047 + }, + { + "epoch": 2.2333825701624814, + "grad_norm": 0.3004239499568939, + "learning_rate": 5.113930286981156e-05, + "loss": 0.1615, + "step": 6048 + }, + { + "epoch": 2.233751846381093, + "grad_norm": 0.21608321368694305, + "learning_rate": 5.111466929424806e-05, + "loss": 0.1532, + "step": 6049 + }, + { + "epoch": 2.2341211225997046, + "grad_norm": 0.25923851132392883, + "learning_rate": 5.109003571868457e-05, + "loss": 0.1548, + "step": 6050 + }, + { + "epoch": 2.2341211225997046, + "eval_loss": 9.0487642288208, + "eval_runtime": 6.9127, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 1.013, + "step": 6050 + }, + { + "epoch": 2.2344903988183162, + "grad_norm": 0.255472332239151, + "learning_rate": 5.1065402143121076e-05, + "loss": 0.1551, + "step": 6051 + }, + { + "epoch": 2.234859675036928, + "grad_norm": 0.23181411623954773, + "learning_rate": 5.1040768567557584e-05, + "loss": 0.1399, + "step": 6052 + }, + { + "epoch": 2.235228951255539, + "grad_norm": 0.24558599293231964, + "learning_rate": 5.101613499199409e-05, + "loss": 0.1548, + "step": 6053 + }, + { + "epoch": 2.2355982274741506, + "grad_norm": 0.242776021361351, + "learning_rate": 5.09915014164306e-05, + "loss": 0.1454, + "step": 6054 + }, + { + "epoch": 2.235967503692762, + "grad_norm": 0.22378981113433838, + "learning_rate": 5.096686784086711e-05, + "loss": 0.1622, + "step": 6055 + }, + { + "epoch": 2.236336779911374, + "grad_norm": 0.26467400789260864, + "learning_rate": 5.0942234265303616e-05, + "loss": 0.1524, + "step": 6056 + }, + { + "epoch": 2.2367060561299854, + "grad_norm": 0.2187795639038086, + "learning_rate": 5.091760068974012e-05, + "loss": 0.1584, + "step": 6057 + }, + { + "epoch": 2.2370753323485966, + "grad_norm": 0.2122139185667038, + "learning_rate": 5.0892967114176625e-05, + "loss": 0.1714, + "step": 6058 + }, + { + "epoch": 2.237444608567208, + "grad_norm": 0.23067109286785126, + "learning_rate": 5.086833353861313e-05, + "loss": 0.1455, + "step": 6059 + }, + { + "epoch": 2.2378138847858198, + "grad_norm": 0.22823211550712585, + "learning_rate": 5.084369996304964e-05, + "loss": 0.1612, + "step": 6060 + }, + { + "epoch": 2.2381831610044314, + "grad_norm": 0.2676428556442261, + "learning_rate": 5.081906638748615e-05, + "loss": 0.1724, + "step": 6061 + }, + { + "epoch": 2.238552437223043, + "grad_norm": 0.23254908621311188, + "learning_rate": 5.0794432811922656e-05, + "loss": 0.1521, + "step": 6062 + }, + { + "epoch": 2.2389217134416546, + "grad_norm": 0.25061461329460144, + "learning_rate": 5.0769799236359164e-05, + "loss": 0.1608, + "step": 6063 + }, + { + "epoch": 2.2392909896602657, + "grad_norm": 0.2964935600757599, + "learning_rate": 5.0745165660795665e-05, + "loss": 0.1633, + "step": 6064 + }, + { + "epoch": 2.2396602658788773, + "grad_norm": 0.2764359414577484, + "learning_rate": 5.072053208523217e-05, + "loss": 0.1817, + "step": 6065 + }, + { + "epoch": 2.240029542097489, + "grad_norm": 0.26771143078804016, + "learning_rate": 5.069589850966868e-05, + "loss": 0.1806, + "step": 6066 + }, + { + "epoch": 2.2403988183161005, + "grad_norm": 0.23228208720684052, + "learning_rate": 5.067126493410519e-05, + "loss": 0.1512, + "step": 6067 + }, + { + "epoch": 2.240768094534712, + "grad_norm": 0.2726166248321533, + "learning_rate": 5.0646631358541696e-05, + "loss": 0.164, + "step": 6068 + }, + { + "epoch": 2.2411373707533233, + "grad_norm": 0.2592065632343292, + "learning_rate": 5.0621997782978204e-05, + "loss": 0.1586, + "step": 6069 + }, + { + "epoch": 2.241506646971935, + "grad_norm": 0.23448434472084045, + "learning_rate": 5.059736420741471e-05, + "loss": 0.1507, + "step": 6070 + }, + { + "epoch": 2.2418759231905465, + "grad_norm": 0.26998454332351685, + "learning_rate": 5.057273063185122e-05, + "loss": 0.1545, + "step": 6071 + }, + { + "epoch": 2.242245199409158, + "grad_norm": 0.25542882084846497, + "learning_rate": 5.054809705628772e-05, + "loss": 0.1652, + "step": 6072 + }, + { + "epoch": 2.2426144756277697, + "grad_norm": 0.22866836190223694, + "learning_rate": 5.052346348072423e-05, + "loss": 0.1453, + "step": 6073 + }, + { + "epoch": 2.2429837518463813, + "grad_norm": 0.2814345359802246, + "learning_rate": 5.049882990516074e-05, + "loss": 0.1837, + "step": 6074 + }, + { + "epoch": 2.2433530280649925, + "grad_norm": 0.2058611363172531, + "learning_rate": 5.0474196329597245e-05, + "loss": 0.1564, + "step": 6075 + }, + { + "epoch": 2.243722304283604, + "grad_norm": 0.2800319790840149, + "learning_rate": 5.044956275403375e-05, + "loss": 0.1536, + "step": 6076 + }, + { + "epoch": 2.2440915805022157, + "grad_norm": 0.24157284200191498, + "learning_rate": 5.042492917847026e-05, + "loss": 0.1792, + "step": 6077 + }, + { + "epoch": 2.2444608567208273, + "grad_norm": 0.2736259400844574, + "learning_rate": 5.040029560290677e-05, + "loss": 0.1734, + "step": 6078 + }, + { + "epoch": 2.244830132939439, + "grad_norm": 0.2631182074546814, + "learning_rate": 5.0375662027343276e-05, + "loss": 0.1648, + "step": 6079 + }, + { + "epoch": 2.24519940915805, + "grad_norm": 0.3254713714122772, + "learning_rate": 5.035102845177978e-05, + "loss": 0.1825, + "step": 6080 + }, + { + "epoch": 2.2455686853766617, + "grad_norm": 0.25539466738700867, + "learning_rate": 5.0326394876216285e-05, + "loss": 0.1556, + "step": 6081 + }, + { + "epoch": 2.2459379615952733, + "grad_norm": 0.24149106442928314, + "learning_rate": 5.030176130065279e-05, + "loss": 0.1554, + "step": 6082 + }, + { + "epoch": 2.246307237813885, + "grad_norm": 0.23104673624038696, + "learning_rate": 5.02771277250893e-05, + "loss": 0.158, + "step": 6083 + }, + { + "epoch": 2.2466765140324965, + "grad_norm": 0.24095751345157623, + "learning_rate": 5.025249414952581e-05, + "loss": 0.1615, + "step": 6084 + }, + { + "epoch": 2.2470457902511076, + "grad_norm": 0.2606332004070282, + "learning_rate": 5.0227860573962317e-05, + "loss": 0.1719, + "step": 6085 + }, + { + "epoch": 2.2474150664697192, + "grad_norm": 0.2455698549747467, + "learning_rate": 5.0203226998398824e-05, + "loss": 0.1739, + "step": 6086 + }, + { + "epoch": 2.247784342688331, + "grad_norm": 0.2728544771671295, + "learning_rate": 5.017859342283533e-05, + "loss": 0.1681, + "step": 6087 + }, + { + "epoch": 2.2481536189069424, + "grad_norm": 0.31489235162734985, + "learning_rate": 5.015395984727183e-05, + "loss": 0.1822, + "step": 6088 + }, + { + "epoch": 2.248522895125554, + "grad_norm": 0.25471094250679016, + "learning_rate": 5.012932627170834e-05, + "loss": 0.1465, + "step": 6089 + }, + { + "epoch": 2.2488921713441656, + "grad_norm": 0.21755622327327728, + "learning_rate": 5.010469269614485e-05, + "loss": 0.1486, + "step": 6090 + }, + { + "epoch": 2.249261447562777, + "grad_norm": 0.25896358489990234, + "learning_rate": 5.008005912058136e-05, + "loss": 0.1557, + "step": 6091 + }, + { + "epoch": 2.2496307237813884, + "grad_norm": 0.25539758801460266, + "learning_rate": 5.0055425545017865e-05, + "loss": 0.1775, + "step": 6092 + }, + { + "epoch": 2.25, + "grad_norm": 0.2679508626461029, + "learning_rate": 5.003079196945437e-05, + "loss": 0.1897, + "step": 6093 + }, + { + "epoch": 2.2503692762186116, + "grad_norm": 0.2183191478252411, + "learning_rate": 5.000615839389088e-05, + "loss": 0.1683, + "step": 6094 + }, + { + "epoch": 2.250738552437223, + "grad_norm": 0.23494145274162292, + "learning_rate": 4.998152481832738e-05, + "loss": 0.1625, + "step": 6095 + }, + { + "epoch": 2.251107828655835, + "grad_norm": 0.2737155854701996, + "learning_rate": 4.995689124276389e-05, + "loss": 0.1696, + "step": 6096 + }, + { + "epoch": 2.251477104874446, + "grad_norm": 0.266712486743927, + "learning_rate": 4.993225766720039e-05, + "loss": 0.1539, + "step": 6097 + }, + { + "epoch": 2.2518463810930576, + "grad_norm": 0.2086418718099594, + "learning_rate": 4.99076240916369e-05, + "loss": 0.1491, + "step": 6098 + }, + { + "epoch": 2.252215657311669, + "grad_norm": 0.23411017656326294, + "learning_rate": 4.9882990516073406e-05, + "loss": 0.1672, + "step": 6099 + }, + { + "epoch": 2.2525849335302808, + "grad_norm": 0.27064037322998047, + "learning_rate": 4.9858356940509914e-05, + "loss": 0.1584, + "step": 6100 + }, + { + "epoch": 2.2525849335302808, + "eval_loss": 9.036221504211426, + "eval_runtime": 6.9064, + "eval_samples_per_second": 7.24, + "eval_steps_per_second": 1.014, + "step": 6100 + }, + { + "epoch": 2.252954209748892, + "grad_norm": 0.30951258540153503, + "learning_rate": 4.983372336494642e-05, + "loss": 0.1739, + "step": 6101 + }, + { + "epoch": 2.2533234859675035, + "grad_norm": 0.21860671043395996, + "learning_rate": 4.980908978938293e-05, + "loss": 0.1627, + "step": 6102 + }, + { + "epoch": 2.253692762186115, + "grad_norm": 0.2377181351184845, + "learning_rate": 4.978445621381944e-05, + "loss": 0.1474, + "step": 6103 + }, + { + "epoch": 2.2540620384047267, + "grad_norm": 0.2517457604408264, + "learning_rate": 4.9759822638255946e-05, + "loss": 0.1596, + "step": 6104 + }, + { + "epoch": 2.2544313146233383, + "grad_norm": 0.3329029977321625, + "learning_rate": 4.973518906269245e-05, + "loss": 0.1779, + "step": 6105 + }, + { + "epoch": 2.25480059084195, + "grad_norm": 0.24127554893493652, + "learning_rate": 4.9710555487128955e-05, + "loss": 0.1524, + "step": 6106 + }, + { + "epoch": 2.255169867060561, + "grad_norm": 0.23684749007225037, + "learning_rate": 4.968592191156546e-05, + "loss": 0.151, + "step": 6107 + }, + { + "epoch": 2.2555391432791727, + "grad_norm": 0.2457515001296997, + "learning_rate": 4.966128833600197e-05, + "loss": 0.1816, + "step": 6108 + }, + { + "epoch": 2.2559084194977843, + "grad_norm": 0.25103387236595154, + "learning_rate": 4.963665476043848e-05, + "loss": 0.1625, + "step": 6109 + }, + { + "epoch": 2.256277695716396, + "grad_norm": 0.24644960463047028, + "learning_rate": 4.9612021184874986e-05, + "loss": 0.1749, + "step": 6110 + }, + { + "epoch": 2.2566469719350075, + "grad_norm": 0.24130894243717194, + "learning_rate": 4.9587387609311494e-05, + "loss": 0.1556, + "step": 6111 + }, + { + "epoch": 2.2570162481536187, + "grad_norm": 0.2687907814979553, + "learning_rate": 4.9562754033748e-05, + "loss": 0.1564, + "step": 6112 + }, + { + "epoch": 2.2573855243722303, + "grad_norm": 0.2514500021934509, + "learning_rate": 4.95381204581845e-05, + "loss": 0.1442, + "step": 6113 + }, + { + "epoch": 2.257754800590842, + "grad_norm": 0.28516241908073425, + "learning_rate": 4.951348688262101e-05, + "loss": 0.1817, + "step": 6114 + }, + { + "epoch": 2.2581240768094535, + "grad_norm": 0.29722079634666443, + "learning_rate": 4.948885330705752e-05, + "loss": 0.1815, + "step": 6115 + }, + { + "epoch": 2.258493353028065, + "grad_norm": 0.23556283116340637, + "learning_rate": 4.9464219731494026e-05, + "loss": 0.1803, + "step": 6116 + }, + { + "epoch": 2.2588626292466767, + "grad_norm": 0.34509795904159546, + "learning_rate": 4.9439586155930534e-05, + "loss": 0.2042, + "step": 6117 + }, + { + "epoch": 2.259231905465288, + "grad_norm": 0.26305586099624634, + "learning_rate": 4.941495258036704e-05, + "loss": 0.1727, + "step": 6118 + }, + { + "epoch": 2.2596011816838995, + "grad_norm": 0.24487519264221191, + "learning_rate": 4.939031900480355e-05, + "loss": 0.1778, + "step": 6119 + }, + { + "epoch": 2.259970457902511, + "grad_norm": 0.22971828281879425, + "learning_rate": 4.936568542924006e-05, + "loss": 0.1479, + "step": 6120 + }, + { + "epoch": 2.2603397341211227, + "grad_norm": 0.25407010316848755, + "learning_rate": 4.934105185367656e-05, + "loss": 0.1662, + "step": 6121 + }, + { + "epoch": 2.2607090103397343, + "grad_norm": 0.22423158586025238, + "learning_rate": 4.931641827811307e-05, + "loss": 0.1527, + "step": 6122 + }, + { + "epoch": 2.2610782865583454, + "grad_norm": 0.28695300221443176, + "learning_rate": 4.9291784702549575e-05, + "loss": 0.1892, + "step": 6123 + }, + { + "epoch": 2.261447562776957, + "grad_norm": 0.23394016921520233, + "learning_rate": 4.926715112698608e-05, + "loss": 0.157, + "step": 6124 + }, + { + "epoch": 2.2618168389955686, + "grad_norm": 0.21605585515499115, + "learning_rate": 4.924251755142259e-05, + "loss": 0.1503, + "step": 6125 + }, + { + "epoch": 2.2621861152141802, + "grad_norm": 0.34260499477386475, + "learning_rate": 4.92178839758591e-05, + "loss": 0.1776, + "step": 6126 + }, + { + "epoch": 2.262555391432792, + "grad_norm": 0.2344532310962677, + "learning_rate": 4.9193250400295606e-05, + "loss": 0.1874, + "step": 6127 + }, + { + "epoch": 2.2629246676514034, + "grad_norm": 0.2453799992799759, + "learning_rate": 4.9168616824732114e-05, + "loss": 0.1499, + "step": 6128 + }, + { + "epoch": 2.2632939438700146, + "grad_norm": 0.21677950024604797, + "learning_rate": 4.9143983249168615e-05, + "loss": 0.155, + "step": 6129 + }, + { + "epoch": 2.263663220088626, + "grad_norm": 0.24246056377887726, + "learning_rate": 4.911934967360512e-05, + "loss": 0.1683, + "step": 6130 + }, + { + "epoch": 2.264032496307238, + "grad_norm": 0.23486801981925964, + "learning_rate": 4.909471609804163e-05, + "loss": 0.1704, + "step": 6131 + }, + { + "epoch": 2.2644017725258494, + "grad_norm": 0.24547888338565826, + "learning_rate": 4.907008252247814e-05, + "loss": 0.1525, + "step": 6132 + }, + { + "epoch": 2.264771048744461, + "grad_norm": 0.23221251368522644, + "learning_rate": 4.9045448946914647e-05, + "loss": 0.1475, + "step": 6133 + }, + { + "epoch": 2.265140324963072, + "grad_norm": 0.2550663352012634, + "learning_rate": 4.9020815371351154e-05, + "loss": 0.1695, + "step": 6134 + }, + { + "epoch": 2.2655096011816838, + "grad_norm": 0.234259694814682, + "learning_rate": 4.899618179578766e-05, + "loss": 0.1502, + "step": 6135 + }, + { + "epoch": 2.2658788774002954, + "grad_norm": 0.29918527603149414, + "learning_rate": 4.897154822022417e-05, + "loss": 0.1867, + "step": 6136 + }, + { + "epoch": 2.266248153618907, + "grad_norm": 0.2615757882595062, + "learning_rate": 4.894691464466067e-05, + "loss": 0.1606, + "step": 6137 + }, + { + "epoch": 2.2666174298375186, + "grad_norm": 0.2693590521812439, + "learning_rate": 4.892228106909718e-05, + "loss": 0.1709, + "step": 6138 + }, + { + "epoch": 2.26698670605613, + "grad_norm": 0.25830090045928955, + "learning_rate": 4.889764749353369e-05, + "loss": 0.1633, + "step": 6139 + }, + { + "epoch": 2.2673559822747413, + "grad_norm": 0.273037314414978, + "learning_rate": 4.8873013917970195e-05, + "loss": 0.1577, + "step": 6140 + }, + { + "epoch": 2.267725258493353, + "grad_norm": 0.23333625495433807, + "learning_rate": 4.88483803424067e-05, + "loss": 0.1868, + "step": 6141 + }, + { + "epoch": 2.2680945347119645, + "grad_norm": 0.2749931216239929, + "learning_rate": 4.882374676684321e-05, + "loss": 0.1669, + "step": 6142 + }, + { + "epoch": 2.268463810930576, + "grad_norm": 0.24854588508605957, + "learning_rate": 4.879911319127972e-05, + "loss": 0.1638, + "step": 6143 + }, + { + "epoch": 2.2688330871491877, + "grad_norm": 0.25309163331985474, + "learning_rate": 4.8774479615716226e-05, + "loss": 0.1692, + "step": 6144 + }, + { + "epoch": 2.269202363367799, + "grad_norm": 0.2585795819759369, + "learning_rate": 4.874984604015273e-05, + "loss": 0.1584, + "step": 6145 + }, + { + "epoch": 2.2695716395864105, + "grad_norm": 0.23847173154354095, + "learning_rate": 4.8725212464589235e-05, + "loss": 0.1404, + "step": 6146 + }, + { + "epoch": 2.269940915805022, + "grad_norm": 0.2689782977104187, + "learning_rate": 4.870057888902574e-05, + "loss": 0.1813, + "step": 6147 + }, + { + "epoch": 2.2703101920236337, + "grad_norm": 0.26980170607566833, + "learning_rate": 4.867594531346225e-05, + "loss": 0.154, + "step": 6148 + }, + { + "epoch": 2.2706794682422453, + "grad_norm": 0.2620565593242645, + "learning_rate": 4.865131173789876e-05, + "loss": 0.1584, + "step": 6149 + }, + { + "epoch": 2.271048744460857, + "grad_norm": 0.2884005010128021, + "learning_rate": 4.862667816233527e-05, + "loss": 0.1761, + "step": 6150 + }, + { + "epoch": 2.271048744460857, + "eval_loss": 8.939491271972656, + "eval_runtime": 6.9082, + "eval_samples_per_second": 7.238, + "eval_steps_per_second": 1.013, + "step": 6150 + }, + { + "epoch": 2.271418020679468, + "grad_norm": 0.23792384564876556, + "learning_rate": 4.8602044586771775e-05, + "loss": 0.1658, + "step": 6151 + }, + { + "epoch": 2.2717872968980797, + "grad_norm": 0.31145840883255005, + "learning_rate": 4.857741101120828e-05, + "loss": 0.1635, + "step": 6152 + }, + { + "epoch": 2.2721565731166913, + "grad_norm": 0.23386354744434357, + "learning_rate": 4.8552777435644783e-05, + "loss": 0.1534, + "step": 6153 + }, + { + "epoch": 2.272525849335303, + "grad_norm": 0.2860638201236725, + "learning_rate": 4.852814386008129e-05, + "loss": 0.1566, + "step": 6154 + }, + { + "epoch": 2.2728951255539145, + "grad_norm": 0.25955700874328613, + "learning_rate": 4.85035102845178e-05, + "loss": 0.1596, + "step": 6155 + }, + { + "epoch": 2.2732644017725256, + "grad_norm": 0.21067415177822113, + "learning_rate": 4.847887670895431e-05, + "loss": 0.1625, + "step": 6156 + }, + { + "epoch": 2.2736336779911372, + "grad_norm": 0.2681225538253784, + "learning_rate": 4.8454243133390815e-05, + "loss": 0.1733, + "step": 6157 + }, + { + "epoch": 2.274002954209749, + "grad_norm": 0.24804915487766266, + "learning_rate": 4.842960955782732e-05, + "loss": 0.1653, + "step": 6158 + }, + { + "epoch": 2.2743722304283605, + "grad_norm": 0.40237942337989807, + "learning_rate": 4.840497598226383e-05, + "loss": 0.1649, + "step": 6159 + }, + { + "epoch": 2.274741506646972, + "grad_norm": 0.33787861466407776, + "learning_rate": 4.838034240670034e-05, + "loss": 0.1891, + "step": 6160 + }, + { + "epoch": 2.2751107828655837, + "grad_norm": 0.23810726404190063, + "learning_rate": 4.835570883113684e-05, + "loss": 0.1467, + "step": 6161 + }, + { + "epoch": 2.275480059084195, + "grad_norm": 0.2722361385822296, + "learning_rate": 4.833107525557335e-05, + "loss": 0.1812, + "step": 6162 + }, + { + "epoch": 2.2758493353028064, + "grad_norm": 0.23594729602336884, + "learning_rate": 4.8306441680009855e-05, + "loss": 0.1815, + "step": 6163 + }, + { + "epoch": 2.276218611521418, + "grad_norm": 0.21067774295806885, + "learning_rate": 4.828180810444636e-05, + "loss": 0.1409, + "step": 6164 + }, + { + "epoch": 2.2765878877400296, + "grad_norm": 0.25318580865859985, + "learning_rate": 4.825717452888287e-05, + "loss": 0.1515, + "step": 6165 + }, + { + "epoch": 2.2769571639586412, + "grad_norm": 0.21054166555404663, + "learning_rate": 4.823254095331938e-05, + "loss": 0.1402, + "step": 6166 + }, + { + "epoch": 2.2773264401772524, + "grad_norm": 0.2487645298242569, + "learning_rate": 4.820790737775589e-05, + "loss": 0.1677, + "step": 6167 + }, + { + "epoch": 2.277695716395864, + "grad_norm": 0.2914375960826874, + "learning_rate": 4.8183273802192395e-05, + "loss": 0.1527, + "step": 6168 + }, + { + "epoch": 2.2780649926144756, + "grad_norm": 0.3064052164554596, + "learning_rate": 4.8158640226628896e-05, + "loss": 0.1746, + "step": 6169 + }, + { + "epoch": 2.278434268833087, + "grad_norm": 0.2510773539543152, + "learning_rate": 4.8134006651065404e-05, + "loss": 0.1553, + "step": 6170 + }, + { + "epoch": 2.278803545051699, + "grad_norm": 0.24469999969005585, + "learning_rate": 4.810937307550191e-05, + "loss": 0.1374, + "step": 6171 + }, + { + "epoch": 2.2791728212703104, + "grad_norm": 0.2897234857082367, + "learning_rate": 4.808473949993842e-05, + "loss": 0.1855, + "step": 6172 + }, + { + "epoch": 2.2795420974889216, + "grad_norm": 0.2664544880390167, + "learning_rate": 4.806010592437493e-05, + "loss": 0.1662, + "step": 6173 + }, + { + "epoch": 2.279911373707533, + "grad_norm": 0.2467261552810669, + "learning_rate": 4.8035472348811435e-05, + "loss": 0.1601, + "step": 6174 + }, + { + "epoch": 2.2802806499261448, + "grad_norm": 0.28341439366340637, + "learning_rate": 4.801083877324794e-05, + "loss": 0.1702, + "step": 6175 + }, + { + "epoch": 2.2806499261447564, + "grad_norm": 0.29802432656288147, + "learning_rate": 4.7986205197684444e-05, + "loss": 0.1814, + "step": 6176 + }, + { + "epoch": 2.281019202363368, + "grad_norm": 0.263931542634964, + "learning_rate": 4.796157162212095e-05, + "loss": 0.175, + "step": 6177 + }, + { + "epoch": 2.281388478581979, + "grad_norm": 0.2403264194726944, + "learning_rate": 4.793693804655746e-05, + "loss": 0.1544, + "step": 6178 + }, + { + "epoch": 2.2817577548005907, + "grad_norm": 0.21426749229431152, + "learning_rate": 4.791230447099397e-05, + "loss": 0.1536, + "step": 6179 + }, + { + "epoch": 2.2821270310192023, + "grad_norm": 0.21598756313323975, + "learning_rate": 4.7887670895430475e-05, + "loss": 0.1464, + "step": 6180 + }, + { + "epoch": 2.282496307237814, + "grad_norm": 0.22816595435142517, + "learning_rate": 4.786303731986698e-05, + "loss": 0.1552, + "step": 6181 + }, + { + "epoch": 2.2828655834564255, + "grad_norm": 0.25365588068962097, + "learning_rate": 4.783840374430349e-05, + "loss": 0.1697, + "step": 6182 + }, + { + "epoch": 2.283234859675037, + "grad_norm": 0.22077427804470062, + "learning_rate": 4.781377016874e-05, + "loss": 0.1355, + "step": 6183 + }, + { + "epoch": 2.2836041358936483, + "grad_norm": 0.20823974907398224, + "learning_rate": 4.77891365931765e-05, + "loss": 0.14, + "step": 6184 + }, + { + "epoch": 2.28397341211226, + "grad_norm": 0.24294820427894592, + "learning_rate": 4.776450301761301e-05, + "loss": 0.1857, + "step": 6185 + }, + { + "epoch": 2.2843426883308715, + "grad_norm": 0.23693466186523438, + "learning_rate": 4.7739869442049516e-05, + "loss": 0.1634, + "step": 6186 + }, + { + "epoch": 2.284711964549483, + "grad_norm": 0.22597266733646393, + "learning_rate": 4.7715235866486024e-05, + "loss": 0.148, + "step": 6187 + }, + { + "epoch": 2.2850812407680947, + "grad_norm": 0.2577482759952545, + "learning_rate": 4.769060229092253e-05, + "loss": 0.1702, + "step": 6188 + }, + { + "epoch": 2.285450516986706, + "grad_norm": 0.24337798357009888, + "learning_rate": 4.766596871535904e-05, + "loss": 0.1689, + "step": 6189 + }, + { + "epoch": 2.2858197932053175, + "grad_norm": 0.2971801161766052, + "learning_rate": 4.764133513979555e-05, + "loss": 0.1732, + "step": 6190 + }, + { + "epoch": 2.286189069423929, + "grad_norm": 0.3394518196582794, + "learning_rate": 4.7616701564232055e-05, + "loss": 0.1799, + "step": 6191 + }, + { + "epoch": 2.2865583456425407, + "grad_norm": 0.2792631983757019, + "learning_rate": 4.7592067988668556e-05, + "loss": 0.1571, + "step": 6192 + }, + { + "epoch": 2.2869276218611523, + "grad_norm": 0.2522904872894287, + "learning_rate": 4.7567434413105064e-05, + "loss": 0.1659, + "step": 6193 + }, + { + "epoch": 2.287296898079764, + "grad_norm": 0.25679874420166016, + "learning_rate": 4.754280083754157e-05, + "loss": 0.1534, + "step": 6194 + }, + { + "epoch": 2.287666174298375, + "grad_norm": 0.22991381585597992, + "learning_rate": 4.751816726197808e-05, + "loss": 0.148, + "step": 6195 + }, + { + "epoch": 2.2880354505169866, + "grad_norm": 0.31186822056770325, + "learning_rate": 4.749353368641459e-05, + "loss": 0.1934, + "step": 6196 + }, + { + "epoch": 2.2884047267355982, + "grad_norm": 0.20539291203022003, + "learning_rate": 4.7468900110851096e-05, + "loss": 0.1454, + "step": 6197 + }, + { + "epoch": 2.28877400295421, + "grad_norm": 0.24370451271533966, + "learning_rate": 4.7444266535287603e-05, + "loss": 0.1445, + "step": 6198 + }, + { + "epoch": 2.2891432791728215, + "grad_norm": 0.24340815842151642, + "learning_rate": 4.741963295972411e-05, + "loss": 0.1668, + "step": 6199 + }, + { + "epoch": 2.2895125553914326, + "grad_norm": 0.2982277274131775, + "learning_rate": 4.739499938416061e-05, + "loss": 0.1626, + "step": 6200 + }, + { + "epoch": 2.2895125553914326, + "eval_loss": 8.9998140335083, + "eval_runtime": 7.123, + "eval_samples_per_second": 7.02, + "eval_steps_per_second": 0.983, + "step": 6200 + }, + { + "epoch": 2.289881831610044, + "grad_norm": 0.22899173200130463, + "learning_rate": 4.737036580859712e-05, + "loss": 0.1749, + "step": 6201 + }, + { + "epoch": 2.290251107828656, + "grad_norm": 0.25553759932518005, + "learning_rate": 4.734573223303363e-05, + "loss": 0.1528, + "step": 6202 + }, + { + "epoch": 2.2906203840472674, + "grad_norm": 0.27195289731025696, + "learning_rate": 4.7321098657470136e-05, + "loss": 0.1713, + "step": 6203 + }, + { + "epoch": 2.290989660265879, + "grad_norm": 0.2217801958322525, + "learning_rate": 4.7296465081906644e-05, + "loss": 0.1426, + "step": 6204 + }, + { + "epoch": 2.2913589364844906, + "grad_norm": 0.21889138221740723, + "learning_rate": 4.727183150634315e-05, + "loss": 0.16, + "step": 6205 + }, + { + "epoch": 2.291728212703102, + "grad_norm": 0.258178174495697, + "learning_rate": 4.724719793077966e-05, + "loss": 0.1549, + "step": 6206 + }, + { + "epoch": 2.2920974889217134, + "grad_norm": 0.2608153820037842, + "learning_rate": 4.722256435521617e-05, + "loss": 0.1654, + "step": 6207 + }, + { + "epoch": 2.292466765140325, + "grad_norm": 0.2767244875431061, + "learning_rate": 4.719793077965267e-05, + "loss": 0.143, + "step": 6208 + }, + { + "epoch": 2.2928360413589366, + "grad_norm": 0.24206216633319855, + "learning_rate": 4.7173297204089176e-05, + "loss": 0.1486, + "step": 6209 + }, + { + "epoch": 2.293205317577548, + "grad_norm": 0.26780402660369873, + "learning_rate": 4.7148663628525684e-05, + "loss": 0.1584, + "step": 6210 + }, + { + "epoch": 2.2935745937961594, + "grad_norm": 0.3337785005569458, + "learning_rate": 4.7124030052962185e-05, + "loss": 0.1543, + "step": 6211 + }, + { + "epoch": 2.293943870014771, + "grad_norm": 0.27934199571609497, + "learning_rate": 4.709939647739869e-05, + "loss": 0.1612, + "step": 6212 + }, + { + "epoch": 2.2943131462333826, + "grad_norm": 0.24472008645534515, + "learning_rate": 4.70747629018352e-05, + "loss": 0.1394, + "step": 6213 + }, + { + "epoch": 2.294682422451994, + "grad_norm": 0.35752809047698975, + "learning_rate": 4.705012932627171e-05, + "loss": 0.1994, + "step": 6214 + }, + { + "epoch": 2.2950516986706058, + "grad_norm": 0.2711879014968872, + "learning_rate": 4.702549575070822e-05, + "loss": 0.1739, + "step": 6215 + }, + { + "epoch": 2.2954209748892174, + "grad_norm": 0.25454115867614746, + "learning_rate": 4.7000862175144725e-05, + "loss": 0.1533, + "step": 6216 + }, + { + "epoch": 2.2957902511078285, + "grad_norm": 0.24883179366588593, + "learning_rate": 4.6976228599581226e-05, + "loss": 0.1535, + "step": 6217 + }, + { + "epoch": 2.29615952732644, + "grad_norm": 0.26722273230552673, + "learning_rate": 4.6951595024017734e-05, + "loss": 0.1759, + "step": 6218 + }, + { + "epoch": 2.2965288035450517, + "grad_norm": 0.25976893305778503, + "learning_rate": 4.692696144845424e-05, + "loss": 0.1509, + "step": 6219 + }, + { + "epoch": 2.2968980797636633, + "grad_norm": 0.19868265092372894, + "learning_rate": 4.690232787289075e-05, + "loss": 0.1534, + "step": 6220 + }, + { + "epoch": 2.2972673559822745, + "grad_norm": 0.36545178294181824, + "learning_rate": 4.687769429732726e-05, + "loss": 0.1972, + "step": 6221 + }, + { + "epoch": 2.297636632200886, + "grad_norm": 0.33269011974334717, + "learning_rate": 4.6853060721763765e-05, + "loss": 0.1821, + "step": 6222 + }, + { + "epoch": 2.2980059084194977, + "grad_norm": 0.27616527676582336, + "learning_rate": 4.682842714620027e-05, + "loss": 0.1485, + "step": 6223 + }, + { + "epoch": 2.2983751846381093, + "grad_norm": 0.27548670768737793, + "learning_rate": 4.680379357063678e-05, + "loss": 0.1498, + "step": 6224 + }, + { + "epoch": 2.298744460856721, + "grad_norm": 0.24453361332416534, + "learning_rate": 4.677915999507328e-05, + "loss": 0.1837, + "step": 6225 + }, + { + "epoch": 2.2991137370753325, + "grad_norm": 0.26872798800468445, + "learning_rate": 4.675452641950979e-05, + "loss": 0.18, + "step": 6226 + }, + { + "epoch": 2.299483013293944, + "grad_norm": 0.24633759260177612, + "learning_rate": 4.67298928439463e-05, + "loss": 0.1746, + "step": 6227 + }, + { + "epoch": 2.2998522895125553, + "grad_norm": 0.26272451877593994, + "learning_rate": 4.6705259268382805e-05, + "loss": 0.1662, + "step": 6228 + }, + { + "epoch": 2.300221565731167, + "grad_norm": 0.20913535356521606, + "learning_rate": 4.668062569281931e-05, + "loss": 0.1586, + "step": 6229 + }, + { + "epoch": 2.3005908419497785, + "grad_norm": 0.2730373740196228, + "learning_rate": 4.665599211725582e-05, + "loss": 0.1917, + "step": 6230 + }, + { + "epoch": 2.30096011816839, + "grad_norm": 0.22976040840148926, + "learning_rate": 4.663135854169233e-05, + "loss": 0.1449, + "step": 6231 + }, + { + "epoch": 2.3013293943870012, + "grad_norm": 0.2512296140193939, + "learning_rate": 4.660672496612884e-05, + "loss": 0.17, + "step": 6232 + }, + { + "epoch": 2.301698670605613, + "grad_norm": 0.2268838882446289, + "learning_rate": 4.658209139056534e-05, + "loss": 0.1674, + "step": 6233 + }, + { + "epoch": 2.3020679468242244, + "grad_norm": 0.2965036928653717, + "learning_rate": 4.6557457815001846e-05, + "loss": 0.172, + "step": 6234 + }, + { + "epoch": 2.302437223042836, + "grad_norm": 0.30569368600845337, + "learning_rate": 4.6532824239438354e-05, + "loss": 0.155, + "step": 6235 + }, + { + "epoch": 2.3028064992614476, + "grad_norm": 0.2479228973388672, + "learning_rate": 4.650819066387486e-05, + "loss": 0.1486, + "step": 6236 + }, + { + "epoch": 2.3031757754800593, + "grad_norm": 0.2889600992202759, + "learning_rate": 4.648355708831137e-05, + "loss": 0.1859, + "step": 6237 + }, + { + "epoch": 2.303545051698671, + "grad_norm": 0.2949424386024475, + "learning_rate": 4.645892351274788e-05, + "loss": 0.1645, + "step": 6238 + }, + { + "epoch": 2.303914327917282, + "grad_norm": 0.2466827630996704, + "learning_rate": 4.6434289937184385e-05, + "loss": 0.139, + "step": 6239 + }, + { + "epoch": 2.3042836041358936, + "grad_norm": 0.288570761680603, + "learning_rate": 4.640965636162089e-05, + "loss": 0.1843, + "step": 6240 + }, + { + "epoch": 2.304652880354505, + "grad_norm": 0.4031515419483185, + "learning_rate": 4.6385022786057394e-05, + "loss": 0.1649, + "step": 6241 + }, + { + "epoch": 2.305022156573117, + "grad_norm": 0.3085927665233612, + "learning_rate": 4.63603892104939e-05, + "loss": 0.1768, + "step": 6242 + }, + { + "epoch": 2.305391432791728, + "grad_norm": 0.2230338454246521, + "learning_rate": 4.633575563493041e-05, + "loss": 0.1705, + "step": 6243 + }, + { + "epoch": 2.3057607090103396, + "grad_norm": 0.21577639877796173, + "learning_rate": 4.631112205936692e-05, + "loss": 0.1493, + "step": 6244 + }, + { + "epoch": 2.306129985228951, + "grad_norm": 0.38364434242248535, + "learning_rate": 4.6286488483803426e-05, + "loss": 0.1575, + "step": 6245 + }, + { + "epoch": 2.306499261447563, + "grad_norm": 0.2201223224401474, + "learning_rate": 4.6261854908239933e-05, + "loss": 0.1383, + "step": 6246 + }, + { + "epoch": 2.3068685376661744, + "grad_norm": 0.26706117391586304, + "learning_rate": 4.623722133267644e-05, + "loss": 0.1596, + "step": 6247 + }, + { + "epoch": 2.307237813884786, + "grad_norm": 0.28090304136276245, + "learning_rate": 4.621258775711295e-05, + "loss": 0.1702, + "step": 6248 + }, + { + "epoch": 2.307607090103397, + "grad_norm": 0.26446276903152466, + "learning_rate": 4.618795418154945e-05, + "loss": 0.1631, + "step": 6249 + }, + { + "epoch": 2.3079763663220088, + "grad_norm": 0.23596444725990295, + "learning_rate": 4.616332060598596e-05, + "loss": 0.1654, + "step": 6250 + }, + { + "epoch": 2.3079763663220088, + "eval_loss": 8.906844139099121, + "eval_runtime": 6.9404, + "eval_samples_per_second": 7.204, + "eval_steps_per_second": 1.009, + "step": 6250 + }, + { + "epoch": 2.3083456425406204, + "grad_norm": 0.23197369277477264, + "learning_rate": 4.6138687030422466e-05, + "loss": 0.1447, + "step": 6251 + }, + { + "epoch": 2.308714918759232, + "grad_norm": 0.25353261828422546, + "learning_rate": 4.6114053454858974e-05, + "loss": 0.1609, + "step": 6252 + }, + { + "epoch": 2.3090841949778436, + "grad_norm": 0.2803304195404053, + "learning_rate": 4.608941987929548e-05, + "loss": 0.1748, + "step": 6253 + }, + { + "epoch": 2.3094534711964547, + "grad_norm": 0.220031276345253, + "learning_rate": 4.606478630373199e-05, + "loss": 0.1537, + "step": 6254 + }, + { + "epoch": 2.3098227474150663, + "grad_norm": 0.2545096278190613, + "learning_rate": 4.60401527281685e-05, + "loss": 0.168, + "step": 6255 + }, + { + "epoch": 2.310192023633678, + "grad_norm": 0.2613220810890198, + "learning_rate": 4.6015519152605005e-05, + "loss": 0.1829, + "step": 6256 + }, + { + "epoch": 2.3105612998522895, + "grad_norm": 0.24163620173931122, + "learning_rate": 4.5990885577041506e-05, + "loss": 0.1522, + "step": 6257 + }, + { + "epoch": 2.310930576070901, + "grad_norm": 0.2600167393684387, + "learning_rate": 4.5966252001478014e-05, + "loss": 0.1694, + "step": 6258 + }, + { + "epoch": 2.3112998522895127, + "grad_norm": 0.2799946367740631, + "learning_rate": 4.594161842591452e-05, + "loss": 0.1565, + "step": 6259 + }, + { + "epoch": 2.311669128508124, + "grad_norm": 0.22951145470142365, + "learning_rate": 4.591698485035103e-05, + "loss": 0.1541, + "step": 6260 + }, + { + "epoch": 2.3120384047267355, + "grad_norm": 0.2867526412010193, + "learning_rate": 4.589235127478754e-05, + "loss": 0.17, + "step": 6261 + }, + { + "epoch": 2.312407680945347, + "grad_norm": 0.24332208931446075, + "learning_rate": 4.5867717699224046e-05, + "loss": 0.1516, + "step": 6262 + }, + { + "epoch": 2.3127769571639587, + "grad_norm": 0.2777426540851593, + "learning_rate": 4.5843084123660554e-05, + "loss": 0.1697, + "step": 6263 + }, + { + "epoch": 2.3131462333825703, + "grad_norm": 0.23616191744804382, + "learning_rate": 4.581845054809706e-05, + "loss": 0.141, + "step": 6264 + }, + { + "epoch": 2.3135155096011815, + "grad_norm": 0.2526165246963501, + "learning_rate": 4.579381697253356e-05, + "loss": 0.1598, + "step": 6265 + }, + { + "epoch": 2.313884785819793, + "grad_norm": 0.28287896513938904, + "learning_rate": 4.576918339697007e-05, + "loss": 0.2286, + "step": 6266 + }, + { + "epoch": 2.3142540620384047, + "grad_norm": 0.2726123034954071, + "learning_rate": 4.574454982140658e-05, + "loss": 0.1675, + "step": 6267 + }, + { + "epoch": 2.3146233382570163, + "grad_norm": 0.28874048590660095, + "learning_rate": 4.5719916245843086e-05, + "loss": 0.2006, + "step": 6268 + }, + { + "epoch": 2.314992614475628, + "grad_norm": 0.22696749866008759, + "learning_rate": 4.5695282670279594e-05, + "loss": 0.1613, + "step": 6269 + }, + { + "epoch": 2.3153618906942395, + "grad_norm": 0.23049207031726837, + "learning_rate": 4.56706490947161e-05, + "loss": 0.1483, + "step": 6270 + }, + { + "epoch": 2.3157311669128506, + "grad_norm": 0.2424476146697998, + "learning_rate": 4.564601551915261e-05, + "loss": 0.168, + "step": 6271 + }, + { + "epoch": 2.3161004431314622, + "grad_norm": 0.2698257863521576, + "learning_rate": 4.562138194358912e-05, + "loss": 0.1804, + "step": 6272 + }, + { + "epoch": 2.316469719350074, + "grad_norm": 0.24617724120616913, + "learning_rate": 4.559674836802562e-05, + "loss": 0.1505, + "step": 6273 + }, + { + "epoch": 2.3168389955686854, + "grad_norm": 0.31448906660079956, + "learning_rate": 4.5572114792462127e-05, + "loss": 0.1592, + "step": 6274 + }, + { + "epoch": 2.317208271787297, + "grad_norm": 0.27043938636779785, + "learning_rate": 4.5547481216898634e-05, + "loss": 0.165, + "step": 6275 + }, + { + "epoch": 2.317577548005908, + "grad_norm": 0.32566961646080017, + "learning_rate": 4.552284764133514e-05, + "loss": 0.2052, + "step": 6276 + }, + { + "epoch": 2.31794682422452, + "grad_norm": 0.36077582836151123, + "learning_rate": 4.549821406577165e-05, + "loss": 0.1874, + "step": 6277 + }, + { + "epoch": 2.3183161004431314, + "grad_norm": 0.25400418043136597, + "learning_rate": 4.547358049020816e-05, + "loss": 0.1634, + "step": 6278 + }, + { + "epoch": 2.318685376661743, + "grad_norm": 0.2810487747192383, + "learning_rate": 4.5448946914644666e-05, + "loss": 0.1782, + "step": 6279 + }, + { + "epoch": 2.3190546528803546, + "grad_norm": 0.2795398533344269, + "learning_rate": 4.5424313339081174e-05, + "loss": 0.1677, + "step": 6280 + }, + { + "epoch": 2.319423929098966, + "grad_norm": 0.2827003598213196, + "learning_rate": 4.5399679763517675e-05, + "loss": 0.1766, + "step": 6281 + }, + { + "epoch": 2.3197932053175774, + "grad_norm": 0.2028317004442215, + "learning_rate": 4.537504618795418e-05, + "loss": 0.147, + "step": 6282 + }, + { + "epoch": 2.320162481536189, + "grad_norm": 0.2884708344936371, + "learning_rate": 4.535041261239069e-05, + "loss": 0.1901, + "step": 6283 + }, + { + "epoch": 2.3205317577548006, + "grad_norm": 0.24429160356521606, + "learning_rate": 4.53257790368272e-05, + "loss": 0.1758, + "step": 6284 + }, + { + "epoch": 2.320901033973412, + "grad_norm": 0.25365304946899414, + "learning_rate": 4.5301145461263706e-05, + "loss": 0.1414, + "step": 6285 + }, + { + "epoch": 2.321270310192024, + "grad_norm": 0.277723491191864, + "learning_rate": 4.5276511885700214e-05, + "loss": 0.1864, + "step": 6286 + }, + { + "epoch": 2.321639586410635, + "grad_norm": 0.28453826904296875, + "learning_rate": 4.525187831013672e-05, + "loss": 0.1426, + "step": 6287 + }, + { + "epoch": 2.3220088626292466, + "grad_norm": 0.28116118907928467, + "learning_rate": 4.522724473457322e-05, + "loss": 0.1598, + "step": 6288 + }, + { + "epoch": 2.322378138847858, + "grad_norm": 0.2716774046421051, + "learning_rate": 4.520261115900973e-05, + "loss": 0.1496, + "step": 6289 + }, + { + "epoch": 2.3227474150664698, + "grad_norm": 0.3424721658229828, + "learning_rate": 4.517797758344624e-05, + "loss": 0.1795, + "step": 6290 + }, + { + "epoch": 2.3231166912850814, + "grad_norm": 0.2584627568721771, + "learning_rate": 4.515334400788275e-05, + "loss": 0.1809, + "step": 6291 + }, + { + "epoch": 2.323485967503693, + "grad_norm": 0.25674933195114136, + "learning_rate": 4.5128710432319254e-05, + "loss": 0.1513, + "step": 6292 + }, + { + "epoch": 2.323855243722304, + "grad_norm": 0.42123061418533325, + "learning_rate": 4.510407685675576e-05, + "loss": 0.1932, + "step": 6293 + }, + { + "epoch": 2.3242245199409157, + "grad_norm": 0.242903470993042, + "learning_rate": 4.507944328119227e-05, + "loss": 0.1636, + "step": 6294 + }, + { + "epoch": 2.3245937961595273, + "grad_norm": 0.29942288994789124, + "learning_rate": 4.505480970562878e-05, + "loss": 0.1879, + "step": 6295 + }, + { + "epoch": 2.324963072378139, + "grad_norm": 0.26634547114372253, + "learning_rate": 4.503017613006528e-05, + "loss": 0.1706, + "step": 6296 + }, + { + "epoch": 2.3253323485967505, + "grad_norm": 0.2594495415687561, + "learning_rate": 4.500554255450179e-05, + "loss": 0.2041, + "step": 6297 + }, + { + "epoch": 2.3257016248153617, + "grad_norm": 0.2658875286579132, + "learning_rate": 4.4980908978938295e-05, + "loss": 0.1674, + "step": 6298 + }, + { + "epoch": 2.3260709010339733, + "grad_norm": 0.2579420506954193, + "learning_rate": 4.49562754033748e-05, + "loss": 0.1538, + "step": 6299 + }, + { + "epoch": 2.326440177252585, + "grad_norm": 0.23655763268470764, + "learning_rate": 4.493164182781131e-05, + "loss": 0.1602, + "step": 6300 + }, + { + "epoch": 2.326440177252585, + "eval_loss": 8.906062126159668, + "eval_runtime": 6.9038, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 1.014, + "step": 6300 + }, + { + "epoch": 2.3268094534711965, + "grad_norm": 0.21723045408725739, + "learning_rate": 4.490700825224782e-05, + "loss": 0.1608, + "step": 6301 + }, + { + "epoch": 2.327178729689808, + "grad_norm": 0.25965991616249084, + "learning_rate": 4.4882374676684326e-05, + "loss": 0.1639, + "step": 6302 + }, + { + "epoch": 2.3275480059084197, + "grad_norm": 0.2066727876663208, + "learning_rate": 4.4857741101120834e-05, + "loss": 0.1574, + "step": 6303 + }, + { + "epoch": 2.327917282127031, + "grad_norm": 0.2679888904094696, + "learning_rate": 4.4833107525557335e-05, + "loss": 0.1677, + "step": 6304 + }, + { + "epoch": 2.3282865583456425, + "grad_norm": 0.28912490606307983, + "learning_rate": 4.480847394999384e-05, + "loss": 0.1871, + "step": 6305 + }, + { + "epoch": 2.328655834564254, + "grad_norm": 0.29908832907676697, + "learning_rate": 4.478384037443035e-05, + "loss": 0.1628, + "step": 6306 + }, + { + "epoch": 2.3290251107828657, + "grad_norm": 0.24215038120746613, + "learning_rate": 4.475920679886686e-05, + "loss": 0.1568, + "step": 6307 + }, + { + "epoch": 2.3293943870014773, + "grad_norm": 0.2995264232158661, + "learning_rate": 4.473457322330337e-05, + "loss": 0.1662, + "step": 6308 + }, + { + "epoch": 2.3297636632200884, + "grad_norm": 0.27667415142059326, + "learning_rate": 4.4709939647739875e-05, + "loss": 0.1943, + "step": 6309 + }, + { + "epoch": 2.3301329394387, + "grad_norm": 0.27024340629577637, + "learning_rate": 4.468530607217638e-05, + "loss": 0.1622, + "step": 6310 + }, + { + "epoch": 2.3305022156573116, + "grad_norm": 0.24666908383369446, + "learning_rate": 4.466067249661289e-05, + "loss": 0.1405, + "step": 6311 + }, + { + "epoch": 2.3308714918759232, + "grad_norm": 0.27632877230644226, + "learning_rate": 4.463603892104939e-05, + "loss": 0.1389, + "step": 6312 + }, + { + "epoch": 2.331240768094535, + "grad_norm": 0.23172470927238464, + "learning_rate": 4.46114053454859e-05, + "loss": 0.1553, + "step": 6313 + }, + { + "epoch": 2.3316100443131464, + "grad_norm": 0.29325976967811584, + "learning_rate": 4.458677176992241e-05, + "loss": 0.1857, + "step": 6314 + }, + { + "epoch": 2.3319793205317576, + "grad_norm": 0.2602393627166748, + "learning_rate": 4.4562138194358915e-05, + "loss": 0.1499, + "step": 6315 + }, + { + "epoch": 2.332348596750369, + "grad_norm": 0.31172099709510803, + "learning_rate": 4.453750461879542e-05, + "loss": 0.1577, + "step": 6316 + }, + { + "epoch": 2.332717872968981, + "grad_norm": 0.25432848930358887, + "learning_rate": 4.451287104323193e-05, + "loss": 0.1688, + "step": 6317 + }, + { + "epoch": 2.3330871491875924, + "grad_norm": 0.28142818808555603, + "learning_rate": 4.448823746766844e-05, + "loss": 0.1692, + "step": 6318 + }, + { + "epoch": 2.333456425406204, + "grad_norm": 0.3042151629924774, + "learning_rate": 4.4463603892104946e-05, + "loss": 0.1891, + "step": 6319 + }, + { + "epoch": 2.333825701624815, + "grad_norm": 0.2881665825843811, + "learning_rate": 4.443897031654145e-05, + "loss": 0.172, + "step": 6320 + }, + { + "epoch": 2.3341949778434268, + "grad_norm": 0.2515096068382263, + "learning_rate": 4.4414336740977955e-05, + "loss": 0.1403, + "step": 6321 + }, + { + "epoch": 2.3345642540620384, + "grad_norm": 0.2738690972328186, + "learning_rate": 4.438970316541446e-05, + "loss": 0.1694, + "step": 6322 + }, + { + "epoch": 2.33493353028065, + "grad_norm": 0.2799984812736511, + "learning_rate": 4.436506958985097e-05, + "loss": 0.1723, + "step": 6323 + }, + { + "epoch": 2.3353028064992616, + "grad_norm": 0.2334475964307785, + "learning_rate": 4.434043601428748e-05, + "loss": 0.1584, + "step": 6324 + }, + { + "epoch": 2.335672082717873, + "grad_norm": 0.2593860924243927, + "learning_rate": 4.431580243872399e-05, + "loss": 0.1523, + "step": 6325 + }, + { + "epoch": 2.3360413589364843, + "grad_norm": 0.19172629714012146, + "learning_rate": 4.4291168863160495e-05, + "loss": 0.1322, + "step": 6326 + }, + { + "epoch": 2.336410635155096, + "grad_norm": 0.2948048412799835, + "learning_rate": 4.4266535287596996e-05, + "loss": 0.1652, + "step": 6327 + }, + { + "epoch": 2.3367799113737076, + "grad_norm": 0.2443239390850067, + "learning_rate": 4.4241901712033504e-05, + "loss": 0.1529, + "step": 6328 + }, + { + "epoch": 2.337149187592319, + "grad_norm": 0.20666438341140747, + "learning_rate": 4.4217268136470005e-05, + "loss": 0.1273, + "step": 6329 + }, + { + "epoch": 2.3375184638109308, + "grad_norm": 0.25608763098716736, + "learning_rate": 4.419263456090651e-05, + "loss": 0.1549, + "step": 6330 + }, + { + "epoch": 2.337887740029542, + "grad_norm": 0.24947820603847504, + "learning_rate": 4.416800098534302e-05, + "loss": 0.18, + "step": 6331 + }, + { + "epoch": 2.3382570162481535, + "grad_norm": 0.21155625581741333, + "learning_rate": 4.414336740977953e-05, + "loss": 0.151, + "step": 6332 + }, + { + "epoch": 2.338626292466765, + "grad_norm": 0.21592651307582855, + "learning_rate": 4.4118733834216036e-05, + "loss": 0.1289, + "step": 6333 + }, + { + "epoch": 2.3389955686853767, + "grad_norm": 0.2873937487602234, + "learning_rate": 4.4094100258652544e-05, + "loss": 0.1717, + "step": 6334 + }, + { + "epoch": 2.3393648449039883, + "grad_norm": 0.23449206352233887, + "learning_rate": 4.406946668308905e-05, + "loss": 0.1483, + "step": 6335 + }, + { + "epoch": 2.3397341211226, + "grad_norm": 0.23663613200187683, + "learning_rate": 4.404483310752556e-05, + "loss": 0.1679, + "step": 6336 + }, + { + "epoch": 2.340103397341211, + "grad_norm": 0.2152911275625229, + "learning_rate": 4.402019953196206e-05, + "loss": 0.166, + "step": 6337 + }, + { + "epoch": 2.3404726735598227, + "grad_norm": 0.22667697072029114, + "learning_rate": 4.399556595639857e-05, + "loss": 0.1594, + "step": 6338 + }, + { + "epoch": 2.3408419497784343, + "grad_norm": 0.22826431691646576, + "learning_rate": 4.397093238083508e-05, + "loss": 0.1543, + "step": 6339 + }, + { + "epoch": 2.341211225997046, + "grad_norm": 0.27181118726730347, + "learning_rate": 4.3946298805271585e-05, + "loss": 0.1504, + "step": 6340 + }, + { + "epoch": 2.3415805022156575, + "grad_norm": 0.2781362533569336, + "learning_rate": 4.392166522970809e-05, + "loss": 0.1746, + "step": 6341 + }, + { + "epoch": 2.3419497784342687, + "grad_norm": 0.3055467903614044, + "learning_rate": 4.38970316541446e-05, + "loss": 0.1585, + "step": 6342 + }, + { + "epoch": 2.3423190546528803, + "grad_norm": 0.3723445236682892, + "learning_rate": 4.387239807858111e-05, + "loss": 0.1512, + "step": 6343 + }, + { + "epoch": 2.342688330871492, + "grad_norm": 0.24662333726882935, + "learning_rate": 4.3847764503017616e-05, + "loss": 0.1495, + "step": 6344 + }, + { + "epoch": 2.3430576070901035, + "grad_norm": 0.2578054368495941, + "learning_rate": 4.382313092745412e-05, + "loss": 0.1579, + "step": 6345 + }, + { + "epoch": 2.343426883308715, + "grad_norm": 0.3027632534503937, + "learning_rate": 4.3798497351890625e-05, + "loss": 0.1819, + "step": 6346 + }, + { + "epoch": 2.3437961595273267, + "grad_norm": 0.2887378931045532, + "learning_rate": 4.377386377632713e-05, + "loss": 0.1624, + "step": 6347 + }, + { + "epoch": 2.344165435745938, + "grad_norm": 0.26808691024780273, + "learning_rate": 4.374923020076364e-05, + "loss": 0.1664, + "step": 6348 + }, + { + "epoch": 2.3445347119645494, + "grad_norm": 0.228684663772583, + "learning_rate": 4.372459662520015e-05, + "loss": 0.1326, + "step": 6349 + }, + { + "epoch": 2.344903988183161, + "grad_norm": 0.246864914894104, + "learning_rate": 4.3699963049636656e-05, + "loss": 0.1695, + "step": 6350 + }, + { + "epoch": 2.344903988183161, + "eval_loss": 8.847007751464844, + "eval_runtime": 6.9096, + "eval_samples_per_second": 7.236, + "eval_steps_per_second": 1.013, + "step": 6350 + }, + { + "epoch": 2.3452732644017726, + "grad_norm": 0.2502978444099426, + "learning_rate": 4.3675329474073164e-05, + "loss": 0.1558, + "step": 6351 + }, + { + "epoch": 2.345642540620384, + "grad_norm": 0.23514792323112488, + "learning_rate": 4.365069589850967e-05, + "loss": 0.158, + "step": 6352 + }, + { + "epoch": 2.3460118168389954, + "grad_norm": 0.2528262734413147, + "learning_rate": 4.362606232294617e-05, + "loss": 0.1601, + "step": 6353 + }, + { + "epoch": 2.346381093057607, + "grad_norm": 0.2973999083042145, + "learning_rate": 4.360142874738268e-05, + "loss": 0.1612, + "step": 6354 + }, + { + "epoch": 2.3467503692762186, + "grad_norm": 0.30021506547927856, + "learning_rate": 4.357679517181919e-05, + "loss": 0.1701, + "step": 6355 + }, + { + "epoch": 2.34711964549483, + "grad_norm": 0.27526965737342834, + "learning_rate": 4.35521615962557e-05, + "loss": 0.1775, + "step": 6356 + }, + { + "epoch": 2.347488921713442, + "grad_norm": 0.27101030945777893, + "learning_rate": 4.3527528020692205e-05, + "loss": 0.1578, + "step": 6357 + }, + { + "epoch": 2.3478581979320534, + "grad_norm": 0.20426113903522491, + "learning_rate": 4.350289444512871e-05, + "loss": 0.1321, + "step": 6358 + }, + { + "epoch": 2.3482274741506646, + "grad_norm": 0.2943769097328186, + "learning_rate": 4.347826086956522e-05, + "loss": 0.1722, + "step": 6359 + }, + { + "epoch": 2.348596750369276, + "grad_norm": 0.2364971786737442, + "learning_rate": 4.345362729400173e-05, + "loss": 0.1364, + "step": 6360 + }, + { + "epoch": 2.348966026587888, + "grad_norm": 0.29631149768829346, + "learning_rate": 4.342899371843823e-05, + "loss": 0.1931, + "step": 6361 + }, + { + "epoch": 2.3493353028064994, + "grad_norm": 0.25259944796562195, + "learning_rate": 4.340436014287474e-05, + "loss": 0.1565, + "step": 6362 + }, + { + "epoch": 2.3497045790251105, + "grad_norm": 0.2911149561405182, + "learning_rate": 4.3379726567311245e-05, + "loss": 0.1669, + "step": 6363 + }, + { + "epoch": 2.350073855243722, + "grad_norm": 0.21698953211307526, + "learning_rate": 4.335509299174775e-05, + "loss": 0.1537, + "step": 6364 + }, + { + "epoch": 2.3504431314623337, + "grad_norm": 0.32403573393821716, + "learning_rate": 4.333045941618426e-05, + "loss": 0.1712, + "step": 6365 + }, + { + "epoch": 2.3508124076809453, + "grad_norm": 0.3207538425922394, + "learning_rate": 4.330582584062077e-05, + "loss": 0.1778, + "step": 6366 + }, + { + "epoch": 2.351181683899557, + "grad_norm": 0.26004767417907715, + "learning_rate": 4.3281192265057276e-05, + "loss": 0.1726, + "step": 6367 + }, + { + "epoch": 2.3515509601181686, + "grad_norm": 0.27153441309928894, + "learning_rate": 4.3256558689493784e-05, + "loss": 0.1702, + "step": 6368 + }, + { + "epoch": 2.35192023633678, + "grad_norm": 0.26396921277046204, + "learning_rate": 4.3231925113930285e-05, + "loss": 0.1885, + "step": 6369 + }, + { + "epoch": 2.3522895125553913, + "grad_norm": 0.22462014853954315, + "learning_rate": 4.320729153836679e-05, + "loss": 0.1754, + "step": 6370 + }, + { + "epoch": 2.352658788774003, + "grad_norm": 0.24633799493312836, + "learning_rate": 4.31826579628033e-05, + "loss": 0.1511, + "step": 6371 + }, + { + "epoch": 2.3530280649926145, + "grad_norm": 0.24897989630699158, + "learning_rate": 4.315802438723981e-05, + "loss": 0.1561, + "step": 6372 + }, + { + "epoch": 2.353397341211226, + "grad_norm": 0.23479872941970825, + "learning_rate": 4.313339081167632e-05, + "loss": 0.1494, + "step": 6373 + }, + { + "epoch": 2.3537666174298373, + "grad_norm": 0.2528115510940552, + "learning_rate": 4.3108757236112825e-05, + "loss": 0.1681, + "step": 6374 + }, + { + "epoch": 2.354135893648449, + "grad_norm": 0.23565773665905, + "learning_rate": 4.308412366054933e-05, + "loss": 0.1717, + "step": 6375 + }, + { + "epoch": 2.3545051698670605, + "grad_norm": 0.21669965982437134, + "learning_rate": 4.305949008498584e-05, + "loss": 0.1411, + "step": 6376 + }, + { + "epoch": 2.354874446085672, + "grad_norm": 0.23970626294612885, + "learning_rate": 4.303485650942234e-05, + "loss": 0.1423, + "step": 6377 + }, + { + "epoch": 2.3552437223042837, + "grad_norm": 0.24445860087871552, + "learning_rate": 4.301022293385885e-05, + "loss": 0.1687, + "step": 6378 + }, + { + "epoch": 2.3556129985228953, + "grad_norm": 0.2811559736728668, + "learning_rate": 4.298558935829536e-05, + "loss": 0.1577, + "step": 6379 + }, + { + "epoch": 2.3559822747415065, + "grad_norm": 0.24085548520088196, + "learning_rate": 4.2960955782731865e-05, + "loss": 0.1465, + "step": 6380 + }, + { + "epoch": 2.356351550960118, + "grad_norm": 0.258731871843338, + "learning_rate": 4.293632220716837e-05, + "loss": 0.1547, + "step": 6381 + }, + { + "epoch": 2.3567208271787297, + "grad_norm": 0.2683509886264801, + "learning_rate": 4.291168863160488e-05, + "loss": 0.1538, + "step": 6382 + }, + { + "epoch": 2.3570901033973413, + "grad_norm": 0.35104984045028687, + "learning_rate": 4.288705505604139e-05, + "loss": 0.1515, + "step": 6383 + }, + { + "epoch": 2.357459379615953, + "grad_norm": 0.22519122064113617, + "learning_rate": 4.28624214804779e-05, + "loss": 0.1524, + "step": 6384 + }, + { + "epoch": 2.357828655834564, + "grad_norm": 0.26378434896469116, + "learning_rate": 4.28377879049144e-05, + "loss": 0.1331, + "step": 6385 + }, + { + "epoch": 2.3581979320531756, + "grad_norm": 0.24841849505901337, + "learning_rate": 4.2813154329350906e-05, + "loss": 0.1531, + "step": 6386 + }, + { + "epoch": 2.3585672082717872, + "grad_norm": 0.22380207479000092, + "learning_rate": 4.2788520753787413e-05, + "loss": 0.1535, + "step": 6387 + }, + { + "epoch": 2.358936484490399, + "grad_norm": 0.3005625605583191, + "learning_rate": 4.276388717822392e-05, + "loss": 0.1701, + "step": 6388 + }, + { + "epoch": 2.3593057607090104, + "grad_norm": 0.2842036187648773, + "learning_rate": 4.273925360266043e-05, + "loss": 0.1869, + "step": 6389 + }, + { + "epoch": 2.359675036927622, + "grad_norm": 0.2560485601425171, + "learning_rate": 4.271462002709694e-05, + "loss": 0.1535, + "step": 6390 + }, + { + "epoch": 2.360044313146233, + "grad_norm": 0.26803532242774963, + "learning_rate": 4.2689986451533445e-05, + "loss": 0.176, + "step": 6391 + }, + { + "epoch": 2.360413589364845, + "grad_norm": 0.24676063656806946, + "learning_rate": 4.266535287596995e-05, + "loss": 0.1674, + "step": 6392 + }, + { + "epoch": 2.3607828655834564, + "grad_norm": 0.30460405349731445, + "learning_rate": 4.2640719300406454e-05, + "loss": 0.196, + "step": 6393 + }, + { + "epoch": 2.361152141802068, + "grad_norm": 0.21518395841121674, + "learning_rate": 4.261608572484296e-05, + "loss": 0.1498, + "step": 6394 + }, + { + "epoch": 2.3615214180206796, + "grad_norm": 0.23324288427829742, + "learning_rate": 4.259145214927947e-05, + "loss": 0.1563, + "step": 6395 + }, + { + "epoch": 2.3618906942392908, + "grad_norm": 0.21555882692337036, + "learning_rate": 4.256681857371598e-05, + "loss": 0.1455, + "step": 6396 + }, + { + "epoch": 2.3622599704579024, + "grad_norm": 0.38672226667404175, + "learning_rate": 4.2542184998152485e-05, + "loss": 0.1806, + "step": 6397 + }, + { + "epoch": 2.362629246676514, + "grad_norm": 0.2865959703922272, + "learning_rate": 4.251755142258899e-05, + "loss": 0.1643, + "step": 6398 + }, + { + "epoch": 2.3629985228951256, + "grad_norm": 0.2968366742134094, + "learning_rate": 4.24929178470255e-05, + "loss": 0.1619, + "step": 6399 + }, + { + "epoch": 2.363367799113737, + "grad_norm": 0.28235292434692383, + "learning_rate": 4.246828427146201e-05, + "loss": 0.1764, + "step": 6400 + }, + { + "epoch": 2.363367799113737, + "eval_loss": 8.831260681152344, + "eval_runtime": 7.0456, + "eval_samples_per_second": 7.097, + "eval_steps_per_second": 0.994, + "step": 6400 + }, + { + "epoch": 2.363737075332349, + "grad_norm": 0.28704458475112915, + "learning_rate": 4.244365069589851e-05, + "loss": 0.1575, + "step": 6401 + }, + { + "epoch": 2.36410635155096, + "grad_norm": 0.27540290355682373, + "learning_rate": 4.241901712033502e-05, + "loss": 0.1863, + "step": 6402 + }, + { + "epoch": 2.3644756277695715, + "grad_norm": 0.22649957239627838, + "learning_rate": 4.2394383544771526e-05, + "loss": 0.1443, + "step": 6403 + }, + { + "epoch": 2.364844903988183, + "grad_norm": 0.2559596002101898, + "learning_rate": 4.2369749969208034e-05, + "loss": 0.1472, + "step": 6404 + }, + { + "epoch": 2.3652141802067947, + "grad_norm": 0.23610402643680573, + "learning_rate": 4.234511639364454e-05, + "loss": 0.1668, + "step": 6405 + }, + { + "epoch": 2.3655834564254064, + "grad_norm": 0.2959190309047699, + "learning_rate": 4.232048281808105e-05, + "loss": 0.1738, + "step": 6406 + }, + { + "epoch": 2.3659527326440175, + "grad_norm": 0.24099315702915192, + "learning_rate": 4.229584924251756e-05, + "loss": 0.1453, + "step": 6407 + }, + { + "epoch": 2.366322008862629, + "grad_norm": 0.2292603999376297, + "learning_rate": 4.227121566695406e-05, + "loss": 0.1401, + "step": 6408 + }, + { + "epoch": 2.3666912850812407, + "grad_norm": 0.2881355285644531, + "learning_rate": 4.2246582091390566e-05, + "loss": 0.1782, + "step": 6409 + }, + { + "epoch": 2.3670605612998523, + "grad_norm": 0.23953141272068024, + "learning_rate": 4.2221948515827074e-05, + "loss": 0.1453, + "step": 6410 + }, + { + "epoch": 2.367429837518464, + "grad_norm": 0.23177595436573029, + "learning_rate": 4.219731494026358e-05, + "loss": 0.1565, + "step": 6411 + }, + { + "epoch": 2.3677991137370755, + "grad_norm": 0.27608489990234375, + "learning_rate": 4.217268136470009e-05, + "loss": 0.1684, + "step": 6412 + }, + { + "epoch": 2.3681683899556867, + "grad_norm": 0.24285469949245453, + "learning_rate": 4.21480477891366e-05, + "loss": 0.1657, + "step": 6413 + }, + { + "epoch": 2.3685376661742983, + "grad_norm": 0.21426056325435638, + "learning_rate": 4.2123414213573105e-05, + "loss": 0.1473, + "step": 6414 + }, + { + "epoch": 2.36890694239291, + "grad_norm": 0.2743415832519531, + "learning_rate": 4.209878063800961e-05, + "loss": 0.1518, + "step": 6415 + }, + { + "epoch": 2.3692762186115215, + "grad_norm": 0.2567157745361328, + "learning_rate": 4.2074147062446114e-05, + "loss": 0.1846, + "step": 6416 + }, + { + "epoch": 2.369645494830133, + "grad_norm": 0.2379358857870102, + "learning_rate": 4.204951348688262e-05, + "loss": 0.1779, + "step": 6417 + }, + { + "epoch": 2.3700147710487443, + "grad_norm": 0.3673871159553528, + "learning_rate": 4.202487991131913e-05, + "loss": 0.1825, + "step": 6418 + }, + { + "epoch": 2.370384047267356, + "grad_norm": 0.2811566889286041, + "learning_rate": 4.200024633575564e-05, + "loss": 0.1664, + "step": 6419 + }, + { + "epoch": 2.3707533234859675, + "grad_norm": 0.2604074478149414, + "learning_rate": 4.1975612760192146e-05, + "loss": 0.1712, + "step": 6420 + }, + { + "epoch": 2.371122599704579, + "grad_norm": 0.19485639035701752, + "learning_rate": 4.1950979184628654e-05, + "loss": 0.1404, + "step": 6421 + }, + { + "epoch": 2.3714918759231907, + "grad_norm": 0.2568664848804474, + "learning_rate": 4.192634560906516e-05, + "loss": 0.1506, + "step": 6422 + }, + { + "epoch": 2.3718611521418023, + "grad_norm": 0.25063255429267883, + "learning_rate": 4.190171203350167e-05, + "loss": 0.1573, + "step": 6423 + }, + { + "epoch": 2.3722304283604134, + "grad_norm": 0.26220011711120605, + "learning_rate": 4.187707845793817e-05, + "loss": 0.1671, + "step": 6424 + }, + { + "epoch": 2.372599704579025, + "grad_norm": 0.2678585350513458, + "learning_rate": 4.185244488237468e-05, + "loss": 0.1826, + "step": 6425 + }, + { + "epoch": 2.3729689807976366, + "grad_norm": 0.30907028913497925, + "learning_rate": 4.1827811306811186e-05, + "loss": 0.19, + "step": 6426 + }, + { + "epoch": 2.3733382570162482, + "grad_norm": 0.265207976102829, + "learning_rate": 4.1803177731247694e-05, + "loss": 0.1843, + "step": 6427 + }, + { + "epoch": 2.37370753323486, + "grad_norm": 0.23149374127388, + "learning_rate": 4.17785441556842e-05, + "loss": 0.1616, + "step": 6428 + }, + { + "epoch": 2.374076809453471, + "grad_norm": 0.2719758450984955, + "learning_rate": 4.175391058012071e-05, + "loss": 0.181, + "step": 6429 + }, + { + "epoch": 2.3744460856720826, + "grad_norm": 0.3266640305519104, + "learning_rate": 4.172927700455722e-05, + "loss": 0.1771, + "step": 6430 + }, + { + "epoch": 2.374815361890694, + "grad_norm": 0.25291907787323, + "learning_rate": 4.1704643428993726e-05, + "loss": 0.1549, + "step": 6431 + }, + { + "epoch": 2.375184638109306, + "grad_norm": 0.2400006502866745, + "learning_rate": 4.168000985343023e-05, + "loss": 0.1511, + "step": 6432 + }, + { + "epoch": 2.3755539143279174, + "grad_norm": 0.2594726085662842, + "learning_rate": 4.1655376277866734e-05, + "loss": 0.1456, + "step": 6433 + }, + { + "epoch": 2.375923190546529, + "grad_norm": 0.2261277139186859, + "learning_rate": 4.163074270230324e-05, + "loss": 0.145, + "step": 6434 + }, + { + "epoch": 2.37629246676514, + "grad_norm": 0.23132039606571198, + "learning_rate": 4.160610912673975e-05, + "loss": 0.1596, + "step": 6435 + }, + { + "epoch": 2.3766617429837518, + "grad_norm": 0.24883174896240234, + "learning_rate": 4.158147555117626e-05, + "loss": 0.1594, + "step": 6436 + }, + { + "epoch": 2.3770310192023634, + "grad_norm": 0.3022156357765198, + "learning_rate": 4.1556841975612766e-05, + "loss": 0.1573, + "step": 6437 + }, + { + "epoch": 2.377400295420975, + "grad_norm": 0.22615794837474823, + "learning_rate": 4.1532208400049274e-05, + "loss": 0.1515, + "step": 6438 + }, + { + "epoch": 2.3777695716395866, + "grad_norm": 0.24303200840950012, + "learning_rate": 4.150757482448578e-05, + "loss": 0.1605, + "step": 6439 + }, + { + "epoch": 2.3781388478581977, + "grad_norm": 0.2612857222557068, + "learning_rate": 4.148294124892228e-05, + "loss": 0.1477, + "step": 6440 + }, + { + "epoch": 2.3785081240768093, + "grad_norm": 0.22234545648097992, + "learning_rate": 4.145830767335879e-05, + "loss": 0.1564, + "step": 6441 + }, + { + "epoch": 2.378877400295421, + "grad_norm": 0.23377478122711182, + "learning_rate": 4.14336740977953e-05, + "loss": 0.1623, + "step": 6442 + }, + { + "epoch": 2.3792466765140325, + "grad_norm": 0.30916154384613037, + "learning_rate": 4.14090405222318e-05, + "loss": 0.1629, + "step": 6443 + }, + { + "epoch": 2.379615952732644, + "grad_norm": 0.27756446599960327, + "learning_rate": 4.138440694666831e-05, + "loss": 0.1788, + "step": 6444 + }, + { + "epoch": 2.3799852289512557, + "grad_norm": 0.2724912762641907, + "learning_rate": 4.1359773371104815e-05, + "loss": 0.1794, + "step": 6445 + }, + { + "epoch": 2.380354505169867, + "grad_norm": 0.24997062981128693, + "learning_rate": 4.133513979554132e-05, + "loss": 0.1613, + "step": 6446 + }, + { + "epoch": 2.3807237813884785, + "grad_norm": 0.26392772793769836, + "learning_rate": 4.131050621997783e-05, + "loss": 0.1603, + "step": 6447 + }, + { + "epoch": 2.38109305760709, + "grad_norm": 0.2502446472644806, + "learning_rate": 4.128587264441434e-05, + "loss": 0.1565, + "step": 6448 + }, + { + "epoch": 2.3814623338257017, + "grad_norm": 0.26440832018852234, + "learning_rate": 4.126123906885084e-05, + "loss": 0.1698, + "step": 6449 + }, + { + "epoch": 2.3818316100443133, + "grad_norm": 0.2729743719100952, + "learning_rate": 4.123660549328735e-05, + "loss": 0.1778, + "step": 6450 + }, + { + "epoch": 2.3818316100443133, + "eval_loss": 8.802284240722656, + "eval_runtime": 6.9022, + "eval_samples_per_second": 7.244, + "eval_steps_per_second": 1.014, + "step": 6450 + }, + { + "epoch": 2.3822008862629245, + "grad_norm": 0.2942679226398468, + "learning_rate": 4.1211971917723856e-05, + "loss": 0.1834, + "step": 6451 + }, + { + "epoch": 2.382570162481536, + "grad_norm": 0.30930110812187195, + "learning_rate": 4.1187338342160364e-05, + "loss": 0.1671, + "step": 6452 + }, + { + "epoch": 2.3829394387001477, + "grad_norm": 0.2370852530002594, + "learning_rate": 4.116270476659687e-05, + "loss": 0.163, + "step": 6453 + }, + { + "epoch": 2.3833087149187593, + "grad_norm": 0.2752065658569336, + "learning_rate": 4.113807119103338e-05, + "loss": 0.1387, + "step": 6454 + }, + { + "epoch": 2.383677991137371, + "grad_norm": 0.3003709316253662, + "learning_rate": 4.111343761546989e-05, + "loss": 0.1597, + "step": 6455 + }, + { + "epoch": 2.3840472673559825, + "grad_norm": 0.30252590775489807, + "learning_rate": 4.1088804039906395e-05, + "loss": 0.1661, + "step": 6456 + }, + { + "epoch": 2.3844165435745936, + "grad_norm": 0.2542496919631958, + "learning_rate": 4.1064170464342896e-05, + "loss": 0.1418, + "step": 6457 + }, + { + "epoch": 2.3847858197932053, + "grad_norm": 0.2691532373428345, + "learning_rate": 4.1039536888779404e-05, + "loss": 0.1805, + "step": 6458 + }, + { + "epoch": 2.385155096011817, + "grad_norm": 0.21074455976486206, + "learning_rate": 4.101490331321591e-05, + "loss": 0.1336, + "step": 6459 + }, + { + "epoch": 2.3855243722304285, + "grad_norm": 0.32025569677352905, + "learning_rate": 4.099026973765242e-05, + "loss": 0.1486, + "step": 6460 + }, + { + "epoch": 2.38589364844904, + "grad_norm": 0.29690447449684143, + "learning_rate": 4.096563616208893e-05, + "loss": 0.1636, + "step": 6461 + }, + { + "epoch": 2.386262924667651, + "grad_norm": 0.21675901114940643, + "learning_rate": 4.0941002586525435e-05, + "loss": 0.1424, + "step": 6462 + }, + { + "epoch": 2.386632200886263, + "grad_norm": 0.2632540762424469, + "learning_rate": 4.091636901096194e-05, + "loss": 0.1457, + "step": 6463 + }, + { + "epoch": 2.3870014771048744, + "grad_norm": 0.242710679769516, + "learning_rate": 4.089173543539845e-05, + "loss": 0.1455, + "step": 6464 + }, + { + "epoch": 2.387370753323486, + "grad_norm": 0.40179622173309326, + "learning_rate": 4.086710185983495e-05, + "loss": 0.1568, + "step": 6465 + }, + { + "epoch": 2.3877400295420976, + "grad_norm": 0.29929208755493164, + "learning_rate": 4.084246828427146e-05, + "loss": 0.1838, + "step": 6466 + }, + { + "epoch": 2.3881093057607092, + "grad_norm": 0.29828721284866333, + "learning_rate": 4.081783470870797e-05, + "loss": 0.1482, + "step": 6467 + }, + { + "epoch": 2.3884785819793204, + "grad_norm": 0.3187955617904663, + "learning_rate": 4.0793201133144476e-05, + "loss": 0.1868, + "step": 6468 + }, + { + "epoch": 2.388847858197932, + "grad_norm": 0.28751951456069946, + "learning_rate": 4.0768567557580984e-05, + "loss": 0.1597, + "step": 6469 + }, + { + "epoch": 2.3892171344165436, + "grad_norm": 0.21898925304412842, + "learning_rate": 4.074393398201749e-05, + "loss": 0.1673, + "step": 6470 + }, + { + "epoch": 2.389586410635155, + "grad_norm": 0.3151208758354187, + "learning_rate": 4.0719300406454e-05, + "loss": 0.195, + "step": 6471 + }, + { + "epoch": 2.389955686853767, + "grad_norm": 0.2791444957256317, + "learning_rate": 4.069466683089051e-05, + "loss": 0.1506, + "step": 6472 + }, + { + "epoch": 2.390324963072378, + "grad_norm": 0.28420600295066833, + "learning_rate": 4.067003325532701e-05, + "loss": 0.152, + "step": 6473 + }, + { + "epoch": 2.3906942392909896, + "grad_norm": 0.23713400959968567, + "learning_rate": 4.0645399679763516e-05, + "loss": 0.1643, + "step": 6474 + }, + { + "epoch": 2.391063515509601, + "grad_norm": 0.2570776641368866, + "learning_rate": 4.0620766104200024e-05, + "loss": 0.165, + "step": 6475 + }, + { + "epoch": 2.3914327917282128, + "grad_norm": 0.2677764296531677, + "learning_rate": 4.059613252863653e-05, + "loss": 0.1504, + "step": 6476 + }, + { + "epoch": 2.3918020679468244, + "grad_norm": 0.293143093585968, + "learning_rate": 4.057149895307304e-05, + "loss": 0.1687, + "step": 6477 + }, + { + "epoch": 2.392171344165436, + "grad_norm": 0.24383391439914703, + "learning_rate": 4.054686537750955e-05, + "loss": 0.1551, + "step": 6478 + }, + { + "epoch": 2.392540620384047, + "grad_norm": 0.26155272126197815, + "learning_rate": 4.0522231801946056e-05, + "loss": 0.1774, + "step": 6479 + }, + { + "epoch": 2.3929098966026587, + "grad_norm": 0.362370103597641, + "learning_rate": 4.0497598226382563e-05, + "loss": 0.1817, + "step": 6480 + }, + { + "epoch": 2.3932791728212703, + "grad_norm": 0.2734926640987396, + "learning_rate": 4.0472964650819064e-05, + "loss": 0.1722, + "step": 6481 + }, + { + "epoch": 2.393648449039882, + "grad_norm": 0.4516774117946625, + "learning_rate": 4.044833107525557e-05, + "loss": 0.21, + "step": 6482 + }, + { + "epoch": 2.3940177252584935, + "grad_norm": 0.2987980544567108, + "learning_rate": 4.042369749969208e-05, + "loss": 0.1752, + "step": 6483 + }, + { + "epoch": 2.3943870014771047, + "grad_norm": 0.2669636905193329, + "learning_rate": 4.039906392412859e-05, + "loss": 0.149, + "step": 6484 + }, + { + "epoch": 2.3947562776957163, + "grad_norm": 0.277078241109848, + "learning_rate": 4.0374430348565096e-05, + "loss": 0.1532, + "step": 6485 + }, + { + "epoch": 2.395125553914328, + "grad_norm": 0.2960759103298187, + "learning_rate": 4.0349796773001604e-05, + "loss": 0.1679, + "step": 6486 + }, + { + "epoch": 2.3954948301329395, + "grad_norm": 0.23781336843967438, + "learning_rate": 4.032516319743811e-05, + "loss": 0.1394, + "step": 6487 + }, + { + "epoch": 2.395864106351551, + "grad_norm": 0.25501328706741333, + "learning_rate": 4.030052962187462e-05, + "loss": 0.1662, + "step": 6488 + }, + { + "epoch": 2.3962333825701627, + "grad_norm": 0.2502579987049103, + "learning_rate": 4.027589604631112e-05, + "loss": 0.1541, + "step": 6489 + }, + { + "epoch": 2.396602658788774, + "grad_norm": 0.2476811707019806, + "learning_rate": 4.025126247074763e-05, + "loss": 0.1673, + "step": 6490 + }, + { + "epoch": 2.3969719350073855, + "grad_norm": 0.2287880927324295, + "learning_rate": 4.0226628895184136e-05, + "loss": 0.1655, + "step": 6491 + }, + { + "epoch": 2.397341211225997, + "grad_norm": 0.2902968227863312, + "learning_rate": 4.0201995319620644e-05, + "loss": 0.1503, + "step": 6492 + }, + { + "epoch": 2.3977104874446087, + "grad_norm": 0.21620434522628784, + "learning_rate": 4.017736174405715e-05, + "loss": 0.1464, + "step": 6493 + }, + { + "epoch": 2.39807976366322, + "grad_norm": 0.2628920078277588, + "learning_rate": 4.015272816849366e-05, + "loss": 0.164, + "step": 6494 + }, + { + "epoch": 2.3984490398818314, + "grad_norm": 0.21965809166431427, + "learning_rate": 4.012809459293017e-05, + "loss": 0.1486, + "step": 6495 + }, + { + "epoch": 2.398818316100443, + "grad_norm": 0.2914850413799286, + "learning_rate": 4.0103461017366676e-05, + "loss": 0.1686, + "step": 6496 + }, + { + "epoch": 2.3991875923190547, + "grad_norm": 0.2575790286064148, + "learning_rate": 4.007882744180318e-05, + "loss": 0.1632, + "step": 6497 + }, + { + "epoch": 2.3995568685376663, + "grad_norm": 0.2682785391807556, + "learning_rate": 4.0054193866239685e-05, + "loss": 0.1539, + "step": 6498 + }, + { + "epoch": 2.399926144756278, + "grad_norm": 0.21142053604125977, + "learning_rate": 4.002956029067619e-05, + "loss": 0.1394, + "step": 6499 + }, + { + "epoch": 2.4002954209748895, + "grad_norm": 0.26053109765052795, + "learning_rate": 4.00049267151127e-05, + "loss": 0.1567, + "step": 6500 + }, + { + "epoch": 2.4002954209748895, + "eval_loss": 8.855240821838379, + "eval_runtime": 6.9007, + "eval_samples_per_second": 7.246, + "eval_steps_per_second": 1.014, + "step": 6500 + }, + { + "epoch": 2.4006646971935006, + "grad_norm": 0.27471908926963806, + "learning_rate": 3.998029313954921e-05, + "loss": 0.1668, + "step": 6501 + }, + { + "epoch": 2.401033973412112, + "grad_norm": 0.26036402583122253, + "learning_rate": 3.9955659563985716e-05, + "loss": 0.1442, + "step": 6502 + }, + { + "epoch": 2.401403249630724, + "grad_norm": 0.2875773012638092, + "learning_rate": 3.9931025988422224e-05, + "loss": 0.1785, + "step": 6503 + }, + { + "epoch": 2.4017725258493354, + "grad_norm": 0.2657027840614319, + "learning_rate": 3.990639241285873e-05, + "loss": 0.1317, + "step": 6504 + }, + { + "epoch": 2.4021418020679466, + "grad_norm": 0.28986603021621704, + "learning_rate": 3.988175883729523e-05, + "loss": 0.1666, + "step": 6505 + }, + { + "epoch": 2.402511078286558, + "grad_norm": 0.22228588163852692, + "learning_rate": 3.985712526173174e-05, + "loss": 0.1554, + "step": 6506 + }, + { + "epoch": 2.40288035450517, + "grad_norm": 0.261974960565567, + "learning_rate": 3.983249168616825e-05, + "loss": 0.1575, + "step": 6507 + }, + { + "epoch": 2.4032496307237814, + "grad_norm": 0.30089473724365234, + "learning_rate": 3.9807858110604756e-05, + "loss": 0.1774, + "step": 6508 + }, + { + "epoch": 2.403618906942393, + "grad_norm": 0.2583668828010559, + "learning_rate": 3.9783224535041264e-05, + "loss": 0.1637, + "step": 6509 + }, + { + "epoch": 2.4039881831610046, + "grad_norm": 0.27829140424728394, + "learning_rate": 3.975859095947777e-05, + "loss": 0.169, + "step": 6510 + }, + { + "epoch": 2.404357459379616, + "grad_norm": 0.22981473803520203, + "learning_rate": 3.973395738391428e-05, + "loss": 0.1397, + "step": 6511 + }, + { + "epoch": 2.4047267355982274, + "grad_norm": 0.28555071353912354, + "learning_rate": 3.970932380835079e-05, + "loss": 0.1785, + "step": 6512 + }, + { + "epoch": 2.405096011816839, + "grad_norm": 0.28578755259513855, + "learning_rate": 3.968469023278729e-05, + "loss": 0.1585, + "step": 6513 + }, + { + "epoch": 2.4054652880354506, + "grad_norm": 0.25342684984207153, + "learning_rate": 3.96600566572238e-05, + "loss": 0.1308, + "step": 6514 + }, + { + "epoch": 2.405834564254062, + "grad_norm": 0.29168635606765747, + "learning_rate": 3.9635423081660305e-05, + "loss": 0.177, + "step": 6515 + }, + { + "epoch": 2.4062038404726733, + "grad_norm": 0.23816536366939545, + "learning_rate": 3.961078950609681e-05, + "loss": 0.1485, + "step": 6516 + }, + { + "epoch": 2.406573116691285, + "grad_norm": 0.24338112771511078, + "learning_rate": 3.958615593053332e-05, + "loss": 0.1526, + "step": 6517 + }, + { + "epoch": 2.4069423929098965, + "grad_norm": 0.24337272346019745, + "learning_rate": 3.956152235496983e-05, + "loss": 0.1549, + "step": 6518 + }, + { + "epoch": 2.407311669128508, + "grad_norm": 0.22050930559635162, + "learning_rate": 3.9536888779406336e-05, + "loss": 0.1599, + "step": 6519 + }, + { + "epoch": 2.4076809453471197, + "grad_norm": 0.2595660090446472, + "learning_rate": 3.951225520384284e-05, + "loss": 0.1461, + "step": 6520 + }, + { + "epoch": 2.4080502215657313, + "grad_norm": 0.23879896104335785, + "learning_rate": 3.9487621628279345e-05, + "loss": 0.1446, + "step": 6521 + }, + { + "epoch": 2.4084194977843425, + "grad_norm": 0.32556548714637756, + "learning_rate": 3.946298805271585e-05, + "loss": 0.1701, + "step": 6522 + }, + { + "epoch": 2.408788774002954, + "grad_norm": 0.235628142952919, + "learning_rate": 3.943835447715236e-05, + "loss": 0.1559, + "step": 6523 + }, + { + "epoch": 2.4091580502215657, + "grad_norm": 0.30522412061691284, + "learning_rate": 3.941372090158887e-05, + "loss": 0.1594, + "step": 6524 + }, + { + "epoch": 2.4095273264401773, + "grad_norm": 0.32828763127326965, + "learning_rate": 3.9389087326025377e-05, + "loss": 0.2085, + "step": 6525 + }, + { + "epoch": 2.409896602658789, + "grad_norm": 0.339797705411911, + "learning_rate": 3.9364453750461884e-05, + "loss": 0.1719, + "step": 6526 + }, + { + "epoch": 2.4102658788774, + "grad_norm": 0.263662725687027, + "learning_rate": 3.933982017489839e-05, + "loss": 0.157, + "step": 6527 + }, + { + "epoch": 2.4106351550960117, + "grad_norm": 0.2801797091960907, + "learning_rate": 3.9315186599334893e-05, + "loss": 0.1915, + "step": 6528 + }, + { + "epoch": 2.4110044313146233, + "grad_norm": 0.25512397289276123, + "learning_rate": 3.92905530237714e-05, + "loss": 0.1613, + "step": 6529 + }, + { + "epoch": 2.411373707533235, + "grad_norm": 0.22137978672981262, + "learning_rate": 3.926591944820791e-05, + "loss": 0.1499, + "step": 6530 + }, + { + "epoch": 2.4117429837518465, + "grad_norm": 0.30446574091911316, + "learning_rate": 3.924128587264442e-05, + "loss": 0.1724, + "step": 6531 + }, + { + "epoch": 2.412112259970458, + "grad_norm": 0.37286803126335144, + "learning_rate": 3.9216652297080925e-05, + "loss": 0.156, + "step": 6532 + }, + { + "epoch": 2.4124815361890692, + "grad_norm": 0.299200177192688, + "learning_rate": 3.919201872151743e-05, + "loss": 0.1816, + "step": 6533 + }, + { + "epoch": 2.412850812407681, + "grad_norm": 0.3659515976905823, + "learning_rate": 3.916738514595394e-05, + "loss": 0.1729, + "step": 6534 + }, + { + "epoch": 2.4132200886262924, + "grad_norm": 0.24710507690906525, + "learning_rate": 3.914275157039045e-05, + "loss": 0.1797, + "step": 6535 + }, + { + "epoch": 2.413589364844904, + "grad_norm": 0.25688138604164124, + "learning_rate": 3.911811799482695e-05, + "loss": 0.1592, + "step": 6536 + }, + { + "epoch": 2.4139586410635157, + "grad_norm": 0.2344970703125, + "learning_rate": 3.909348441926346e-05, + "loss": 0.1389, + "step": 6537 + }, + { + "epoch": 2.414327917282127, + "grad_norm": 0.28085607290267944, + "learning_rate": 3.9068850843699965e-05, + "loss": 0.1665, + "step": 6538 + }, + { + "epoch": 2.4146971935007384, + "grad_norm": 0.3381539285182953, + "learning_rate": 3.904421726813647e-05, + "loss": 0.18, + "step": 6539 + }, + { + "epoch": 2.41506646971935, + "grad_norm": 0.2785937488079071, + "learning_rate": 3.901958369257298e-05, + "loss": 0.1634, + "step": 6540 + }, + { + "epoch": 2.4154357459379616, + "grad_norm": 0.24598821997642517, + "learning_rate": 3.899495011700949e-05, + "loss": 0.1528, + "step": 6541 + }, + { + "epoch": 2.4158050221565732, + "grad_norm": 0.23714780807495117, + "learning_rate": 3.8970316541446e-05, + "loss": 0.1437, + "step": 6542 + }, + { + "epoch": 2.416174298375185, + "grad_norm": 0.2520083487033844, + "learning_rate": 3.8945682965882505e-05, + "loss": 0.1558, + "step": 6543 + }, + { + "epoch": 2.416543574593796, + "grad_norm": 0.25688642263412476, + "learning_rate": 3.8921049390319006e-05, + "loss": 0.1536, + "step": 6544 + }, + { + "epoch": 2.4169128508124076, + "grad_norm": 0.32345423102378845, + "learning_rate": 3.8896415814755514e-05, + "loss": 0.1655, + "step": 6545 + }, + { + "epoch": 2.417282127031019, + "grad_norm": 0.28547775745391846, + "learning_rate": 3.887178223919202e-05, + "loss": 0.1714, + "step": 6546 + }, + { + "epoch": 2.417651403249631, + "grad_norm": 0.26908496022224426, + "learning_rate": 3.884714866362853e-05, + "loss": 0.1608, + "step": 6547 + }, + { + "epoch": 2.4180206794682424, + "grad_norm": 0.26942598819732666, + "learning_rate": 3.882251508806504e-05, + "loss": 0.1684, + "step": 6548 + }, + { + "epoch": 2.4183899556868536, + "grad_norm": 0.23765675723552704, + "learning_rate": 3.8797881512501545e-05, + "loss": 0.1436, + "step": 6549 + }, + { + "epoch": 2.418759231905465, + "grad_norm": 0.25074639916419983, + "learning_rate": 3.877324793693805e-05, + "loss": 0.1889, + "step": 6550 + }, + { + "epoch": 2.418759231905465, + "eval_loss": 8.749035835266113, + "eval_runtime": 6.9095, + "eval_samples_per_second": 7.236, + "eval_steps_per_second": 1.013, + "step": 6550 + }, + { + "epoch": 2.4191285081240768, + "grad_norm": 0.23251919448375702, + "learning_rate": 3.874861436137456e-05, + "loss": 0.1661, + "step": 6551 + }, + { + "epoch": 2.4194977843426884, + "grad_norm": 0.26775816082954407, + "learning_rate": 3.872398078581106e-05, + "loss": 0.1649, + "step": 6552 + }, + { + "epoch": 2.4198670605613, + "grad_norm": 0.2593441307544708, + "learning_rate": 3.869934721024757e-05, + "loss": 0.1558, + "step": 6553 + }, + { + "epoch": 2.4202363367799116, + "grad_norm": 0.32847246527671814, + "learning_rate": 3.867471363468408e-05, + "loss": 0.1574, + "step": 6554 + }, + { + "epoch": 2.4206056129985227, + "grad_norm": 0.2546409070491791, + "learning_rate": 3.8650080059120585e-05, + "loss": 0.1738, + "step": 6555 + }, + { + "epoch": 2.4209748892171343, + "grad_norm": 0.27371150255203247, + "learning_rate": 3.862544648355709e-05, + "loss": 0.163, + "step": 6556 + }, + { + "epoch": 2.421344165435746, + "grad_norm": 0.3180791139602661, + "learning_rate": 3.86008129079936e-05, + "loss": 0.1598, + "step": 6557 + }, + { + "epoch": 2.4217134416543575, + "grad_norm": 0.2851298153400421, + "learning_rate": 3.857617933243011e-05, + "loss": 0.1592, + "step": 6558 + }, + { + "epoch": 2.422082717872969, + "grad_norm": 0.26532068848609924, + "learning_rate": 3.855154575686661e-05, + "loss": 0.1631, + "step": 6559 + }, + { + "epoch": 2.4224519940915803, + "grad_norm": 0.21851901710033417, + "learning_rate": 3.852691218130312e-05, + "loss": 0.1593, + "step": 6560 + }, + { + "epoch": 2.422821270310192, + "grad_norm": 0.255950927734375, + "learning_rate": 3.850227860573962e-05, + "loss": 0.1635, + "step": 6561 + }, + { + "epoch": 2.4231905465288035, + "grad_norm": 0.22938866913318634, + "learning_rate": 3.847764503017613e-05, + "loss": 0.1455, + "step": 6562 + }, + { + "epoch": 2.423559822747415, + "grad_norm": 0.22640900313854218, + "learning_rate": 3.8453011454612635e-05, + "loss": 0.1328, + "step": 6563 + }, + { + "epoch": 2.4239290989660267, + "grad_norm": 0.2631516456604004, + "learning_rate": 3.842837787904914e-05, + "loss": 0.1802, + "step": 6564 + }, + { + "epoch": 2.4242983751846383, + "grad_norm": 0.2598891258239746, + "learning_rate": 3.840374430348565e-05, + "loss": 0.1537, + "step": 6565 + }, + { + "epoch": 2.4246676514032495, + "grad_norm": 0.20260311663150787, + "learning_rate": 3.837911072792216e-05, + "loss": 0.1435, + "step": 6566 + }, + { + "epoch": 2.425036927621861, + "grad_norm": 0.24862119555473328, + "learning_rate": 3.8354477152358666e-05, + "loss": 0.158, + "step": 6567 + }, + { + "epoch": 2.4254062038404727, + "grad_norm": 0.24671316146850586, + "learning_rate": 3.8329843576795174e-05, + "loss": 0.1646, + "step": 6568 + }, + { + "epoch": 2.4257754800590843, + "grad_norm": 0.2880328595638275, + "learning_rate": 3.8305210001231675e-05, + "loss": 0.1646, + "step": 6569 + }, + { + "epoch": 2.426144756277696, + "grad_norm": 0.25308993458747864, + "learning_rate": 3.828057642566818e-05, + "loss": 0.1502, + "step": 6570 + }, + { + "epoch": 2.426514032496307, + "grad_norm": 0.33136507868766785, + "learning_rate": 3.825594285010469e-05, + "loss": 0.2125, + "step": 6571 + }, + { + "epoch": 2.4268833087149186, + "grad_norm": 0.2482517957687378, + "learning_rate": 3.82313092745412e-05, + "loss": 0.136, + "step": 6572 + }, + { + "epoch": 2.4272525849335302, + "grad_norm": 0.23653635382652283, + "learning_rate": 3.8206675698977707e-05, + "loss": 0.1374, + "step": 6573 + }, + { + "epoch": 2.427621861152142, + "grad_norm": 0.2695724070072174, + "learning_rate": 3.8182042123414214e-05, + "loss": 0.1673, + "step": 6574 + }, + { + "epoch": 2.4279911373707534, + "grad_norm": 0.2372472584247589, + "learning_rate": 3.815740854785072e-05, + "loss": 0.1552, + "step": 6575 + }, + { + "epoch": 2.428360413589365, + "grad_norm": 0.23868004977703094, + "learning_rate": 3.813277497228723e-05, + "loss": 0.1471, + "step": 6576 + }, + { + "epoch": 2.428729689807976, + "grad_norm": 0.25767219066619873, + "learning_rate": 3.810814139672373e-05, + "loss": 0.1635, + "step": 6577 + }, + { + "epoch": 2.429098966026588, + "grad_norm": 0.27408942580223083, + "learning_rate": 3.808350782116024e-05, + "loss": 0.1398, + "step": 6578 + }, + { + "epoch": 2.4294682422451994, + "grad_norm": 0.21813571453094482, + "learning_rate": 3.805887424559675e-05, + "loss": 0.1551, + "step": 6579 + }, + { + "epoch": 2.429837518463811, + "grad_norm": 0.28335994482040405, + "learning_rate": 3.8034240670033255e-05, + "loss": 0.1678, + "step": 6580 + }, + { + "epoch": 2.4302067946824226, + "grad_norm": 0.2098926156759262, + "learning_rate": 3.800960709446976e-05, + "loss": 0.1356, + "step": 6581 + }, + { + "epoch": 2.430576070901034, + "grad_norm": 0.2750248610973358, + "learning_rate": 3.798497351890627e-05, + "loss": 0.1786, + "step": 6582 + }, + { + "epoch": 2.4309453471196454, + "grad_norm": 0.24234561622142792, + "learning_rate": 3.796033994334278e-05, + "loss": 0.1417, + "step": 6583 + }, + { + "epoch": 2.431314623338257, + "grad_norm": 0.30867087841033936, + "learning_rate": 3.7935706367779286e-05, + "loss": 0.1984, + "step": 6584 + }, + { + "epoch": 2.4316838995568686, + "grad_norm": 0.2483898252248764, + "learning_rate": 3.791107279221579e-05, + "loss": 0.1586, + "step": 6585 + }, + { + "epoch": 2.43205317577548, + "grad_norm": 0.2643324136734009, + "learning_rate": 3.7886439216652295e-05, + "loss": 0.1545, + "step": 6586 + }, + { + "epoch": 2.432422451994092, + "grad_norm": 0.2337154597043991, + "learning_rate": 3.78618056410888e-05, + "loss": 0.141, + "step": 6587 + }, + { + "epoch": 2.432791728212703, + "grad_norm": 0.21573297679424286, + "learning_rate": 3.783717206552531e-05, + "loss": 0.1572, + "step": 6588 + }, + { + "epoch": 2.4331610044313146, + "grad_norm": 0.2413255274295807, + "learning_rate": 3.781253848996182e-05, + "loss": 0.1871, + "step": 6589 + }, + { + "epoch": 2.433530280649926, + "grad_norm": 0.3112795948982239, + "learning_rate": 3.778790491439833e-05, + "loss": 0.1651, + "step": 6590 + }, + { + "epoch": 2.4338995568685378, + "grad_norm": 0.2840538024902344, + "learning_rate": 3.7763271338834835e-05, + "loss": 0.1606, + "step": 6591 + }, + { + "epoch": 2.4342688330871494, + "grad_norm": 0.2429855763912201, + "learning_rate": 3.773863776327134e-05, + "loss": 0.1379, + "step": 6592 + }, + { + "epoch": 2.4346381093057605, + "grad_norm": 0.27231860160827637, + "learning_rate": 3.7714004187707844e-05, + "loss": 0.174, + "step": 6593 + }, + { + "epoch": 2.435007385524372, + "grad_norm": 0.22264686226844788, + "learning_rate": 3.768937061214435e-05, + "loss": 0.1645, + "step": 6594 + }, + { + "epoch": 2.4353766617429837, + "grad_norm": 0.30021628737449646, + "learning_rate": 3.766473703658086e-05, + "loss": 0.1814, + "step": 6595 + }, + { + "epoch": 2.4357459379615953, + "grad_norm": 0.3207149803638458, + "learning_rate": 3.764010346101737e-05, + "loss": 0.1484, + "step": 6596 + }, + { + "epoch": 2.436115214180207, + "grad_norm": 0.23627181351184845, + "learning_rate": 3.7615469885453875e-05, + "loss": 0.1668, + "step": 6597 + }, + { + "epoch": 2.4364844903988185, + "grad_norm": 0.23027193546295166, + "learning_rate": 3.759083630989038e-05, + "loss": 0.17, + "step": 6598 + }, + { + "epoch": 2.4368537666174297, + "grad_norm": 0.2505633234977722, + "learning_rate": 3.756620273432689e-05, + "loss": 0.1619, + "step": 6599 + }, + { + "epoch": 2.4372230428360413, + "grad_norm": 0.2686895430088043, + "learning_rate": 3.75415691587634e-05, + "loss": 0.1609, + "step": 6600 + }, + { + "epoch": 2.4372230428360413, + "eval_loss": 8.820206642150879, + "eval_runtime": 6.9048, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 1.014, + "step": 6600 + }, + { + "epoch": 2.437592319054653, + "grad_norm": 0.2675122916698456, + "learning_rate": 3.75169355831999e-05, + "loss": 0.1577, + "step": 6601 + }, + { + "epoch": 2.4379615952732645, + "grad_norm": 0.23315240442752838, + "learning_rate": 3.749230200763641e-05, + "loss": 0.1526, + "step": 6602 + }, + { + "epoch": 2.438330871491876, + "grad_norm": 0.25693339109420776, + "learning_rate": 3.7467668432072915e-05, + "loss": 0.1366, + "step": 6603 + }, + { + "epoch": 2.4387001477104873, + "grad_norm": 0.26732125878334045, + "learning_rate": 3.744303485650942e-05, + "loss": 0.1772, + "step": 6604 + }, + { + "epoch": 2.439069423929099, + "grad_norm": 0.2606602609157562, + "learning_rate": 3.741840128094593e-05, + "loss": 0.1795, + "step": 6605 + }, + { + "epoch": 2.4394387001477105, + "grad_norm": 0.25766611099243164, + "learning_rate": 3.739376770538244e-05, + "loss": 0.1531, + "step": 6606 + }, + { + "epoch": 2.439807976366322, + "grad_norm": 0.28651708364486694, + "learning_rate": 3.736913412981895e-05, + "loss": 0.1614, + "step": 6607 + }, + { + "epoch": 2.4401772525849337, + "grad_norm": 0.26084449887275696, + "learning_rate": 3.7344500554255455e-05, + "loss": 0.1508, + "step": 6608 + }, + { + "epoch": 2.4405465288035453, + "grad_norm": 0.23077642917633057, + "learning_rate": 3.7319866978691956e-05, + "loss": 0.1316, + "step": 6609 + }, + { + "epoch": 2.4409158050221564, + "grad_norm": 0.25899550318717957, + "learning_rate": 3.7295233403128464e-05, + "loss": 0.1624, + "step": 6610 + }, + { + "epoch": 2.441285081240768, + "grad_norm": 0.2986014187335968, + "learning_rate": 3.727059982756497e-05, + "loss": 0.156, + "step": 6611 + }, + { + "epoch": 2.4416543574593796, + "grad_norm": 0.28071168065071106, + "learning_rate": 3.724596625200148e-05, + "loss": 0.1766, + "step": 6612 + }, + { + "epoch": 2.4420236336779912, + "grad_norm": 0.25692906975746155, + "learning_rate": 3.722133267643799e-05, + "loss": 0.1642, + "step": 6613 + }, + { + "epoch": 2.442392909896603, + "grad_norm": 0.25577986240386963, + "learning_rate": 3.7196699100874495e-05, + "loss": 0.148, + "step": 6614 + }, + { + "epoch": 2.442762186115214, + "grad_norm": 0.2964719831943512, + "learning_rate": 3.7172065525311e-05, + "loss": 0.175, + "step": 6615 + }, + { + "epoch": 2.4431314623338256, + "grad_norm": 0.2849692404270172, + "learning_rate": 3.714743194974751e-05, + "loss": 0.1601, + "step": 6616 + }, + { + "epoch": 2.443500738552437, + "grad_norm": 0.26236188411712646, + "learning_rate": 3.712279837418401e-05, + "loss": 0.1501, + "step": 6617 + }, + { + "epoch": 2.443870014771049, + "grad_norm": 0.2684587836265564, + "learning_rate": 3.709816479862052e-05, + "loss": 0.161, + "step": 6618 + }, + { + "epoch": 2.4442392909896604, + "grad_norm": 0.27292928099632263, + "learning_rate": 3.707353122305703e-05, + "loss": 0.1501, + "step": 6619 + }, + { + "epoch": 2.444608567208272, + "grad_norm": 0.3116298019886017, + "learning_rate": 3.7048897647493536e-05, + "loss": 0.1766, + "step": 6620 + }, + { + "epoch": 2.444977843426883, + "grad_norm": 0.2640453279018402, + "learning_rate": 3.7024264071930043e-05, + "loss": 0.1627, + "step": 6621 + }, + { + "epoch": 2.445347119645495, + "grad_norm": 0.3244304656982422, + "learning_rate": 3.699963049636655e-05, + "loss": 0.1965, + "step": 6622 + }, + { + "epoch": 2.4457163958641064, + "grad_norm": 0.28618332743644714, + "learning_rate": 3.697499692080306e-05, + "loss": 0.1732, + "step": 6623 + }, + { + "epoch": 2.446085672082718, + "grad_norm": 0.27196013927459717, + "learning_rate": 3.695036334523957e-05, + "loss": 0.1616, + "step": 6624 + }, + { + "epoch": 2.446454948301329, + "grad_norm": 0.265786349773407, + "learning_rate": 3.692572976967607e-05, + "loss": 0.1632, + "step": 6625 + }, + { + "epoch": 2.4468242245199407, + "grad_norm": 0.3124234676361084, + "learning_rate": 3.6901096194112576e-05, + "loss": 0.189, + "step": 6626 + }, + { + "epoch": 2.4471935007385524, + "grad_norm": 0.2570928931236267, + "learning_rate": 3.6876462618549084e-05, + "loss": 0.1646, + "step": 6627 + }, + { + "epoch": 2.447562776957164, + "grad_norm": 0.2625611126422882, + "learning_rate": 3.685182904298559e-05, + "loss": 0.1524, + "step": 6628 + }, + { + "epoch": 2.4479320531757756, + "grad_norm": 0.24514485895633698, + "learning_rate": 3.68271954674221e-05, + "loss": 0.1432, + "step": 6629 + }, + { + "epoch": 2.448301329394387, + "grad_norm": 0.3152150511741638, + "learning_rate": 3.680256189185861e-05, + "loss": 0.167, + "step": 6630 + }, + { + "epoch": 2.4486706056129988, + "grad_norm": 0.2918526828289032, + "learning_rate": 3.6777928316295115e-05, + "loss": 0.1966, + "step": 6631 + }, + { + "epoch": 2.44903988183161, + "grad_norm": 0.30037373304367065, + "learning_rate": 3.6753294740731616e-05, + "loss": 0.1829, + "step": 6632 + }, + { + "epoch": 2.4494091580502215, + "grad_norm": 0.34445443749427795, + "learning_rate": 3.6728661165168124e-05, + "loss": 0.1862, + "step": 6633 + }, + { + "epoch": 2.449778434268833, + "grad_norm": 0.26773616671562195, + "learning_rate": 3.670402758960463e-05, + "loss": 0.1411, + "step": 6634 + }, + { + "epoch": 2.4501477104874447, + "grad_norm": 0.29953983426094055, + "learning_rate": 3.667939401404114e-05, + "loss": 0.16, + "step": 6635 + }, + { + "epoch": 2.450516986706056, + "grad_norm": 0.25814399123191833, + "learning_rate": 3.665476043847765e-05, + "loss": 0.1409, + "step": 6636 + }, + { + "epoch": 2.4508862629246675, + "grad_norm": 0.2499862015247345, + "learning_rate": 3.6630126862914156e-05, + "loss": 0.1573, + "step": 6637 + }, + { + "epoch": 2.451255539143279, + "grad_norm": 0.23134571313858032, + "learning_rate": 3.6605493287350664e-05, + "loss": 0.1522, + "step": 6638 + }, + { + "epoch": 2.4516248153618907, + "grad_norm": 0.27558833360671997, + "learning_rate": 3.658085971178717e-05, + "loss": 0.149, + "step": 6639 + }, + { + "epoch": 2.4519940915805023, + "grad_norm": 0.24139514565467834, + "learning_rate": 3.655622613622367e-05, + "loss": 0.1543, + "step": 6640 + }, + { + "epoch": 2.452363367799114, + "grad_norm": 0.3197338283061981, + "learning_rate": 3.653159256066018e-05, + "loss": 0.1915, + "step": 6641 + }, + { + "epoch": 2.4527326440177255, + "grad_norm": 0.2726379632949829, + "learning_rate": 3.650695898509669e-05, + "loss": 0.1776, + "step": 6642 + }, + { + "epoch": 2.4531019202363367, + "grad_norm": 0.2774439752101898, + "learning_rate": 3.6482325409533196e-05, + "loss": 0.1584, + "step": 6643 + }, + { + "epoch": 2.4534711964549483, + "grad_norm": 0.22459927201271057, + "learning_rate": 3.6457691833969704e-05, + "loss": 0.1583, + "step": 6644 + }, + { + "epoch": 2.45384047267356, + "grad_norm": 0.23228324949741364, + "learning_rate": 3.643305825840621e-05, + "loss": 0.1463, + "step": 6645 + }, + { + "epoch": 2.4542097488921715, + "grad_norm": 0.29190313816070557, + "learning_rate": 3.640842468284272e-05, + "loss": 0.1591, + "step": 6646 + }, + { + "epoch": 2.4545790251107826, + "grad_norm": 0.24514329433441162, + "learning_rate": 3.638379110727923e-05, + "loss": 0.1717, + "step": 6647 + }, + { + "epoch": 2.4549483013293942, + "grad_norm": 0.26786360144615173, + "learning_rate": 3.635915753171573e-05, + "loss": 0.1714, + "step": 6648 + }, + { + "epoch": 2.455317577548006, + "grad_norm": 0.3906300961971283, + "learning_rate": 3.6334523956152236e-05, + "loss": 0.1792, + "step": 6649 + }, + { + "epoch": 2.4556868537666174, + "grad_norm": 0.32624369859695435, + "learning_rate": 3.6309890380588744e-05, + "loss": 0.2215, + "step": 6650 + }, + { + "epoch": 2.4556868537666174, + "eval_loss": 8.874140739440918, + "eval_runtime": 7.0464, + "eval_samples_per_second": 7.096, + "eval_steps_per_second": 0.993, + "step": 6650 + }, + { + "epoch": 2.456056129985229, + "grad_norm": 0.28335246443748474, + "learning_rate": 3.628525680502525e-05, + "loss": 0.1592, + "step": 6651 + }, + { + "epoch": 2.4564254062038406, + "grad_norm": 0.2917601764202118, + "learning_rate": 3.626062322946176e-05, + "loss": 0.1779, + "step": 6652 + }, + { + "epoch": 2.456794682422452, + "grad_norm": 0.2804676592350006, + "learning_rate": 3.623598965389827e-05, + "loss": 0.1772, + "step": 6653 + }, + { + "epoch": 2.4571639586410634, + "grad_norm": 0.27422645688056946, + "learning_rate": 3.6211356078334776e-05, + "loss": 0.1562, + "step": 6654 + }, + { + "epoch": 2.457533234859675, + "grad_norm": 0.21687686443328857, + "learning_rate": 3.6186722502771284e-05, + "loss": 0.1551, + "step": 6655 + }, + { + "epoch": 2.4579025110782866, + "grad_norm": 0.22367054224014282, + "learning_rate": 3.6162088927207785e-05, + "loss": 0.1546, + "step": 6656 + }, + { + "epoch": 2.458271787296898, + "grad_norm": 0.24385707080364227, + "learning_rate": 3.613745535164429e-05, + "loss": 0.1678, + "step": 6657 + }, + { + "epoch": 2.4586410635155094, + "grad_norm": 0.25708070397377014, + "learning_rate": 3.61128217760808e-05, + "loss": 0.1489, + "step": 6658 + }, + { + "epoch": 2.459010339734121, + "grad_norm": 0.2841649651527405, + "learning_rate": 3.608818820051731e-05, + "loss": 0.1632, + "step": 6659 + }, + { + "epoch": 2.4593796159527326, + "grad_norm": 0.26423361897468567, + "learning_rate": 3.6063554624953816e-05, + "loss": 0.1504, + "step": 6660 + }, + { + "epoch": 2.459748892171344, + "grad_norm": 0.22195886075496674, + "learning_rate": 3.6038921049390324e-05, + "loss": 0.1499, + "step": 6661 + }, + { + "epoch": 2.460118168389956, + "grad_norm": 0.286496639251709, + "learning_rate": 3.601428747382683e-05, + "loss": 0.1642, + "step": 6662 + }, + { + "epoch": 2.4604874446085674, + "grad_norm": 0.22343029081821442, + "learning_rate": 3.598965389826334e-05, + "loss": 0.1501, + "step": 6663 + }, + { + "epoch": 2.4608567208271785, + "grad_norm": 0.24169427156448364, + "learning_rate": 3.596502032269984e-05, + "loss": 0.1511, + "step": 6664 + }, + { + "epoch": 2.46122599704579, + "grad_norm": 0.277081698179245, + "learning_rate": 3.594038674713635e-05, + "loss": 0.1698, + "step": 6665 + }, + { + "epoch": 2.4615952732644018, + "grad_norm": 0.24551615118980408, + "learning_rate": 3.5915753171572857e-05, + "loss": 0.1659, + "step": 6666 + }, + { + "epoch": 2.4619645494830134, + "grad_norm": 0.2621482312679291, + "learning_rate": 3.5891119596009364e-05, + "loss": 0.1668, + "step": 6667 + }, + { + "epoch": 2.462333825701625, + "grad_norm": 0.25901490449905396, + "learning_rate": 3.586648602044587e-05, + "loss": 0.1603, + "step": 6668 + }, + { + "epoch": 2.462703101920236, + "grad_norm": 0.21770717203617096, + "learning_rate": 3.584185244488238e-05, + "loss": 0.1578, + "step": 6669 + }, + { + "epoch": 2.4630723781388477, + "grad_norm": 0.24670730531215668, + "learning_rate": 3.581721886931889e-05, + "loss": 0.1763, + "step": 6670 + }, + { + "epoch": 2.4634416543574593, + "grad_norm": 0.23688511550426483, + "learning_rate": 3.5792585293755396e-05, + "loss": 0.1438, + "step": 6671 + }, + { + "epoch": 2.463810930576071, + "grad_norm": 0.2523290812969208, + "learning_rate": 3.57679517181919e-05, + "loss": 0.1579, + "step": 6672 + }, + { + "epoch": 2.4641802067946825, + "grad_norm": 0.277069628238678, + "learning_rate": 3.5743318142628405e-05, + "loss": 0.1785, + "step": 6673 + }, + { + "epoch": 2.464549483013294, + "grad_norm": 0.23258350789546967, + "learning_rate": 3.571868456706491e-05, + "loss": 0.1411, + "step": 6674 + }, + { + "epoch": 2.4649187592319053, + "grad_norm": 0.2636997699737549, + "learning_rate": 3.5694050991501414e-05, + "loss": 0.1815, + "step": 6675 + }, + { + "epoch": 2.465288035450517, + "grad_norm": 0.2412007749080658, + "learning_rate": 3.566941741593792e-05, + "loss": 0.1541, + "step": 6676 + }, + { + "epoch": 2.4656573116691285, + "grad_norm": 0.258176326751709, + "learning_rate": 3.564478384037443e-05, + "loss": 0.1576, + "step": 6677 + }, + { + "epoch": 2.46602658788774, + "grad_norm": 0.25865378975868225, + "learning_rate": 3.562015026481094e-05, + "loss": 0.1414, + "step": 6678 + }, + { + "epoch": 2.4663958641063517, + "grad_norm": 0.2801617681980133, + "learning_rate": 3.5595516689247445e-05, + "loss": 0.1779, + "step": 6679 + }, + { + "epoch": 2.466765140324963, + "grad_norm": 0.26042041182518005, + "learning_rate": 3.557088311368395e-05, + "loss": 0.1565, + "step": 6680 + }, + { + "epoch": 2.4671344165435745, + "grad_norm": 0.2893603444099426, + "learning_rate": 3.5546249538120454e-05, + "loss": 0.1868, + "step": 6681 + }, + { + "epoch": 2.467503692762186, + "grad_norm": 0.2607285678386688, + "learning_rate": 3.552161596255696e-05, + "loss": 0.1602, + "step": 6682 + }, + { + "epoch": 2.4678729689807977, + "grad_norm": 0.3251848518848419, + "learning_rate": 3.549698238699347e-05, + "loss": 0.1926, + "step": 6683 + }, + { + "epoch": 2.4682422451994093, + "grad_norm": 0.2854210436344147, + "learning_rate": 3.547234881142998e-05, + "loss": 0.1996, + "step": 6684 + }, + { + "epoch": 2.468611521418021, + "grad_norm": 0.2856009304523468, + "learning_rate": 3.5447715235866486e-05, + "loss": 0.166, + "step": 6685 + }, + { + "epoch": 2.468980797636632, + "grad_norm": 0.23951977491378784, + "learning_rate": 3.5423081660302994e-05, + "loss": 0.1473, + "step": 6686 + }, + { + "epoch": 2.4693500738552436, + "grad_norm": 0.22596383094787598, + "learning_rate": 3.53984480847395e-05, + "loss": 0.1466, + "step": 6687 + }, + { + "epoch": 2.4697193500738552, + "grad_norm": 0.3350682556629181, + "learning_rate": 3.537381450917601e-05, + "loss": 0.1949, + "step": 6688 + }, + { + "epoch": 2.470088626292467, + "grad_norm": 0.22620722651481628, + "learning_rate": 3.534918093361251e-05, + "loss": 0.1612, + "step": 6689 + }, + { + "epoch": 2.4704579025110784, + "grad_norm": 0.2508251368999481, + "learning_rate": 3.532454735804902e-05, + "loss": 0.1569, + "step": 6690 + }, + { + "epoch": 2.4708271787296896, + "grad_norm": 0.23386171460151672, + "learning_rate": 3.5299913782485526e-05, + "loss": 0.1642, + "step": 6691 + }, + { + "epoch": 2.471196454948301, + "grad_norm": 0.29422685503959656, + "learning_rate": 3.5275280206922034e-05, + "loss": 0.1694, + "step": 6692 + }, + { + "epoch": 2.471565731166913, + "grad_norm": 0.3166307210922241, + "learning_rate": 3.525064663135854e-05, + "loss": 0.1779, + "step": 6693 + }, + { + "epoch": 2.4719350073855244, + "grad_norm": 0.2737945318222046, + "learning_rate": 3.522601305579505e-05, + "loss": 0.1501, + "step": 6694 + }, + { + "epoch": 2.472304283604136, + "grad_norm": 0.2566399574279785, + "learning_rate": 3.520137948023156e-05, + "loss": 0.1773, + "step": 6695 + }, + { + "epoch": 2.4726735598227476, + "grad_norm": 0.21264776587486267, + "learning_rate": 3.5176745904668065e-05, + "loss": 0.1362, + "step": 6696 + }, + { + "epoch": 2.4730428360413588, + "grad_norm": 0.2826724648475647, + "learning_rate": 3.5152112329104566e-05, + "loss": 0.1568, + "step": 6697 + }, + { + "epoch": 2.4734121122599704, + "grad_norm": 0.23783594369888306, + "learning_rate": 3.5127478753541074e-05, + "loss": 0.1485, + "step": 6698 + }, + { + "epoch": 2.473781388478582, + "grad_norm": 0.2671682834625244, + "learning_rate": 3.510284517797758e-05, + "loss": 0.1573, + "step": 6699 + }, + { + "epoch": 2.4741506646971936, + "grad_norm": 0.29095926880836487, + "learning_rate": 3.507821160241409e-05, + "loss": 0.1814, + "step": 6700 + }, + { + "epoch": 2.4741506646971936, + "eval_loss": 8.811786651611328, + "eval_runtime": 6.9228, + "eval_samples_per_second": 7.223, + "eval_steps_per_second": 1.011, + "step": 6700 + }, + { + "epoch": 2.474519940915805, + "grad_norm": 0.21582616865634918, + "learning_rate": 3.50535780268506e-05, + "loss": 0.1496, + "step": 6701 + }, + { + "epoch": 2.4748892171344163, + "grad_norm": 0.21327340602874756, + "learning_rate": 3.5028944451287106e-05, + "loss": 0.1312, + "step": 6702 + }, + { + "epoch": 2.475258493353028, + "grad_norm": 0.24499481916427612, + "learning_rate": 3.5004310875723614e-05, + "loss": 0.1587, + "step": 6703 + }, + { + "epoch": 2.4756277695716395, + "grad_norm": 0.22406260669231415, + "learning_rate": 3.497967730016012e-05, + "loss": 0.1431, + "step": 6704 + }, + { + "epoch": 2.475997045790251, + "grad_norm": 0.28711971640586853, + "learning_rate": 3.495504372459662e-05, + "loss": 0.1704, + "step": 6705 + }, + { + "epoch": 2.4763663220088628, + "grad_norm": 0.3145855665206909, + "learning_rate": 3.493041014903313e-05, + "loss": 0.1747, + "step": 6706 + }, + { + "epoch": 2.4767355982274744, + "grad_norm": 0.25399473309516907, + "learning_rate": 3.490577657346964e-05, + "loss": 0.1794, + "step": 6707 + }, + { + "epoch": 2.4771048744460855, + "grad_norm": 0.24246756732463837, + "learning_rate": 3.4881142997906146e-05, + "loss": 0.1598, + "step": 6708 + }, + { + "epoch": 2.477474150664697, + "grad_norm": 0.24297937750816345, + "learning_rate": 3.4856509422342654e-05, + "loss": 0.151, + "step": 6709 + }, + { + "epoch": 2.4778434268833087, + "grad_norm": 0.24939250946044922, + "learning_rate": 3.483187584677916e-05, + "loss": 0.163, + "step": 6710 + }, + { + "epoch": 2.4782127031019203, + "grad_norm": 0.2589324414730072, + "learning_rate": 3.480724227121567e-05, + "loss": 0.1629, + "step": 6711 + }, + { + "epoch": 2.478581979320532, + "grad_norm": 0.22194306552410126, + "learning_rate": 3.478260869565218e-05, + "loss": 0.1608, + "step": 6712 + }, + { + "epoch": 2.478951255539143, + "grad_norm": 0.27636653184890747, + "learning_rate": 3.475797512008868e-05, + "loss": 0.1853, + "step": 6713 + }, + { + "epoch": 2.4793205317577547, + "grad_norm": 0.4495850205421448, + "learning_rate": 3.4733341544525187e-05, + "loss": 0.2005, + "step": 6714 + }, + { + "epoch": 2.4796898079763663, + "grad_norm": 0.26991376280784607, + "learning_rate": 3.4708707968961694e-05, + "loss": 0.1667, + "step": 6715 + }, + { + "epoch": 2.480059084194978, + "grad_norm": 0.2566462755203247, + "learning_rate": 3.46840743933982e-05, + "loss": 0.1723, + "step": 6716 + }, + { + "epoch": 2.4804283604135895, + "grad_norm": 0.24712300300598145, + "learning_rate": 3.465944081783471e-05, + "loss": 0.1774, + "step": 6717 + }, + { + "epoch": 2.480797636632201, + "grad_norm": 0.28069454431533813, + "learning_rate": 3.463480724227122e-05, + "loss": 0.1572, + "step": 6718 + }, + { + "epoch": 2.4811669128508123, + "grad_norm": 0.2797495126724243, + "learning_rate": 3.4610173666707726e-05, + "loss": 0.1734, + "step": 6719 + }, + { + "epoch": 2.481536189069424, + "grad_norm": 0.27008047699928284, + "learning_rate": 3.4585540091144234e-05, + "loss": 0.1539, + "step": 6720 + }, + { + "epoch": 2.4819054652880355, + "grad_norm": 0.24062402546405792, + "learning_rate": 3.4560906515580735e-05, + "loss": 0.1589, + "step": 6721 + }, + { + "epoch": 2.482274741506647, + "grad_norm": 0.2584007978439331, + "learning_rate": 3.453627294001724e-05, + "loss": 0.1546, + "step": 6722 + }, + { + "epoch": 2.4826440177252587, + "grad_norm": 0.25408095121383667, + "learning_rate": 3.451163936445375e-05, + "loss": 0.143, + "step": 6723 + }, + { + "epoch": 2.48301329394387, + "grad_norm": 0.34458717703819275, + "learning_rate": 3.448700578889026e-05, + "loss": 0.1511, + "step": 6724 + }, + { + "epoch": 2.4833825701624814, + "grad_norm": 0.2452273964881897, + "learning_rate": 3.4462372213326766e-05, + "loss": 0.1578, + "step": 6725 + }, + { + "epoch": 2.483751846381093, + "grad_norm": 0.26042330265045166, + "learning_rate": 3.4437738637763274e-05, + "loss": 0.1503, + "step": 6726 + }, + { + "epoch": 2.4841211225997046, + "grad_norm": 0.26157912611961365, + "learning_rate": 3.441310506219978e-05, + "loss": 0.1323, + "step": 6727 + }, + { + "epoch": 2.4844903988183162, + "grad_norm": 0.25223198533058167, + "learning_rate": 3.438847148663629e-05, + "loss": 0.1824, + "step": 6728 + }, + { + "epoch": 2.484859675036928, + "grad_norm": 0.34162068367004395, + "learning_rate": 3.436383791107279e-05, + "loss": 0.1553, + "step": 6729 + }, + { + "epoch": 2.485228951255539, + "grad_norm": 0.27832159399986267, + "learning_rate": 3.43392043355093e-05, + "loss": 0.1608, + "step": 6730 + }, + { + "epoch": 2.4855982274741506, + "grad_norm": 0.24827948212623596, + "learning_rate": 3.431457075994581e-05, + "loss": 0.1521, + "step": 6731 + }, + { + "epoch": 2.485967503692762, + "grad_norm": 0.3118678331375122, + "learning_rate": 3.4289937184382315e-05, + "loss": 0.179, + "step": 6732 + }, + { + "epoch": 2.486336779911374, + "grad_norm": 0.254609078168869, + "learning_rate": 3.426530360881882e-05, + "loss": 0.1711, + "step": 6733 + }, + { + "epoch": 2.4867060561299854, + "grad_norm": 0.2571640610694885, + "learning_rate": 3.424067003325533e-05, + "loss": 0.1536, + "step": 6734 + }, + { + "epoch": 2.4870753323485966, + "grad_norm": 0.2874715030193329, + "learning_rate": 3.421603645769184e-05, + "loss": 0.1765, + "step": 6735 + }, + { + "epoch": 2.487444608567208, + "grad_norm": 0.2608601450920105, + "learning_rate": 3.4191402882128346e-05, + "loss": 0.1658, + "step": 6736 + }, + { + "epoch": 2.4878138847858198, + "grad_norm": 0.22421011328697205, + "learning_rate": 3.416676930656485e-05, + "loss": 0.1443, + "step": 6737 + }, + { + "epoch": 2.4881831610044314, + "grad_norm": 0.2357330322265625, + "learning_rate": 3.4142135731001355e-05, + "loss": 0.1575, + "step": 6738 + }, + { + "epoch": 2.488552437223043, + "grad_norm": 0.3060683608055115, + "learning_rate": 3.411750215543786e-05, + "loss": 0.1793, + "step": 6739 + }, + { + "epoch": 2.4889217134416546, + "grad_norm": 0.27132004499435425, + "learning_rate": 3.409286857987437e-05, + "loss": 0.1702, + "step": 6740 + }, + { + "epoch": 2.4892909896602657, + "grad_norm": 0.290730744600296, + "learning_rate": 3.406823500431088e-05, + "loss": 0.1651, + "step": 6741 + }, + { + "epoch": 2.4896602658788773, + "grad_norm": 0.24577991664409637, + "learning_rate": 3.4043601428747386e-05, + "loss": 0.1424, + "step": 6742 + }, + { + "epoch": 2.490029542097489, + "grad_norm": 0.2577519714832306, + "learning_rate": 3.4018967853183894e-05, + "loss": 0.1603, + "step": 6743 + }, + { + "epoch": 2.4903988183161005, + "grad_norm": 0.2726038694381714, + "learning_rate": 3.39943342776204e-05, + "loss": 0.1571, + "step": 6744 + }, + { + "epoch": 2.490768094534712, + "grad_norm": 0.26232436299324036, + "learning_rate": 3.39697007020569e-05, + "loss": 0.1829, + "step": 6745 + }, + { + "epoch": 2.4911373707533233, + "grad_norm": 0.3218645453453064, + "learning_rate": 3.394506712649341e-05, + "loss": 0.172, + "step": 6746 + }, + { + "epoch": 2.491506646971935, + "grad_norm": 0.24506910145282745, + "learning_rate": 3.392043355092992e-05, + "loss": 0.147, + "step": 6747 + }, + { + "epoch": 2.4918759231905465, + "grad_norm": 0.22498810291290283, + "learning_rate": 3.389579997536643e-05, + "loss": 0.1589, + "step": 6748 + }, + { + "epoch": 2.492245199409158, + "grad_norm": 0.3201974332332611, + "learning_rate": 3.3871166399802935e-05, + "loss": 0.1788, + "step": 6749 + }, + { + "epoch": 2.4926144756277697, + "grad_norm": 0.25516536831855774, + "learning_rate": 3.384653282423944e-05, + "loss": 0.1878, + "step": 6750 + }, + { + "epoch": 2.4926144756277697, + "eval_loss": 8.830473899841309, + "eval_runtime": 7.0208, + "eval_samples_per_second": 7.122, + "eval_steps_per_second": 0.997, + "step": 6750 + }, + { + "epoch": 2.4929837518463813, + "grad_norm": 0.2762545645236969, + "learning_rate": 3.382189924867595e-05, + "loss": 0.146, + "step": 6751 + }, + { + "epoch": 2.4933530280649925, + "grad_norm": 0.2369060516357422, + "learning_rate": 3.379726567311245e-05, + "loss": 0.1654, + "step": 6752 + }, + { + "epoch": 2.493722304283604, + "grad_norm": 0.299869567155838, + "learning_rate": 3.377263209754896e-05, + "loss": 0.1695, + "step": 6753 + }, + { + "epoch": 2.4940915805022157, + "grad_norm": 0.2743680477142334, + "learning_rate": 3.374799852198547e-05, + "loss": 0.1669, + "step": 6754 + }, + { + "epoch": 2.4944608567208273, + "grad_norm": 0.2761988639831543, + "learning_rate": 3.3723364946421975e-05, + "loss": 0.1721, + "step": 6755 + }, + { + "epoch": 2.494830132939439, + "grad_norm": 0.2742190957069397, + "learning_rate": 3.369873137085848e-05, + "loss": 0.1591, + "step": 6756 + }, + { + "epoch": 2.49519940915805, + "grad_norm": 0.2773594260215759, + "learning_rate": 3.367409779529499e-05, + "loss": 0.1455, + "step": 6757 + }, + { + "epoch": 2.4955686853766617, + "grad_norm": 0.22165824472904205, + "learning_rate": 3.36494642197315e-05, + "loss": 0.1388, + "step": 6758 + }, + { + "epoch": 2.4959379615952733, + "grad_norm": 0.27791768312454224, + "learning_rate": 3.3624830644168007e-05, + "loss": 0.1596, + "step": 6759 + }, + { + "epoch": 2.496307237813885, + "grad_norm": 0.29881131649017334, + "learning_rate": 3.360019706860451e-05, + "loss": 0.1772, + "step": 6760 + }, + { + "epoch": 2.4966765140324965, + "grad_norm": 0.2577894628047943, + "learning_rate": 3.3575563493041016e-05, + "loss": 0.1585, + "step": 6761 + }, + { + "epoch": 2.497045790251108, + "grad_norm": 0.2929326891899109, + "learning_rate": 3.355092991747752e-05, + "loss": 0.1449, + "step": 6762 + }, + { + "epoch": 2.4974150664697192, + "grad_norm": 0.3003034293651581, + "learning_rate": 3.352629634191403e-05, + "loss": 0.1676, + "step": 6763 + }, + { + "epoch": 2.497784342688331, + "grad_norm": 0.2359735667705536, + "learning_rate": 3.350166276635054e-05, + "loss": 0.1501, + "step": 6764 + }, + { + "epoch": 2.4981536189069424, + "grad_norm": 0.18972964584827423, + "learning_rate": 3.347702919078705e-05, + "loss": 0.1107, + "step": 6765 + }, + { + "epoch": 2.498522895125554, + "grad_norm": 0.23935313522815704, + "learning_rate": 3.3452395615223555e-05, + "loss": 0.1477, + "step": 6766 + }, + { + "epoch": 2.498892171344165, + "grad_norm": 0.2084641009569168, + "learning_rate": 3.342776203966006e-05, + "loss": 0.1462, + "step": 6767 + }, + { + "epoch": 2.499261447562777, + "grad_norm": 0.26808738708496094, + "learning_rate": 3.3403128464096564e-05, + "loss": 0.1477, + "step": 6768 + }, + { + "epoch": 2.4996307237813884, + "grad_norm": 0.2533901035785675, + "learning_rate": 3.337849488853307e-05, + "loss": 0.1533, + "step": 6769 + }, + { + "epoch": 2.5, + "grad_norm": 0.27896246314048767, + "learning_rate": 3.335386131296958e-05, + "loss": 0.1765, + "step": 6770 + }, + { + "epoch": 2.5003692762186116, + "grad_norm": 0.2557832896709442, + "learning_rate": 3.332922773740609e-05, + "loss": 0.1503, + "step": 6771 + }, + { + "epoch": 2.500738552437223, + "grad_norm": 0.29989272356033325, + "learning_rate": 3.3304594161842595e-05, + "loss": 0.1628, + "step": 6772 + }, + { + "epoch": 2.501107828655835, + "grad_norm": 0.2863641679286957, + "learning_rate": 3.32799605862791e-05, + "loss": 0.1667, + "step": 6773 + }, + { + "epoch": 2.501477104874446, + "grad_norm": 0.23772463202476501, + "learning_rate": 3.325532701071561e-05, + "loss": 0.161, + "step": 6774 + }, + { + "epoch": 2.5018463810930576, + "grad_norm": 0.25158512592315674, + "learning_rate": 3.323069343515212e-05, + "loss": 0.1691, + "step": 6775 + }, + { + "epoch": 2.502215657311669, + "grad_norm": 0.23783046007156372, + "learning_rate": 3.320605985958862e-05, + "loss": 0.1568, + "step": 6776 + }, + { + "epoch": 2.5025849335302808, + "grad_norm": 0.25783777236938477, + "learning_rate": 3.318142628402513e-05, + "loss": 0.1741, + "step": 6777 + }, + { + "epoch": 2.502954209748892, + "grad_norm": 0.2358204424381256, + "learning_rate": 3.3156792708461636e-05, + "loss": 0.1513, + "step": 6778 + }, + { + "epoch": 2.5033234859675035, + "grad_norm": 0.5107343792915344, + "learning_rate": 3.3132159132898144e-05, + "loss": 0.1544, + "step": 6779 + }, + { + "epoch": 2.503692762186115, + "grad_norm": 0.23131924867630005, + "learning_rate": 3.310752555733465e-05, + "loss": 0.1497, + "step": 6780 + }, + { + "epoch": 2.5040620384047267, + "grad_norm": 0.3461349606513977, + "learning_rate": 3.308289198177116e-05, + "loss": 0.174, + "step": 6781 + }, + { + "epoch": 2.5044313146233383, + "grad_norm": 0.28275614976882935, + "learning_rate": 3.305825840620767e-05, + "loss": 0.1447, + "step": 6782 + }, + { + "epoch": 2.50480059084195, + "grad_norm": 0.5512775182723999, + "learning_rate": 3.3033624830644175e-05, + "loss": 0.1544, + "step": 6783 + }, + { + "epoch": 2.5051698670605616, + "grad_norm": 0.21180443465709686, + "learning_rate": 3.3008991255080676e-05, + "loss": 0.1454, + "step": 6784 + }, + { + "epoch": 2.5055391432791727, + "grad_norm": 0.2845575511455536, + "learning_rate": 3.2984357679517184e-05, + "loss": 0.1562, + "step": 6785 + }, + { + "epoch": 2.5059084194977843, + "grad_norm": 0.23931200802326202, + "learning_rate": 3.295972410395369e-05, + "loss": 0.1599, + "step": 6786 + }, + { + "epoch": 2.506277695716396, + "grad_norm": 0.23637863993644714, + "learning_rate": 3.29350905283902e-05, + "loss": 0.1502, + "step": 6787 + }, + { + "epoch": 2.5066469719350075, + "grad_norm": 0.24636603891849518, + "learning_rate": 3.291045695282671e-05, + "loss": 0.1443, + "step": 6788 + }, + { + "epoch": 2.5070162481536187, + "grad_norm": 0.2545834481716156, + "learning_rate": 3.2885823377263215e-05, + "loss": 0.1449, + "step": 6789 + }, + { + "epoch": 2.5073855243722303, + "grad_norm": 0.25286591053009033, + "learning_rate": 3.286118980169972e-05, + "loss": 0.1583, + "step": 6790 + }, + { + "epoch": 2.507754800590842, + "grad_norm": 0.20704224705696106, + "learning_rate": 3.2836556226136224e-05, + "loss": 0.1414, + "step": 6791 + }, + { + "epoch": 2.5081240768094535, + "grad_norm": 0.22646434605121613, + "learning_rate": 3.281192265057273e-05, + "loss": 0.1659, + "step": 6792 + }, + { + "epoch": 2.508493353028065, + "grad_norm": 0.3176220655441284, + "learning_rate": 3.278728907500923e-05, + "loss": 0.1582, + "step": 6793 + }, + { + "epoch": 2.5088626292466767, + "grad_norm": 0.19691136479377747, + "learning_rate": 3.276265549944574e-05, + "loss": 0.1344, + "step": 6794 + }, + { + "epoch": 2.5092319054652883, + "grad_norm": 0.25748586654663086, + "learning_rate": 3.273802192388225e-05, + "loss": 0.1388, + "step": 6795 + }, + { + "epoch": 2.5096011816838995, + "grad_norm": 0.2530933618545532, + "learning_rate": 3.271338834831876e-05, + "loss": 0.1668, + "step": 6796 + }, + { + "epoch": 2.509970457902511, + "grad_norm": 0.2544432580471039, + "learning_rate": 3.2688754772755265e-05, + "loss": 0.1716, + "step": 6797 + }, + { + "epoch": 2.5103397341211227, + "grad_norm": 0.2255215346813202, + "learning_rate": 3.266412119719177e-05, + "loss": 0.1469, + "step": 6798 + }, + { + "epoch": 2.5107090103397343, + "grad_norm": 0.2419660985469818, + "learning_rate": 3.263948762162828e-05, + "loss": 0.1519, + "step": 6799 + }, + { + "epoch": 2.5110782865583454, + "grad_norm": 0.2823397219181061, + "learning_rate": 3.261485404606479e-05, + "loss": 0.1813, + "step": 6800 + }, + { + "epoch": 2.5110782865583454, + "eval_loss": 8.859306335449219, + "eval_runtime": 6.8994, + "eval_samples_per_second": 7.247, + "eval_steps_per_second": 1.015, + "step": 6800 + }, + { + "epoch": 2.511447562776957, + "grad_norm": 0.23401859402656555, + "learning_rate": 3.259022047050129e-05, + "loss": 0.1483, + "step": 6801 + }, + { + "epoch": 2.5118168389955686, + "grad_norm": 0.28507429361343384, + "learning_rate": 3.25655868949378e-05, + "loss": 0.1584, + "step": 6802 + }, + { + "epoch": 2.5121861152141802, + "grad_norm": 0.2034989893436432, + "learning_rate": 3.2540953319374305e-05, + "loss": 0.1268, + "step": 6803 + }, + { + "epoch": 2.512555391432792, + "grad_norm": 0.2903268337249756, + "learning_rate": 3.251631974381081e-05, + "loss": 0.1607, + "step": 6804 + }, + { + "epoch": 2.5129246676514034, + "grad_norm": 0.26071372628211975, + "learning_rate": 3.249168616824732e-05, + "loss": 0.1679, + "step": 6805 + }, + { + "epoch": 2.513293943870015, + "grad_norm": 0.24649475514888763, + "learning_rate": 3.246705259268383e-05, + "loss": 0.1566, + "step": 6806 + }, + { + "epoch": 2.513663220088626, + "grad_norm": 0.24315010011196136, + "learning_rate": 3.2442419017120337e-05, + "loss": 0.157, + "step": 6807 + }, + { + "epoch": 2.514032496307238, + "grad_norm": 0.22827361524105072, + "learning_rate": 3.2417785441556844e-05, + "loss": 0.1459, + "step": 6808 + }, + { + "epoch": 2.5144017725258494, + "grad_norm": 0.2596340477466583, + "learning_rate": 3.2393151865993346e-05, + "loss": 0.192, + "step": 6809 + }, + { + "epoch": 2.514771048744461, + "grad_norm": 0.23717932403087616, + "learning_rate": 3.236851829042985e-05, + "loss": 0.1496, + "step": 6810 + }, + { + "epoch": 2.515140324963072, + "grad_norm": 0.23985400795936584, + "learning_rate": 3.234388471486636e-05, + "loss": 0.1547, + "step": 6811 + }, + { + "epoch": 2.5155096011816838, + "grad_norm": 0.21521057188510895, + "learning_rate": 3.231925113930287e-05, + "loss": 0.1372, + "step": 6812 + }, + { + "epoch": 2.5158788774002954, + "grad_norm": 0.24097222089767456, + "learning_rate": 3.229461756373938e-05, + "loss": 0.1318, + "step": 6813 + }, + { + "epoch": 2.516248153618907, + "grad_norm": 0.256532222032547, + "learning_rate": 3.2269983988175885e-05, + "loss": 0.1485, + "step": 6814 + }, + { + "epoch": 2.5166174298375186, + "grad_norm": 0.2318410575389862, + "learning_rate": 3.224535041261239e-05, + "loss": 0.1316, + "step": 6815 + }, + { + "epoch": 2.51698670605613, + "grad_norm": 0.24775995314121246, + "learning_rate": 3.22207168370489e-05, + "loss": 0.1727, + "step": 6816 + }, + { + "epoch": 2.5173559822747418, + "grad_norm": 0.2640649676322937, + "learning_rate": 3.21960832614854e-05, + "loss": 0.1635, + "step": 6817 + }, + { + "epoch": 2.517725258493353, + "grad_norm": 0.26359236240386963, + "learning_rate": 3.217144968592191e-05, + "loss": 0.1918, + "step": 6818 + }, + { + "epoch": 2.5180945347119645, + "grad_norm": 0.27677151560783386, + "learning_rate": 3.214681611035842e-05, + "loss": 0.1612, + "step": 6819 + }, + { + "epoch": 2.518463810930576, + "grad_norm": 0.24239493906497955, + "learning_rate": 3.2122182534794925e-05, + "loss": 0.1494, + "step": 6820 + }, + { + "epoch": 2.5188330871491877, + "grad_norm": 0.2830536365509033, + "learning_rate": 3.209754895923143e-05, + "loss": 0.1953, + "step": 6821 + }, + { + "epoch": 2.519202363367799, + "grad_norm": 0.237086221575737, + "learning_rate": 3.207291538366794e-05, + "loss": 0.155, + "step": 6822 + }, + { + "epoch": 2.5195716395864105, + "grad_norm": 0.20839619636535645, + "learning_rate": 3.204828180810445e-05, + "loss": 0.159, + "step": 6823 + }, + { + "epoch": 2.519940915805022, + "grad_norm": 0.2132973074913025, + "learning_rate": 3.202364823254096e-05, + "loss": 0.1653, + "step": 6824 + }, + { + "epoch": 2.5203101920236337, + "grad_norm": 0.25597721338272095, + "learning_rate": 3.199901465697746e-05, + "loss": 0.1588, + "step": 6825 + }, + { + "epoch": 2.5206794682422453, + "grad_norm": 0.2560158967971802, + "learning_rate": 3.1974381081413966e-05, + "loss": 0.1533, + "step": 6826 + }, + { + "epoch": 2.521048744460857, + "grad_norm": 0.2048921138048172, + "learning_rate": 3.1949747505850474e-05, + "loss": 0.1575, + "step": 6827 + }, + { + "epoch": 2.5214180206794685, + "grad_norm": 0.22768859565258026, + "learning_rate": 3.192511393028698e-05, + "loss": 0.1439, + "step": 6828 + }, + { + "epoch": 2.5217872968980797, + "grad_norm": 0.3183407485485077, + "learning_rate": 3.190048035472349e-05, + "loss": 0.1781, + "step": 6829 + }, + { + "epoch": 2.5221565731166913, + "grad_norm": 0.21865598857402802, + "learning_rate": 3.187584677916e-05, + "loss": 0.1443, + "step": 6830 + }, + { + "epoch": 2.522525849335303, + "grad_norm": 0.264291912317276, + "learning_rate": 3.1851213203596505e-05, + "loss": 0.1632, + "step": 6831 + }, + { + "epoch": 2.5228951255539145, + "grad_norm": 0.2688767611980438, + "learning_rate": 3.182657962803301e-05, + "loss": 0.1551, + "step": 6832 + }, + { + "epoch": 2.5232644017725256, + "grad_norm": 0.23574908077716827, + "learning_rate": 3.1801946052469514e-05, + "loss": 0.1538, + "step": 6833 + }, + { + "epoch": 2.5236336779911372, + "grad_norm": 0.2748833894729614, + "learning_rate": 3.177731247690602e-05, + "loss": 0.1792, + "step": 6834 + }, + { + "epoch": 2.524002954209749, + "grad_norm": 0.25703164935112, + "learning_rate": 3.175267890134253e-05, + "loss": 0.1796, + "step": 6835 + }, + { + "epoch": 2.5243722304283605, + "grad_norm": 0.29320061206817627, + "learning_rate": 3.172804532577904e-05, + "loss": 0.181, + "step": 6836 + }, + { + "epoch": 2.524741506646972, + "grad_norm": 0.27898555994033813, + "learning_rate": 3.1703411750215545e-05, + "loss": 0.1411, + "step": 6837 + }, + { + "epoch": 2.5251107828655837, + "grad_norm": 0.20139668881893158, + "learning_rate": 3.167877817465205e-05, + "loss": 0.1356, + "step": 6838 + }, + { + "epoch": 2.525480059084195, + "grad_norm": 0.26012757420539856, + "learning_rate": 3.165414459908856e-05, + "loss": 0.1566, + "step": 6839 + }, + { + "epoch": 2.5258493353028064, + "grad_norm": 0.266966849565506, + "learning_rate": 3.162951102352507e-05, + "loss": 0.1569, + "step": 6840 + }, + { + "epoch": 2.526218611521418, + "grad_norm": 0.2548103332519531, + "learning_rate": 3.160487744796157e-05, + "loss": 0.152, + "step": 6841 + }, + { + "epoch": 2.5265878877400296, + "grad_norm": 0.2481808215379715, + "learning_rate": 3.158024387239808e-05, + "loss": 0.1747, + "step": 6842 + }, + { + "epoch": 2.5269571639586412, + "grad_norm": 0.2738502025604248, + "learning_rate": 3.1555610296834586e-05, + "loss": 0.1971, + "step": 6843 + }, + { + "epoch": 2.5273264401772524, + "grad_norm": 0.3156393766403198, + "learning_rate": 3.1530976721271094e-05, + "loss": 0.168, + "step": 6844 + }, + { + "epoch": 2.527695716395864, + "grad_norm": 0.28380993008613586, + "learning_rate": 3.15063431457076e-05, + "loss": 0.1579, + "step": 6845 + }, + { + "epoch": 2.5280649926144756, + "grad_norm": 0.2537134289741516, + "learning_rate": 3.148170957014411e-05, + "loss": 0.154, + "step": 6846 + }, + { + "epoch": 2.528434268833087, + "grad_norm": 0.27439358830451965, + "learning_rate": 3.145707599458062e-05, + "loss": 0.1558, + "step": 6847 + }, + { + "epoch": 2.528803545051699, + "grad_norm": 0.26842087507247925, + "learning_rate": 3.1432442419017125e-05, + "loss": 0.1839, + "step": 6848 + }, + { + "epoch": 2.5291728212703104, + "grad_norm": 0.339095801115036, + "learning_rate": 3.1407808843453626e-05, + "loss": 0.1529, + "step": 6849 + }, + { + "epoch": 2.5295420974889216, + "grad_norm": 0.2560567259788513, + "learning_rate": 3.1383175267890134e-05, + "loss": 0.1766, + "step": 6850 + }, + { + "epoch": 2.5295420974889216, + "eval_loss": 8.870379447937012, + "eval_runtime": 6.9043, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 1.014, + "step": 6850 + }, + { + "epoch": 2.529911373707533, + "grad_norm": 0.24540522694587708, + "learning_rate": 3.135854169232664e-05, + "loss": 0.1613, + "step": 6851 + }, + { + "epoch": 2.5302806499261448, + "grad_norm": 0.2619554102420807, + "learning_rate": 3.133390811676315e-05, + "loss": 0.156, + "step": 6852 + }, + { + "epoch": 2.5306499261447564, + "grad_norm": 0.25315919518470764, + "learning_rate": 3.130927454119966e-05, + "loss": 0.1542, + "step": 6853 + }, + { + "epoch": 2.5310192023633675, + "grad_norm": 0.226703479886055, + "learning_rate": 3.1284640965636165e-05, + "loss": 0.1671, + "step": 6854 + }, + { + "epoch": 2.531388478581979, + "grad_norm": 0.25560879707336426, + "learning_rate": 3.126000739007267e-05, + "loss": 0.1602, + "step": 6855 + }, + { + "epoch": 2.5317577548005907, + "grad_norm": 0.277718186378479, + "learning_rate": 3.123537381450918e-05, + "loss": 0.1584, + "step": 6856 + }, + { + "epoch": 2.5321270310192023, + "grad_norm": 0.2714185118675232, + "learning_rate": 3.121074023894568e-05, + "loss": 0.1437, + "step": 6857 + }, + { + "epoch": 2.532496307237814, + "grad_norm": 0.26761338114738464, + "learning_rate": 3.118610666338219e-05, + "loss": 0.1621, + "step": 6858 + }, + { + "epoch": 2.5328655834564255, + "grad_norm": 0.26824235916137695, + "learning_rate": 3.11614730878187e-05, + "loss": 0.1624, + "step": 6859 + }, + { + "epoch": 2.533234859675037, + "grad_norm": 0.2979130446910858, + "learning_rate": 3.1136839512255206e-05, + "loss": 0.1705, + "step": 6860 + }, + { + "epoch": 2.5336041358936483, + "grad_norm": 0.29294320940971375, + "learning_rate": 3.1112205936691714e-05, + "loss": 0.1551, + "step": 6861 + }, + { + "epoch": 2.53397341211226, + "grad_norm": 0.23470883071422577, + "learning_rate": 3.108757236112822e-05, + "loss": 0.1516, + "step": 6862 + }, + { + "epoch": 2.5343426883308715, + "grad_norm": 0.23841427266597748, + "learning_rate": 3.106293878556473e-05, + "loss": 0.1416, + "step": 6863 + }, + { + "epoch": 2.534711964549483, + "grad_norm": 0.22230716049671173, + "learning_rate": 3.103830521000123e-05, + "loss": 0.1398, + "step": 6864 + }, + { + "epoch": 2.5350812407680943, + "grad_norm": 0.29489973187446594, + "learning_rate": 3.101367163443774e-05, + "loss": 0.1744, + "step": 6865 + }, + { + "epoch": 2.535450516986706, + "grad_norm": 0.24104700982570648, + "learning_rate": 3.0989038058874246e-05, + "loss": 0.1346, + "step": 6866 + }, + { + "epoch": 2.5358197932053175, + "grad_norm": 0.3276706337928772, + "learning_rate": 3.0964404483310754e-05, + "loss": 0.1381, + "step": 6867 + }, + { + "epoch": 2.536189069423929, + "grad_norm": 0.26013678312301636, + "learning_rate": 3.093977090774726e-05, + "loss": 0.1498, + "step": 6868 + }, + { + "epoch": 2.5365583456425407, + "grad_norm": 0.28557059168815613, + "learning_rate": 3.091513733218377e-05, + "loss": 0.1656, + "step": 6869 + }, + { + "epoch": 2.5369276218611523, + "grad_norm": 0.2536207437515259, + "learning_rate": 3.089050375662028e-05, + "loss": 0.1556, + "step": 6870 + }, + { + "epoch": 2.537296898079764, + "grad_norm": 0.285200297832489, + "learning_rate": 3.0865870181056786e-05, + "loss": 0.1643, + "step": 6871 + }, + { + "epoch": 2.537666174298375, + "grad_norm": 0.23827961087226868, + "learning_rate": 3.084123660549329e-05, + "loss": 0.1512, + "step": 6872 + }, + { + "epoch": 2.5380354505169866, + "grad_norm": 0.2906091511249542, + "learning_rate": 3.0816603029929795e-05, + "loss": 0.1533, + "step": 6873 + }, + { + "epoch": 2.5384047267355982, + "grad_norm": 0.23596736788749695, + "learning_rate": 3.07919694543663e-05, + "loss": 0.1827, + "step": 6874 + }, + { + "epoch": 2.53877400295421, + "grad_norm": 0.2751716375350952, + "learning_rate": 3.076733587880281e-05, + "loss": 0.1647, + "step": 6875 + }, + { + "epoch": 2.539143279172821, + "grad_norm": 0.311139851808548, + "learning_rate": 3.074270230323932e-05, + "loss": 0.1731, + "step": 6876 + }, + { + "epoch": 2.5395125553914326, + "grad_norm": 0.29657259583473206, + "learning_rate": 3.0718068727675826e-05, + "loss": 0.1834, + "step": 6877 + }, + { + "epoch": 2.539881831610044, + "grad_norm": 0.34616801142692566, + "learning_rate": 3.0693435152112334e-05, + "loss": 0.1879, + "step": 6878 + }, + { + "epoch": 2.540251107828656, + "grad_norm": 0.26304179430007935, + "learning_rate": 3.066880157654884e-05, + "loss": 0.1543, + "step": 6879 + }, + { + "epoch": 2.5406203840472674, + "grad_norm": 0.2697470784187317, + "learning_rate": 3.064416800098534e-05, + "loss": 0.138, + "step": 6880 + }, + { + "epoch": 2.540989660265879, + "grad_norm": 0.2795422077178955, + "learning_rate": 3.061953442542185e-05, + "loss": 0.1525, + "step": 6881 + }, + { + "epoch": 2.5413589364844906, + "grad_norm": 0.254029244184494, + "learning_rate": 3.059490084985836e-05, + "loss": 0.1505, + "step": 6882 + }, + { + "epoch": 2.541728212703102, + "grad_norm": 0.23972146213054657, + "learning_rate": 3.0570267274294866e-05, + "loss": 0.1325, + "step": 6883 + }, + { + "epoch": 2.5420974889217134, + "grad_norm": 0.24427391588687897, + "learning_rate": 3.0545633698731374e-05, + "loss": 0.1575, + "step": 6884 + }, + { + "epoch": 2.542466765140325, + "grad_norm": 0.25511693954467773, + "learning_rate": 3.052100012316788e-05, + "loss": 0.1753, + "step": 6885 + }, + { + "epoch": 2.5428360413589366, + "grad_norm": 0.2553637623786926, + "learning_rate": 3.0496366547604387e-05, + "loss": 0.1433, + "step": 6886 + }, + { + "epoch": 2.5432053175775478, + "grad_norm": 0.2689611613750458, + "learning_rate": 3.0471732972040894e-05, + "loss": 0.1537, + "step": 6887 + }, + { + "epoch": 2.5435745937961594, + "grad_norm": 0.2893809378147125, + "learning_rate": 3.0447099396477402e-05, + "loss": 0.1627, + "step": 6888 + }, + { + "epoch": 2.543943870014771, + "grad_norm": 0.34930410981178284, + "learning_rate": 3.042246582091391e-05, + "loss": 0.1885, + "step": 6889 + }, + { + "epoch": 2.5443131462333826, + "grad_norm": 0.22365716099739075, + "learning_rate": 3.0397832245350415e-05, + "loss": 0.1376, + "step": 6890 + }, + { + "epoch": 2.544682422451994, + "grad_norm": 0.2957543730735779, + "learning_rate": 3.0373198669786923e-05, + "loss": 0.1895, + "step": 6891 + }, + { + "epoch": 2.5450516986706058, + "grad_norm": 0.271034836769104, + "learning_rate": 3.034856509422343e-05, + "loss": 0.1476, + "step": 6892 + }, + { + "epoch": 2.5454209748892174, + "grad_norm": 0.23931564390659332, + "learning_rate": 3.0323931518659938e-05, + "loss": 0.1468, + "step": 6893 + }, + { + "epoch": 2.5457902511078285, + "grad_norm": 0.2622028589248657, + "learning_rate": 3.0299297943096443e-05, + "loss": 0.1744, + "step": 6894 + }, + { + "epoch": 2.54615952732644, + "grad_norm": 0.2857055962085724, + "learning_rate": 3.027466436753295e-05, + "loss": 0.1367, + "step": 6895 + }, + { + "epoch": 2.5465288035450517, + "grad_norm": 0.2550969123840332, + "learning_rate": 3.025003079196946e-05, + "loss": 0.1487, + "step": 6896 + }, + { + "epoch": 2.5468980797636633, + "grad_norm": 0.2802126407623291, + "learning_rate": 3.0225397216405966e-05, + "loss": 0.1451, + "step": 6897 + }, + { + "epoch": 2.5472673559822745, + "grad_norm": 0.31413230299949646, + "learning_rate": 3.020076364084247e-05, + "loss": 0.1722, + "step": 6898 + }, + { + "epoch": 2.547636632200886, + "grad_norm": 0.3008919358253479, + "learning_rate": 3.017613006527898e-05, + "loss": 0.1716, + "step": 6899 + }, + { + "epoch": 2.5480059084194977, + "grad_norm": 0.28647610545158386, + "learning_rate": 3.0151496489715487e-05, + "loss": 0.1691, + "step": 6900 + }, + { + "epoch": 2.5480059084194977, + "eval_loss": 8.829212188720703, + "eval_runtime": 6.9048, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 1.014, + "step": 6900 + }, + { + "epoch": 2.5483751846381093, + "grad_norm": 0.265356183052063, + "learning_rate": 3.0126862914151994e-05, + "loss": 0.1654, + "step": 6901 + }, + { + "epoch": 2.548744460856721, + "grad_norm": 0.2522689402103424, + "learning_rate": 3.01022293385885e-05, + "loss": 0.1536, + "step": 6902 + }, + { + "epoch": 2.5491137370753325, + "grad_norm": 0.2832428514957428, + "learning_rate": 3.0077595763025007e-05, + "loss": 0.1765, + "step": 6903 + }, + { + "epoch": 2.549483013293944, + "grad_norm": 0.271251380443573, + "learning_rate": 3.0052962187461515e-05, + "loss": 0.1756, + "step": 6904 + }, + { + "epoch": 2.5498522895125553, + "grad_norm": 0.27654266357421875, + "learning_rate": 3.0028328611898022e-05, + "loss": 0.1697, + "step": 6905 + }, + { + "epoch": 2.550221565731167, + "grad_norm": 0.2686871290206909, + "learning_rate": 3.0003695036334527e-05, + "loss": 0.1636, + "step": 6906 + }, + { + "epoch": 2.5505908419497785, + "grad_norm": 0.24213500320911407, + "learning_rate": 2.9979061460771028e-05, + "loss": 0.157, + "step": 6907 + }, + { + "epoch": 2.55096011816839, + "grad_norm": 0.2184593379497528, + "learning_rate": 2.9954427885207536e-05, + "loss": 0.1574, + "step": 6908 + }, + { + "epoch": 2.5513293943870012, + "grad_norm": 0.23742301762104034, + "learning_rate": 2.9929794309644044e-05, + "loss": 0.1422, + "step": 6909 + }, + { + "epoch": 2.551698670605613, + "grad_norm": 0.31525370478630066, + "learning_rate": 2.990516073408055e-05, + "loss": 0.1797, + "step": 6910 + }, + { + "epoch": 2.5520679468242244, + "grad_norm": 0.24646587669849396, + "learning_rate": 2.9880527158517056e-05, + "loss": 0.1577, + "step": 6911 + }, + { + "epoch": 2.552437223042836, + "grad_norm": 0.24509663879871368, + "learning_rate": 2.9855893582953564e-05, + "loss": 0.154, + "step": 6912 + }, + { + "epoch": 2.5528064992614476, + "grad_norm": 0.22193372249603271, + "learning_rate": 2.9831260007390072e-05, + "loss": 0.15, + "step": 6913 + }, + { + "epoch": 2.5531757754800593, + "grad_norm": 0.2361707091331482, + "learning_rate": 2.980662643182658e-05, + "loss": 0.1569, + "step": 6914 + }, + { + "epoch": 2.553545051698671, + "grad_norm": 0.27979180216789246, + "learning_rate": 2.9781992856263084e-05, + "loss": 0.1666, + "step": 6915 + }, + { + "epoch": 2.553914327917282, + "grad_norm": 0.2396395057439804, + "learning_rate": 2.9757359280699592e-05, + "loss": 0.1559, + "step": 6916 + }, + { + "epoch": 2.5542836041358936, + "grad_norm": 0.30098122358322144, + "learning_rate": 2.97327257051361e-05, + "loss": 0.1686, + "step": 6917 + }, + { + "epoch": 2.554652880354505, + "grad_norm": 0.2770708501338959, + "learning_rate": 2.9708092129572608e-05, + "loss": 0.1752, + "step": 6918 + }, + { + "epoch": 2.555022156573117, + "grad_norm": 0.2261212319135666, + "learning_rate": 2.9683458554009112e-05, + "loss": 0.1305, + "step": 6919 + }, + { + "epoch": 2.555391432791728, + "grad_norm": 0.24221518635749817, + "learning_rate": 2.965882497844562e-05, + "loss": 0.1499, + "step": 6920 + }, + { + "epoch": 2.5557607090103396, + "grad_norm": 0.2868638336658478, + "learning_rate": 2.9634191402882128e-05, + "loss": 0.1593, + "step": 6921 + }, + { + "epoch": 2.556129985228951, + "grad_norm": 0.2641688883304596, + "learning_rate": 2.9609557827318636e-05, + "loss": 0.1613, + "step": 6922 + }, + { + "epoch": 2.556499261447563, + "grad_norm": 0.20018653571605682, + "learning_rate": 2.958492425175514e-05, + "loss": 0.1382, + "step": 6923 + }, + { + "epoch": 2.5568685376661744, + "grad_norm": 0.26979711651802063, + "learning_rate": 2.9560290676191648e-05, + "loss": 0.174, + "step": 6924 + }, + { + "epoch": 2.557237813884786, + "grad_norm": 0.2975309491157532, + "learning_rate": 2.9535657100628156e-05, + "loss": 0.1443, + "step": 6925 + }, + { + "epoch": 2.5576070901033976, + "grad_norm": 0.26325398683547974, + "learning_rate": 2.9511023525064664e-05, + "loss": 0.1768, + "step": 6926 + }, + { + "epoch": 2.5579763663220088, + "grad_norm": 0.26743263006210327, + "learning_rate": 2.948638994950117e-05, + "loss": 0.1649, + "step": 6927 + }, + { + "epoch": 2.5583456425406204, + "grad_norm": 0.2848571538925171, + "learning_rate": 2.9461756373937676e-05, + "loss": 0.1596, + "step": 6928 + }, + { + "epoch": 2.558714918759232, + "grad_norm": 0.312482625246048, + "learning_rate": 2.9437122798374184e-05, + "loss": 0.175, + "step": 6929 + }, + { + "epoch": 2.5590841949778436, + "grad_norm": 0.2455243021249771, + "learning_rate": 2.9412489222810692e-05, + "loss": 0.1469, + "step": 6930 + }, + { + "epoch": 2.5594534711964547, + "grad_norm": 0.26892316341400146, + "learning_rate": 2.9387855647247196e-05, + "loss": 0.1683, + "step": 6931 + }, + { + "epoch": 2.5598227474150663, + "grad_norm": 0.2560502588748932, + "learning_rate": 2.9363222071683704e-05, + "loss": 0.1284, + "step": 6932 + }, + { + "epoch": 2.560192023633678, + "grad_norm": 0.2688573896884918, + "learning_rate": 2.9338588496120212e-05, + "loss": 0.1675, + "step": 6933 + }, + { + "epoch": 2.5605612998522895, + "grad_norm": 0.2911495864391327, + "learning_rate": 2.931395492055672e-05, + "loss": 0.1645, + "step": 6934 + }, + { + "epoch": 2.560930576070901, + "grad_norm": 0.23881962895393372, + "learning_rate": 2.9289321344993224e-05, + "loss": 0.1434, + "step": 6935 + }, + { + "epoch": 2.5612998522895127, + "grad_norm": 0.24586613476276398, + "learning_rate": 2.9264687769429732e-05, + "loss": 0.1558, + "step": 6936 + }, + { + "epoch": 2.5616691285081243, + "grad_norm": 0.19620896875858307, + "learning_rate": 2.924005419386624e-05, + "loss": 0.1649, + "step": 6937 + }, + { + "epoch": 2.5620384047267355, + "grad_norm": 0.30448541045188904, + "learning_rate": 2.9215420618302748e-05, + "loss": 0.1771, + "step": 6938 + }, + { + "epoch": 2.562407680945347, + "grad_norm": 0.28403109312057495, + "learning_rate": 2.9190787042739253e-05, + "loss": 0.1546, + "step": 6939 + }, + { + "epoch": 2.5627769571639587, + "grad_norm": 0.22645065188407898, + "learning_rate": 2.916615346717576e-05, + "loss": 0.1651, + "step": 6940 + }, + { + "epoch": 2.5631462333825703, + "grad_norm": 0.24472738802433014, + "learning_rate": 2.9141519891612268e-05, + "loss": 0.1667, + "step": 6941 + }, + { + "epoch": 2.5635155096011815, + "grad_norm": 0.24024857580661774, + "learning_rate": 2.9116886316048776e-05, + "loss": 0.1459, + "step": 6942 + }, + { + "epoch": 2.563884785819793, + "grad_norm": 0.23248101770877838, + "learning_rate": 2.909225274048528e-05, + "loss": 0.1453, + "step": 6943 + }, + { + "epoch": 2.5642540620384047, + "grad_norm": 0.2680393159389496, + "learning_rate": 2.906761916492179e-05, + "loss": 0.162, + "step": 6944 + }, + { + "epoch": 2.5646233382570163, + "grad_norm": 0.24683666229248047, + "learning_rate": 2.9042985589358296e-05, + "loss": 0.1601, + "step": 6945 + }, + { + "epoch": 2.564992614475628, + "grad_norm": 0.22906315326690674, + "learning_rate": 2.9018352013794804e-05, + "loss": 0.1663, + "step": 6946 + }, + { + "epoch": 2.5653618906942395, + "grad_norm": 0.259025901556015, + "learning_rate": 2.899371843823131e-05, + "loss": 0.1525, + "step": 6947 + }, + { + "epoch": 2.565731166912851, + "grad_norm": 0.2644229829311371, + "learning_rate": 2.8969084862667817e-05, + "loss": 0.1557, + "step": 6948 + }, + { + "epoch": 2.5661004431314622, + "grad_norm": 0.2690475881099701, + "learning_rate": 2.8944451287104324e-05, + "loss": 0.1594, + "step": 6949 + }, + { + "epoch": 2.566469719350074, + "grad_norm": 0.31890222430229187, + "learning_rate": 2.8919817711540832e-05, + "loss": 0.1708, + "step": 6950 + }, + { + "epoch": 2.566469719350074, + "eval_loss": 8.846841812133789, + "eval_runtime": 6.9187, + "eval_samples_per_second": 7.227, + "eval_steps_per_second": 1.012, + "step": 6950 + }, + { + "epoch": 2.5668389955686854, + "grad_norm": 0.2752573490142822, + "learning_rate": 2.8895184135977337e-05, + "loss": 0.1579, + "step": 6951 + }, + { + "epoch": 2.567208271787297, + "grad_norm": 0.2522841691970825, + "learning_rate": 2.8870550560413845e-05, + "loss": 0.1566, + "step": 6952 + }, + { + "epoch": 2.567577548005908, + "grad_norm": 0.3026491105556488, + "learning_rate": 2.8845916984850352e-05, + "loss": 0.1775, + "step": 6953 + }, + { + "epoch": 2.56794682422452, + "grad_norm": 0.23503316938877106, + "learning_rate": 2.882128340928686e-05, + "loss": 0.1477, + "step": 6954 + }, + { + "epoch": 2.5683161004431314, + "grad_norm": 0.260810911655426, + "learning_rate": 2.8796649833723365e-05, + "loss": 0.1575, + "step": 6955 + }, + { + "epoch": 2.568685376661743, + "grad_norm": 0.26684099435806274, + "learning_rate": 2.8772016258159873e-05, + "loss": 0.144, + "step": 6956 + }, + { + "epoch": 2.5690546528803546, + "grad_norm": 0.46218356490135193, + "learning_rate": 2.874738268259638e-05, + "loss": 0.1534, + "step": 6957 + }, + { + "epoch": 2.569423929098966, + "grad_norm": 0.2712130844593048, + "learning_rate": 2.872274910703289e-05, + "loss": 0.15, + "step": 6958 + }, + { + "epoch": 2.569793205317578, + "grad_norm": 0.316108763217926, + "learning_rate": 2.8698115531469393e-05, + "loss": 0.2115, + "step": 6959 + }, + { + "epoch": 2.570162481536189, + "grad_norm": 0.2536649703979492, + "learning_rate": 2.86734819559059e-05, + "loss": 0.1709, + "step": 6960 + }, + { + "epoch": 2.5705317577548006, + "grad_norm": 0.24049994349479675, + "learning_rate": 2.864884838034241e-05, + "loss": 0.1471, + "step": 6961 + }, + { + "epoch": 2.570901033973412, + "grad_norm": 0.25325360894203186, + "learning_rate": 2.8624214804778916e-05, + "loss": 0.1407, + "step": 6962 + }, + { + "epoch": 2.571270310192024, + "grad_norm": 0.2575276792049408, + "learning_rate": 2.859958122921542e-05, + "loss": 0.1503, + "step": 6963 + }, + { + "epoch": 2.571639586410635, + "grad_norm": 0.2881489396095276, + "learning_rate": 2.857494765365193e-05, + "loss": 0.1572, + "step": 6964 + }, + { + "epoch": 2.5720088626292466, + "grad_norm": 0.24955743551254272, + "learning_rate": 2.8550314078088437e-05, + "loss": 0.1496, + "step": 6965 + }, + { + "epoch": 2.572378138847858, + "grad_norm": 0.25923117995262146, + "learning_rate": 2.8525680502524945e-05, + "loss": 0.1689, + "step": 6966 + }, + { + "epoch": 2.5727474150664698, + "grad_norm": 0.3004510700702667, + "learning_rate": 2.850104692696145e-05, + "loss": 0.1689, + "step": 6967 + }, + { + "epoch": 2.5731166912850814, + "grad_norm": 0.2406233698129654, + "learning_rate": 2.8476413351397957e-05, + "loss": 0.1357, + "step": 6968 + }, + { + "epoch": 2.573485967503693, + "grad_norm": 0.2545103132724762, + "learning_rate": 2.8451779775834465e-05, + "loss": 0.1723, + "step": 6969 + }, + { + "epoch": 2.573855243722304, + "grad_norm": 0.2670483887195587, + "learning_rate": 2.8427146200270973e-05, + "loss": 0.1625, + "step": 6970 + }, + { + "epoch": 2.5742245199409157, + "grad_norm": 0.2389604151248932, + "learning_rate": 2.8402512624707477e-05, + "loss": 0.1563, + "step": 6971 + }, + { + "epoch": 2.5745937961595273, + "grad_norm": 0.2561827600002289, + "learning_rate": 2.8377879049143985e-05, + "loss": 0.161, + "step": 6972 + }, + { + "epoch": 2.574963072378139, + "grad_norm": 0.24164538085460663, + "learning_rate": 2.8353245473580493e-05, + "loss": 0.1529, + "step": 6973 + }, + { + "epoch": 2.5753323485967505, + "grad_norm": 0.24538809061050415, + "learning_rate": 2.8328611898017e-05, + "loss": 0.151, + "step": 6974 + }, + { + "epoch": 2.5757016248153617, + "grad_norm": 0.21176253259181976, + "learning_rate": 2.8303978322453505e-05, + "loss": 0.1523, + "step": 6975 + }, + { + "epoch": 2.5760709010339733, + "grad_norm": 0.23087479174137115, + "learning_rate": 2.8279344746890013e-05, + "loss": 0.1602, + "step": 6976 + }, + { + "epoch": 2.576440177252585, + "grad_norm": 0.26039305329322815, + "learning_rate": 2.825471117132652e-05, + "loss": 0.1555, + "step": 6977 + }, + { + "epoch": 2.5768094534711965, + "grad_norm": 0.25466832518577576, + "learning_rate": 2.8230077595763025e-05, + "loss": 0.1416, + "step": 6978 + }, + { + "epoch": 2.577178729689808, + "grad_norm": 0.2453896552324295, + "learning_rate": 2.8205444020199533e-05, + "loss": 0.1472, + "step": 6979 + }, + { + "epoch": 2.5775480059084197, + "grad_norm": 0.280854731798172, + "learning_rate": 2.818081044463604e-05, + "loss": 0.1637, + "step": 6980 + }, + { + "epoch": 2.577917282127031, + "grad_norm": 0.30829083919525146, + "learning_rate": 2.815617686907255e-05, + "loss": 0.1618, + "step": 6981 + }, + { + "epoch": 2.5782865583456425, + "grad_norm": 0.2752147614955902, + "learning_rate": 2.8131543293509053e-05, + "loss": 0.1526, + "step": 6982 + }, + { + "epoch": 2.578655834564254, + "grad_norm": 0.28210029006004333, + "learning_rate": 2.810690971794556e-05, + "loss": 0.1459, + "step": 6983 + }, + { + "epoch": 2.5790251107828657, + "grad_norm": 0.2649667263031006, + "learning_rate": 2.808227614238207e-05, + "loss": 0.1544, + "step": 6984 + }, + { + "epoch": 2.579394387001477, + "grad_norm": 0.279638409614563, + "learning_rate": 2.8057642566818577e-05, + "loss": 0.144, + "step": 6985 + }, + { + "epoch": 2.5797636632200884, + "grad_norm": 0.24743255972862244, + "learning_rate": 2.803300899125508e-05, + "loss": 0.1504, + "step": 6986 + }, + { + "epoch": 2.5801329394387, + "grad_norm": 0.2615189850330353, + "learning_rate": 2.800837541569159e-05, + "loss": 0.1598, + "step": 6987 + }, + { + "epoch": 2.5805022156573116, + "grad_norm": 0.2658248543739319, + "learning_rate": 2.7983741840128097e-05, + "loss": 0.1512, + "step": 6988 + }, + { + "epoch": 2.5808714918759232, + "grad_norm": 0.29861173033714294, + "learning_rate": 2.7959108264564605e-05, + "loss": 0.1926, + "step": 6989 + }, + { + "epoch": 2.581240768094535, + "grad_norm": 0.2611427307128906, + "learning_rate": 2.793447468900111e-05, + "loss": 0.1637, + "step": 6990 + }, + { + "epoch": 2.5816100443131464, + "grad_norm": 0.23956765234470367, + "learning_rate": 2.7909841113437617e-05, + "loss": 0.1535, + "step": 6991 + }, + { + "epoch": 2.5819793205317576, + "grad_norm": 0.2620432674884796, + "learning_rate": 2.7885207537874125e-05, + "loss": 0.1393, + "step": 6992 + }, + { + "epoch": 2.582348596750369, + "grad_norm": 0.2682763636112213, + "learning_rate": 2.7860573962310633e-05, + "loss": 0.1883, + "step": 6993 + }, + { + "epoch": 2.582717872968981, + "grad_norm": 0.24090777337551117, + "learning_rate": 2.7835940386747138e-05, + "loss": 0.165, + "step": 6994 + }, + { + "epoch": 2.5830871491875924, + "grad_norm": 0.27259185910224915, + "learning_rate": 2.7811306811183645e-05, + "loss": 0.1507, + "step": 6995 + }, + { + "epoch": 2.5834564254062036, + "grad_norm": 0.2761384844779968, + "learning_rate": 2.7786673235620153e-05, + "loss": 0.1648, + "step": 6996 + }, + { + "epoch": 2.583825701624815, + "grad_norm": 0.2621285021305084, + "learning_rate": 2.776203966005666e-05, + "loss": 0.1475, + "step": 6997 + }, + { + "epoch": 2.5841949778434268, + "grad_norm": 0.27943727374076843, + "learning_rate": 2.7737406084493166e-05, + "loss": 0.1473, + "step": 6998 + }, + { + "epoch": 2.5845642540620384, + "grad_norm": 0.27507519721984863, + "learning_rate": 2.7712772508929674e-05, + "loss": 0.1659, + "step": 6999 + }, + { + "epoch": 2.58493353028065, + "grad_norm": 0.22033503651618958, + "learning_rate": 2.768813893336618e-05, + "loss": 0.1405, + "step": 7000 + }, + { + "epoch": 2.58493353028065, + "eval_loss": 8.878762245178223, + "eval_runtime": 6.9137, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 1.012, + "step": 7000 + }, + { + "epoch": 2.5853028064992616, + "grad_norm": 0.21761852502822876, + "learning_rate": 2.766350535780269e-05, + "loss": 0.1248, + "step": 7001 + }, + { + "epoch": 2.585672082717873, + "grad_norm": 0.24824120104312897, + "learning_rate": 2.7638871782239194e-05, + "loss": 0.1425, + "step": 7002 + }, + { + "epoch": 2.5860413589364843, + "grad_norm": 0.23599402606487274, + "learning_rate": 2.76142382066757e-05, + "loss": 0.1621, + "step": 7003 + }, + { + "epoch": 2.586410635155096, + "grad_norm": 0.29255110025405884, + "learning_rate": 2.758960463111221e-05, + "loss": 0.1708, + "step": 7004 + }, + { + "epoch": 2.5867799113737076, + "grad_norm": 0.27694687247276306, + "learning_rate": 2.7564971055548717e-05, + "loss": 0.1548, + "step": 7005 + }, + { + "epoch": 2.587149187592319, + "grad_norm": 0.27357783913612366, + "learning_rate": 2.7540337479985222e-05, + "loss": 0.1693, + "step": 7006 + }, + { + "epoch": 2.5875184638109303, + "grad_norm": 0.24329516291618347, + "learning_rate": 2.751570390442173e-05, + "loss": 0.1547, + "step": 7007 + }, + { + "epoch": 2.587887740029542, + "grad_norm": 0.2939443588256836, + "learning_rate": 2.7491070328858238e-05, + "loss": 0.1659, + "step": 7008 + }, + { + "epoch": 2.5882570162481535, + "grad_norm": 0.2561970055103302, + "learning_rate": 2.7466436753294745e-05, + "loss": 0.1476, + "step": 7009 + }, + { + "epoch": 2.588626292466765, + "grad_norm": 0.24797339737415314, + "learning_rate": 2.744180317773125e-05, + "loss": 0.1978, + "step": 7010 + }, + { + "epoch": 2.5889955686853767, + "grad_norm": 0.23422382771968842, + "learning_rate": 2.7417169602167758e-05, + "loss": 0.1448, + "step": 7011 + }, + { + "epoch": 2.5893648449039883, + "grad_norm": 0.29002249240875244, + "learning_rate": 2.7392536026604266e-05, + "loss": 0.1505, + "step": 7012 + }, + { + "epoch": 2.5897341211226, + "grad_norm": 0.2879321277141571, + "learning_rate": 2.7367902451040773e-05, + "loss": 0.1574, + "step": 7013 + }, + { + "epoch": 2.590103397341211, + "grad_norm": 0.3601919412612915, + "learning_rate": 2.7343268875477278e-05, + "loss": 0.1531, + "step": 7014 + }, + { + "epoch": 2.5904726735598227, + "grad_norm": 0.26962071657180786, + "learning_rate": 2.7318635299913786e-05, + "loss": 0.163, + "step": 7015 + }, + { + "epoch": 2.5908419497784343, + "grad_norm": 0.2307303249835968, + "learning_rate": 2.7294001724350294e-05, + "loss": 0.1507, + "step": 7016 + }, + { + "epoch": 2.591211225997046, + "grad_norm": 0.3000551164150238, + "learning_rate": 2.72693681487868e-05, + "loss": 0.1823, + "step": 7017 + }, + { + "epoch": 2.591580502215657, + "grad_norm": 0.27390074729919434, + "learning_rate": 2.7244734573223306e-05, + "loss": 0.1575, + "step": 7018 + }, + { + "epoch": 2.5919497784342687, + "grad_norm": 0.29743772745132446, + "learning_rate": 2.7220100997659814e-05, + "loss": 0.1874, + "step": 7019 + }, + { + "epoch": 2.5923190546528803, + "grad_norm": 0.2229994684457779, + "learning_rate": 2.7195467422096322e-05, + "loss": 0.1429, + "step": 7020 + }, + { + "epoch": 2.592688330871492, + "grad_norm": 0.2708507180213928, + "learning_rate": 2.717083384653283e-05, + "loss": 0.1621, + "step": 7021 + }, + { + "epoch": 2.5930576070901035, + "grad_norm": 0.29748401045799255, + "learning_rate": 2.7146200270969334e-05, + "loss": 0.1572, + "step": 7022 + }, + { + "epoch": 2.593426883308715, + "grad_norm": 0.24034476280212402, + "learning_rate": 2.7121566695405835e-05, + "loss": 0.1545, + "step": 7023 + }, + { + "epoch": 2.5937961595273267, + "grad_norm": 0.24706235527992249, + "learning_rate": 2.7096933119842343e-05, + "loss": 0.1314, + "step": 7024 + }, + { + "epoch": 2.594165435745938, + "grad_norm": 0.258064329624176, + "learning_rate": 2.707229954427885e-05, + "loss": 0.1542, + "step": 7025 + }, + { + "epoch": 2.5945347119645494, + "grad_norm": 0.23822395503520966, + "learning_rate": 2.704766596871536e-05, + "loss": 0.1549, + "step": 7026 + }, + { + "epoch": 2.594903988183161, + "grad_norm": 0.23765696585178375, + "learning_rate": 2.7023032393151863e-05, + "loss": 0.1434, + "step": 7027 + }, + { + "epoch": 2.5952732644017726, + "grad_norm": 0.2565697133541107, + "learning_rate": 2.699839881758837e-05, + "loss": 0.1619, + "step": 7028 + }, + { + "epoch": 2.595642540620384, + "grad_norm": 0.3135586977005005, + "learning_rate": 2.697376524202488e-05, + "loss": 0.1463, + "step": 7029 + }, + { + "epoch": 2.5960118168389954, + "grad_norm": 0.27200618386268616, + "learning_rate": 2.6949131666461387e-05, + "loss": 0.1613, + "step": 7030 + }, + { + "epoch": 2.596381093057607, + "grad_norm": 0.30012962222099304, + "learning_rate": 2.692449809089789e-05, + "loss": 0.1732, + "step": 7031 + }, + { + "epoch": 2.5967503692762186, + "grad_norm": 0.27331098914146423, + "learning_rate": 2.68998645153344e-05, + "loss": 0.1394, + "step": 7032 + }, + { + "epoch": 2.59711964549483, + "grad_norm": 0.25347280502319336, + "learning_rate": 2.6875230939770907e-05, + "loss": 0.1466, + "step": 7033 + }, + { + "epoch": 2.597488921713442, + "grad_norm": 0.2999386191368103, + "learning_rate": 2.6850597364207415e-05, + "loss": 0.1649, + "step": 7034 + }, + { + "epoch": 2.5978581979320534, + "grad_norm": 0.241125226020813, + "learning_rate": 2.682596378864392e-05, + "loss": 0.1396, + "step": 7035 + }, + { + "epoch": 2.5982274741506646, + "grad_norm": 0.255249947309494, + "learning_rate": 2.6801330213080427e-05, + "loss": 0.1683, + "step": 7036 + }, + { + "epoch": 2.598596750369276, + "grad_norm": 0.35444656014442444, + "learning_rate": 2.6776696637516935e-05, + "loss": 0.1824, + "step": 7037 + }, + { + "epoch": 2.598966026587888, + "grad_norm": 0.2855187654495239, + "learning_rate": 2.6752063061953443e-05, + "loss": 0.1681, + "step": 7038 + }, + { + "epoch": 2.5993353028064994, + "grad_norm": 0.25417065620422363, + "learning_rate": 2.6727429486389947e-05, + "loss": 0.1285, + "step": 7039 + }, + { + "epoch": 2.5997045790251105, + "grad_norm": 0.27841001749038696, + "learning_rate": 2.6702795910826455e-05, + "loss": 0.162, + "step": 7040 + }, + { + "epoch": 2.600073855243722, + "grad_norm": 0.2910228967666626, + "learning_rate": 2.6678162335262963e-05, + "loss": 0.1833, + "step": 7041 + }, + { + "epoch": 2.6004431314623337, + "grad_norm": 0.2821456789970398, + "learning_rate": 2.665352875969947e-05, + "loss": 0.1614, + "step": 7042 + }, + { + "epoch": 2.6008124076809453, + "grad_norm": 0.2879747450351715, + "learning_rate": 2.6628895184135975e-05, + "loss": 0.1614, + "step": 7043 + }, + { + "epoch": 2.601181683899557, + "grad_norm": 0.25978267192840576, + "learning_rate": 2.6604261608572483e-05, + "loss": 0.144, + "step": 7044 + }, + { + "epoch": 2.6015509601181686, + "grad_norm": 0.281252384185791, + "learning_rate": 2.657962803300899e-05, + "loss": 0.136, + "step": 7045 + }, + { + "epoch": 2.60192023633678, + "grad_norm": 0.31845033168792725, + "learning_rate": 2.65549944574455e-05, + "loss": 0.1824, + "step": 7046 + }, + { + "epoch": 2.6022895125553913, + "grad_norm": 0.289420485496521, + "learning_rate": 2.6530360881882004e-05, + "loss": 0.1751, + "step": 7047 + }, + { + "epoch": 2.602658788774003, + "grad_norm": 0.2504969537258148, + "learning_rate": 2.650572730631851e-05, + "loss": 0.1597, + "step": 7048 + }, + { + "epoch": 2.6030280649926145, + "grad_norm": 0.2578841745853424, + "learning_rate": 2.648109373075502e-05, + "loss": 0.1472, + "step": 7049 + }, + { + "epoch": 2.603397341211226, + "grad_norm": 0.26248040795326233, + "learning_rate": 2.6456460155191527e-05, + "loss": 0.1678, + "step": 7050 + }, + { + "epoch": 2.603397341211226, + "eval_loss": 8.843896865844727, + "eval_runtime": 6.9127, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 1.013, + "step": 7050 + }, + { + "epoch": 2.6037666174298373, + "grad_norm": 0.2729944884777069, + "learning_rate": 2.643182657962803e-05, + "loss": 0.1529, + "step": 7051 + }, + { + "epoch": 2.604135893648449, + "grad_norm": 0.24332155287265778, + "learning_rate": 2.640719300406454e-05, + "loss": 0.1617, + "step": 7052 + }, + { + "epoch": 2.6045051698670605, + "grad_norm": 0.2626761198043823, + "learning_rate": 2.6382559428501047e-05, + "loss": 0.1411, + "step": 7053 + }, + { + "epoch": 2.604874446085672, + "grad_norm": 0.29778847098350525, + "learning_rate": 2.6357925852937555e-05, + "loss": 0.1944, + "step": 7054 + }, + { + "epoch": 2.6052437223042837, + "grad_norm": 0.274544894695282, + "learning_rate": 2.633329227737406e-05, + "loss": 0.1636, + "step": 7055 + }, + { + "epoch": 2.6056129985228953, + "grad_norm": 0.3099652826786041, + "learning_rate": 2.6308658701810568e-05, + "loss": 0.1569, + "step": 7056 + }, + { + "epoch": 2.605982274741507, + "grad_norm": 0.258964866399765, + "learning_rate": 2.6284025126247075e-05, + "loss": 0.1667, + "step": 7057 + }, + { + "epoch": 2.606351550960118, + "grad_norm": 0.25549083948135376, + "learning_rate": 2.6259391550683583e-05, + "loss": 0.159, + "step": 7058 + }, + { + "epoch": 2.6067208271787297, + "grad_norm": 0.24773307144641876, + "learning_rate": 2.6234757975120088e-05, + "loss": 0.1712, + "step": 7059 + }, + { + "epoch": 2.6070901033973413, + "grad_norm": 0.26214468479156494, + "learning_rate": 2.6210124399556596e-05, + "loss": 0.1482, + "step": 7060 + }, + { + "epoch": 2.607459379615953, + "grad_norm": 0.2931172847747803, + "learning_rate": 2.6185490823993103e-05, + "loss": 0.1374, + "step": 7061 + }, + { + "epoch": 2.607828655834564, + "grad_norm": 0.2331322580575943, + "learning_rate": 2.616085724842961e-05, + "loss": 0.1626, + "step": 7062 + }, + { + "epoch": 2.6081979320531756, + "grad_norm": 0.2185620218515396, + "learning_rate": 2.6136223672866116e-05, + "loss": 0.1268, + "step": 7063 + }, + { + "epoch": 2.6085672082717872, + "grad_norm": 0.27303072810173035, + "learning_rate": 2.6111590097302624e-05, + "loss": 0.1565, + "step": 7064 + }, + { + "epoch": 2.608936484490399, + "grad_norm": 0.24687299132347107, + "learning_rate": 2.608695652173913e-05, + "loss": 0.1643, + "step": 7065 + }, + { + "epoch": 2.6093057607090104, + "grad_norm": 0.26658719778060913, + "learning_rate": 2.606232294617564e-05, + "loss": 0.1614, + "step": 7066 + }, + { + "epoch": 2.609675036927622, + "grad_norm": 0.23379723727703094, + "learning_rate": 2.6037689370612144e-05, + "loss": 0.1667, + "step": 7067 + }, + { + "epoch": 2.6100443131462336, + "grad_norm": 0.36649322509765625, + "learning_rate": 2.6013055795048652e-05, + "loss": 0.2077, + "step": 7068 + }, + { + "epoch": 2.610413589364845, + "grad_norm": 0.2951618432998657, + "learning_rate": 2.598842221948516e-05, + "loss": 0.1672, + "step": 7069 + }, + { + "epoch": 2.6107828655834564, + "grad_norm": 0.3540022671222687, + "learning_rate": 2.5963788643921667e-05, + "loss": 0.1473, + "step": 7070 + }, + { + "epoch": 2.611152141802068, + "grad_norm": 0.3307909369468689, + "learning_rate": 2.5939155068358172e-05, + "loss": 0.2021, + "step": 7071 + }, + { + "epoch": 2.6115214180206796, + "grad_norm": 0.29840707778930664, + "learning_rate": 2.591452149279468e-05, + "loss": 0.1798, + "step": 7072 + }, + { + "epoch": 2.6118906942392908, + "grad_norm": 0.27899643778800964, + "learning_rate": 2.5889887917231188e-05, + "loss": 0.1704, + "step": 7073 + }, + { + "epoch": 2.6122599704579024, + "grad_norm": 0.261625736951828, + "learning_rate": 2.5865254341667696e-05, + "loss": 0.1439, + "step": 7074 + }, + { + "epoch": 2.612629246676514, + "grad_norm": 0.24923984706401825, + "learning_rate": 2.58406207661042e-05, + "loss": 0.1737, + "step": 7075 + }, + { + "epoch": 2.6129985228951256, + "grad_norm": 0.22461603581905365, + "learning_rate": 2.5815987190540708e-05, + "loss": 0.1577, + "step": 7076 + }, + { + "epoch": 2.613367799113737, + "grad_norm": 0.26391854882240295, + "learning_rate": 2.5791353614977216e-05, + "loss": 0.1498, + "step": 7077 + }, + { + "epoch": 2.613737075332349, + "grad_norm": 0.28709402680397034, + "learning_rate": 2.5766720039413724e-05, + "loss": 0.1854, + "step": 7078 + }, + { + "epoch": 2.6141063515509604, + "grad_norm": 0.27425602078437805, + "learning_rate": 2.5742086463850228e-05, + "loss": 0.1523, + "step": 7079 + }, + { + "epoch": 2.6144756277695715, + "grad_norm": 0.2423887997865677, + "learning_rate": 2.5717452888286736e-05, + "loss": 0.1503, + "step": 7080 + }, + { + "epoch": 2.614844903988183, + "grad_norm": 0.2716057300567627, + "learning_rate": 2.5692819312723244e-05, + "loss": 0.1655, + "step": 7081 + }, + { + "epoch": 2.6152141802067947, + "grad_norm": 0.2524002194404602, + "learning_rate": 2.566818573715975e-05, + "loss": 0.17, + "step": 7082 + }, + { + "epoch": 2.6155834564254064, + "grad_norm": 0.26297226548194885, + "learning_rate": 2.5643552161596256e-05, + "loss": 0.1456, + "step": 7083 + }, + { + "epoch": 2.6159527326440175, + "grad_norm": 0.21919339895248413, + "learning_rate": 2.5618918586032764e-05, + "loss": 0.1347, + "step": 7084 + }, + { + "epoch": 2.616322008862629, + "grad_norm": 0.29301899671554565, + "learning_rate": 2.5594285010469272e-05, + "loss": 0.1924, + "step": 7085 + }, + { + "epoch": 2.6166912850812407, + "grad_norm": 0.2823084592819214, + "learning_rate": 2.556965143490578e-05, + "loss": 0.1721, + "step": 7086 + }, + { + "epoch": 2.6170605612998523, + "grad_norm": 0.31265807151794434, + "learning_rate": 2.5545017859342284e-05, + "loss": 0.1472, + "step": 7087 + }, + { + "epoch": 2.617429837518464, + "grad_norm": 0.29546868801116943, + "learning_rate": 2.5520384283778792e-05, + "loss": 0.172, + "step": 7088 + }, + { + "epoch": 2.6177991137370755, + "grad_norm": 0.27220213413238525, + "learning_rate": 2.54957507082153e-05, + "loss": 0.1562, + "step": 7089 + }, + { + "epoch": 2.618168389955687, + "grad_norm": 0.21157190203666687, + "learning_rate": 2.5471117132651808e-05, + "loss": 0.1271, + "step": 7090 + }, + { + "epoch": 2.6185376661742983, + "grad_norm": 0.2235511988401413, + "learning_rate": 2.5446483557088312e-05, + "loss": 0.1275, + "step": 7091 + }, + { + "epoch": 2.61890694239291, + "grad_norm": 0.2530994415283203, + "learning_rate": 2.542184998152482e-05, + "loss": 0.1659, + "step": 7092 + }, + { + "epoch": 2.6192762186115215, + "grad_norm": 0.24295057356357574, + "learning_rate": 2.5397216405961328e-05, + "loss": 0.1589, + "step": 7093 + }, + { + "epoch": 2.619645494830133, + "grad_norm": 0.27022114396095276, + "learning_rate": 2.5372582830397832e-05, + "loss": 0.1811, + "step": 7094 + }, + { + "epoch": 2.6200147710487443, + "grad_norm": 0.2586112916469574, + "learning_rate": 2.534794925483434e-05, + "loss": 0.148, + "step": 7095 + }, + { + "epoch": 2.620384047267356, + "grad_norm": 0.33487239480018616, + "learning_rate": 2.5323315679270848e-05, + "loss": 0.1896, + "step": 7096 + }, + { + "epoch": 2.6207533234859675, + "grad_norm": 0.22621877491474152, + "learning_rate": 2.5298682103707356e-05, + "loss": 0.1282, + "step": 7097 + }, + { + "epoch": 2.621122599704579, + "grad_norm": 0.2808326184749603, + "learning_rate": 2.527404852814386e-05, + "loss": 0.148, + "step": 7098 + }, + { + "epoch": 2.6214918759231907, + "grad_norm": 0.2556298077106476, + "learning_rate": 2.524941495258037e-05, + "loss": 0.1717, + "step": 7099 + }, + { + "epoch": 2.6218611521418023, + "grad_norm": 0.22629734873771667, + "learning_rate": 2.5224781377016876e-05, + "loss": 0.1552, + "step": 7100 + }, + { + "epoch": 2.6218611521418023, + "eval_loss": 8.866128921508789, + "eval_runtime": 6.9116, + "eval_samples_per_second": 7.234, + "eval_steps_per_second": 1.013, + "step": 7100 + }, + { + "epoch": 2.6222304283604134, + "grad_norm": 0.2279207408428192, + "learning_rate": 2.5200147801453384e-05, + "loss": 0.15, + "step": 7101 + }, + { + "epoch": 2.622599704579025, + "grad_norm": 0.2539941668510437, + "learning_rate": 2.517551422588989e-05, + "loss": 0.1553, + "step": 7102 + }, + { + "epoch": 2.6229689807976366, + "grad_norm": 0.247309610247612, + "learning_rate": 2.5150880650326396e-05, + "loss": 0.1497, + "step": 7103 + }, + { + "epoch": 2.6233382570162482, + "grad_norm": 0.2931831181049347, + "learning_rate": 2.5126247074762904e-05, + "loss": 0.1544, + "step": 7104 + }, + { + "epoch": 2.62370753323486, + "grad_norm": 0.26418307423591614, + "learning_rate": 2.5101613499199412e-05, + "loss": 0.1638, + "step": 7105 + }, + { + "epoch": 2.624076809453471, + "grad_norm": 0.2591760754585266, + "learning_rate": 2.5076979923635917e-05, + "loss": 0.1646, + "step": 7106 + }, + { + "epoch": 2.6244460856720826, + "grad_norm": 0.27333158254623413, + "learning_rate": 2.5052346348072425e-05, + "loss": 0.1377, + "step": 7107 + }, + { + "epoch": 2.624815361890694, + "grad_norm": 0.2400055080652237, + "learning_rate": 2.5027712772508932e-05, + "loss": 0.1692, + "step": 7108 + }, + { + "epoch": 2.625184638109306, + "grad_norm": 0.20605552196502686, + "learning_rate": 2.500307919694544e-05, + "loss": 0.1543, + "step": 7109 + }, + { + "epoch": 2.6255539143279174, + "grad_norm": 0.24791140854358673, + "learning_rate": 2.4978445621381945e-05, + "loss": 0.1374, + "step": 7110 + }, + { + "epoch": 2.625923190546529, + "grad_norm": 0.3039201498031616, + "learning_rate": 2.495381204581845e-05, + "loss": 0.1759, + "step": 7111 + }, + { + "epoch": 2.62629246676514, + "grad_norm": 0.25907498598098755, + "learning_rate": 2.4929178470254957e-05, + "loss": 0.1413, + "step": 7112 + }, + { + "epoch": 2.6266617429837518, + "grad_norm": 0.2713993787765503, + "learning_rate": 2.4904544894691465e-05, + "loss": 0.1779, + "step": 7113 + }, + { + "epoch": 2.6270310192023634, + "grad_norm": 0.2473960667848587, + "learning_rate": 2.4879911319127973e-05, + "loss": 0.164, + "step": 7114 + }, + { + "epoch": 2.627400295420975, + "grad_norm": 0.32699617743492126, + "learning_rate": 2.4855277743564477e-05, + "loss": 0.2088, + "step": 7115 + }, + { + "epoch": 2.6277695716395866, + "grad_norm": 0.24713794887065887, + "learning_rate": 2.4830644168000985e-05, + "loss": 0.163, + "step": 7116 + }, + { + "epoch": 2.6281388478581977, + "grad_norm": 0.25837570428848267, + "learning_rate": 2.4806010592437493e-05, + "loss": 0.1506, + "step": 7117 + }, + { + "epoch": 2.6285081240768093, + "grad_norm": 0.27124252915382385, + "learning_rate": 2.4781377016874e-05, + "loss": 0.1441, + "step": 7118 + }, + { + "epoch": 2.628877400295421, + "grad_norm": 0.2625406086444855, + "learning_rate": 2.4756743441310505e-05, + "loss": 0.1465, + "step": 7119 + }, + { + "epoch": 2.6292466765140325, + "grad_norm": 0.24405086040496826, + "learning_rate": 2.4732109865747013e-05, + "loss": 0.1784, + "step": 7120 + }, + { + "epoch": 2.629615952732644, + "grad_norm": 0.2646090090274811, + "learning_rate": 2.470747629018352e-05, + "loss": 0.1582, + "step": 7121 + }, + { + "epoch": 2.6299852289512557, + "grad_norm": 0.26583191752433777, + "learning_rate": 2.468284271462003e-05, + "loss": 0.1593, + "step": 7122 + }, + { + "epoch": 2.630354505169867, + "grad_norm": 0.285787969827652, + "learning_rate": 2.4658209139056533e-05, + "loss": 0.1446, + "step": 7123 + }, + { + "epoch": 2.6307237813884785, + "grad_norm": 0.22197078168392181, + "learning_rate": 2.463357556349304e-05, + "loss": 0.1374, + "step": 7124 + }, + { + "epoch": 2.63109305760709, + "grad_norm": 0.23562760651111603, + "learning_rate": 2.460894198792955e-05, + "loss": 0.1294, + "step": 7125 + }, + { + "epoch": 2.6314623338257017, + "grad_norm": 0.29790398478507996, + "learning_rate": 2.4584308412366057e-05, + "loss": 0.1454, + "step": 7126 + }, + { + "epoch": 2.631831610044313, + "grad_norm": 0.28301894664764404, + "learning_rate": 2.455967483680256e-05, + "loss": 0.1962, + "step": 7127 + }, + { + "epoch": 2.6322008862629245, + "grad_norm": 0.2764492332935333, + "learning_rate": 2.453504126123907e-05, + "loss": 0.1461, + "step": 7128 + }, + { + "epoch": 2.632570162481536, + "grad_norm": 0.3701415956020355, + "learning_rate": 2.4510407685675577e-05, + "loss": 0.1704, + "step": 7129 + }, + { + "epoch": 2.6329394387001477, + "grad_norm": 0.28814417123794556, + "learning_rate": 2.4485774110112085e-05, + "loss": 0.1846, + "step": 7130 + }, + { + "epoch": 2.6333087149187593, + "grad_norm": 0.22481834888458252, + "learning_rate": 2.446114053454859e-05, + "loss": 0.1449, + "step": 7131 + }, + { + "epoch": 2.633677991137371, + "grad_norm": 0.3016885221004486, + "learning_rate": 2.4436506958985097e-05, + "loss": 0.1626, + "step": 7132 + }, + { + "epoch": 2.6340472673559825, + "grad_norm": 0.25806280970573425, + "learning_rate": 2.4411873383421605e-05, + "loss": 0.1635, + "step": 7133 + }, + { + "epoch": 2.6344165435745936, + "grad_norm": 0.2477843016386032, + "learning_rate": 2.4387239807858113e-05, + "loss": 0.1745, + "step": 7134 + }, + { + "epoch": 2.6347858197932053, + "grad_norm": 0.25511592626571655, + "learning_rate": 2.4362606232294618e-05, + "loss": 0.1608, + "step": 7135 + }, + { + "epoch": 2.635155096011817, + "grad_norm": 0.2669403851032257, + "learning_rate": 2.4337972656731125e-05, + "loss": 0.1534, + "step": 7136 + }, + { + "epoch": 2.6355243722304285, + "grad_norm": 0.2959480583667755, + "learning_rate": 2.4313339081167633e-05, + "loss": 0.1669, + "step": 7137 + }, + { + "epoch": 2.6358936484490396, + "grad_norm": 0.23895889520645142, + "learning_rate": 2.428870550560414e-05, + "loss": 0.1586, + "step": 7138 + }, + { + "epoch": 2.636262924667651, + "grad_norm": 0.26087817549705505, + "learning_rate": 2.4264071930040646e-05, + "loss": 0.1618, + "step": 7139 + }, + { + "epoch": 2.636632200886263, + "grad_norm": 0.24142244458198547, + "learning_rate": 2.4239438354477154e-05, + "loss": 0.1483, + "step": 7140 + }, + { + "epoch": 2.6370014771048744, + "grad_norm": 0.23019300401210785, + "learning_rate": 2.421480477891366e-05, + "loss": 0.1699, + "step": 7141 + }, + { + "epoch": 2.637370753323486, + "grad_norm": 0.24477308988571167, + "learning_rate": 2.419017120335017e-05, + "loss": 0.1689, + "step": 7142 + }, + { + "epoch": 2.6377400295420976, + "grad_norm": 0.2647544741630554, + "learning_rate": 2.4165537627786674e-05, + "loss": 0.187, + "step": 7143 + }, + { + "epoch": 2.6381093057607092, + "grad_norm": 0.2613615095615387, + "learning_rate": 2.414090405222318e-05, + "loss": 0.1613, + "step": 7144 + }, + { + "epoch": 2.6384785819793204, + "grad_norm": 0.2559479773044586, + "learning_rate": 2.411627047665969e-05, + "loss": 0.1789, + "step": 7145 + }, + { + "epoch": 2.638847858197932, + "grad_norm": 0.28694236278533936, + "learning_rate": 2.4091636901096197e-05, + "loss": 0.1591, + "step": 7146 + }, + { + "epoch": 2.6392171344165436, + "grad_norm": 0.2477453202009201, + "learning_rate": 2.4067003325532702e-05, + "loss": 0.1484, + "step": 7147 + }, + { + "epoch": 2.639586410635155, + "grad_norm": 0.24981501698493958, + "learning_rate": 2.404236974996921e-05, + "loss": 0.1633, + "step": 7148 + }, + { + "epoch": 2.6399556868537664, + "grad_norm": 0.20641064643859863, + "learning_rate": 2.4017736174405718e-05, + "loss": 0.1356, + "step": 7149 + }, + { + "epoch": 2.640324963072378, + "grad_norm": 0.2581598162651062, + "learning_rate": 2.3993102598842222e-05, + "loss": 0.179, + "step": 7150 + }, + { + "epoch": 2.640324963072378, + "eval_loss": 8.853588104248047, + "eval_runtime": 6.9056, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 1.014, + "step": 7150 + }, + { + "epoch": 2.6406942392909896, + "grad_norm": 0.2390914410352707, + "learning_rate": 2.396846902327873e-05, + "loss": 0.1574, + "step": 7151 + }, + { + "epoch": 2.641063515509601, + "grad_norm": 0.25248903036117554, + "learning_rate": 2.3943835447715238e-05, + "loss": 0.1798, + "step": 7152 + }, + { + "epoch": 2.6414327917282128, + "grad_norm": 0.2654764652252197, + "learning_rate": 2.3919201872151746e-05, + "loss": 0.1272, + "step": 7153 + }, + { + "epoch": 2.6418020679468244, + "grad_norm": 0.2501949965953827, + "learning_rate": 2.389456829658825e-05, + "loss": 0.1494, + "step": 7154 + }, + { + "epoch": 2.642171344165436, + "grad_norm": 0.28131425380706787, + "learning_rate": 2.3869934721024758e-05, + "loss": 0.1663, + "step": 7155 + }, + { + "epoch": 2.642540620384047, + "grad_norm": 0.27217432856559753, + "learning_rate": 2.3845301145461266e-05, + "loss": 0.1685, + "step": 7156 + }, + { + "epoch": 2.6429098966026587, + "grad_norm": 0.24578003585338593, + "learning_rate": 2.3820667569897774e-05, + "loss": 0.1593, + "step": 7157 + }, + { + "epoch": 2.6432791728212703, + "grad_norm": 0.21168336272239685, + "learning_rate": 2.3796033994334278e-05, + "loss": 0.1343, + "step": 7158 + }, + { + "epoch": 2.643648449039882, + "grad_norm": 0.33981403708457947, + "learning_rate": 2.3771400418770786e-05, + "loss": 0.1914, + "step": 7159 + }, + { + "epoch": 2.644017725258493, + "grad_norm": 0.22811926901340485, + "learning_rate": 2.3746766843207294e-05, + "loss": 0.156, + "step": 7160 + }, + { + "epoch": 2.6443870014771047, + "grad_norm": 0.3153007924556732, + "learning_rate": 2.3722133267643802e-05, + "loss": 0.1993, + "step": 7161 + }, + { + "epoch": 2.6447562776957163, + "grad_norm": 0.24859854578971863, + "learning_rate": 2.3697499692080306e-05, + "loss": 0.1747, + "step": 7162 + }, + { + "epoch": 2.645125553914328, + "grad_norm": 0.20079587399959564, + "learning_rate": 2.3672866116516814e-05, + "loss": 0.1219, + "step": 7163 + }, + { + "epoch": 2.6454948301329395, + "grad_norm": 0.2523827850818634, + "learning_rate": 2.3648232540953322e-05, + "loss": 0.1712, + "step": 7164 + }, + { + "epoch": 2.645864106351551, + "grad_norm": 0.3068341612815857, + "learning_rate": 2.362359896538983e-05, + "loss": 0.1839, + "step": 7165 + }, + { + "epoch": 2.6462333825701627, + "grad_norm": 0.2505051791667938, + "learning_rate": 2.3598965389826334e-05, + "loss": 0.1442, + "step": 7166 + }, + { + "epoch": 2.646602658788774, + "grad_norm": 0.24568988382816315, + "learning_rate": 2.3574331814262842e-05, + "loss": 0.1603, + "step": 7167 + }, + { + "epoch": 2.6469719350073855, + "grad_norm": 0.27248865365982056, + "learning_rate": 2.3549698238699347e-05, + "loss": 0.1556, + "step": 7168 + }, + { + "epoch": 2.647341211225997, + "grad_norm": 0.260239839553833, + "learning_rate": 2.3525064663135854e-05, + "loss": 0.1504, + "step": 7169 + }, + { + "epoch": 2.6477104874446087, + "grad_norm": 0.26263317465782166, + "learning_rate": 2.3500431087572362e-05, + "loss": 0.1637, + "step": 7170 + }, + { + "epoch": 2.64807976366322, + "grad_norm": 0.24393810331821442, + "learning_rate": 2.3475797512008867e-05, + "loss": 0.1572, + "step": 7171 + }, + { + "epoch": 2.6484490398818314, + "grad_norm": 0.35813090205192566, + "learning_rate": 2.3451163936445375e-05, + "loss": 0.1804, + "step": 7172 + }, + { + "epoch": 2.648818316100443, + "grad_norm": 0.2368893325328827, + "learning_rate": 2.3426530360881883e-05, + "loss": 0.1686, + "step": 7173 + }, + { + "epoch": 2.6491875923190547, + "grad_norm": 0.21937641501426697, + "learning_rate": 2.340189678531839e-05, + "loss": 0.1233, + "step": 7174 + }, + { + "epoch": 2.6495568685376663, + "grad_norm": 0.2677416205406189, + "learning_rate": 2.3377263209754895e-05, + "loss": 0.1756, + "step": 7175 + }, + { + "epoch": 2.649926144756278, + "grad_norm": 0.25949594378471375, + "learning_rate": 2.3352629634191403e-05, + "loss": 0.1407, + "step": 7176 + }, + { + "epoch": 2.6502954209748895, + "grad_norm": 0.2761537730693817, + "learning_rate": 2.332799605862791e-05, + "loss": 0.1556, + "step": 7177 + }, + { + "epoch": 2.6506646971935006, + "grad_norm": 0.2283579260110855, + "learning_rate": 2.330336248306442e-05, + "loss": 0.1575, + "step": 7178 + }, + { + "epoch": 2.651033973412112, + "grad_norm": 0.22830431163311005, + "learning_rate": 2.3278728907500923e-05, + "loss": 0.1521, + "step": 7179 + }, + { + "epoch": 2.651403249630724, + "grad_norm": 0.30982473492622375, + "learning_rate": 2.325409533193743e-05, + "loss": 0.1475, + "step": 7180 + }, + { + "epoch": 2.6517725258493354, + "grad_norm": 0.2520334720611572, + "learning_rate": 2.322946175637394e-05, + "loss": 0.1598, + "step": 7181 + }, + { + "epoch": 2.6521418020679466, + "grad_norm": 0.25671178102493286, + "learning_rate": 2.3204828180810447e-05, + "loss": 0.1618, + "step": 7182 + }, + { + "epoch": 2.652511078286558, + "grad_norm": 0.2838020324707031, + "learning_rate": 2.318019460524695e-05, + "loss": 0.1462, + "step": 7183 + }, + { + "epoch": 2.65288035450517, + "grad_norm": 0.264899879693985, + "learning_rate": 2.315556102968346e-05, + "loss": 0.1331, + "step": 7184 + }, + { + "epoch": 2.6532496307237814, + "grad_norm": 0.28281593322753906, + "learning_rate": 2.3130927454119967e-05, + "loss": 0.1742, + "step": 7185 + }, + { + "epoch": 2.653618906942393, + "grad_norm": 0.21121738851070404, + "learning_rate": 2.3106293878556475e-05, + "loss": 0.1293, + "step": 7186 + }, + { + "epoch": 2.6539881831610046, + "grad_norm": 0.25332462787628174, + "learning_rate": 2.308166030299298e-05, + "loss": 0.1616, + "step": 7187 + }, + { + "epoch": 2.654357459379616, + "grad_norm": 0.2448319047689438, + "learning_rate": 2.3057026727429487e-05, + "loss": 0.1614, + "step": 7188 + }, + { + "epoch": 2.6547267355982274, + "grad_norm": 0.30122774839401245, + "learning_rate": 2.3032393151865995e-05, + "loss": 0.193, + "step": 7189 + }, + { + "epoch": 2.655096011816839, + "grad_norm": 0.27029240131378174, + "learning_rate": 2.3007759576302503e-05, + "loss": 0.1854, + "step": 7190 + }, + { + "epoch": 2.6554652880354506, + "grad_norm": 0.22305288910865784, + "learning_rate": 2.2983126000739007e-05, + "loss": 0.1415, + "step": 7191 + }, + { + "epoch": 2.655834564254062, + "grad_norm": 0.29960229992866516, + "learning_rate": 2.2958492425175515e-05, + "loss": 0.1419, + "step": 7192 + }, + { + "epoch": 2.6562038404726733, + "grad_norm": 0.2728195786476135, + "learning_rate": 2.2933858849612023e-05, + "loss": 0.1799, + "step": 7193 + }, + { + "epoch": 2.656573116691285, + "grad_norm": 0.2856903076171875, + "learning_rate": 2.290922527404853e-05, + "loss": 0.1691, + "step": 7194 + }, + { + "epoch": 2.6569423929098965, + "grad_norm": 0.2481575608253479, + "learning_rate": 2.2884591698485035e-05, + "loss": 0.1525, + "step": 7195 + }, + { + "epoch": 2.657311669128508, + "grad_norm": 0.28450366854667664, + "learning_rate": 2.2859958122921543e-05, + "loss": 0.1456, + "step": 7196 + }, + { + "epoch": 2.6576809453471197, + "grad_norm": 0.22370845079421997, + "learning_rate": 2.283532454735805e-05, + "loss": 0.1439, + "step": 7197 + }, + { + "epoch": 2.6580502215657313, + "grad_norm": 0.2714557349681854, + "learning_rate": 2.281069097179456e-05, + "loss": 0.1558, + "step": 7198 + }, + { + "epoch": 2.658419497784343, + "grad_norm": 0.2757282853126526, + "learning_rate": 2.2786057396231063e-05, + "loss": 0.1638, + "step": 7199 + }, + { + "epoch": 2.658788774002954, + "grad_norm": 0.2563628852367401, + "learning_rate": 2.276142382066757e-05, + "loss": 0.144, + "step": 7200 + }, + { + "epoch": 2.658788774002954, + "eval_loss": 8.860787391662598, + "eval_runtime": 6.9065, + "eval_samples_per_second": 7.24, + "eval_steps_per_second": 1.014, + "step": 7200 + }, + { + "epoch": 2.6591580502215657, + "grad_norm": 0.31413426995277405, + "learning_rate": 2.273679024510408e-05, + "loss": 0.1468, + "step": 7201 + }, + { + "epoch": 2.6595273264401773, + "grad_norm": 0.25500231981277466, + "learning_rate": 2.2712156669540587e-05, + "loss": 0.1444, + "step": 7202 + }, + { + "epoch": 2.659896602658789, + "grad_norm": 0.2638005316257477, + "learning_rate": 2.268752309397709e-05, + "loss": 0.1585, + "step": 7203 + }, + { + "epoch": 2.6602658788774, + "grad_norm": 0.3434399366378784, + "learning_rate": 2.26628895184136e-05, + "loss": 0.1647, + "step": 7204 + }, + { + "epoch": 2.6606351550960117, + "grad_norm": 0.2380715310573578, + "learning_rate": 2.2638255942850107e-05, + "loss": 0.1531, + "step": 7205 + }, + { + "epoch": 2.6610044313146233, + "grad_norm": 0.24765819311141968, + "learning_rate": 2.261362236728661e-05, + "loss": 0.1495, + "step": 7206 + }, + { + "epoch": 2.661373707533235, + "grad_norm": 0.2835356593132019, + "learning_rate": 2.258898879172312e-05, + "loss": 0.1595, + "step": 7207 + }, + { + "epoch": 2.6617429837518465, + "grad_norm": 0.24970851838588715, + "learning_rate": 2.2564355216159627e-05, + "loss": 0.1555, + "step": 7208 + }, + { + "epoch": 2.662112259970458, + "grad_norm": 0.26853424310684204, + "learning_rate": 2.2539721640596135e-05, + "loss": 0.1617, + "step": 7209 + }, + { + "epoch": 2.6624815361890697, + "grad_norm": 0.2536202669143677, + "learning_rate": 2.251508806503264e-05, + "loss": 0.1506, + "step": 7210 + }, + { + "epoch": 2.662850812407681, + "grad_norm": 0.28407227993011475, + "learning_rate": 2.2490454489469147e-05, + "loss": 0.18, + "step": 7211 + }, + { + "epoch": 2.6632200886262924, + "grad_norm": 0.2836498022079468, + "learning_rate": 2.2465820913905655e-05, + "loss": 0.1521, + "step": 7212 + }, + { + "epoch": 2.663589364844904, + "grad_norm": 0.22375798225402832, + "learning_rate": 2.2441187338342163e-05, + "loss": 0.1322, + "step": 7213 + }, + { + "epoch": 2.6639586410635157, + "grad_norm": 0.2822200655937195, + "learning_rate": 2.2416553762778668e-05, + "loss": 0.173, + "step": 7214 + }, + { + "epoch": 2.664327917282127, + "grad_norm": 0.26703476905822754, + "learning_rate": 2.2391920187215176e-05, + "loss": 0.1618, + "step": 7215 + }, + { + "epoch": 2.6646971935007384, + "grad_norm": 0.25818583369255066, + "learning_rate": 2.2367286611651683e-05, + "loss": 0.1321, + "step": 7216 + }, + { + "epoch": 2.66506646971935, + "grad_norm": 0.2904724180698395, + "learning_rate": 2.234265303608819e-05, + "loss": 0.179, + "step": 7217 + }, + { + "epoch": 2.6654357459379616, + "grad_norm": 0.31186190247535706, + "learning_rate": 2.2318019460524696e-05, + "loss": 0.1794, + "step": 7218 + }, + { + "epoch": 2.6658050221565732, + "grad_norm": 0.2763841450214386, + "learning_rate": 2.2293385884961204e-05, + "loss": 0.1765, + "step": 7219 + }, + { + "epoch": 2.666174298375185, + "grad_norm": 0.27837663888931274, + "learning_rate": 2.226875230939771e-05, + "loss": 0.1523, + "step": 7220 + }, + { + "epoch": 2.6665435745937964, + "grad_norm": 0.28560489416122437, + "learning_rate": 2.224411873383422e-05, + "loss": 0.1601, + "step": 7221 + }, + { + "epoch": 2.6669128508124076, + "grad_norm": 0.2666325867176056, + "learning_rate": 2.2219485158270724e-05, + "loss": 0.1607, + "step": 7222 + }, + { + "epoch": 2.667282127031019, + "grad_norm": 0.2722642421722412, + "learning_rate": 2.219485158270723e-05, + "loss": 0.1761, + "step": 7223 + }, + { + "epoch": 2.667651403249631, + "grad_norm": 0.26727864146232605, + "learning_rate": 2.217021800714374e-05, + "loss": 0.1506, + "step": 7224 + }, + { + "epoch": 2.6680206794682424, + "grad_norm": 0.28678226470947266, + "learning_rate": 2.2145584431580247e-05, + "loss": 0.1727, + "step": 7225 + }, + { + "epoch": 2.6683899556868536, + "grad_norm": 0.27051815390586853, + "learning_rate": 2.2120950856016752e-05, + "loss": 0.1655, + "step": 7226 + }, + { + "epoch": 2.668759231905465, + "grad_norm": 0.24237746000289917, + "learning_rate": 2.2096317280453256e-05, + "loss": 0.1701, + "step": 7227 + }, + { + "epoch": 2.6691285081240768, + "grad_norm": 0.22495131194591522, + "learning_rate": 2.2071683704889764e-05, + "loss": 0.1514, + "step": 7228 + }, + { + "epoch": 2.6694977843426884, + "grad_norm": 0.264757364988327, + "learning_rate": 2.2047050129326272e-05, + "loss": 0.1711, + "step": 7229 + }, + { + "epoch": 2.6698670605613, + "grad_norm": 0.2586033344268799, + "learning_rate": 2.202241655376278e-05, + "loss": 0.1482, + "step": 7230 + }, + { + "epoch": 2.6702363367799116, + "grad_norm": 0.28463292121887207, + "learning_rate": 2.1997782978199284e-05, + "loss": 0.1616, + "step": 7231 + }, + { + "epoch": 2.670605612998523, + "grad_norm": 0.2854052186012268, + "learning_rate": 2.1973149402635792e-05, + "loss": 0.213, + "step": 7232 + }, + { + "epoch": 2.6709748892171343, + "grad_norm": 0.22594064474105835, + "learning_rate": 2.19485158270723e-05, + "loss": 0.1552, + "step": 7233 + }, + { + "epoch": 2.671344165435746, + "grad_norm": 0.3046702742576599, + "learning_rate": 2.1923882251508808e-05, + "loss": 0.186, + "step": 7234 + }, + { + "epoch": 2.6717134416543575, + "grad_norm": 0.2519453465938568, + "learning_rate": 2.1899248675945312e-05, + "loss": 0.1642, + "step": 7235 + }, + { + "epoch": 2.672082717872969, + "grad_norm": 0.2688485085964203, + "learning_rate": 2.187461510038182e-05, + "loss": 0.1478, + "step": 7236 + }, + { + "epoch": 2.6724519940915803, + "grad_norm": 0.2981970012187958, + "learning_rate": 2.1849981524818328e-05, + "loss": 0.155, + "step": 7237 + }, + { + "epoch": 2.672821270310192, + "grad_norm": 0.23487290740013123, + "learning_rate": 2.1825347949254836e-05, + "loss": 0.1565, + "step": 7238 + }, + { + "epoch": 2.6731905465288035, + "grad_norm": 0.29502034187316895, + "learning_rate": 2.180071437369134e-05, + "loss": 0.1959, + "step": 7239 + }, + { + "epoch": 2.673559822747415, + "grad_norm": 0.2634850740432739, + "learning_rate": 2.177608079812785e-05, + "loss": 0.1966, + "step": 7240 + }, + { + "epoch": 2.6739290989660267, + "grad_norm": 0.25058913230895996, + "learning_rate": 2.1751447222564356e-05, + "loss": 0.1462, + "step": 7241 + }, + { + "epoch": 2.6742983751846383, + "grad_norm": 0.2272311896085739, + "learning_rate": 2.1726813647000864e-05, + "loss": 0.1441, + "step": 7242 + }, + { + "epoch": 2.6746676514032495, + "grad_norm": 0.24805359542369843, + "learning_rate": 2.170218007143737e-05, + "loss": 0.1545, + "step": 7243 + }, + { + "epoch": 2.675036927621861, + "grad_norm": 0.22720925509929657, + "learning_rate": 2.1677546495873876e-05, + "loss": 0.141, + "step": 7244 + }, + { + "epoch": 2.6754062038404727, + "grad_norm": 0.2286713719367981, + "learning_rate": 2.1652912920310384e-05, + "loss": 0.1324, + "step": 7245 + }, + { + "epoch": 2.6757754800590843, + "grad_norm": 0.29961085319519043, + "learning_rate": 2.1628279344746892e-05, + "loss": 0.16, + "step": 7246 + }, + { + "epoch": 2.676144756277696, + "grad_norm": 0.3165718615055084, + "learning_rate": 2.1603645769183397e-05, + "loss": 0.1812, + "step": 7247 + }, + { + "epoch": 2.676514032496307, + "grad_norm": 0.28724023699760437, + "learning_rate": 2.1579012193619905e-05, + "loss": 0.1609, + "step": 7248 + }, + { + "epoch": 2.6768833087149186, + "grad_norm": 0.3201262950897217, + "learning_rate": 2.1554378618056412e-05, + "loss": 0.1663, + "step": 7249 + }, + { + "epoch": 2.6772525849335302, + "grad_norm": 0.2807766795158386, + "learning_rate": 2.152974504249292e-05, + "loss": 0.1599, + "step": 7250 + }, + { + "epoch": 2.6772525849335302, + "eval_loss": 8.880995750427246, + "eval_runtime": 6.8994, + "eval_samples_per_second": 7.247, + "eval_steps_per_second": 1.015, + "step": 7250 + }, + { + "epoch": 2.677621861152142, + "grad_norm": 0.26888784766197205, + "learning_rate": 2.1505111466929425e-05, + "loss": 0.2008, + "step": 7251 + }, + { + "epoch": 2.6779911373707534, + "grad_norm": 0.21790331602096558, + "learning_rate": 2.1480477891365933e-05, + "loss": 0.1659, + "step": 7252 + }, + { + "epoch": 2.678360413589365, + "grad_norm": 0.3012241721153259, + "learning_rate": 2.145584431580244e-05, + "loss": 0.1601, + "step": 7253 + }, + { + "epoch": 2.678729689807976, + "grad_norm": 0.2403479665517807, + "learning_rate": 2.143121074023895e-05, + "loss": 0.1335, + "step": 7254 + }, + { + "epoch": 2.679098966026588, + "grad_norm": 0.21214225888252258, + "learning_rate": 2.1406577164675453e-05, + "loss": 0.1468, + "step": 7255 + }, + { + "epoch": 2.6794682422451994, + "grad_norm": 0.329088032245636, + "learning_rate": 2.138194358911196e-05, + "loss": 0.1865, + "step": 7256 + }, + { + "epoch": 2.679837518463811, + "grad_norm": 0.2386673241853714, + "learning_rate": 2.135731001354847e-05, + "loss": 0.1691, + "step": 7257 + }, + { + "epoch": 2.680206794682422, + "grad_norm": 0.3052637279033661, + "learning_rate": 2.1332676437984976e-05, + "loss": 0.1617, + "step": 7258 + }, + { + "epoch": 2.680576070901034, + "grad_norm": 0.272629052400589, + "learning_rate": 2.130804286242148e-05, + "loss": 0.1665, + "step": 7259 + }, + { + "epoch": 2.6809453471196454, + "grad_norm": 0.23165617883205414, + "learning_rate": 2.128340928685799e-05, + "loss": 0.1484, + "step": 7260 + }, + { + "epoch": 2.681314623338257, + "grad_norm": 0.27847227454185486, + "learning_rate": 2.1258775711294497e-05, + "loss": 0.1636, + "step": 7261 + }, + { + "epoch": 2.6816838995568686, + "grad_norm": 0.24064858257770538, + "learning_rate": 2.1234142135731004e-05, + "loss": 0.1515, + "step": 7262 + }, + { + "epoch": 2.68205317577548, + "grad_norm": 0.30608758330345154, + "learning_rate": 2.120950856016751e-05, + "loss": 0.1614, + "step": 7263 + }, + { + "epoch": 2.682422451994092, + "grad_norm": 0.32515254616737366, + "learning_rate": 2.1184874984604017e-05, + "loss": 0.1997, + "step": 7264 + }, + { + "epoch": 2.682791728212703, + "grad_norm": 0.30345281958580017, + "learning_rate": 2.1160241409040525e-05, + "loss": 0.1929, + "step": 7265 + }, + { + "epoch": 2.6831610044313146, + "grad_norm": 0.24107767641544342, + "learning_rate": 2.113560783347703e-05, + "loss": 0.1493, + "step": 7266 + }, + { + "epoch": 2.683530280649926, + "grad_norm": 0.25408318638801575, + "learning_rate": 2.1110974257913537e-05, + "loss": 0.1635, + "step": 7267 + }, + { + "epoch": 2.6838995568685378, + "grad_norm": 0.24572612345218658, + "learning_rate": 2.1086340682350045e-05, + "loss": 0.1533, + "step": 7268 + }, + { + "epoch": 2.684268833087149, + "grad_norm": 0.28045687079429626, + "learning_rate": 2.1061707106786553e-05, + "loss": 0.1895, + "step": 7269 + }, + { + "epoch": 2.6846381093057605, + "grad_norm": 0.25190210342407227, + "learning_rate": 2.1037073531223057e-05, + "loss": 0.1746, + "step": 7270 + }, + { + "epoch": 2.685007385524372, + "grad_norm": 0.2416532188653946, + "learning_rate": 2.1012439955659565e-05, + "loss": 0.1624, + "step": 7271 + }, + { + "epoch": 2.6853766617429837, + "grad_norm": 0.24440214037895203, + "learning_rate": 2.0987806380096073e-05, + "loss": 0.1496, + "step": 7272 + }, + { + "epoch": 2.6857459379615953, + "grad_norm": 0.29208752512931824, + "learning_rate": 2.096317280453258e-05, + "loss": 0.173, + "step": 7273 + }, + { + "epoch": 2.686115214180207, + "grad_norm": 0.33578214049339294, + "learning_rate": 2.0938539228969085e-05, + "loss": 0.1596, + "step": 7274 + }, + { + "epoch": 2.6864844903988185, + "grad_norm": 0.23982414603233337, + "learning_rate": 2.0913905653405593e-05, + "loss": 0.162, + "step": 7275 + }, + { + "epoch": 2.6868537666174297, + "grad_norm": 0.2528478503227234, + "learning_rate": 2.08892720778421e-05, + "loss": 0.1612, + "step": 7276 + }, + { + "epoch": 2.6872230428360413, + "grad_norm": 0.24012508988380432, + "learning_rate": 2.086463850227861e-05, + "loss": 0.1512, + "step": 7277 + }, + { + "epoch": 2.687592319054653, + "grad_norm": 0.3770832419395447, + "learning_rate": 2.0840004926715113e-05, + "loss": 0.1784, + "step": 7278 + }, + { + "epoch": 2.6879615952732645, + "grad_norm": 0.23678553104400635, + "learning_rate": 2.081537135115162e-05, + "loss": 0.159, + "step": 7279 + }, + { + "epoch": 2.6883308714918757, + "grad_norm": 0.28467488288879395, + "learning_rate": 2.079073777558813e-05, + "loss": 0.1834, + "step": 7280 + }, + { + "epoch": 2.6887001477104873, + "grad_norm": 0.22828525304794312, + "learning_rate": 2.0766104200024637e-05, + "loss": 0.1371, + "step": 7281 + }, + { + "epoch": 2.689069423929099, + "grad_norm": 0.2698400914669037, + "learning_rate": 2.074147062446114e-05, + "loss": 0.1554, + "step": 7282 + }, + { + "epoch": 2.6894387001477105, + "grad_norm": 0.24962279200553894, + "learning_rate": 2.071683704889765e-05, + "loss": 0.1558, + "step": 7283 + }, + { + "epoch": 2.689807976366322, + "grad_norm": 0.30898740887641907, + "learning_rate": 2.0692203473334154e-05, + "loss": 0.1553, + "step": 7284 + }, + { + "epoch": 2.6901772525849337, + "grad_norm": 0.22581657767295837, + "learning_rate": 2.066756989777066e-05, + "loss": 0.1434, + "step": 7285 + }, + { + "epoch": 2.6905465288035453, + "grad_norm": 0.24127379059791565, + "learning_rate": 2.064293632220717e-05, + "loss": 0.1402, + "step": 7286 + }, + { + "epoch": 2.6909158050221564, + "grad_norm": 0.28008565306663513, + "learning_rate": 2.0618302746643674e-05, + "loss": 0.1619, + "step": 7287 + }, + { + "epoch": 2.691285081240768, + "grad_norm": 0.2592097818851471, + "learning_rate": 2.0593669171080182e-05, + "loss": 0.1502, + "step": 7288 + }, + { + "epoch": 2.6916543574593796, + "grad_norm": 0.22759641706943512, + "learning_rate": 2.056903559551669e-05, + "loss": 0.1471, + "step": 7289 + }, + { + "epoch": 2.6920236336779912, + "grad_norm": 0.23374295234680176, + "learning_rate": 2.0544402019953198e-05, + "loss": 0.1408, + "step": 7290 + }, + { + "epoch": 2.6923929098966024, + "grad_norm": 0.2670370936393738, + "learning_rate": 2.0519768444389702e-05, + "loss": 0.167, + "step": 7291 + }, + { + "epoch": 2.692762186115214, + "grad_norm": 0.24448539316654205, + "learning_rate": 2.049513486882621e-05, + "loss": 0.1552, + "step": 7292 + }, + { + "epoch": 2.6931314623338256, + "grad_norm": 0.24040241539478302, + "learning_rate": 2.0470501293262718e-05, + "loss": 0.1478, + "step": 7293 + }, + { + "epoch": 2.693500738552437, + "grad_norm": 0.20360209047794342, + "learning_rate": 2.0445867717699226e-05, + "loss": 0.1374, + "step": 7294 + }, + { + "epoch": 2.693870014771049, + "grad_norm": 0.2686956226825714, + "learning_rate": 2.042123414213573e-05, + "loss": 0.1849, + "step": 7295 + }, + { + "epoch": 2.6942392909896604, + "grad_norm": 0.23635953664779663, + "learning_rate": 2.0396600566572238e-05, + "loss": 0.1513, + "step": 7296 + }, + { + "epoch": 2.694608567208272, + "grad_norm": 0.24696846306324005, + "learning_rate": 2.0371966991008746e-05, + "loss": 0.1604, + "step": 7297 + }, + { + "epoch": 2.694977843426883, + "grad_norm": 0.27391448616981506, + "learning_rate": 2.0347333415445254e-05, + "loss": 0.1717, + "step": 7298 + }, + { + "epoch": 2.695347119645495, + "grad_norm": 0.3222452402114868, + "learning_rate": 2.0322699839881758e-05, + "loss": 0.1681, + "step": 7299 + }, + { + "epoch": 2.6957163958641064, + "grad_norm": 0.19614270329475403, + "learning_rate": 2.0298066264318266e-05, + "loss": 0.1295, + "step": 7300 + }, + { + "epoch": 2.6957163958641064, + "eval_loss": 8.893660545349121, + "eval_runtime": 6.9049, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 1.014, + "step": 7300 + }, + { + "epoch": 2.696085672082718, + "grad_norm": 0.24722620844841003, + "learning_rate": 2.0273432688754774e-05, + "loss": 0.1576, + "step": 7301 + }, + { + "epoch": 2.696454948301329, + "grad_norm": 0.25698596239089966, + "learning_rate": 2.0248799113191282e-05, + "loss": 0.1406, + "step": 7302 + }, + { + "epoch": 2.6968242245199407, + "grad_norm": 0.21790169179439545, + "learning_rate": 2.0224165537627786e-05, + "loss": 0.1388, + "step": 7303 + }, + { + "epoch": 2.6971935007385524, + "grad_norm": 0.2299220860004425, + "learning_rate": 2.0199531962064294e-05, + "loss": 0.1667, + "step": 7304 + }, + { + "epoch": 2.697562776957164, + "grad_norm": 0.25858837366104126, + "learning_rate": 2.0174898386500802e-05, + "loss": 0.1722, + "step": 7305 + }, + { + "epoch": 2.6979320531757756, + "grad_norm": 0.2272341400384903, + "learning_rate": 2.015026481093731e-05, + "loss": 0.1676, + "step": 7306 + }, + { + "epoch": 2.698301329394387, + "grad_norm": 0.26498937606811523, + "learning_rate": 2.0125631235373814e-05, + "loss": 0.1664, + "step": 7307 + }, + { + "epoch": 2.6986706056129988, + "grad_norm": 0.23973999917507172, + "learning_rate": 2.0100997659810322e-05, + "loss": 0.1665, + "step": 7308 + }, + { + "epoch": 2.69903988183161, + "grad_norm": 0.22399213910102844, + "learning_rate": 2.007636408424683e-05, + "loss": 0.1532, + "step": 7309 + }, + { + "epoch": 2.6994091580502215, + "grad_norm": 0.22601471841335297, + "learning_rate": 2.0051730508683338e-05, + "loss": 0.147, + "step": 7310 + }, + { + "epoch": 2.699778434268833, + "grad_norm": 0.2528609335422516, + "learning_rate": 2.0027096933119842e-05, + "loss": 0.1609, + "step": 7311 + }, + { + "epoch": 2.7001477104874447, + "grad_norm": 0.2894052565097809, + "learning_rate": 2.000246335755635e-05, + "loss": 0.1572, + "step": 7312 + }, + { + "epoch": 2.700516986706056, + "grad_norm": 0.24308443069458008, + "learning_rate": 1.9977829781992858e-05, + "loss": 0.1419, + "step": 7313 + }, + { + "epoch": 2.7008862629246675, + "grad_norm": 0.24548430740833282, + "learning_rate": 1.9953196206429366e-05, + "loss": 0.1592, + "step": 7314 + }, + { + "epoch": 2.701255539143279, + "grad_norm": 0.26395776867866516, + "learning_rate": 1.992856263086587e-05, + "loss": 0.1538, + "step": 7315 + }, + { + "epoch": 2.7016248153618907, + "grad_norm": 0.3058598041534424, + "learning_rate": 1.9903929055302378e-05, + "loss": 0.1764, + "step": 7316 + }, + { + "epoch": 2.7019940915805023, + "grad_norm": 0.3347432315349579, + "learning_rate": 1.9879295479738886e-05, + "loss": 0.1734, + "step": 7317 + }, + { + "epoch": 2.702363367799114, + "grad_norm": 0.26009097695350647, + "learning_rate": 1.9854661904175394e-05, + "loss": 0.1765, + "step": 7318 + }, + { + "epoch": 2.7027326440177255, + "grad_norm": 0.18659621477127075, + "learning_rate": 1.98300283286119e-05, + "loss": 0.133, + "step": 7319 + }, + { + "epoch": 2.7031019202363367, + "grad_norm": 0.21674500405788422, + "learning_rate": 1.9805394753048406e-05, + "loss": 0.1404, + "step": 7320 + }, + { + "epoch": 2.7034711964549483, + "grad_norm": 0.23804491758346558, + "learning_rate": 1.9780761177484914e-05, + "loss": 0.1423, + "step": 7321 + }, + { + "epoch": 2.70384047267356, + "grad_norm": 0.24353374540805817, + "learning_rate": 1.975612760192142e-05, + "loss": 0.1564, + "step": 7322 + }, + { + "epoch": 2.7042097488921715, + "grad_norm": 0.2639636993408203, + "learning_rate": 1.9731494026357927e-05, + "loss": 0.1866, + "step": 7323 + }, + { + "epoch": 2.7045790251107826, + "grad_norm": 0.2024192065000534, + "learning_rate": 1.9706860450794434e-05, + "loss": 0.1461, + "step": 7324 + }, + { + "epoch": 2.7049483013293942, + "grad_norm": 0.27622878551483154, + "learning_rate": 1.9682226875230942e-05, + "loss": 0.1795, + "step": 7325 + }, + { + "epoch": 2.705317577548006, + "grad_norm": 0.2478237748146057, + "learning_rate": 1.9657593299667447e-05, + "loss": 0.148, + "step": 7326 + }, + { + "epoch": 2.7056868537666174, + "grad_norm": 0.2895376682281494, + "learning_rate": 1.9632959724103955e-05, + "loss": 0.1642, + "step": 7327 + }, + { + "epoch": 2.706056129985229, + "grad_norm": 0.272233247756958, + "learning_rate": 1.9608326148540462e-05, + "loss": 0.1502, + "step": 7328 + }, + { + "epoch": 2.7064254062038406, + "grad_norm": 0.24305157363414764, + "learning_rate": 1.958369257297697e-05, + "loss": 0.1598, + "step": 7329 + }, + { + "epoch": 2.7067946824224522, + "grad_norm": 0.28086456656455994, + "learning_rate": 1.9559058997413475e-05, + "loss": 0.1799, + "step": 7330 + }, + { + "epoch": 2.7071639586410634, + "grad_norm": 0.3017653524875641, + "learning_rate": 1.9534425421849983e-05, + "loss": 0.1768, + "step": 7331 + }, + { + "epoch": 2.707533234859675, + "grad_norm": 0.24587690830230713, + "learning_rate": 1.950979184628649e-05, + "loss": 0.1578, + "step": 7332 + }, + { + "epoch": 2.7079025110782866, + "grad_norm": 0.19745522737503052, + "learning_rate": 1.9485158270723e-05, + "loss": 0.1254, + "step": 7333 + }, + { + "epoch": 2.708271787296898, + "grad_norm": 0.22620220482349396, + "learning_rate": 1.9460524695159503e-05, + "loss": 0.1588, + "step": 7334 + }, + { + "epoch": 2.7086410635155094, + "grad_norm": 0.24788720905780792, + "learning_rate": 1.943589111959601e-05, + "loss": 0.1733, + "step": 7335 + }, + { + "epoch": 2.709010339734121, + "grad_norm": 0.2391194999217987, + "learning_rate": 1.941125754403252e-05, + "loss": 0.1635, + "step": 7336 + }, + { + "epoch": 2.7093796159527326, + "grad_norm": 0.2939334213733673, + "learning_rate": 1.9386623968469026e-05, + "loss": 0.1819, + "step": 7337 + }, + { + "epoch": 2.709748892171344, + "grad_norm": 0.2398591786623001, + "learning_rate": 1.936199039290553e-05, + "loss": 0.1633, + "step": 7338 + }, + { + "epoch": 2.710118168389956, + "grad_norm": 0.23743073642253876, + "learning_rate": 1.933735681734204e-05, + "loss": 0.1367, + "step": 7339 + }, + { + "epoch": 2.7104874446085674, + "grad_norm": 0.21814915537834167, + "learning_rate": 1.9312723241778547e-05, + "loss": 0.1408, + "step": 7340 + }, + { + "epoch": 2.710856720827179, + "grad_norm": 0.2480717897415161, + "learning_rate": 1.9288089666215054e-05, + "loss": 0.1425, + "step": 7341 + }, + { + "epoch": 2.71122599704579, + "grad_norm": 0.2471914291381836, + "learning_rate": 1.926345609065156e-05, + "loss": 0.1655, + "step": 7342 + }, + { + "epoch": 2.7115952732644018, + "grad_norm": 0.21049658954143524, + "learning_rate": 1.9238822515088063e-05, + "loss": 0.1265, + "step": 7343 + }, + { + "epoch": 2.7119645494830134, + "grad_norm": 0.3001233637332916, + "learning_rate": 1.921418893952457e-05, + "loss": 0.17, + "step": 7344 + }, + { + "epoch": 2.712333825701625, + "grad_norm": 0.2560826241970062, + "learning_rate": 1.918955536396108e-05, + "loss": 0.1665, + "step": 7345 + }, + { + "epoch": 2.712703101920236, + "grad_norm": 0.28198087215423584, + "learning_rate": 1.9164921788397587e-05, + "loss": 0.158, + "step": 7346 + }, + { + "epoch": 2.7130723781388477, + "grad_norm": 0.22184699773788452, + "learning_rate": 1.914028821283409e-05, + "loss": 0.1539, + "step": 7347 + }, + { + "epoch": 2.7134416543574593, + "grad_norm": 0.27782949805259705, + "learning_rate": 1.91156546372706e-05, + "loss": 0.1625, + "step": 7348 + }, + { + "epoch": 2.713810930576071, + "grad_norm": 0.2673696279525757, + "learning_rate": 1.9091021061707107e-05, + "loss": 0.1602, + "step": 7349 + }, + { + "epoch": 2.7141802067946825, + "grad_norm": 0.2995664179325104, + "learning_rate": 1.9066387486143615e-05, + "loss": 0.1916, + "step": 7350 + }, + { + "epoch": 2.7141802067946825, + "eval_loss": 8.952940940856934, + "eval_runtime": 6.9043, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 1.014, + "step": 7350 + }, + { + "epoch": 2.714549483013294, + "grad_norm": 0.3239915668964386, + "learning_rate": 1.904175391058012e-05, + "loss": 0.1465, + "step": 7351 + }, + { + "epoch": 2.7149187592319057, + "grad_norm": 0.2529049217700958, + "learning_rate": 1.9017120335016627e-05, + "loss": 0.1495, + "step": 7352 + }, + { + "epoch": 2.715288035450517, + "grad_norm": 0.2395060509443283, + "learning_rate": 1.8992486759453135e-05, + "loss": 0.1918, + "step": 7353 + }, + { + "epoch": 2.7156573116691285, + "grad_norm": 0.24270415306091309, + "learning_rate": 1.8967853183889643e-05, + "loss": 0.1523, + "step": 7354 + }, + { + "epoch": 2.71602658788774, + "grad_norm": 0.23609092831611633, + "learning_rate": 1.8943219608326148e-05, + "loss": 0.1388, + "step": 7355 + }, + { + "epoch": 2.7163958641063517, + "grad_norm": 0.2770302891731262, + "learning_rate": 1.8918586032762656e-05, + "loss": 0.1518, + "step": 7356 + }, + { + "epoch": 2.716765140324963, + "grad_norm": 0.2779446840286255, + "learning_rate": 1.8893952457199163e-05, + "loss": 0.1592, + "step": 7357 + }, + { + "epoch": 2.7171344165435745, + "grad_norm": 0.4106813371181488, + "learning_rate": 1.886931888163567e-05, + "loss": 0.2363, + "step": 7358 + }, + { + "epoch": 2.717503692762186, + "grad_norm": 0.2778377830982208, + "learning_rate": 1.8844685306072176e-05, + "loss": 0.1788, + "step": 7359 + }, + { + "epoch": 2.7178729689807977, + "grad_norm": 0.23983317613601685, + "learning_rate": 1.8820051730508684e-05, + "loss": 0.149, + "step": 7360 + }, + { + "epoch": 2.7182422451994093, + "grad_norm": 0.3039620816707611, + "learning_rate": 1.879541815494519e-05, + "loss": 0.1531, + "step": 7361 + }, + { + "epoch": 2.718611521418021, + "grad_norm": 0.23476289212703705, + "learning_rate": 1.87707845793817e-05, + "loss": 0.1669, + "step": 7362 + }, + { + "epoch": 2.7189807976366325, + "grad_norm": 0.28840407729148865, + "learning_rate": 1.8746151003818204e-05, + "loss": 0.1455, + "step": 7363 + }, + { + "epoch": 2.7193500738552436, + "grad_norm": 0.2577774226665497, + "learning_rate": 1.872151742825471e-05, + "loss": 0.1672, + "step": 7364 + }, + { + "epoch": 2.7197193500738552, + "grad_norm": 0.23904825747013092, + "learning_rate": 1.869688385269122e-05, + "loss": 0.1652, + "step": 7365 + }, + { + "epoch": 2.720088626292467, + "grad_norm": 0.22826147079467773, + "learning_rate": 1.8672250277127727e-05, + "loss": 0.1562, + "step": 7366 + }, + { + "epoch": 2.7204579025110784, + "grad_norm": 0.2585035562515259, + "learning_rate": 1.8647616701564232e-05, + "loss": 0.1453, + "step": 7367 + }, + { + "epoch": 2.7208271787296896, + "grad_norm": 0.29808974266052246, + "learning_rate": 1.862298312600074e-05, + "loss": 0.1697, + "step": 7368 + }, + { + "epoch": 2.721196454948301, + "grad_norm": 0.2825849652290344, + "learning_rate": 1.8598349550437248e-05, + "loss": 0.1647, + "step": 7369 + }, + { + "epoch": 2.721565731166913, + "grad_norm": 0.2700226902961731, + "learning_rate": 1.8573715974873755e-05, + "loss": 0.1421, + "step": 7370 + }, + { + "epoch": 2.7219350073855244, + "grad_norm": 0.23040692508220673, + "learning_rate": 1.854908239931026e-05, + "loss": 0.1552, + "step": 7371 + }, + { + "epoch": 2.722304283604136, + "grad_norm": 0.2740280032157898, + "learning_rate": 1.8524448823746768e-05, + "loss": 0.1716, + "step": 7372 + }, + { + "epoch": 2.7226735598227476, + "grad_norm": 0.24746659398078918, + "learning_rate": 1.8499815248183276e-05, + "loss": 0.1348, + "step": 7373 + }, + { + "epoch": 2.7230428360413588, + "grad_norm": 0.22308820486068726, + "learning_rate": 1.8475181672619783e-05, + "loss": 0.1402, + "step": 7374 + }, + { + "epoch": 2.7234121122599704, + "grad_norm": 0.26153984665870667, + "learning_rate": 1.8450548097056288e-05, + "loss": 0.15, + "step": 7375 + }, + { + "epoch": 2.723781388478582, + "grad_norm": 0.25719717144966125, + "learning_rate": 1.8425914521492796e-05, + "loss": 0.1672, + "step": 7376 + }, + { + "epoch": 2.7241506646971936, + "grad_norm": 0.2778204679489136, + "learning_rate": 1.8401280945929304e-05, + "loss": 0.1465, + "step": 7377 + }, + { + "epoch": 2.724519940915805, + "grad_norm": 0.2639988660812378, + "learning_rate": 1.8376647370365808e-05, + "loss": 0.1432, + "step": 7378 + }, + { + "epoch": 2.7248892171344163, + "grad_norm": 0.24638685584068298, + "learning_rate": 1.8352013794802316e-05, + "loss": 0.1783, + "step": 7379 + }, + { + "epoch": 2.725258493353028, + "grad_norm": 0.257068932056427, + "learning_rate": 1.8327380219238824e-05, + "loss": 0.1617, + "step": 7380 + }, + { + "epoch": 2.7256277695716395, + "grad_norm": 0.25621286034584045, + "learning_rate": 1.8302746643675332e-05, + "loss": 0.2045, + "step": 7381 + }, + { + "epoch": 2.725997045790251, + "grad_norm": 0.2604859173297882, + "learning_rate": 1.8278113068111836e-05, + "loss": 0.1595, + "step": 7382 + }, + { + "epoch": 2.7263663220088628, + "grad_norm": 0.22385691106319427, + "learning_rate": 1.8253479492548344e-05, + "loss": 0.1759, + "step": 7383 + }, + { + "epoch": 2.7267355982274744, + "grad_norm": 0.28562042117118835, + "learning_rate": 1.8228845916984852e-05, + "loss": 0.1714, + "step": 7384 + }, + { + "epoch": 2.7271048744460855, + "grad_norm": 0.2191428393125534, + "learning_rate": 1.820421234142136e-05, + "loss": 0.1225, + "step": 7385 + }, + { + "epoch": 2.727474150664697, + "grad_norm": 0.27994871139526367, + "learning_rate": 1.8179578765857864e-05, + "loss": 0.1675, + "step": 7386 + }, + { + "epoch": 2.7278434268833087, + "grad_norm": 0.2531251311302185, + "learning_rate": 1.8154945190294372e-05, + "loss": 0.1668, + "step": 7387 + }, + { + "epoch": 2.7282127031019203, + "grad_norm": 0.2614520788192749, + "learning_rate": 1.813031161473088e-05, + "loss": 0.1371, + "step": 7388 + }, + { + "epoch": 2.7285819793205315, + "grad_norm": 0.2514965534210205, + "learning_rate": 1.8105678039167388e-05, + "loss": 0.167, + "step": 7389 + }, + { + "epoch": 2.728951255539143, + "grad_norm": 0.24289195239543915, + "learning_rate": 1.8081044463603892e-05, + "loss": 0.1547, + "step": 7390 + }, + { + "epoch": 2.7293205317577547, + "grad_norm": 0.2943010926246643, + "learning_rate": 1.80564108880404e-05, + "loss": 0.1666, + "step": 7391 + }, + { + "epoch": 2.7296898079763663, + "grad_norm": 0.27223441004753113, + "learning_rate": 1.8031777312476908e-05, + "loss": 0.1604, + "step": 7392 + }, + { + "epoch": 2.730059084194978, + "grad_norm": 0.2430969625711441, + "learning_rate": 1.8007143736913416e-05, + "loss": 0.1414, + "step": 7393 + }, + { + "epoch": 2.7304283604135895, + "grad_norm": 0.2918786406517029, + "learning_rate": 1.798251016134992e-05, + "loss": 0.1744, + "step": 7394 + }, + { + "epoch": 2.730797636632201, + "grad_norm": 0.2865627706050873, + "learning_rate": 1.7957876585786428e-05, + "loss": 0.1536, + "step": 7395 + }, + { + "epoch": 2.7311669128508123, + "grad_norm": 0.24253606796264648, + "learning_rate": 1.7933243010222936e-05, + "loss": 0.1402, + "step": 7396 + }, + { + "epoch": 2.731536189069424, + "grad_norm": 0.2807876169681549, + "learning_rate": 1.7908609434659444e-05, + "loss": 0.1681, + "step": 7397 + }, + { + "epoch": 2.7319054652880355, + "grad_norm": 0.3328063488006592, + "learning_rate": 1.788397585909595e-05, + "loss": 0.1852, + "step": 7398 + }, + { + "epoch": 2.732274741506647, + "grad_norm": 0.2316242754459381, + "learning_rate": 1.7859342283532456e-05, + "loss": 0.1413, + "step": 7399 + }, + { + "epoch": 2.7326440177252582, + "grad_norm": 0.2862212061882019, + "learning_rate": 1.783470870796896e-05, + "loss": 0.1696, + "step": 7400 + }, + { + "epoch": 2.7326440177252582, + "eval_loss": 8.981195449829102, + "eval_runtime": 6.8991, + "eval_samples_per_second": 7.247, + "eval_steps_per_second": 1.015, + "step": 7400 + }, + { + "epoch": 2.73301329394387, + "grad_norm": 0.22983554005622864, + "learning_rate": 1.781007513240547e-05, + "loss": 0.1432, + "step": 7401 + }, + { + "epoch": 2.7333825701624814, + "grad_norm": 0.24668338894844055, + "learning_rate": 1.7785441556841977e-05, + "loss": 0.1445, + "step": 7402 + }, + { + "epoch": 2.733751846381093, + "grad_norm": 0.24147279560565948, + "learning_rate": 1.776080798127848e-05, + "loss": 0.155, + "step": 7403 + }, + { + "epoch": 2.7341211225997046, + "grad_norm": 0.2700996696949005, + "learning_rate": 1.773617440571499e-05, + "loss": 0.1694, + "step": 7404 + }, + { + "epoch": 2.7344903988183162, + "grad_norm": 0.26680999994277954, + "learning_rate": 1.7711540830151497e-05, + "loss": 0.1452, + "step": 7405 + }, + { + "epoch": 2.734859675036928, + "grad_norm": 0.24472986161708832, + "learning_rate": 1.7686907254588005e-05, + "loss": 0.1589, + "step": 7406 + }, + { + "epoch": 2.735228951255539, + "grad_norm": 0.3255447447299957, + "learning_rate": 1.766227367902451e-05, + "loss": 0.1814, + "step": 7407 + }, + { + "epoch": 2.7355982274741506, + "grad_norm": 0.525245726108551, + "learning_rate": 1.7637640103461017e-05, + "loss": 0.1711, + "step": 7408 + }, + { + "epoch": 2.735967503692762, + "grad_norm": 0.2769116461277008, + "learning_rate": 1.7613006527897525e-05, + "loss": 0.1713, + "step": 7409 + }, + { + "epoch": 2.736336779911374, + "grad_norm": 0.23150792717933655, + "learning_rate": 1.7588372952334033e-05, + "loss": 0.1545, + "step": 7410 + }, + { + "epoch": 2.736706056129985, + "grad_norm": 0.25552213191986084, + "learning_rate": 1.7563739376770537e-05, + "loss": 0.1379, + "step": 7411 + }, + { + "epoch": 2.7370753323485966, + "grad_norm": 0.2724624276161194, + "learning_rate": 1.7539105801207045e-05, + "loss": 0.1479, + "step": 7412 + }, + { + "epoch": 2.737444608567208, + "grad_norm": 0.28824561834335327, + "learning_rate": 1.7514472225643553e-05, + "loss": 0.1618, + "step": 7413 + }, + { + "epoch": 2.7378138847858198, + "grad_norm": 0.2064255028963089, + "learning_rate": 1.748983865008006e-05, + "loss": 0.1593, + "step": 7414 + }, + { + "epoch": 2.7381831610044314, + "grad_norm": 0.235234335064888, + "learning_rate": 1.7465205074516565e-05, + "loss": 0.1491, + "step": 7415 + }, + { + "epoch": 2.738552437223043, + "grad_norm": 0.31407734751701355, + "learning_rate": 1.7440571498953073e-05, + "loss": 0.1556, + "step": 7416 + }, + { + "epoch": 2.7389217134416546, + "grad_norm": 0.21700848639011383, + "learning_rate": 1.741593792338958e-05, + "loss": 0.1522, + "step": 7417 + }, + { + "epoch": 2.7392909896602657, + "grad_norm": 0.28381839394569397, + "learning_rate": 1.739130434782609e-05, + "loss": 0.1596, + "step": 7418 + }, + { + "epoch": 2.7396602658788773, + "grad_norm": 0.2443346232175827, + "learning_rate": 1.7366670772262593e-05, + "loss": 0.144, + "step": 7419 + }, + { + "epoch": 2.740029542097489, + "grad_norm": 0.25192248821258545, + "learning_rate": 1.73420371966991e-05, + "loss": 0.153, + "step": 7420 + }, + { + "epoch": 2.7403988183161005, + "grad_norm": 0.23653534054756165, + "learning_rate": 1.731740362113561e-05, + "loss": 0.1378, + "step": 7421 + }, + { + "epoch": 2.7407680945347117, + "grad_norm": 0.26313599944114685, + "learning_rate": 1.7292770045572117e-05, + "loss": 0.1459, + "step": 7422 + }, + { + "epoch": 2.7411373707533233, + "grad_norm": 0.2989385426044464, + "learning_rate": 1.726813647000862e-05, + "loss": 0.1903, + "step": 7423 + }, + { + "epoch": 2.741506646971935, + "grad_norm": 0.27745380997657776, + "learning_rate": 1.724350289444513e-05, + "loss": 0.1666, + "step": 7424 + }, + { + "epoch": 2.7418759231905465, + "grad_norm": 0.2680248022079468, + "learning_rate": 1.7218869318881637e-05, + "loss": 0.1617, + "step": 7425 + }, + { + "epoch": 2.742245199409158, + "grad_norm": 0.2668180465698242, + "learning_rate": 1.7194235743318145e-05, + "loss": 0.1541, + "step": 7426 + }, + { + "epoch": 2.7426144756277697, + "grad_norm": 0.2783244848251343, + "learning_rate": 1.716960216775465e-05, + "loss": 0.1717, + "step": 7427 + }, + { + "epoch": 2.7429837518463813, + "grad_norm": 0.22534789144992828, + "learning_rate": 1.7144968592191157e-05, + "loss": 0.1495, + "step": 7428 + }, + { + "epoch": 2.7433530280649925, + "grad_norm": 0.30373477935791016, + "learning_rate": 1.7120335016627665e-05, + "loss": 0.1761, + "step": 7429 + }, + { + "epoch": 2.743722304283604, + "grad_norm": 0.2695143520832062, + "learning_rate": 1.7095701441064173e-05, + "loss": 0.1701, + "step": 7430 + }, + { + "epoch": 2.7440915805022157, + "grad_norm": 0.3140215575695038, + "learning_rate": 1.7071067865500677e-05, + "loss": 0.1786, + "step": 7431 + }, + { + "epoch": 2.7444608567208273, + "grad_norm": 0.32914185523986816, + "learning_rate": 1.7046434289937185e-05, + "loss": 0.1916, + "step": 7432 + }, + { + "epoch": 2.7448301329394384, + "grad_norm": 0.2648669481277466, + "learning_rate": 1.7021800714373693e-05, + "loss": 0.155, + "step": 7433 + }, + { + "epoch": 2.74519940915805, + "grad_norm": 0.24941708147525787, + "learning_rate": 1.69971671388102e-05, + "loss": 0.1535, + "step": 7434 + }, + { + "epoch": 2.7455686853766617, + "grad_norm": 0.2519848644733429, + "learning_rate": 1.6972533563246706e-05, + "loss": 0.1609, + "step": 7435 + }, + { + "epoch": 2.7459379615952733, + "grad_norm": 0.22162123024463654, + "learning_rate": 1.6947899987683213e-05, + "loss": 0.1385, + "step": 7436 + }, + { + "epoch": 2.746307237813885, + "grad_norm": 0.2708025574684143, + "learning_rate": 1.692326641211972e-05, + "loss": 0.1669, + "step": 7437 + }, + { + "epoch": 2.7466765140324965, + "grad_norm": 0.3346139192581177, + "learning_rate": 1.6898632836556226e-05, + "loss": 0.1871, + "step": 7438 + }, + { + "epoch": 2.747045790251108, + "grad_norm": 0.23187866806983948, + "learning_rate": 1.6873999260992734e-05, + "loss": 0.1506, + "step": 7439 + }, + { + "epoch": 2.7474150664697192, + "grad_norm": 0.27713683247566223, + "learning_rate": 1.684936568542924e-05, + "loss": 0.1745, + "step": 7440 + }, + { + "epoch": 2.747784342688331, + "grad_norm": 0.3212098479270935, + "learning_rate": 1.682473210986575e-05, + "loss": 0.1683, + "step": 7441 + }, + { + "epoch": 2.7481536189069424, + "grad_norm": 0.21507683396339417, + "learning_rate": 1.6800098534302254e-05, + "loss": 0.1427, + "step": 7442 + }, + { + "epoch": 2.748522895125554, + "grad_norm": 0.22883003950119019, + "learning_rate": 1.677546495873876e-05, + "loss": 0.1553, + "step": 7443 + }, + { + "epoch": 2.748892171344165, + "grad_norm": 0.28358742594718933, + "learning_rate": 1.675083138317527e-05, + "loss": 0.1686, + "step": 7444 + }, + { + "epoch": 2.749261447562777, + "grad_norm": 0.2905455231666565, + "learning_rate": 1.6726197807611777e-05, + "loss": 0.1597, + "step": 7445 + }, + { + "epoch": 2.7496307237813884, + "grad_norm": 0.29259276390075684, + "learning_rate": 1.6701564232048282e-05, + "loss": 0.1789, + "step": 7446 + }, + { + "epoch": 2.75, + "grad_norm": 0.23894624412059784, + "learning_rate": 1.667693065648479e-05, + "loss": 0.1802, + "step": 7447 + }, + { + "epoch": 2.7503692762186116, + "grad_norm": 0.2541247010231018, + "learning_rate": 1.6652297080921298e-05, + "loss": 0.1702, + "step": 7448 + }, + { + "epoch": 2.750738552437223, + "grad_norm": 0.27310457825660706, + "learning_rate": 1.6627663505357805e-05, + "loss": 0.1586, + "step": 7449 + }, + { + "epoch": 2.751107828655835, + "grad_norm": 0.4058380722999573, + "learning_rate": 1.660302992979431e-05, + "loss": 0.1712, + "step": 7450 + }, + { + "epoch": 2.751107828655835, + "eval_loss": 8.90359878540039, + "eval_runtime": 6.9024, + "eval_samples_per_second": 7.244, + "eval_steps_per_second": 1.014, + "step": 7450 + }, + { + "epoch": 2.751477104874446, + "grad_norm": 0.2598055899143219, + "learning_rate": 1.6578396354230818e-05, + "loss": 0.1465, + "step": 7451 + }, + { + "epoch": 2.7518463810930576, + "grad_norm": 0.28927722573280334, + "learning_rate": 1.6553762778667326e-05, + "loss": 0.1708, + "step": 7452 + }, + { + "epoch": 2.752215657311669, + "grad_norm": 0.28600090742111206, + "learning_rate": 1.6529129203103834e-05, + "loss": 0.1738, + "step": 7453 + }, + { + "epoch": 2.7525849335302808, + "grad_norm": 0.24343499541282654, + "learning_rate": 1.6504495627540338e-05, + "loss": 0.1507, + "step": 7454 + }, + { + "epoch": 2.752954209748892, + "grad_norm": 0.24369241297245026, + "learning_rate": 1.6479862051976846e-05, + "loss": 0.14, + "step": 7455 + }, + { + "epoch": 2.7533234859675035, + "grad_norm": 0.24829861521720886, + "learning_rate": 1.6455228476413354e-05, + "loss": 0.1663, + "step": 7456 + }, + { + "epoch": 2.753692762186115, + "grad_norm": 0.22016984224319458, + "learning_rate": 1.643059490084986e-05, + "loss": 0.1355, + "step": 7457 + }, + { + "epoch": 2.7540620384047267, + "grad_norm": 0.275783896446228, + "learning_rate": 1.6405961325286366e-05, + "loss": 0.1657, + "step": 7458 + }, + { + "epoch": 2.7544313146233383, + "grad_norm": 0.21881066262722015, + "learning_rate": 1.638132774972287e-05, + "loss": 0.1456, + "step": 7459 + }, + { + "epoch": 2.75480059084195, + "grad_norm": 0.29382163286209106, + "learning_rate": 1.635669417415938e-05, + "loss": 0.1425, + "step": 7460 + }, + { + "epoch": 2.7551698670605616, + "grad_norm": 0.2665386199951172, + "learning_rate": 1.6332060598595886e-05, + "loss": 0.1564, + "step": 7461 + }, + { + "epoch": 2.7555391432791727, + "grad_norm": 0.21323393285274506, + "learning_rate": 1.6307427023032394e-05, + "loss": 0.1558, + "step": 7462 + }, + { + "epoch": 2.7559084194977843, + "grad_norm": 0.279378205537796, + "learning_rate": 1.62827934474689e-05, + "loss": 0.1496, + "step": 7463 + }, + { + "epoch": 2.756277695716396, + "grad_norm": 0.2696584165096283, + "learning_rate": 1.6258159871905406e-05, + "loss": 0.1584, + "step": 7464 + }, + { + "epoch": 2.7566469719350075, + "grad_norm": 0.28755778074264526, + "learning_rate": 1.6233526296341914e-05, + "loss": 0.1703, + "step": 7465 + }, + { + "epoch": 2.7570162481536187, + "grad_norm": 0.2940944731235504, + "learning_rate": 1.6208892720778422e-05, + "loss": 0.1648, + "step": 7466 + }, + { + "epoch": 2.7573855243722303, + "grad_norm": 0.3109150826931, + "learning_rate": 1.6184259145214927e-05, + "loss": 0.1808, + "step": 7467 + }, + { + "epoch": 2.757754800590842, + "grad_norm": 0.23995424807071686, + "learning_rate": 1.6159625569651435e-05, + "loss": 0.1447, + "step": 7468 + }, + { + "epoch": 2.7581240768094535, + "grad_norm": 0.2656186521053314, + "learning_rate": 1.6134991994087942e-05, + "loss": 0.1734, + "step": 7469 + }, + { + "epoch": 2.758493353028065, + "grad_norm": 0.20248644053936005, + "learning_rate": 1.611035841852445e-05, + "loss": 0.1475, + "step": 7470 + }, + { + "epoch": 2.7588626292466767, + "grad_norm": 0.2579127252101898, + "learning_rate": 1.6085724842960955e-05, + "loss": 0.1571, + "step": 7471 + }, + { + "epoch": 2.7592319054652883, + "grad_norm": 0.2635478973388672, + "learning_rate": 1.6061091267397463e-05, + "loss": 0.1651, + "step": 7472 + }, + { + "epoch": 2.7596011816838995, + "grad_norm": 0.22819086909294128, + "learning_rate": 1.603645769183397e-05, + "loss": 0.1486, + "step": 7473 + }, + { + "epoch": 2.759970457902511, + "grad_norm": 0.24105340242385864, + "learning_rate": 1.601182411627048e-05, + "loss": 0.1632, + "step": 7474 + }, + { + "epoch": 2.7603397341211227, + "grad_norm": 0.27956318855285645, + "learning_rate": 1.5987190540706983e-05, + "loss": 0.1738, + "step": 7475 + }, + { + "epoch": 2.7607090103397343, + "grad_norm": 0.24938826262950897, + "learning_rate": 1.596255696514349e-05, + "loss": 0.1722, + "step": 7476 + }, + { + "epoch": 2.7610782865583454, + "grad_norm": 0.28471922874450684, + "learning_rate": 1.593792338958e-05, + "loss": 0.145, + "step": 7477 + }, + { + "epoch": 2.761447562776957, + "grad_norm": 0.19068174064159393, + "learning_rate": 1.5913289814016506e-05, + "loss": 0.1376, + "step": 7478 + }, + { + "epoch": 2.7618168389955686, + "grad_norm": 0.281215637922287, + "learning_rate": 1.588865623845301e-05, + "loss": 0.1602, + "step": 7479 + }, + { + "epoch": 2.7621861152141802, + "grad_norm": 0.2299923449754715, + "learning_rate": 1.586402266288952e-05, + "loss": 0.1442, + "step": 7480 + }, + { + "epoch": 2.762555391432792, + "grad_norm": 0.22439081966876984, + "learning_rate": 1.5839389087326027e-05, + "loss": 0.1473, + "step": 7481 + }, + { + "epoch": 2.7629246676514034, + "grad_norm": 0.30542123317718506, + "learning_rate": 1.5814755511762534e-05, + "loss": 0.1651, + "step": 7482 + }, + { + "epoch": 2.763293943870015, + "grad_norm": 0.27152472734451294, + "learning_rate": 1.579012193619904e-05, + "loss": 0.1531, + "step": 7483 + }, + { + "epoch": 2.763663220088626, + "grad_norm": 0.22861289978027344, + "learning_rate": 1.5765488360635547e-05, + "loss": 0.1334, + "step": 7484 + }, + { + "epoch": 2.764032496307238, + "grad_norm": 0.2674832344055176, + "learning_rate": 1.5740854785072055e-05, + "loss": 0.1783, + "step": 7485 + }, + { + "epoch": 2.7644017725258494, + "grad_norm": 0.2581583559513092, + "learning_rate": 1.5716221209508563e-05, + "loss": 0.1324, + "step": 7486 + }, + { + "epoch": 2.764771048744461, + "grad_norm": 0.24515534937381744, + "learning_rate": 1.5691587633945067e-05, + "loss": 0.1614, + "step": 7487 + }, + { + "epoch": 2.765140324963072, + "grad_norm": 0.2793174982070923, + "learning_rate": 1.5666954058381575e-05, + "loss": 0.1491, + "step": 7488 + }, + { + "epoch": 2.7655096011816838, + "grad_norm": 0.3175552785396576, + "learning_rate": 1.5642320482818083e-05, + "loss": 0.1667, + "step": 7489 + }, + { + "epoch": 2.7658788774002954, + "grad_norm": 0.22482416033744812, + "learning_rate": 1.561768690725459e-05, + "loss": 0.1472, + "step": 7490 + }, + { + "epoch": 2.766248153618907, + "grad_norm": 0.24928809702396393, + "learning_rate": 1.5593053331691095e-05, + "loss": 0.1657, + "step": 7491 + }, + { + "epoch": 2.7666174298375186, + "grad_norm": 0.21376118063926697, + "learning_rate": 1.5568419756127603e-05, + "loss": 0.1309, + "step": 7492 + }, + { + "epoch": 2.76698670605613, + "grad_norm": 0.26423296332359314, + "learning_rate": 1.554378618056411e-05, + "loss": 0.1543, + "step": 7493 + }, + { + "epoch": 2.7673559822747418, + "grad_norm": 0.2402375340461731, + "learning_rate": 1.5519152605000615e-05, + "loss": 0.1597, + "step": 7494 + }, + { + "epoch": 2.767725258493353, + "grad_norm": 0.24353481829166412, + "learning_rate": 1.5494519029437123e-05, + "loss": 0.1468, + "step": 7495 + }, + { + "epoch": 2.7680945347119645, + "grad_norm": 0.24683569371700287, + "learning_rate": 1.546988545387363e-05, + "loss": 0.1388, + "step": 7496 + }, + { + "epoch": 2.768463810930576, + "grad_norm": 0.25189974904060364, + "learning_rate": 1.544525187831014e-05, + "loss": 0.1477, + "step": 7497 + }, + { + "epoch": 2.7688330871491877, + "grad_norm": 0.26379960775375366, + "learning_rate": 1.5420618302746643e-05, + "loss": 0.1533, + "step": 7498 + }, + { + "epoch": 2.769202363367799, + "grad_norm": 0.3107115924358368, + "learning_rate": 1.539598472718315e-05, + "loss": 0.1812, + "step": 7499 + }, + { + "epoch": 2.7695716395864105, + "grad_norm": 0.3180251121520996, + "learning_rate": 1.537135115161966e-05, + "loss": 0.1635, + "step": 7500 + }, + { + "epoch": 2.7695716395864105, + "eval_loss": 8.895413398742676, + "eval_runtime": 6.9682, + "eval_samples_per_second": 7.175, + "eval_steps_per_second": 1.005, + "step": 7500 + }, + { + "epoch": 2.769940915805022, + "grad_norm": 0.29091599583625793, + "learning_rate": 1.5346717576056167e-05, + "loss": 0.1722, + "step": 7501 + }, + { + "epoch": 2.7703101920236337, + "grad_norm": 0.3242517411708832, + "learning_rate": 1.532208400049267e-05, + "loss": 0.1701, + "step": 7502 + }, + { + "epoch": 2.7706794682422453, + "grad_norm": 0.24117054045200348, + "learning_rate": 1.529745042492918e-05, + "loss": 0.1482, + "step": 7503 + }, + { + "epoch": 2.771048744460857, + "grad_norm": 0.24702662229537964, + "learning_rate": 1.5272816849365687e-05, + "loss": 0.1685, + "step": 7504 + }, + { + "epoch": 2.7714180206794685, + "grad_norm": 0.3111673891544342, + "learning_rate": 1.5248183273802193e-05, + "loss": 0.1769, + "step": 7505 + }, + { + "epoch": 2.7717872968980797, + "grad_norm": 0.2924559712409973, + "learning_rate": 1.5223549698238701e-05, + "loss": 0.1576, + "step": 7506 + }, + { + "epoch": 2.7721565731166913, + "grad_norm": 0.2647545039653778, + "learning_rate": 1.5198916122675207e-05, + "loss": 0.1661, + "step": 7507 + }, + { + "epoch": 2.772525849335303, + "grad_norm": 0.2691461741924286, + "learning_rate": 1.5174282547111715e-05, + "loss": 0.1688, + "step": 7508 + }, + { + "epoch": 2.7728951255539145, + "grad_norm": 0.29338595271110535, + "learning_rate": 1.5149648971548221e-05, + "loss": 0.186, + "step": 7509 + }, + { + "epoch": 2.7732644017725256, + "grad_norm": 0.2630934417247772, + "learning_rate": 1.512501539598473e-05, + "loss": 0.1539, + "step": 7510 + }, + { + "epoch": 2.7736336779911372, + "grad_norm": 0.22969433665275574, + "learning_rate": 1.5100381820421235e-05, + "loss": 0.1365, + "step": 7511 + }, + { + "epoch": 2.774002954209749, + "grad_norm": 0.2502608895301819, + "learning_rate": 1.5075748244857743e-05, + "loss": 0.1395, + "step": 7512 + }, + { + "epoch": 2.7743722304283605, + "grad_norm": 0.24753591418266296, + "learning_rate": 1.505111466929425e-05, + "loss": 0.143, + "step": 7513 + }, + { + "epoch": 2.774741506646972, + "grad_norm": 0.28611332178115845, + "learning_rate": 1.5026481093730757e-05, + "loss": 0.1826, + "step": 7514 + }, + { + "epoch": 2.7751107828655837, + "grad_norm": 0.267416387796402, + "learning_rate": 1.5001847518167263e-05, + "loss": 0.1699, + "step": 7515 + }, + { + "epoch": 2.775480059084195, + "grad_norm": 0.2549743056297302, + "learning_rate": 1.4977213942603768e-05, + "loss": 0.1614, + "step": 7516 + }, + { + "epoch": 2.7758493353028064, + "grad_norm": 0.2738191485404968, + "learning_rate": 1.4952580367040276e-05, + "loss": 0.1535, + "step": 7517 + }, + { + "epoch": 2.776218611521418, + "grad_norm": 0.28258010745048523, + "learning_rate": 1.4927946791476782e-05, + "loss": 0.1584, + "step": 7518 + }, + { + "epoch": 2.7765878877400296, + "grad_norm": 0.23524357378482819, + "learning_rate": 1.490331321591329e-05, + "loss": 0.1421, + "step": 7519 + }, + { + "epoch": 2.7769571639586412, + "grad_norm": 0.24090488255023956, + "learning_rate": 1.4878679640349796e-05, + "loss": 0.1556, + "step": 7520 + }, + { + "epoch": 2.7773264401772524, + "grad_norm": 0.26074302196502686, + "learning_rate": 1.4854046064786304e-05, + "loss": 0.1907, + "step": 7521 + }, + { + "epoch": 2.777695716395864, + "grad_norm": 0.3227173089981079, + "learning_rate": 1.482941248922281e-05, + "loss": 0.1656, + "step": 7522 + }, + { + "epoch": 2.7780649926144756, + "grad_norm": 0.23194463551044464, + "learning_rate": 1.4804778913659318e-05, + "loss": 0.1542, + "step": 7523 + }, + { + "epoch": 2.778434268833087, + "grad_norm": 0.1940666288137436, + "learning_rate": 1.4780145338095824e-05, + "loss": 0.1226, + "step": 7524 + }, + { + "epoch": 2.778803545051699, + "grad_norm": 0.22734761238098145, + "learning_rate": 1.4755511762532332e-05, + "loss": 0.1373, + "step": 7525 + }, + { + "epoch": 2.7791728212703104, + "grad_norm": 0.2619820833206177, + "learning_rate": 1.4730878186968838e-05, + "loss": 0.1389, + "step": 7526 + }, + { + "epoch": 2.7795420974889216, + "grad_norm": 0.27534186840057373, + "learning_rate": 1.4706244611405346e-05, + "loss": 0.176, + "step": 7527 + }, + { + "epoch": 2.779911373707533, + "grad_norm": 0.21473518013954163, + "learning_rate": 1.4681611035841852e-05, + "loss": 0.1333, + "step": 7528 + }, + { + "epoch": 2.7802806499261448, + "grad_norm": 0.2728576064109802, + "learning_rate": 1.465697746027836e-05, + "loss": 0.1648, + "step": 7529 + }, + { + "epoch": 2.7806499261447564, + "grad_norm": 0.22558070719242096, + "learning_rate": 1.4632343884714866e-05, + "loss": 0.1514, + "step": 7530 + }, + { + "epoch": 2.7810192023633675, + "grad_norm": 0.27340826392173767, + "learning_rate": 1.4607710309151374e-05, + "loss": 0.1719, + "step": 7531 + }, + { + "epoch": 2.781388478581979, + "grad_norm": 0.2483701854944229, + "learning_rate": 1.458307673358788e-05, + "loss": 0.1603, + "step": 7532 + }, + { + "epoch": 2.7817577548005907, + "grad_norm": 0.2842472195625305, + "learning_rate": 1.4558443158024388e-05, + "loss": 0.1632, + "step": 7533 + }, + { + "epoch": 2.7821270310192023, + "grad_norm": 0.2855534851551056, + "learning_rate": 1.4533809582460894e-05, + "loss": 0.1881, + "step": 7534 + }, + { + "epoch": 2.782496307237814, + "grad_norm": 0.27395233511924744, + "learning_rate": 1.4509176006897402e-05, + "loss": 0.1422, + "step": 7535 + }, + { + "epoch": 2.7828655834564255, + "grad_norm": 0.2964867949485779, + "learning_rate": 1.4484542431333908e-05, + "loss": 0.191, + "step": 7536 + }, + { + "epoch": 2.783234859675037, + "grad_norm": 0.19931849837303162, + "learning_rate": 1.4459908855770416e-05, + "loss": 0.1231, + "step": 7537 + }, + { + "epoch": 2.7836041358936483, + "grad_norm": 0.273821622133255, + "learning_rate": 1.4435275280206922e-05, + "loss": 0.1589, + "step": 7538 + }, + { + "epoch": 2.78397341211226, + "grad_norm": 0.250991553068161, + "learning_rate": 1.441064170464343e-05, + "loss": 0.1543, + "step": 7539 + }, + { + "epoch": 2.7843426883308715, + "grad_norm": 0.20151562988758087, + "learning_rate": 1.4386008129079936e-05, + "loss": 0.1347, + "step": 7540 + }, + { + "epoch": 2.784711964549483, + "grad_norm": 0.24669502675533295, + "learning_rate": 1.4361374553516444e-05, + "loss": 0.1578, + "step": 7541 + }, + { + "epoch": 2.7850812407680943, + "grad_norm": 0.33126693964004517, + "learning_rate": 1.433674097795295e-05, + "loss": 0.1631, + "step": 7542 + }, + { + "epoch": 2.785450516986706, + "grad_norm": 0.24954812228679657, + "learning_rate": 1.4312107402389458e-05, + "loss": 0.1578, + "step": 7543 + }, + { + "epoch": 2.7858197932053175, + "grad_norm": 0.27492791414260864, + "learning_rate": 1.4287473826825964e-05, + "loss": 0.1667, + "step": 7544 + }, + { + "epoch": 2.786189069423929, + "grad_norm": 0.28509747982025146, + "learning_rate": 1.4262840251262472e-05, + "loss": 0.1542, + "step": 7545 + }, + { + "epoch": 2.7865583456425407, + "grad_norm": 0.28552940487861633, + "learning_rate": 1.4238206675698978e-05, + "loss": 0.156, + "step": 7546 + }, + { + "epoch": 2.7869276218611523, + "grad_norm": 0.2524304986000061, + "learning_rate": 1.4213573100135486e-05, + "loss": 0.1674, + "step": 7547 + }, + { + "epoch": 2.787296898079764, + "grad_norm": 0.22559098899364471, + "learning_rate": 1.4188939524571992e-05, + "loss": 0.1383, + "step": 7548 + }, + { + "epoch": 2.787666174298375, + "grad_norm": 0.23759859800338745, + "learning_rate": 1.41643059490085e-05, + "loss": 0.1712, + "step": 7549 + }, + { + "epoch": 2.7880354505169866, + "grad_norm": 0.25347110629081726, + "learning_rate": 1.4139672373445007e-05, + "loss": 0.1598, + "step": 7550 + }, + { + "epoch": 2.7880354505169866, + "eval_loss": 8.959206581115723, + "eval_runtime": 6.9082, + "eval_samples_per_second": 7.238, + "eval_steps_per_second": 1.013, + "step": 7550 + }, + { + "epoch": 2.7884047267355982, + "grad_norm": 0.19091956317424774, + "learning_rate": 1.4115038797881513e-05, + "loss": 0.1324, + "step": 7551 + }, + { + "epoch": 2.78877400295421, + "grad_norm": 0.28404998779296875, + "learning_rate": 1.409040522231802e-05, + "loss": 0.1643, + "step": 7552 + }, + { + "epoch": 2.789143279172821, + "grad_norm": 0.23156441748142242, + "learning_rate": 1.4065771646754527e-05, + "loss": 0.1325, + "step": 7553 + }, + { + "epoch": 2.7895125553914326, + "grad_norm": 0.27524295449256897, + "learning_rate": 1.4041138071191035e-05, + "loss": 0.1605, + "step": 7554 + }, + { + "epoch": 2.789881831610044, + "grad_norm": 0.2332834154367447, + "learning_rate": 1.401650449562754e-05, + "loss": 0.141, + "step": 7555 + }, + { + "epoch": 2.790251107828656, + "grad_norm": 0.2644990086555481, + "learning_rate": 1.3991870920064049e-05, + "loss": 0.1847, + "step": 7556 + }, + { + "epoch": 2.7906203840472674, + "grad_norm": 0.2668643593788147, + "learning_rate": 1.3967237344500555e-05, + "loss": 0.1522, + "step": 7557 + }, + { + "epoch": 2.790989660265879, + "grad_norm": 0.27474069595336914, + "learning_rate": 1.3942603768937063e-05, + "loss": 0.1629, + "step": 7558 + }, + { + "epoch": 2.7913589364844906, + "grad_norm": 0.2673705220222473, + "learning_rate": 1.3917970193373569e-05, + "loss": 0.2066, + "step": 7559 + }, + { + "epoch": 2.791728212703102, + "grad_norm": 0.25657036900520325, + "learning_rate": 1.3893336617810077e-05, + "loss": 0.1429, + "step": 7560 + }, + { + "epoch": 2.7920974889217134, + "grad_norm": 0.24279601871967316, + "learning_rate": 1.3868703042246583e-05, + "loss": 0.1436, + "step": 7561 + }, + { + "epoch": 2.792466765140325, + "grad_norm": 0.28260916471481323, + "learning_rate": 1.384406946668309e-05, + "loss": 0.1696, + "step": 7562 + }, + { + "epoch": 2.7928360413589366, + "grad_norm": 0.25003188848495483, + "learning_rate": 1.3819435891119597e-05, + "loss": 0.1603, + "step": 7563 + }, + { + "epoch": 2.7932053175775478, + "grad_norm": 0.2936260402202606, + "learning_rate": 1.3794802315556105e-05, + "loss": 0.1833, + "step": 7564 + }, + { + "epoch": 2.7935745937961594, + "grad_norm": 0.24306906759738922, + "learning_rate": 1.3770168739992611e-05, + "loss": 0.1435, + "step": 7565 + }, + { + "epoch": 2.793943870014771, + "grad_norm": 0.22188471257686615, + "learning_rate": 1.3745535164429119e-05, + "loss": 0.1268, + "step": 7566 + }, + { + "epoch": 2.7943131462333826, + "grad_norm": 0.24690088629722595, + "learning_rate": 1.3720901588865625e-05, + "loss": 0.1574, + "step": 7567 + }, + { + "epoch": 2.794682422451994, + "grad_norm": 0.2431536763906479, + "learning_rate": 1.3696268013302133e-05, + "loss": 0.1419, + "step": 7568 + }, + { + "epoch": 2.7950516986706058, + "grad_norm": 0.32719650864601135, + "learning_rate": 1.3671634437738639e-05, + "loss": 0.177, + "step": 7569 + }, + { + "epoch": 2.7954209748892174, + "grad_norm": 0.22930581867694855, + "learning_rate": 1.3647000862175147e-05, + "loss": 0.1479, + "step": 7570 + }, + { + "epoch": 2.7957902511078285, + "grad_norm": 0.24811190366744995, + "learning_rate": 1.3622367286611653e-05, + "loss": 0.1412, + "step": 7571 + }, + { + "epoch": 2.79615952732644, + "grad_norm": 0.2541557848453522, + "learning_rate": 1.3597733711048161e-05, + "loss": 0.1527, + "step": 7572 + }, + { + "epoch": 2.7965288035450517, + "grad_norm": 0.3019953668117523, + "learning_rate": 1.3573100135484667e-05, + "loss": 0.1783, + "step": 7573 + }, + { + "epoch": 2.7968980797636633, + "grad_norm": 0.22698678076267242, + "learning_rate": 1.3548466559921172e-05, + "loss": 0.1481, + "step": 7574 + }, + { + "epoch": 2.7972673559822745, + "grad_norm": 0.2314881980419159, + "learning_rate": 1.352383298435768e-05, + "loss": 0.1365, + "step": 7575 + }, + { + "epoch": 2.797636632200886, + "grad_norm": 0.26017892360687256, + "learning_rate": 1.3499199408794186e-05, + "loss": 0.1659, + "step": 7576 + }, + { + "epoch": 2.7980059084194977, + "grad_norm": 0.28461870551109314, + "learning_rate": 1.3474565833230693e-05, + "loss": 0.1666, + "step": 7577 + }, + { + "epoch": 2.7983751846381093, + "grad_norm": 0.2899555563926697, + "learning_rate": 1.34499322576672e-05, + "loss": 0.1749, + "step": 7578 + }, + { + "epoch": 2.798744460856721, + "grad_norm": 0.25253763794898987, + "learning_rate": 1.3425298682103707e-05, + "loss": 0.1325, + "step": 7579 + }, + { + "epoch": 2.7991137370753325, + "grad_norm": 0.3199283182621002, + "learning_rate": 1.3400665106540214e-05, + "loss": 0.1669, + "step": 7580 + }, + { + "epoch": 2.799483013293944, + "grad_norm": 0.2520403265953064, + "learning_rate": 1.3376031530976721e-05, + "loss": 0.1596, + "step": 7581 + }, + { + "epoch": 2.7998522895125553, + "grad_norm": 0.23610898852348328, + "learning_rate": 1.3351397955413228e-05, + "loss": 0.1573, + "step": 7582 + }, + { + "epoch": 2.800221565731167, + "grad_norm": 0.30460333824157715, + "learning_rate": 1.3326764379849736e-05, + "loss": 0.1375, + "step": 7583 + }, + { + "epoch": 2.8005908419497785, + "grad_norm": 0.29375678300857544, + "learning_rate": 1.3302130804286242e-05, + "loss": 0.1702, + "step": 7584 + }, + { + "epoch": 2.80096011816839, + "grad_norm": 0.289198100566864, + "learning_rate": 1.327749722872275e-05, + "loss": 0.1843, + "step": 7585 + }, + { + "epoch": 2.8013293943870012, + "grad_norm": 0.26423829793930054, + "learning_rate": 1.3252863653159256e-05, + "loss": 0.1619, + "step": 7586 + }, + { + "epoch": 2.801698670605613, + "grad_norm": 0.2893339693546295, + "learning_rate": 1.3228230077595764e-05, + "loss": 0.1648, + "step": 7587 + }, + { + "epoch": 2.8020679468242244, + "grad_norm": 0.2697225511074066, + "learning_rate": 1.320359650203227e-05, + "loss": 0.152, + "step": 7588 + }, + { + "epoch": 2.802437223042836, + "grad_norm": 0.21412163972854614, + "learning_rate": 1.3178962926468778e-05, + "loss": 0.1298, + "step": 7589 + }, + { + "epoch": 2.8028064992614476, + "grad_norm": 0.24301163852214813, + "learning_rate": 1.3154329350905284e-05, + "loss": 0.1562, + "step": 7590 + }, + { + "epoch": 2.8031757754800593, + "grad_norm": 0.24034807085990906, + "learning_rate": 1.3129695775341792e-05, + "loss": 0.1321, + "step": 7591 + }, + { + "epoch": 2.803545051698671, + "grad_norm": 0.2885459065437317, + "learning_rate": 1.3105062199778298e-05, + "loss": 0.1941, + "step": 7592 + }, + { + "epoch": 2.803914327917282, + "grad_norm": 0.27063921093940735, + "learning_rate": 1.3080428624214806e-05, + "loss": 0.1365, + "step": 7593 + }, + { + "epoch": 2.8042836041358936, + "grad_norm": 0.25682809948921204, + "learning_rate": 1.3055795048651312e-05, + "loss": 0.168, + "step": 7594 + }, + { + "epoch": 2.804652880354505, + "grad_norm": 0.27599695324897766, + "learning_rate": 1.303116147308782e-05, + "loss": 0.1659, + "step": 7595 + }, + { + "epoch": 2.805022156573117, + "grad_norm": 0.27911749482154846, + "learning_rate": 1.3006527897524326e-05, + "loss": 0.1585, + "step": 7596 + }, + { + "epoch": 2.805391432791728, + "grad_norm": 0.22856685519218445, + "learning_rate": 1.2981894321960834e-05, + "loss": 0.1528, + "step": 7597 + }, + { + "epoch": 2.8057607090103396, + "grad_norm": 0.2999403178691864, + "learning_rate": 1.295726074639734e-05, + "loss": 0.1622, + "step": 7598 + }, + { + "epoch": 2.806129985228951, + "grad_norm": 0.22871984541416168, + "learning_rate": 1.2932627170833848e-05, + "loss": 0.134, + "step": 7599 + }, + { + "epoch": 2.806499261447563, + "grad_norm": 0.24820372462272644, + "learning_rate": 1.2907993595270354e-05, + "loss": 0.1537, + "step": 7600 + }, + { + "epoch": 2.806499261447563, + "eval_loss": 8.949795722961426, + "eval_runtime": 6.8982, + "eval_samples_per_second": 7.248, + "eval_steps_per_second": 1.015, + "step": 7600 + }, + { + "epoch": 2.8068685376661744, + "grad_norm": 0.2312173694372177, + "learning_rate": 1.2883360019706862e-05, + "loss": 0.1406, + "step": 7601 + }, + { + "epoch": 2.807237813884786, + "grad_norm": 0.2928660213947296, + "learning_rate": 1.2858726444143368e-05, + "loss": 0.1516, + "step": 7602 + }, + { + "epoch": 2.8076070901033976, + "grad_norm": 0.23433832824230194, + "learning_rate": 1.2834092868579876e-05, + "loss": 0.1393, + "step": 7603 + }, + { + "epoch": 2.8079763663220088, + "grad_norm": 0.31710928678512573, + "learning_rate": 1.2809459293016382e-05, + "loss": 0.1846, + "step": 7604 + }, + { + "epoch": 2.8083456425406204, + "grad_norm": 0.23491697013378143, + "learning_rate": 1.278482571745289e-05, + "loss": 0.1429, + "step": 7605 + }, + { + "epoch": 2.808714918759232, + "grad_norm": 0.3791598081588745, + "learning_rate": 1.2760192141889396e-05, + "loss": 0.1685, + "step": 7606 + }, + { + "epoch": 2.8090841949778436, + "grad_norm": 0.29887065291404724, + "learning_rate": 1.2735558566325904e-05, + "loss": 0.1727, + "step": 7607 + }, + { + "epoch": 2.8094534711964547, + "grad_norm": 0.2430477738380432, + "learning_rate": 1.271092499076241e-05, + "loss": 0.1747, + "step": 7608 + }, + { + "epoch": 2.8098227474150663, + "grad_norm": 0.2095842957496643, + "learning_rate": 1.2686291415198916e-05, + "loss": 0.142, + "step": 7609 + }, + { + "epoch": 2.810192023633678, + "grad_norm": 0.3249691426753998, + "learning_rate": 1.2661657839635424e-05, + "loss": 0.1683, + "step": 7610 + }, + { + "epoch": 2.8105612998522895, + "grad_norm": 0.27750352025032043, + "learning_rate": 1.263702426407193e-05, + "loss": 0.1596, + "step": 7611 + }, + { + "epoch": 2.810930576070901, + "grad_norm": 0.30263251066207886, + "learning_rate": 1.2612390688508438e-05, + "loss": 0.1641, + "step": 7612 + }, + { + "epoch": 2.8112998522895127, + "grad_norm": 0.3112315833568573, + "learning_rate": 1.2587757112944944e-05, + "loss": 0.2159, + "step": 7613 + }, + { + "epoch": 2.8116691285081243, + "grad_norm": 0.25650346279144287, + "learning_rate": 1.2563123537381452e-05, + "loss": 0.1707, + "step": 7614 + }, + { + "epoch": 2.8120384047267355, + "grad_norm": 0.2324189394712448, + "learning_rate": 1.2538489961817958e-05, + "loss": 0.1448, + "step": 7615 + }, + { + "epoch": 2.812407680945347, + "grad_norm": 0.27498534321784973, + "learning_rate": 1.2513856386254466e-05, + "loss": 0.1496, + "step": 7616 + }, + { + "epoch": 2.8127769571639587, + "grad_norm": 0.3623496890068054, + "learning_rate": 1.2489222810690972e-05, + "loss": 0.1721, + "step": 7617 + }, + { + "epoch": 2.8131462333825703, + "grad_norm": 0.2953372597694397, + "learning_rate": 1.2464589235127479e-05, + "loss": 0.1975, + "step": 7618 + }, + { + "epoch": 2.8135155096011815, + "grad_norm": 0.21967244148254395, + "learning_rate": 1.2439955659563986e-05, + "loss": 0.13, + "step": 7619 + }, + { + "epoch": 2.813884785819793, + "grad_norm": 0.28372862935066223, + "learning_rate": 1.2415322084000493e-05, + "loss": 0.157, + "step": 7620 + }, + { + "epoch": 2.8142540620384047, + "grad_norm": 0.234775111079216, + "learning_rate": 1.2390688508437e-05, + "loss": 0.1346, + "step": 7621 + }, + { + "epoch": 2.8146233382570163, + "grad_norm": 0.3067571818828583, + "learning_rate": 1.2366054932873507e-05, + "loss": 0.174, + "step": 7622 + }, + { + "epoch": 2.814992614475628, + "grad_norm": 0.21647082269191742, + "learning_rate": 1.2341421357310014e-05, + "loss": 0.1506, + "step": 7623 + }, + { + "epoch": 2.8153618906942395, + "grad_norm": 0.2957089841365814, + "learning_rate": 1.231678778174652e-05, + "loss": 0.1853, + "step": 7624 + }, + { + "epoch": 2.815731166912851, + "grad_norm": 0.23774920403957367, + "learning_rate": 1.2292154206183028e-05, + "loss": 0.1412, + "step": 7625 + }, + { + "epoch": 2.8161004431314622, + "grad_norm": 0.25511863827705383, + "learning_rate": 1.2267520630619535e-05, + "loss": 0.1455, + "step": 7626 + }, + { + "epoch": 2.816469719350074, + "grad_norm": 0.22340211272239685, + "learning_rate": 1.2242887055056043e-05, + "loss": 0.148, + "step": 7627 + }, + { + "epoch": 2.8168389955686854, + "grad_norm": 0.23858360946178436, + "learning_rate": 1.2218253479492549e-05, + "loss": 0.1323, + "step": 7628 + }, + { + "epoch": 2.817208271787297, + "grad_norm": 0.22605356574058533, + "learning_rate": 1.2193619903929057e-05, + "loss": 0.1595, + "step": 7629 + }, + { + "epoch": 2.817577548005908, + "grad_norm": 0.2945128381252289, + "learning_rate": 1.2168986328365563e-05, + "loss": 0.147, + "step": 7630 + }, + { + "epoch": 2.81794682422452, + "grad_norm": 0.24200816452503204, + "learning_rate": 1.214435275280207e-05, + "loss": 0.1503, + "step": 7631 + }, + { + "epoch": 2.8183161004431314, + "grad_norm": 0.2188728302717209, + "learning_rate": 1.2119719177238577e-05, + "loss": 0.133, + "step": 7632 + }, + { + "epoch": 2.818685376661743, + "grad_norm": 0.2998740077018738, + "learning_rate": 1.2095085601675085e-05, + "loss": 0.1774, + "step": 7633 + }, + { + "epoch": 2.8190546528803546, + "grad_norm": 0.2521224617958069, + "learning_rate": 1.207045202611159e-05, + "loss": 0.167, + "step": 7634 + }, + { + "epoch": 2.819423929098966, + "grad_norm": 0.23806288838386536, + "learning_rate": 1.2045818450548099e-05, + "loss": 0.1465, + "step": 7635 + }, + { + "epoch": 2.819793205317578, + "grad_norm": 0.2919353246688843, + "learning_rate": 1.2021184874984605e-05, + "loss": 0.1664, + "step": 7636 + }, + { + "epoch": 2.820162481536189, + "grad_norm": 0.2923643887042999, + "learning_rate": 1.1996551299421111e-05, + "loss": 0.1751, + "step": 7637 + }, + { + "epoch": 2.8205317577548006, + "grad_norm": 0.30557382106781006, + "learning_rate": 1.1971917723857619e-05, + "loss": 0.1847, + "step": 7638 + }, + { + "epoch": 2.820901033973412, + "grad_norm": 0.20320585370063782, + "learning_rate": 1.1947284148294125e-05, + "loss": 0.1428, + "step": 7639 + }, + { + "epoch": 2.821270310192024, + "grad_norm": 0.35118481516838074, + "learning_rate": 1.1922650572730633e-05, + "loss": 0.191, + "step": 7640 + }, + { + "epoch": 2.821639586410635, + "grad_norm": 0.23418359458446503, + "learning_rate": 1.1898016997167139e-05, + "loss": 0.1303, + "step": 7641 + }, + { + "epoch": 2.8220088626292466, + "grad_norm": 0.3023289442062378, + "learning_rate": 1.1873383421603647e-05, + "loss": 0.1618, + "step": 7642 + }, + { + "epoch": 2.822378138847858, + "grad_norm": 0.24761348962783813, + "learning_rate": 1.1848749846040153e-05, + "loss": 0.1569, + "step": 7643 + }, + { + "epoch": 2.8227474150664698, + "grad_norm": 0.2871807813644409, + "learning_rate": 1.1824116270476661e-05, + "loss": 0.1607, + "step": 7644 + }, + { + "epoch": 2.8231166912850814, + "grad_norm": 0.27608802914619446, + "learning_rate": 1.1799482694913167e-05, + "loss": 0.138, + "step": 7645 + }, + { + "epoch": 2.823485967503693, + "grad_norm": 0.28925570845603943, + "learning_rate": 1.1774849119349673e-05, + "loss": 0.1784, + "step": 7646 + }, + { + "epoch": 2.823855243722304, + "grad_norm": 0.25159335136413574, + "learning_rate": 1.1750215543786181e-05, + "loss": 0.1556, + "step": 7647 + }, + { + "epoch": 2.8242245199409157, + "grad_norm": 0.24042898416519165, + "learning_rate": 1.1725581968222687e-05, + "loss": 0.1539, + "step": 7648 + }, + { + "epoch": 2.8245937961595273, + "grad_norm": 0.26414334774017334, + "learning_rate": 1.1700948392659195e-05, + "loss": 0.1654, + "step": 7649 + }, + { + "epoch": 2.824963072378139, + "grad_norm": 0.24918536841869354, + "learning_rate": 1.1676314817095701e-05, + "loss": 0.1607, + "step": 7650 + }, + { + "epoch": 2.824963072378139, + "eval_loss": 8.934947967529297, + "eval_runtime": 6.9089, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 1.013, + "step": 7650 + }, + { + "epoch": 2.8253323485967505, + "grad_norm": 0.25667792558670044, + "learning_rate": 1.165168124153221e-05, + "loss": 0.1593, + "step": 7651 + }, + { + "epoch": 2.8257016248153617, + "grad_norm": 0.2609187066555023, + "learning_rate": 1.1627047665968715e-05, + "loss": 0.1469, + "step": 7652 + }, + { + "epoch": 2.8260709010339733, + "grad_norm": 0.21854522824287415, + "learning_rate": 1.1602414090405223e-05, + "loss": 0.1412, + "step": 7653 + }, + { + "epoch": 2.826440177252585, + "grad_norm": 0.25824499130249023, + "learning_rate": 1.157778051484173e-05, + "loss": 0.151, + "step": 7654 + }, + { + "epoch": 2.8268094534711965, + "grad_norm": 0.3034045994281769, + "learning_rate": 1.1553146939278237e-05, + "loss": 0.1771, + "step": 7655 + }, + { + "epoch": 2.827178729689808, + "grad_norm": 0.2368393987417221, + "learning_rate": 1.1528513363714743e-05, + "loss": 0.1284, + "step": 7656 + }, + { + "epoch": 2.8275480059084197, + "grad_norm": 0.2218867540359497, + "learning_rate": 1.1503879788151251e-05, + "loss": 0.1294, + "step": 7657 + }, + { + "epoch": 2.827917282127031, + "grad_norm": 0.2873745262622833, + "learning_rate": 1.1479246212587757e-05, + "loss": 0.1812, + "step": 7658 + }, + { + "epoch": 2.8282865583456425, + "grad_norm": 0.2978823184967041, + "learning_rate": 1.1454612637024265e-05, + "loss": 0.155, + "step": 7659 + }, + { + "epoch": 2.828655834564254, + "grad_norm": 0.2622617781162262, + "learning_rate": 1.1429979061460772e-05, + "loss": 0.1628, + "step": 7660 + }, + { + "epoch": 2.8290251107828657, + "grad_norm": 0.29369959235191345, + "learning_rate": 1.140534548589728e-05, + "loss": 0.1902, + "step": 7661 + }, + { + "epoch": 2.829394387001477, + "grad_norm": 0.2877872884273529, + "learning_rate": 1.1380711910333786e-05, + "loss": 0.1526, + "step": 7662 + }, + { + "epoch": 2.8297636632200884, + "grad_norm": 0.2491106539964676, + "learning_rate": 1.1356078334770293e-05, + "loss": 0.1604, + "step": 7663 + }, + { + "epoch": 2.8301329394387, + "grad_norm": 0.27732059359550476, + "learning_rate": 1.13314447592068e-05, + "loss": 0.1542, + "step": 7664 + }, + { + "epoch": 2.8305022156573116, + "grad_norm": 0.2297486513853073, + "learning_rate": 1.1306811183643306e-05, + "loss": 0.1549, + "step": 7665 + }, + { + "epoch": 2.8308714918759232, + "grad_norm": 0.23846516013145447, + "learning_rate": 1.1282177608079814e-05, + "loss": 0.1485, + "step": 7666 + }, + { + "epoch": 2.831240768094535, + "grad_norm": 0.23883701860904694, + "learning_rate": 1.125754403251632e-05, + "loss": 0.1483, + "step": 7667 + }, + { + "epoch": 2.8316100443131464, + "grad_norm": 0.22844117879867554, + "learning_rate": 1.1232910456952828e-05, + "loss": 0.1319, + "step": 7668 + }, + { + "epoch": 2.8319793205317576, + "grad_norm": 0.2871289551258087, + "learning_rate": 1.1208276881389334e-05, + "loss": 0.1616, + "step": 7669 + }, + { + "epoch": 2.832348596750369, + "grad_norm": 0.2913699448108673, + "learning_rate": 1.1183643305825842e-05, + "loss": 0.1683, + "step": 7670 + }, + { + "epoch": 2.832717872968981, + "grad_norm": 0.23114582896232605, + "learning_rate": 1.1159009730262348e-05, + "loss": 0.1444, + "step": 7671 + }, + { + "epoch": 2.8330871491875924, + "grad_norm": 0.3411503732204437, + "learning_rate": 1.1134376154698856e-05, + "loss": 0.1628, + "step": 7672 + }, + { + "epoch": 2.8334564254062036, + "grad_norm": 0.2649197280406952, + "learning_rate": 1.1109742579135362e-05, + "loss": 0.1827, + "step": 7673 + }, + { + "epoch": 2.833825701624815, + "grad_norm": 0.22040055692195892, + "learning_rate": 1.108510900357187e-05, + "loss": 0.1279, + "step": 7674 + }, + { + "epoch": 2.8341949778434268, + "grad_norm": 0.28520432114601135, + "learning_rate": 1.1060475428008376e-05, + "loss": 0.1699, + "step": 7675 + }, + { + "epoch": 2.8345642540620384, + "grad_norm": 0.24498812854290009, + "learning_rate": 1.1035841852444882e-05, + "loss": 0.1668, + "step": 7676 + }, + { + "epoch": 2.83493353028065, + "grad_norm": 0.24487565457820892, + "learning_rate": 1.101120827688139e-05, + "loss": 0.1465, + "step": 7677 + }, + { + "epoch": 2.8353028064992616, + "grad_norm": 0.22076864540576935, + "learning_rate": 1.0986574701317896e-05, + "loss": 0.1415, + "step": 7678 + }, + { + "epoch": 2.835672082717873, + "grad_norm": 0.31694111227989197, + "learning_rate": 1.0961941125754404e-05, + "loss": 0.1659, + "step": 7679 + }, + { + "epoch": 2.8360413589364843, + "grad_norm": 0.23941947519779205, + "learning_rate": 1.093730755019091e-05, + "loss": 0.1448, + "step": 7680 + }, + { + "epoch": 2.836410635155096, + "grad_norm": 0.2833503186702728, + "learning_rate": 1.0912673974627418e-05, + "loss": 0.1631, + "step": 7681 + }, + { + "epoch": 2.8367799113737076, + "grad_norm": 0.35158607363700867, + "learning_rate": 1.0888040399063924e-05, + "loss": 0.1753, + "step": 7682 + }, + { + "epoch": 2.837149187592319, + "grad_norm": 0.2929239869117737, + "learning_rate": 1.0863406823500432e-05, + "loss": 0.1766, + "step": 7683 + }, + { + "epoch": 2.8375184638109303, + "grad_norm": 0.2721927762031555, + "learning_rate": 1.0838773247936938e-05, + "loss": 0.1651, + "step": 7684 + }, + { + "epoch": 2.837887740029542, + "grad_norm": 0.30922213196754456, + "learning_rate": 1.0814139672373446e-05, + "loss": 0.1747, + "step": 7685 + }, + { + "epoch": 2.8382570162481535, + "grad_norm": 0.2551419138908386, + "learning_rate": 1.0789506096809952e-05, + "loss": 0.1551, + "step": 7686 + }, + { + "epoch": 2.838626292466765, + "grad_norm": 0.25759127736091614, + "learning_rate": 1.076487252124646e-05, + "loss": 0.1511, + "step": 7687 + }, + { + "epoch": 2.8389955686853767, + "grad_norm": 0.2053588330745697, + "learning_rate": 1.0740238945682966e-05, + "loss": 0.1467, + "step": 7688 + }, + { + "epoch": 2.8393648449039883, + "grad_norm": 0.2408028393983841, + "learning_rate": 1.0715605370119474e-05, + "loss": 0.1495, + "step": 7689 + }, + { + "epoch": 2.8397341211226, + "grad_norm": 0.2754501700401306, + "learning_rate": 1.069097179455598e-05, + "loss": 0.1752, + "step": 7690 + }, + { + "epoch": 2.840103397341211, + "grad_norm": 0.21606384217739105, + "learning_rate": 1.0666338218992488e-05, + "loss": 0.1336, + "step": 7691 + }, + { + "epoch": 2.8404726735598227, + "grad_norm": 0.26788634061813354, + "learning_rate": 1.0641704643428994e-05, + "loss": 0.171, + "step": 7692 + }, + { + "epoch": 2.8408419497784343, + "grad_norm": 0.3007977604866028, + "learning_rate": 1.0617071067865502e-05, + "loss": 0.1675, + "step": 7693 + }, + { + "epoch": 2.841211225997046, + "grad_norm": 0.32352912425994873, + "learning_rate": 1.0592437492302008e-05, + "loss": 0.1784, + "step": 7694 + }, + { + "epoch": 2.841580502215657, + "grad_norm": 0.28513962030410767, + "learning_rate": 1.0567803916738515e-05, + "loss": 0.1781, + "step": 7695 + }, + { + "epoch": 2.8419497784342687, + "grad_norm": 0.24499237537384033, + "learning_rate": 1.0543170341175022e-05, + "loss": 0.1415, + "step": 7696 + }, + { + "epoch": 2.8423190546528803, + "grad_norm": 0.258148193359375, + "learning_rate": 1.0518536765611529e-05, + "loss": 0.156, + "step": 7697 + }, + { + "epoch": 2.842688330871492, + "grad_norm": 0.28376054763793945, + "learning_rate": 1.0493903190048036e-05, + "loss": 0.1343, + "step": 7698 + }, + { + "epoch": 2.8430576070901035, + "grad_norm": 0.2879290282726288, + "learning_rate": 1.0469269614484543e-05, + "loss": 0.1976, + "step": 7699 + }, + { + "epoch": 2.843426883308715, + "grad_norm": 0.2800526022911072, + "learning_rate": 1.044463603892105e-05, + "loss": 0.1789, + "step": 7700 + }, + { + "epoch": 2.843426883308715, + "eval_loss": 8.98156452178955, + "eval_runtime": 6.9043, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 1.014, + "step": 7700 + }, + { + "epoch": 2.8437961595273267, + "grad_norm": 0.2176995575428009, + "learning_rate": 1.0420002463357557e-05, + "loss": 0.1392, + "step": 7701 + }, + { + "epoch": 2.844165435745938, + "grad_norm": 0.3147423565387726, + "learning_rate": 1.0395368887794065e-05, + "loss": 0.1869, + "step": 7702 + }, + { + "epoch": 2.8445347119645494, + "grad_norm": 0.21075312793254852, + "learning_rate": 1.037073531223057e-05, + "loss": 0.1281, + "step": 7703 + }, + { + "epoch": 2.844903988183161, + "grad_norm": 0.23713389039039612, + "learning_rate": 1.0346101736667077e-05, + "loss": 0.154, + "step": 7704 + }, + { + "epoch": 2.8452732644017726, + "grad_norm": 0.2358805239200592, + "learning_rate": 1.0321468161103585e-05, + "loss": 0.1611, + "step": 7705 + }, + { + "epoch": 2.845642540620384, + "grad_norm": 0.2420981526374817, + "learning_rate": 1.0296834585540091e-05, + "loss": 0.1571, + "step": 7706 + }, + { + "epoch": 2.8460118168389954, + "grad_norm": 0.2906803786754608, + "learning_rate": 1.0272201009976599e-05, + "loss": 0.1566, + "step": 7707 + }, + { + "epoch": 2.846381093057607, + "grad_norm": 0.1957794576883316, + "learning_rate": 1.0247567434413105e-05, + "loss": 0.1333, + "step": 7708 + }, + { + "epoch": 2.8467503692762186, + "grad_norm": 0.28155967593193054, + "learning_rate": 1.0222933858849613e-05, + "loss": 0.163, + "step": 7709 + }, + { + "epoch": 2.84711964549483, + "grad_norm": 0.31716328859329224, + "learning_rate": 1.0198300283286119e-05, + "loss": 0.1545, + "step": 7710 + }, + { + "epoch": 2.847488921713442, + "grad_norm": 0.34429728984832764, + "learning_rate": 1.0173666707722627e-05, + "loss": 0.1794, + "step": 7711 + }, + { + "epoch": 2.8478581979320534, + "grad_norm": 0.239198699593544, + "learning_rate": 1.0149033132159133e-05, + "loss": 0.1271, + "step": 7712 + }, + { + "epoch": 2.8482274741506646, + "grad_norm": 0.38698169589042664, + "learning_rate": 1.0124399556595641e-05, + "loss": 0.1997, + "step": 7713 + }, + { + "epoch": 2.848596750369276, + "grad_norm": 0.27299168705940247, + "learning_rate": 1.0099765981032147e-05, + "loss": 0.1513, + "step": 7714 + }, + { + "epoch": 2.848966026587888, + "grad_norm": 0.23073719441890717, + "learning_rate": 1.0075132405468655e-05, + "loss": 0.1545, + "step": 7715 + }, + { + "epoch": 2.8493353028064994, + "grad_norm": 0.3002285659313202, + "learning_rate": 1.0050498829905161e-05, + "loss": 0.1469, + "step": 7716 + }, + { + "epoch": 2.8497045790251105, + "grad_norm": 0.3697929084300995, + "learning_rate": 1.0025865254341669e-05, + "loss": 0.1578, + "step": 7717 + }, + { + "epoch": 2.850073855243722, + "grad_norm": 0.2252642661333084, + "learning_rate": 1.0001231678778175e-05, + "loss": 0.1608, + "step": 7718 + }, + { + "epoch": 2.8504431314623337, + "grad_norm": 0.2594517171382904, + "learning_rate": 9.976598103214683e-06, + "loss": 0.1528, + "step": 7719 + }, + { + "epoch": 2.8508124076809453, + "grad_norm": 0.25399044156074524, + "learning_rate": 9.951964527651189e-06, + "loss": 0.1541, + "step": 7720 + }, + { + "epoch": 2.851181683899557, + "grad_norm": 0.20220792293548584, + "learning_rate": 9.927330952087697e-06, + "loss": 0.1361, + "step": 7721 + }, + { + "epoch": 2.8515509601181686, + "grad_norm": 0.28783994913101196, + "learning_rate": 9.902697376524203e-06, + "loss": 0.1649, + "step": 7722 + }, + { + "epoch": 2.85192023633678, + "grad_norm": 0.22549551725387573, + "learning_rate": 9.87806380096071e-06, + "loss": 0.1426, + "step": 7723 + }, + { + "epoch": 2.8522895125553913, + "grad_norm": 0.2762086093425751, + "learning_rate": 9.853430225397217e-06, + "loss": 0.1612, + "step": 7724 + }, + { + "epoch": 2.852658788774003, + "grad_norm": 0.23178315162658691, + "learning_rate": 9.828796649833723e-06, + "loss": 0.1355, + "step": 7725 + }, + { + "epoch": 2.8530280649926145, + "grad_norm": 0.20642724633216858, + "learning_rate": 9.804163074270231e-06, + "loss": 0.136, + "step": 7726 + }, + { + "epoch": 2.853397341211226, + "grad_norm": 0.25018852949142456, + "learning_rate": 9.779529498706737e-06, + "loss": 0.1532, + "step": 7727 + }, + { + "epoch": 2.8537666174298373, + "grad_norm": 0.29667386412620544, + "learning_rate": 9.754895923143245e-06, + "loss": 0.1815, + "step": 7728 + }, + { + "epoch": 2.854135893648449, + "grad_norm": 0.21438167989253998, + "learning_rate": 9.730262347579751e-06, + "loss": 0.1423, + "step": 7729 + }, + { + "epoch": 2.8545051698670605, + "grad_norm": 0.31625455617904663, + "learning_rate": 9.70562877201626e-06, + "loss": 0.1871, + "step": 7730 + }, + { + "epoch": 2.854874446085672, + "grad_norm": 0.24175108969211578, + "learning_rate": 9.680995196452765e-06, + "loss": 0.1522, + "step": 7731 + }, + { + "epoch": 2.8552437223042837, + "grad_norm": 0.2430894672870636, + "learning_rate": 9.656361620889273e-06, + "loss": 0.1403, + "step": 7732 + }, + { + "epoch": 2.8556129985228953, + "grad_norm": 0.27614155411720276, + "learning_rate": 9.63172804532578e-06, + "loss": 0.1741, + "step": 7733 + }, + { + "epoch": 2.855982274741507, + "grad_norm": 0.2861122786998749, + "learning_rate": 9.607094469762286e-06, + "loss": 0.148, + "step": 7734 + }, + { + "epoch": 2.856351550960118, + "grad_norm": 0.3137962818145752, + "learning_rate": 9.582460894198794e-06, + "loss": 0.17, + "step": 7735 + }, + { + "epoch": 2.8567208271787297, + "grad_norm": 0.25127294659614563, + "learning_rate": 9.5578273186353e-06, + "loss": 0.1518, + "step": 7736 + }, + { + "epoch": 2.8570901033973413, + "grad_norm": 0.2233678698539734, + "learning_rate": 9.533193743071808e-06, + "loss": 0.1555, + "step": 7737 + }, + { + "epoch": 2.857459379615953, + "grad_norm": 0.24551436305046082, + "learning_rate": 9.508560167508314e-06, + "loss": 0.1656, + "step": 7738 + }, + { + "epoch": 2.857828655834564, + "grad_norm": 0.35238945484161377, + "learning_rate": 9.483926591944822e-06, + "loss": 0.1842, + "step": 7739 + }, + { + "epoch": 2.8581979320531756, + "grad_norm": 0.22350764274597168, + "learning_rate": 9.459293016381328e-06, + "loss": 0.1732, + "step": 7740 + }, + { + "epoch": 2.8585672082717872, + "grad_norm": 0.2366911619901657, + "learning_rate": 9.434659440817836e-06, + "loss": 0.1413, + "step": 7741 + }, + { + "epoch": 2.858936484490399, + "grad_norm": 0.3179534077644348, + "learning_rate": 9.410025865254342e-06, + "loss": 0.1908, + "step": 7742 + }, + { + "epoch": 2.8593057607090104, + "grad_norm": 0.3167871832847595, + "learning_rate": 9.38539228969085e-06, + "loss": 0.1605, + "step": 7743 + }, + { + "epoch": 2.859675036927622, + "grad_norm": 0.22424213588237762, + "learning_rate": 9.360758714127356e-06, + "loss": 0.1451, + "step": 7744 + }, + { + "epoch": 2.8600443131462336, + "grad_norm": 0.2675069570541382, + "learning_rate": 9.336125138563864e-06, + "loss": 0.1556, + "step": 7745 + }, + { + "epoch": 2.860413589364845, + "grad_norm": 0.28380632400512695, + "learning_rate": 9.31149156300037e-06, + "loss": 0.1655, + "step": 7746 + }, + { + "epoch": 2.8607828655834564, + "grad_norm": 0.24247777462005615, + "learning_rate": 9.286857987436878e-06, + "loss": 0.1664, + "step": 7747 + }, + { + "epoch": 2.861152141802068, + "grad_norm": 0.2621886730194092, + "learning_rate": 9.262224411873384e-06, + "loss": 0.1598, + "step": 7748 + }, + { + "epoch": 2.8615214180206796, + "grad_norm": 0.2718808352947235, + "learning_rate": 9.237590836309892e-06, + "loss": 0.1574, + "step": 7749 + }, + { + "epoch": 2.8618906942392908, + "grad_norm": 0.2578151226043701, + "learning_rate": 9.212957260746398e-06, + "loss": 0.1534, + "step": 7750 + }, + { + "epoch": 2.8618906942392908, + "eval_loss": 9.01315975189209, + "eval_runtime": 6.9013, + "eval_samples_per_second": 7.245, + "eval_steps_per_second": 1.014, + "step": 7750 + }, + { + "epoch": 2.8622599704579024, + "grad_norm": 0.24831654131412506, + "learning_rate": 9.188323685182904e-06, + "loss": 0.1718, + "step": 7751 + }, + { + "epoch": 2.862629246676514, + "grad_norm": 0.2706897556781769, + "learning_rate": 9.163690109619412e-06, + "loss": 0.1465, + "step": 7752 + }, + { + "epoch": 2.8629985228951256, + "grad_norm": 0.24103237688541412, + "learning_rate": 9.139056534055918e-06, + "loss": 0.1443, + "step": 7753 + }, + { + "epoch": 2.863367799113737, + "grad_norm": 0.2637235224246979, + "learning_rate": 9.114422958492426e-06, + "loss": 0.1674, + "step": 7754 + }, + { + "epoch": 2.863737075332349, + "grad_norm": 0.3259708881378174, + "learning_rate": 9.089789382928932e-06, + "loss": 0.1782, + "step": 7755 + }, + { + "epoch": 2.8641063515509604, + "grad_norm": 0.23410508036613464, + "learning_rate": 9.06515580736544e-06, + "loss": 0.1346, + "step": 7756 + }, + { + "epoch": 2.8644756277695715, + "grad_norm": 0.25629451870918274, + "learning_rate": 9.040522231801946e-06, + "loss": 0.1507, + "step": 7757 + }, + { + "epoch": 2.864844903988183, + "grad_norm": 0.2605557143688202, + "learning_rate": 9.015888656238454e-06, + "loss": 0.1677, + "step": 7758 + }, + { + "epoch": 2.8652141802067947, + "grad_norm": 0.23210284113883972, + "learning_rate": 8.99125508067496e-06, + "loss": 0.1564, + "step": 7759 + }, + { + "epoch": 2.8655834564254064, + "grad_norm": 0.2533877193927765, + "learning_rate": 8.966621505111468e-06, + "loss": 0.1455, + "step": 7760 + }, + { + "epoch": 2.8659527326440175, + "grad_norm": 0.27652597427368164, + "learning_rate": 8.941987929547974e-06, + "loss": 0.1673, + "step": 7761 + }, + { + "epoch": 2.866322008862629, + "grad_norm": 0.2527548670768738, + "learning_rate": 8.91735435398448e-06, + "loss": 0.1526, + "step": 7762 + }, + { + "epoch": 2.8666912850812407, + "grad_norm": 0.2469540685415268, + "learning_rate": 8.892720778420988e-06, + "loss": 0.1336, + "step": 7763 + }, + { + "epoch": 2.8670605612998523, + "grad_norm": 0.2724383771419525, + "learning_rate": 8.868087202857494e-06, + "loss": 0.1719, + "step": 7764 + }, + { + "epoch": 2.867429837518464, + "grad_norm": 0.284205824136734, + "learning_rate": 8.843453627294002e-06, + "loss": 0.1542, + "step": 7765 + }, + { + "epoch": 2.8677991137370755, + "grad_norm": 0.31241342425346375, + "learning_rate": 8.818820051730508e-06, + "loss": 0.1817, + "step": 7766 + }, + { + "epoch": 2.868168389955687, + "grad_norm": 0.2625638544559479, + "learning_rate": 8.794186476167016e-06, + "loss": 0.1644, + "step": 7767 + }, + { + "epoch": 2.8685376661742983, + "grad_norm": 0.24493885040283203, + "learning_rate": 8.769552900603523e-06, + "loss": 0.1502, + "step": 7768 + }, + { + "epoch": 2.86890694239291, + "grad_norm": 0.25389790534973145, + "learning_rate": 8.74491932504003e-06, + "loss": 0.1547, + "step": 7769 + }, + { + "epoch": 2.8692762186115215, + "grad_norm": 0.2772005498409271, + "learning_rate": 8.720285749476537e-06, + "loss": 0.1591, + "step": 7770 + }, + { + "epoch": 2.869645494830133, + "grad_norm": 0.28581178188323975, + "learning_rate": 8.695652173913044e-06, + "loss": 0.1731, + "step": 7771 + }, + { + "epoch": 2.8700147710487443, + "grad_norm": 0.24728703498840332, + "learning_rate": 8.67101859834955e-06, + "loss": 0.1266, + "step": 7772 + }, + { + "epoch": 2.870384047267356, + "grad_norm": 0.25105801224708557, + "learning_rate": 8.646385022786058e-06, + "loss": 0.1697, + "step": 7773 + }, + { + "epoch": 2.8707533234859675, + "grad_norm": 0.23554202914237976, + "learning_rate": 8.621751447222565e-06, + "loss": 0.1675, + "step": 7774 + }, + { + "epoch": 2.871122599704579, + "grad_norm": 0.22083799540996552, + "learning_rate": 8.597117871659072e-06, + "loss": 0.1533, + "step": 7775 + }, + { + "epoch": 2.8714918759231907, + "grad_norm": 0.24602487683296204, + "learning_rate": 8.572484296095579e-06, + "loss": 0.1641, + "step": 7776 + }, + { + "epoch": 2.8718611521418023, + "grad_norm": 0.2528549134731293, + "learning_rate": 8.547850720532087e-06, + "loss": 0.1552, + "step": 7777 + }, + { + "epoch": 2.8722304283604134, + "grad_norm": 0.25162604451179504, + "learning_rate": 8.523217144968593e-06, + "loss": 0.1704, + "step": 7778 + }, + { + "epoch": 2.872599704579025, + "grad_norm": 0.2593317925930023, + "learning_rate": 8.4985835694051e-06, + "loss": 0.1385, + "step": 7779 + }, + { + "epoch": 2.8729689807976366, + "grad_norm": 0.26376643776893616, + "learning_rate": 8.473949993841607e-06, + "loss": 0.1417, + "step": 7780 + }, + { + "epoch": 2.8733382570162482, + "grad_norm": 0.33480486273765564, + "learning_rate": 8.449316418278113e-06, + "loss": 0.2164, + "step": 7781 + }, + { + "epoch": 2.87370753323486, + "grad_norm": 0.22279219329357147, + "learning_rate": 8.42468284271462e-06, + "loss": 0.1288, + "step": 7782 + }, + { + "epoch": 2.874076809453471, + "grad_norm": 0.26911675930023193, + "learning_rate": 8.400049267151127e-06, + "loss": 0.173, + "step": 7783 + }, + { + "epoch": 2.8744460856720826, + "grad_norm": 0.2548901438713074, + "learning_rate": 8.375415691587635e-06, + "loss": 0.1552, + "step": 7784 + }, + { + "epoch": 2.874815361890694, + "grad_norm": 0.23104795813560486, + "learning_rate": 8.350782116024141e-06, + "loss": 0.1737, + "step": 7785 + }, + { + "epoch": 2.875184638109306, + "grad_norm": 0.23088377714157104, + "learning_rate": 8.326148540460649e-06, + "loss": 0.1413, + "step": 7786 + }, + { + "epoch": 2.8755539143279174, + "grad_norm": 0.2658538520336151, + "learning_rate": 8.301514964897155e-06, + "loss": 0.1438, + "step": 7787 + }, + { + "epoch": 2.875923190546529, + "grad_norm": 0.20461083948612213, + "learning_rate": 8.276881389333663e-06, + "loss": 0.1333, + "step": 7788 + }, + { + "epoch": 2.87629246676514, + "grad_norm": 0.2399008870124817, + "learning_rate": 8.252247813770169e-06, + "loss": 0.1562, + "step": 7789 + }, + { + "epoch": 2.8766617429837518, + "grad_norm": 0.24018298089504242, + "learning_rate": 8.227614238206677e-06, + "loss": 0.1355, + "step": 7790 + }, + { + "epoch": 2.8770310192023634, + "grad_norm": 0.34940052032470703, + "learning_rate": 8.202980662643183e-06, + "loss": 0.193, + "step": 7791 + }, + { + "epoch": 2.877400295420975, + "grad_norm": 0.2941071391105652, + "learning_rate": 8.17834708707969e-06, + "loss": 0.1606, + "step": 7792 + }, + { + "epoch": 2.8777695716395866, + "grad_norm": 0.26835066080093384, + "learning_rate": 8.153713511516197e-06, + "loss": 0.1594, + "step": 7793 + }, + { + "epoch": 2.8781388478581977, + "grad_norm": 0.25363367795944214, + "learning_rate": 8.129079935952703e-06, + "loss": 0.1619, + "step": 7794 + }, + { + "epoch": 2.8785081240768093, + "grad_norm": 0.23352386057376862, + "learning_rate": 8.104446360389211e-06, + "loss": 0.1469, + "step": 7795 + }, + { + "epoch": 2.878877400295421, + "grad_norm": 0.25929105281829834, + "learning_rate": 8.079812784825717e-06, + "loss": 0.1581, + "step": 7796 + }, + { + "epoch": 2.8792466765140325, + "grad_norm": 0.2699171006679535, + "learning_rate": 8.055179209262225e-06, + "loss": 0.1563, + "step": 7797 + }, + { + "epoch": 2.879615952732644, + "grad_norm": 0.24687309563159943, + "learning_rate": 8.030545633698731e-06, + "loss": 0.1359, + "step": 7798 + }, + { + "epoch": 2.8799852289512557, + "grad_norm": 0.2889746427536011, + "learning_rate": 8.00591205813524e-06, + "loss": 0.1476, + "step": 7799 + }, + { + "epoch": 2.880354505169867, + "grad_norm": 0.2349807769060135, + "learning_rate": 7.981278482571745e-06, + "loss": 0.1455, + "step": 7800 + }, + { + "epoch": 2.880354505169867, + "eval_loss": 8.991836547851562, + "eval_runtime": 6.9066, + "eval_samples_per_second": 7.239, + "eval_steps_per_second": 1.014, + "step": 7800 + }, + { + "epoch": 2.8807237813884785, + "grad_norm": 0.2404627948999405, + "learning_rate": 7.956644907008253e-06, + "loss": 0.1498, + "step": 7801 + }, + { + "epoch": 2.88109305760709, + "grad_norm": 0.24170491099357605, + "learning_rate": 7.93201133144476e-06, + "loss": 0.1218, + "step": 7802 + }, + { + "epoch": 2.8814623338257017, + "grad_norm": 0.2557377219200134, + "learning_rate": 7.907377755881267e-06, + "loss": 0.1517, + "step": 7803 + }, + { + "epoch": 2.881831610044313, + "grad_norm": 0.31372299790382385, + "learning_rate": 7.882744180317773e-06, + "loss": 0.1719, + "step": 7804 + }, + { + "epoch": 2.8822008862629245, + "grad_norm": 0.239765465259552, + "learning_rate": 7.858110604754281e-06, + "loss": 0.1671, + "step": 7805 + }, + { + "epoch": 2.882570162481536, + "grad_norm": 0.2766842544078827, + "learning_rate": 7.833477029190787e-06, + "loss": 0.1666, + "step": 7806 + }, + { + "epoch": 2.8829394387001477, + "grad_norm": 0.23289455473423004, + "learning_rate": 7.808843453627295e-06, + "loss": 0.1478, + "step": 7807 + }, + { + "epoch": 2.8833087149187593, + "grad_norm": 0.25973019003868103, + "learning_rate": 7.784209878063801e-06, + "loss": 0.1674, + "step": 7808 + }, + { + "epoch": 2.883677991137371, + "grad_norm": 0.2501669228076935, + "learning_rate": 7.759576302500308e-06, + "loss": 0.1615, + "step": 7809 + }, + { + "epoch": 2.8840472673559825, + "grad_norm": 0.23316755890846252, + "learning_rate": 7.734942726936816e-06, + "loss": 0.1647, + "step": 7810 + }, + { + "epoch": 2.8844165435745936, + "grad_norm": 0.20180602371692657, + "learning_rate": 7.710309151373322e-06, + "loss": 0.1355, + "step": 7811 + }, + { + "epoch": 2.8847858197932053, + "grad_norm": 0.2601860463619232, + "learning_rate": 7.68567557580983e-06, + "loss": 0.1693, + "step": 7812 + }, + { + "epoch": 2.885155096011817, + "grad_norm": 0.2914137840270996, + "learning_rate": 7.661042000246336e-06, + "loss": 0.1763, + "step": 7813 + }, + { + "epoch": 2.8855243722304285, + "grad_norm": 0.27411553263664246, + "learning_rate": 7.636408424682844e-06, + "loss": 0.1491, + "step": 7814 + }, + { + "epoch": 2.8858936484490396, + "grad_norm": 0.24918341636657715, + "learning_rate": 7.611774849119351e-06, + "loss": 0.1486, + "step": 7815 + }, + { + "epoch": 2.886262924667651, + "grad_norm": 0.26462891697883606, + "learning_rate": 7.587141273555858e-06, + "loss": 0.136, + "step": 7816 + }, + { + "epoch": 2.886632200886263, + "grad_norm": 0.25037795305252075, + "learning_rate": 7.562507697992365e-06, + "loss": 0.164, + "step": 7817 + }, + { + "epoch": 2.8870014771048744, + "grad_norm": 0.30708014965057373, + "learning_rate": 7.537874122428872e-06, + "loss": 0.1736, + "step": 7818 + }, + { + "epoch": 2.887370753323486, + "grad_norm": 0.2661638855934143, + "learning_rate": 7.513240546865379e-06, + "loss": 0.1764, + "step": 7819 + }, + { + "epoch": 2.8877400295420976, + "grad_norm": 0.2557961344718933, + "learning_rate": 7.488606971301884e-06, + "loss": 0.1716, + "step": 7820 + }, + { + "epoch": 2.8881093057607092, + "grad_norm": 0.26975947618484497, + "learning_rate": 7.463973395738391e-06, + "loss": 0.1811, + "step": 7821 + }, + { + "epoch": 2.8884785819793204, + "grad_norm": 0.28164228796958923, + "learning_rate": 7.439339820174898e-06, + "loss": 0.1594, + "step": 7822 + }, + { + "epoch": 2.888847858197932, + "grad_norm": 0.24386563897132874, + "learning_rate": 7.414706244611405e-06, + "loss": 0.1709, + "step": 7823 + }, + { + "epoch": 2.8892171344165436, + "grad_norm": 0.24668964743614197, + "learning_rate": 7.390072669047912e-06, + "loss": 0.1508, + "step": 7824 + }, + { + "epoch": 2.889586410635155, + "grad_norm": 0.25202590227127075, + "learning_rate": 7.365439093484419e-06, + "loss": 0.1491, + "step": 7825 + }, + { + "epoch": 2.8899556868537664, + "grad_norm": 0.26024529337882996, + "learning_rate": 7.340805517920926e-06, + "loss": 0.1415, + "step": 7826 + }, + { + "epoch": 2.890324963072378, + "grad_norm": 0.22780027985572815, + "learning_rate": 7.316171942357433e-06, + "loss": 0.1528, + "step": 7827 + }, + { + "epoch": 2.8906942392909896, + "grad_norm": 0.26981809735298157, + "learning_rate": 7.29153836679394e-06, + "loss": 0.1511, + "step": 7828 + }, + { + "epoch": 2.891063515509601, + "grad_norm": 0.24553334712982178, + "learning_rate": 7.266904791230447e-06, + "loss": 0.135, + "step": 7829 + }, + { + "epoch": 2.8914327917282128, + "grad_norm": 0.22489580512046814, + "learning_rate": 7.242271215666954e-06, + "loss": 0.1329, + "step": 7830 + }, + { + "epoch": 2.8918020679468244, + "grad_norm": 0.23057226836681366, + "learning_rate": 7.217637640103461e-06, + "loss": 0.1476, + "step": 7831 + }, + { + "epoch": 2.892171344165436, + "grad_norm": 0.23183265328407288, + "learning_rate": 7.193004064539968e-06, + "loss": 0.1877, + "step": 7832 + }, + { + "epoch": 2.892540620384047, + "grad_norm": 0.27713415026664734, + "learning_rate": 7.168370488976475e-06, + "loss": 0.169, + "step": 7833 + }, + { + "epoch": 2.8929098966026587, + "grad_norm": 0.2636701762676239, + "learning_rate": 7.143736913412982e-06, + "loss": 0.163, + "step": 7834 + }, + { + "epoch": 2.8932791728212703, + "grad_norm": 0.2538353502750397, + "learning_rate": 7.119103337849489e-06, + "loss": 0.1679, + "step": 7835 + }, + { + "epoch": 2.893648449039882, + "grad_norm": 0.2508026957511902, + "learning_rate": 7.094469762285996e-06, + "loss": 0.1597, + "step": 7836 + }, + { + "epoch": 2.894017725258493, + "grad_norm": 0.27143165469169617, + "learning_rate": 7.069836186722503e-06, + "loss": 0.1783, + "step": 7837 + }, + { + "epoch": 2.8943870014771047, + "grad_norm": 0.24314972758293152, + "learning_rate": 7.04520261115901e-06, + "loss": 0.1502, + "step": 7838 + }, + { + "epoch": 2.8947562776957163, + "grad_norm": 0.277226060628891, + "learning_rate": 7.020569035595517e-06, + "loss": 0.1686, + "step": 7839 + }, + { + "epoch": 2.895125553914328, + "grad_norm": 0.24152825772762299, + "learning_rate": 6.995935460032024e-06, + "loss": 0.1598, + "step": 7840 + }, + { + "epoch": 2.8954948301329395, + "grad_norm": 0.2329702079296112, + "learning_rate": 6.971301884468531e-06, + "loss": 0.1653, + "step": 7841 + }, + { + "epoch": 2.895864106351551, + "grad_norm": 0.23207807540893555, + "learning_rate": 6.946668308905038e-06, + "loss": 0.1489, + "step": 7842 + }, + { + "epoch": 2.8962333825701627, + "grad_norm": 0.304966539144516, + "learning_rate": 6.922034733341545e-06, + "loss": 0.1605, + "step": 7843 + }, + { + "epoch": 2.896602658788774, + "grad_norm": 0.24416109919548035, + "learning_rate": 6.897401157778052e-06, + "loss": 0.1561, + "step": 7844 + }, + { + "epoch": 2.8969719350073855, + "grad_norm": 0.19457095861434937, + "learning_rate": 6.872767582214559e-06, + "loss": 0.1244, + "step": 7845 + }, + { + "epoch": 2.897341211225997, + "grad_norm": 0.30616214871406555, + "learning_rate": 6.848134006651066e-06, + "loss": 0.1558, + "step": 7846 + }, + { + "epoch": 2.8977104874446087, + "grad_norm": 0.26054486632347107, + "learning_rate": 6.823500431087573e-06, + "loss": 0.1584, + "step": 7847 + }, + { + "epoch": 2.89807976366322, + "grad_norm": 0.29278042912483215, + "learning_rate": 6.7988668555240804e-06, + "loss": 0.1785, + "step": 7848 + }, + { + "epoch": 2.8984490398818314, + "grad_norm": 0.2801530361175537, + "learning_rate": 6.774233279960586e-06, + "loss": 0.1494, + "step": 7849 + }, + { + "epoch": 2.898818316100443, + "grad_norm": 0.3944324851036072, + "learning_rate": 6.749599704397093e-06, + "loss": 0.1767, + "step": 7850 + }, + { + "epoch": 2.898818316100443, + "eval_loss": 8.985445022583008, + "eval_runtime": 6.9043, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 1.014, + "step": 7850 + }, + { + "epoch": 2.8991875923190547, + "grad_norm": 0.3139789402484894, + "learning_rate": 6.7249661288336e-06, + "loss": 0.1779, + "step": 7851 + }, + { + "epoch": 2.8995568685376663, + "grad_norm": 0.2477511763572693, + "learning_rate": 6.700332553270107e-06, + "loss": 0.1374, + "step": 7852 + }, + { + "epoch": 2.899926144756278, + "grad_norm": 0.23102974891662598, + "learning_rate": 6.675698977706614e-06, + "loss": 0.1475, + "step": 7853 + }, + { + "epoch": 2.9002954209748895, + "grad_norm": 0.2750798761844635, + "learning_rate": 6.651065402143121e-06, + "loss": 0.1572, + "step": 7854 + }, + { + "epoch": 2.9006646971935006, + "grad_norm": 0.2737557291984558, + "learning_rate": 6.626431826579628e-06, + "loss": 0.1799, + "step": 7855 + }, + { + "epoch": 2.901033973412112, + "grad_norm": 0.2548477351665497, + "learning_rate": 6.601798251016135e-06, + "loss": 0.1489, + "step": 7856 + }, + { + "epoch": 2.901403249630724, + "grad_norm": 0.22344064712524414, + "learning_rate": 6.577164675452642e-06, + "loss": 0.1365, + "step": 7857 + }, + { + "epoch": 2.9017725258493354, + "grad_norm": 0.2377508133649826, + "learning_rate": 6.552531099889149e-06, + "loss": 0.1273, + "step": 7858 + }, + { + "epoch": 2.9021418020679466, + "grad_norm": 0.2163553088903427, + "learning_rate": 6.527897524325656e-06, + "loss": 0.1536, + "step": 7859 + }, + { + "epoch": 2.902511078286558, + "grad_norm": 0.2572222948074341, + "learning_rate": 6.503263948762163e-06, + "loss": 0.1681, + "step": 7860 + }, + { + "epoch": 2.90288035450517, + "grad_norm": 0.3034805357456207, + "learning_rate": 6.47863037319867e-06, + "loss": 0.1791, + "step": 7861 + }, + { + "epoch": 2.9032496307237814, + "grad_norm": 0.44993504881858826, + "learning_rate": 6.453996797635177e-06, + "loss": 0.1676, + "step": 7862 + }, + { + "epoch": 2.903618906942393, + "grad_norm": 0.3106197714805603, + "learning_rate": 6.429363222071684e-06, + "loss": 0.194, + "step": 7863 + }, + { + "epoch": 2.9039881831610046, + "grad_norm": 0.2459009289741516, + "learning_rate": 6.404729646508191e-06, + "loss": 0.17, + "step": 7864 + }, + { + "epoch": 2.904357459379616, + "grad_norm": 0.2741924226284027, + "learning_rate": 6.380096070944698e-06, + "loss": 0.1412, + "step": 7865 + }, + { + "epoch": 2.9047267355982274, + "grad_norm": 0.2306993156671524, + "learning_rate": 6.355462495381205e-06, + "loss": 0.1448, + "step": 7866 + }, + { + "epoch": 2.905096011816839, + "grad_norm": 0.2528946101665497, + "learning_rate": 6.330828919817712e-06, + "loss": 0.1514, + "step": 7867 + }, + { + "epoch": 2.9054652880354506, + "grad_norm": 0.2847950756549835, + "learning_rate": 6.306195344254219e-06, + "loss": 0.1615, + "step": 7868 + }, + { + "epoch": 2.905834564254062, + "grad_norm": 0.35707566142082214, + "learning_rate": 6.281561768690726e-06, + "loss": 0.1972, + "step": 7869 + }, + { + "epoch": 2.9062038404726733, + "grad_norm": 0.2563282251358032, + "learning_rate": 6.256928193127233e-06, + "loss": 0.1364, + "step": 7870 + }, + { + "epoch": 2.906573116691285, + "grad_norm": 0.26471999287605286, + "learning_rate": 6.232294617563739e-06, + "loss": 0.1631, + "step": 7871 + }, + { + "epoch": 2.9069423929098965, + "grad_norm": 0.23557411134243011, + "learning_rate": 6.207661042000246e-06, + "loss": 0.1437, + "step": 7872 + }, + { + "epoch": 2.907311669128508, + "grad_norm": 0.2836287021636963, + "learning_rate": 6.183027466436753e-06, + "loss": 0.1504, + "step": 7873 + }, + { + "epoch": 2.9076809453471197, + "grad_norm": 0.2945464551448822, + "learning_rate": 6.15839389087326e-06, + "loss": 0.1667, + "step": 7874 + }, + { + "epoch": 2.9080502215657313, + "grad_norm": 0.3006623089313507, + "learning_rate": 6.133760315309767e-06, + "loss": 0.1642, + "step": 7875 + }, + { + "epoch": 2.908419497784343, + "grad_norm": 0.23895134031772614, + "learning_rate": 6.109126739746274e-06, + "loss": 0.1602, + "step": 7876 + }, + { + "epoch": 2.908788774002954, + "grad_norm": 0.28039875626564026, + "learning_rate": 6.084493164182781e-06, + "loss": 0.1536, + "step": 7877 + }, + { + "epoch": 2.9091580502215657, + "grad_norm": 0.24777038395404816, + "learning_rate": 6.059859588619288e-06, + "loss": 0.1451, + "step": 7878 + }, + { + "epoch": 2.9095273264401773, + "grad_norm": 0.2491549253463745, + "learning_rate": 6.035226013055795e-06, + "loss": 0.1467, + "step": 7879 + }, + { + "epoch": 2.909896602658789, + "grad_norm": 0.22842395305633545, + "learning_rate": 6.010592437492302e-06, + "loss": 0.1425, + "step": 7880 + }, + { + "epoch": 2.9102658788774, + "grad_norm": 0.2613133192062378, + "learning_rate": 5.9859588619288094e-06, + "loss": 0.1488, + "step": 7881 + }, + { + "epoch": 2.9106351550960117, + "grad_norm": 0.25035813450813293, + "learning_rate": 5.9613252863653164e-06, + "loss": 0.161, + "step": 7882 + }, + { + "epoch": 2.9110044313146233, + "grad_norm": 0.19619257748126984, + "learning_rate": 5.9366917108018235e-06, + "loss": 0.1331, + "step": 7883 + }, + { + "epoch": 2.911373707533235, + "grad_norm": 0.24376262724399567, + "learning_rate": 5.9120581352383305e-06, + "loss": 0.1583, + "step": 7884 + }, + { + "epoch": 2.9117429837518465, + "grad_norm": 0.2612351179122925, + "learning_rate": 5.887424559674837e-06, + "loss": 0.1171, + "step": 7885 + }, + { + "epoch": 2.912112259970458, + "grad_norm": 0.25041598081588745, + "learning_rate": 5.862790984111344e-06, + "loss": 0.1556, + "step": 7886 + }, + { + "epoch": 2.9124815361890697, + "grad_norm": 0.24045979976654053, + "learning_rate": 5.838157408547851e-06, + "loss": 0.1476, + "step": 7887 + }, + { + "epoch": 2.912850812407681, + "grad_norm": 0.26647570729255676, + "learning_rate": 5.813523832984358e-06, + "loss": 0.1794, + "step": 7888 + }, + { + "epoch": 2.9132200886262924, + "grad_norm": 0.26317957043647766, + "learning_rate": 5.788890257420865e-06, + "loss": 0.1574, + "step": 7889 + }, + { + "epoch": 2.913589364844904, + "grad_norm": 0.24807961285114288, + "learning_rate": 5.764256681857372e-06, + "loss": 0.1566, + "step": 7890 + }, + { + "epoch": 2.9139586410635157, + "grad_norm": 0.2447405308485031, + "learning_rate": 5.739623106293879e-06, + "loss": 0.1484, + "step": 7891 + }, + { + "epoch": 2.914327917282127, + "grad_norm": 0.19944146275520325, + "learning_rate": 5.714989530730386e-06, + "loss": 0.1271, + "step": 7892 + }, + { + "epoch": 2.9146971935007384, + "grad_norm": 0.28306761384010315, + "learning_rate": 5.690355955166893e-06, + "loss": 0.1697, + "step": 7893 + }, + { + "epoch": 2.91506646971935, + "grad_norm": 0.23773032426834106, + "learning_rate": 5.6657223796034e-06, + "loss": 0.1436, + "step": 7894 + }, + { + "epoch": 2.9154357459379616, + "grad_norm": 0.34763750433921814, + "learning_rate": 5.641088804039907e-06, + "loss": 0.1664, + "step": 7895 + }, + { + "epoch": 2.9158050221565732, + "grad_norm": 0.2620336711406708, + "learning_rate": 5.616455228476414e-06, + "loss": 0.1812, + "step": 7896 + }, + { + "epoch": 2.916174298375185, + "grad_norm": 0.24942079186439514, + "learning_rate": 5.591821652912921e-06, + "loss": 0.1477, + "step": 7897 + }, + { + "epoch": 2.9165435745937964, + "grad_norm": 0.24600937962532043, + "learning_rate": 5.567188077349428e-06, + "loss": 0.1631, + "step": 7898 + }, + { + "epoch": 2.9169128508124076, + "grad_norm": 0.2510182857513428, + "learning_rate": 5.542554501785935e-06, + "loss": 0.1441, + "step": 7899 + }, + { + "epoch": 2.917282127031019, + "grad_norm": 0.3106641471385956, + "learning_rate": 5.517920926222441e-06, + "loss": 0.1722, + "step": 7900 + }, + { + "epoch": 2.917282127031019, + "eval_loss": 8.97296142578125, + "eval_runtime": 6.8967, + "eval_samples_per_second": 7.25, + "eval_steps_per_second": 1.015, + "step": 7900 + }, + { + "epoch": 2.917651403249631, + "grad_norm": 0.24881936609745026, + "learning_rate": 5.493287350658948e-06, + "loss": 0.1324, + "step": 7901 + }, + { + "epoch": 2.9180206794682424, + "grad_norm": 0.2843630313873291, + "learning_rate": 5.468653775095455e-06, + "loss": 0.1894, + "step": 7902 + }, + { + "epoch": 2.9183899556868536, + "grad_norm": 0.2514699399471283, + "learning_rate": 5.444020199531962e-06, + "loss": 0.1629, + "step": 7903 + }, + { + "epoch": 2.918759231905465, + "grad_norm": 0.24311618506908417, + "learning_rate": 5.419386623968469e-06, + "loss": 0.1419, + "step": 7904 + }, + { + "epoch": 2.9191285081240768, + "grad_norm": 0.2648943066596985, + "learning_rate": 5.394753048404976e-06, + "loss": 0.1547, + "step": 7905 + }, + { + "epoch": 2.9194977843426884, + "grad_norm": 0.22802942991256714, + "learning_rate": 5.370119472841483e-06, + "loss": 0.1141, + "step": 7906 + }, + { + "epoch": 2.9198670605613, + "grad_norm": 0.24953940510749817, + "learning_rate": 5.34548589727799e-06, + "loss": 0.1474, + "step": 7907 + }, + { + "epoch": 2.9202363367799116, + "grad_norm": 0.284559428691864, + "learning_rate": 5.320852321714497e-06, + "loss": 0.1512, + "step": 7908 + }, + { + "epoch": 2.920605612998523, + "grad_norm": 0.21309591829776764, + "learning_rate": 5.296218746151004e-06, + "loss": 0.1418, + "step": 7909 + }, + { + "epoch": 2.9209748892171343, + "grad_norm": 0.25759708881378174, + "learning_rate": 5.271585170587511e-06, + "loss": 0.1508, + "step": 7910 + }, + { + "epoch": 2.921344165435746, + "grad_norm": 0.27101629972457886, + "learning_rate": 5.246951595024018e-06, + "loss": 0.1852, + "step": 7911 + }, + { + "epoch": 2.9217134416543575, + "grad_norm": 0.22052115201950073, + "learning_rate": 5.222318019460525e-06, + "loss": 0.1414, + "step": 7912 + }, + { + "epoch": 2.922082717872969, + "grad_norm": 0.26660534739494324, + "learning_rate": 5.197684443897032e-06, + "loss": 0.162, + "step": 7913 + }, + { + "epoch": 2.9224519940915803, + "grad_norm": 0.2699013948440552, + "learning_rate": 5.1730508683335384e-06, + "loss": 0.153, + "step": 7914 + }, + { + "epoch": 2.922821270310192, + "grad_norm": 0.251467764377594, + "learning_rate": 5.1484172927700454e-06, + "loss": 0.1448, + "step": 7915 + }, + { + "epoch": 2.9231905465288035, + "grad_norm": 0.28349432349205017, + "learning_rate": 5.1237837172065525e-06, + "loss": 0.1541, + "step": 7916 + }, + { + "epoch": 2.923559822747415, + "grad_norm": 0.24508216977119446, + "learning_rate": 5.0991501416430595e-06, + "loss": 0.1679, + "step": 7917 + }, + { + "epoch": 2.9239290989660267, + "grad_norm": 0.23247027397155762, + "learning_rate": 5.0745165660795665e-06, + "loss": 0.1386, + "step": 7918 + }, + { + "epoch": 2.9242983751846383, + "grad_norm": 0.2580397129058838, + "learning_rate": 5.0498829905160735e-06, + "loss": 0.1525, + "step": 7919 + }, + { + "epoch": 2.9246676514032495, + "grad_norm": 0.2523912191390991, + "learning_rate": 5.0252494149525805e-06, + "loss": 0.157, + "step": 7920 + }, + { + "epoch": 2.925036927621861, + "grad_norm": 0.3013935983181, + "learning_rate": 5.0006158393890875e-06, + "loss": 0.1586, + "step": 7921 + }, + { + "epoch": 2.9254062038404727, + "grad_norm": 0.2614888846874237, + "learning_rate": 4.9759822638255946e-06, + "loss": 0.1564, + "step": 7922 + }, + { + "epoch": 2.9257754800590843, + "grad_norm": 0.277831494808197, + "learning_rate": 4.9513486882621016e-06, + "loss": 0.1689, + "step": 7923 + }, + { + "epoch": 2.926144756277696, + "grad_norm": 0.2410767823457718, + "learning_rate": 4.926715112698609e-06, + "loss": 0.1376, + "step": 7924 + }, + { + "epoch": 2.926514032496307, + "grad_norm": 0.27479082345962524, + "learning_rate": 4.902081537135116e-06, + "loss": 0.1638, + "step": 7925 + }, + { + "epoch": 2.9268833087149186, + "grad_norm": 0.274920791387558, + "learning_rate": 4.877447961571623e-06, + "loss": 0.1556, + "step": 7926 + }, + { + "epoch": 2.9272525849335302, + "grad_norm": 0.26660868525505066, + "learning_rate": 4.85281438600813e-06, + "loss": 0.1462, + "step": 7927 + }, + { + "epoch": 2.927621861152142, + "grad_norm": 0.27625221014022827, + "learning_rate": 4.828180810444637e-06, + "loss": 0.1496, + "step": 7928 + }, + { + "epoch": 2.9279911373707534, + "grad_norm": 0.3186943829059601, + "learning_rate": 4.803547234881143e-06, + "loss": 0.1651, + "step": 7929 + }, + { + "epoch": 2.928360413589365, + "grad_norm": 0.2378610223531723, + "learning_rate": 4.77891365931765e-06, + "loss": 0.1404, + "step": 7930 + }, + { + "epoch": 2.928729689807976, + "grad_norm": 0.250263512134552, + "learning_rate": 4.754280083754157e-06, + "loss": 0.1587, + "step": 7931 + }, + { + "epoch": 2.929098966026588, + "grad_norm": 0.2912275195121765, + "learning_rate": 4.729646508190664e-06, + "loss": 0.159, + "step": 7932 + }, + { + "epoch": 2.9294682422451994, + "grad_norm": 0.31668514013290405, + "learning_rate": 4.705012932627171e-06, + "loss": 0.197, + "step": 7933 + }, + { + "epoch": 2.929837518463811, + "grad_norm": 0.25703102350234985, + "learning_rate": 4.680379357063678e-06, + "loss": 0.1569, + "step": 7934 + }, + { + "epoch": 2.930206794682422, + "grad_norm": 0.27894508838653564, + "learning_rate": 4.655745781500185e-06, + "loss": 0.1714, + "step": 7935 + }, + { + "epoch": 2.930576070901034, + "grad_norm": 0.22952796518802643, + "learning_rate": 4.631112205936692e-06, + "loss": 0.1507, + "step": 7936 + }, + { + "epoch": 2.9309453471196454, + "grad_norm": 0.2388591319322586, + "learning_rate": 4.606478630373199e-06, + "loss": 0.1539, + "step": 7937 + }, + { + "epoch": 2.931314623338257, + "grad_norm": 0.2579047977924347, + "learning_rate": 4.581845054809706e-06, + "loss": 0.1709, + "step": 7938 + }, + { + "epoch": 2.9316838995568686, + "grad_norm": 0.21860139071941376, + "learning_rate": 4.557211479246213e-06, + "loss": 0.1454, + "step": 7939 + }, + { + "epoch": 2.93205317577548, + "grad_norm": 0.24705132842063904, + "learning_rate": 4.53257790368272e-06, + "loss": 0.1688, + "step": 7940 + }, + { + "epoch": 2.932422451994092, + "grad_norm": 0.23665402829647064, + "learning_rate": 4.507944328119227e-06, + "loss": 0.1384, + "step": 7941 + }, + { + "epoch": 2.932791728212703, + "grad_norm": 0.24169811606407166, + "learning_rate": 4.483310752555734e-06, + "loss": 0.1448, + "step": 7942 + }, + { + "epoch": 2.9331610044313146, + "grad_norm": 0.22645510733127594, + "learning_rate": 4.45867717699224e-06, + "loss": 0.1429, + "step": 7943 + }, + { + "epoch": 2.933530280649926, + "grad_norm": 0.25645139813423157, + "learning_rate": 4.434043601428747e-06, + "loss": 0.1714, + "step": 7944 + }, + { + "epoch": 2.9338995568685378, + "grad_norm": 0.33161213994026184, + "learning_rate": 4.409410025865254e-06, + "loss": 0.1799, + "step": 7945 + }, + { + "epoch": 2.934268833087149, + "grad_norm": 0.2736192047595978, + "learning_rate": 4.384776450301761e-06, + "loss": 0.1514, + "step": 7946 + }, + { + "epoch": 2.9346381093057605, + "grad_norm": 0.22764644026756287, + "learning_rate": 4.360142874738268e-06, + "loss": 0.1502, + "step": 7947 + }, + { + "epoch": 2.935007385524372, + "grad_norm": 0.2983296811580658, + "learning_rate": 4.335509299174775e-06, + "loss": 0.2016, + "step": 7948 + }, + { + "epoch": 2.9353766617429837, + "grad_norm": 0.25068601965904236, + "learning_rate": 4.310875723611282e-06, + "loss": 0.1562, + "step": 7949 + }, + { + "epoch": 2.9357459379615953, + "grad_norm": 0.27430424094200134, + "learning_rate": 4.286242148047789e-06, + "loss": 0.1558, + "step": 7950 + }, + { + "epoch": 2.9357459379615953, + "eval_loss": 8.990312576293945, + "eval_runtime": 6.9148, + "eval_samples_per_second": 7.231, + "eval_steps_per_second": 1.012, + "step": 7950 + }, + { + "epoch": 2.936115214180207, + "grad_norm": 0.2822214365005493, + "learning_rate": 4.261608572484296e-06, + "loss": 0.1567, + "step": 7951 + }, + { + "epoch": 2.9364844903988185, + "grad_norm": 0.24384671449661255, + "learning_rate": 4.236974996920803e-06, + "loss": 0.1421, + "step": 7952 + }, + { + "epoch": 2.9368537666174297, + "grad_norm": 0.24013996124267578, + "learning_rate": 4.21234142135731e-06, + "loss": 0.1638, + "step": 7953 + }, + { + "epoch": 2.9372230428360413, + "grad_norm": 0.2883111238479614, + "learning_rate": 4.187707845793817e-06, + "loss": 0.1533, + "step": 7954 + }, + { + "epoch": 2.937592319054653, + "grad_norm": 0.2396702915430069, + "learning_rate": 4.163074270230324e-06, + "loss": 0.1659, + "step": 7955 + }, + { + "epoch": 2.9379615952732645, + "grad_norm": 0.26180678606033325, + "learning_rate": 4.138440694666831e-06, + "loss": 0.1615, + "step": 7956 + }, + { + "epoch": 2.9383308714918757, + "grad_norm": 0.24537289142608643, + "learning_rate": 4.1138071191033384e-06, + "loss": 0.152, + "step": 7957 + }, + { + "epoch": 2.9387001477104873, + "grad_norm": 0.2571628987789154, + "learning_rate": 4.089173543539845e-06, + "loss": 0.1406, + "step": 7958 + }, + { + "epoch": 2.939069423929099, + "grad_norm": 0.22918717563152313, + "learning_rate": 4.064539967976352e-06, + "loss": 0.1304, + "step": 7959 + }, + { + "epoch": 2.9394387001477105, + "grad_norm": 0.24279744923114777, + "learning_rate": 4.039906392412859e-06, + "loss": 0.1458, + "step": 7960 + }, + { + "epoch": 2.939807976366322, + "grad_norm": 0.2192680537700653, + "learning_rate": 4.015272816849366e-06, + "loss": 0.1347, + "step": 7961 + }, + { + "epoch": 2.9401772525849337, + "grad_norm": 0.2706016004085541, + "learning_rate": 3.990639241285873e-06, + "loss": 0.1713, + "step": 7962 + }, + { + "epoch": 2.9405465288035453, + "grad_norm": 0.2554149627685547, + "learning_rate": 3.96600566572238e-06, + "loss": 0.1701, + "step": 7963 + }, + { + "epoch": 2.9409158050221564, + "grad_norm": 0.2816751003265381, + "learning_rate": 3.941372090158887e-06, + "loss": 0.1609, + "step": 7964 + }, + { + "epoch": 2.941285081240768, + "grad_norm": 0.27880871295928955, + "learning_rate": 3.916738514595394e-06, + "loss": 0.1774, + "step": 7965 + }, + { + "epoch": 2.9416543574593796, + "grad_norm": 0.2953261137008667, + "learning_rate": 3.892104939031901e-06, + "loss": 0.1549, + "step": 7966 + }, + { + "epoch": 2.9420236336779912, + "grad_norm": 0.24242518842220306, + "learning_rate": 3.867471363468408e-06, + "loss": 0.1626, + "step": 7967 + }, + { + "epoch": 2.9423929098966024, + "grad_norm": 0.2703699469566345, + "learning_rate": 3.842837787904915e-06, + "loss": 0.1405, + "step": 7968 + }, + { + "epoch": 2.942762186115214, + "grad_norm": 0.2523631751537323, + "learning_rate": 3.818204212341422e-06, + "loss": 0.1619, + "step": 7969 + }, + { + "epoch": 2.9431314623338256, + "grad_norm": 0.23782677948474884, + "learning_rate": 3.793570636777929e-06, + "loss": 0.1483, + "step": 7970 + }, + { + "epoch": 2.943500738552437, + "grad_norm": 0.24940797686576843, + "learning_rate": 3.768937061214436e-06, + "loss": 0.1475, + "step": 7971 + }, + { + "epoch": 2.943870014771049, + "grad_norm": 0.2472488433122635, + "learning_rate": 3.744303485650942e-06, + "loss": 0.1419, + "step": 7972 + }, + { + "epoch": 2.9442392909896604, + "grad_norm": 0.23008085787296295, + "learning_rate": 3.719669910087449e-06, + "loss": 0.1483, + "step": 7973 + }, + { + "epoch": 2.944608567208272, + "grad_norm": 0.2478325366973877, + "learning_rate": 3.695036334523956e-06, + "loss": 0.1396, + "step": 7974 + }, + { + "epoch": 2.944977843426883, + "grad_norm": 0.23091351985931396, + "learning_rate": 3.670402758960463e-06, + "loss": 0.1586, + "step": 7975 + }, + { + "epoch": 2.945347119645495, + "grad_norm": 0.29193294048309326, + "learning_rate": 3.64576918339697e-06, + "loss": 0.1902, + "step": 7976 + }, + { + "epoch": 2.9457163958641064, + "grad_norm": 0.2566969096660614, + "learning_rate": 3.621135607833477e-06, + "loss": 0.1665, + "step": 7977 + }, + { + "epoch": 2.946085672082718, + "grad_norm": 0.28985485434532166, + "learning_rate": 3.596502032269984e-06, + "loss": 0.1659, + "step": 7978 + }, + { + "epoch": 2.946454948301329, + "grad_norm": 0.2937055230140686, + "learning_rate": 3.571868456706491e-06, + "loss": 0.1791, + "step": 7979 + }, + { + "epoch": 2.9468242245199407, + "grad_norm": 0.2837137281894684, + "learning_rate": 3.547234881142998e-06, + "loss": 0.1464, + "step": 7980 + }, + { + "epoch": 2.9471935007385524, + "grad_norm": 0.23624473810195923, + "learning_rate": 3.522601305579505e-06, + "loss": 0.143, + "step": 7981 + }, + { + "epoch": 2.947562776957164, + "grad_norm": 0.27758723497390747, + "learning_rate": 3.497967730016012e-06, + "loss": 0.1749, + "step": 7982 + }, + { + "epoch": 2.9479320531757756, + "grad_norm": 0.25888416171073914, + "learning_rate": 3.473334154452519e-06, + "loss": 0.1527, + "step": 7983 + }, + { + "epoch": 2.948301329394387, + "grad_norm": 0.27565741539001465, + "learning_rate": 3.448700578889026e-06, + "loss": 0.1413, + "step": 7984 + }, + { + "epoch": 2.9486706056129988, + "grad_norm": 0.30611753463745117, + "learning_rate": 3.424067003325533e-06, + "loss": 0.1435, + "step": 7985 + }, + { + "epoch": 2.94903988183161, + "grad_norm": 0.23541276156902313, + "learning_rate": 3.3994334277620402e-06, + "loss": 0.15, + "step": 7986 + }, + { + "epoch": 2.9494091580502215, + "grad_norm": 0.2765224575996399, + "learning_rate": 3.3747998521985464e-06, + "loss": 0.149, + "step": 7987 + }, + { + "epoch": 2.949778434268833, + "grad_norm": 0.2600533962249756, + "learning_rate": 3.3501662766350534e-06, + "loss": 0.1518, + "step": 7988 + }, + { + "epoch": 2.9501477104874447, + "grad_norm": 0.2566416263580322, + "learning_rate": 3.3255327010715604e-06, + "loss": 0.1521, + "step": 7989 + }, + { + "epoch": 2.950516986706056, + "grad_norm": 0.21060623228549957, + "learning_rate": 3.3008991255080674e-06, + "loss": 0.134, + "step": 7990 + }, + { + "epoch": 2.9508862629246675, + "grad_norm": 0.3320417106151581, + "learning_rate": 3.2762655499445745e-06, + "loss": 0.1928, + "step": 7991 + }, + { + "epoch": 2.951255539143279, + "grad_norm": 0.2630045413970947, + "learning_rate": 3.2516319743810815e-06, + "loss": 0.1602, + "step": 7992 + }, + { + "epoch": 2.9516248153618907, + "grad_norm": 0.3391921818256378, + "learning_rate": 3.2269983988175885e-06, + "loss": 0.1729, + "step": 7993 + }, + { + "epoch": 2.9519940915805023, + "grad_norm": 0.2500394284725189, + "learning_rate": 3.2023648232540955e-06, + "loss": 0.1431, + "step": 7994 + }, + { + "epoch": 2.952363367799114, + "grad_norm": 0.24159610271453857, + "learning_rate": 3.1777312476906025e-06, + "loss": 0.133, + "step": 7995 + }, + { + "epoch": 2.9527326440177255, + "grad_norm": 0.24430900812149048, + "learning_rate": 3.1530976721271095e-06, + "loss": 0.1492, + "step": 7996 + }, + { + "epoch": 2.9531019202363367, + "grad_norm": 0.2748737335205078, + "learning_rate": 3.1284640965636165e-06, + "loss": 0.1603, + "step": 7997 + }, + { + "epoch": 2.9534711964549483, + "grad_norm": 0.3518620729446411, + "learning_rate": 3.103830521000123e-06, + "loss": 0.1837, + "step": 7998 + }, + { + "epoch": 2.95384047267356, + "grad_norm": 0.2794644832611084, + "learning_rate": 3.07919694543663e-06, + "loss": 0.1646, + "step": 7999 + }, + { + "epoch": 2.9542097488921715, + "grad_norm": 0.28425800800323486, + "learning_rate": 3.054563369873137e-06, + "loss": 0.1596, + "step": 8000 + }, + { + "epoch": 2.9542097488921715, + "eval_loss": 9.002720832824707, + "eval_runtime": 6.9202, + "eval_samples_per_second": 7.225, + "eval_steps_per_second": 1.012, + "step": 8000 + }, + { + "epoch": 2.9545790251107826, + "grad_norm": 0.25080424547195435, + "learning_rate": 3.029929794309644e-06, + "loss": 0.1407, + "step": 8001 + }, + { + "epoch": 2.9549483013293942, + "grad_norm": 0.23469460010528564, + "learning_rate": 3.005296218746151e-06, + "loss": 0.1412, + "step": 8002 + }, + { + "epoch": 2.955317577548006, + "grad_norm": 0.2845630347728729, + "learning_rate": 2.9806626431826582e-06, + "loss": 0.1565, + "step": 8003 + }, + { + "epoch": 2.9556868537666174, + "grad_norm": 0.24741552770137787, + "learning_rate": 2.9560290676191652e-06, + "loss": 0.1321, + "step": 8004 + }, + { + "epoch": 2.956056129985229, + "grad_norm": 0.2677769362926483, + "learning_rate": 2.931395492055672e-06, + "loss": 0.189, + "step": 8005 + }, + { + "epoch": 2.9564254062038406, + "grad_norm": 0.28130677342414856, + "learning_rate": 2.906761916492179e-06, + "loss": 0.1476, + "step": 8006 + }, + { + "epoch": 2.9567946824224522, + "grad_norm": 0.23087288439273834, + "learning_rate": 2.882128340928686e-06, + "loss": 0.1475, + "step": 8007 + }, + { + "epoch": 2.9571639586410634, + "grad_norm": 0.2836574912071228, + "learning_rate": 2.857494765365193e-06, + "loss": 0.1809, + "step": 8008 + }, + { + "epoch": 2.957533234859675, + "grad_norm": 0.2461574375629425, + "learning_rate": 2.8328611898017e-06, + "loss": 0.1543, + "step": 8009 + }, + { + "epoch": 2.9579025110782866, + "grad_norm": 0.22171956300735474, + "learning_rate": 2.808227614238207e-06, + "loss": 0.1475, + "step": 8010 + }, + { + "epoch": 2.958271787296898, + "grad_norm": 0.23095613718032837, + "learning_rate": 2.783594038674714e-06, + "loss": 0.1587, + "step": 8011 + }, + { + "epoch": 2.9586410635155094, + "grad_norm": 0.2358042150735855, + "learning_rate": 2.7589604631112205e-06, + "loss": 0.1519, + "step": 8012 + }, + { + "epoch": 2.959010339734121, + "grad_norm": 0.22689734399318695, + "learning_rate": 2.7343268875477275e-06, + "loss": 0.1457, + "step": 8013 + }, + { + "epoch": 2.9593796159527326, + "grad_norm": 0.26101285219192505, + "learning_rate": 2.7096933119842346e-06, + "loss": 0.1431, + "step": 8014 + }, + { + "epoch": 2.959748892171344, + "grad_norm": 0.3013013005256653, + "learning_rate": 2.6850597364207416e-06, + "loss": 0.1541, + "step": 8015 + }, + { + "epoch": 2.960118168389956, + "grad_norm": 0.3007984757423401, + "learning_rate": 2.6604261608572486e-06, + "loss": 0.1637, + "step": 8016 + }, + { + "epoch": 2.9604874446085674, + "grad_norm": 0.279112845659256, + "learning_rate": 2.6357925852937556e-06, + "loss": 0.1583, + "step": 8017 + }, + { + "epoch": 2.960856720827179, + "grad_norm": 0.3156653940677643, + "learning_rate": 2.6111590097302626e-06, + "loss": 0.1568, + "step": 8018 + }, + { + "epoch": 2.96122599704579, + "grad_norm": 0.29697200655937195, + "learning_rate": 2.5865254341667692e-06, + "loss": 0.1848, + "step": 8019 + }, + { + "epoch": 2.9615952732644018, + "grad_norm": 0.26294344663619995, + "learning_rate": 2.5618918586032762e-06, + "loss": 0.1484, + "step": 8020 + }, + { + "epoch": 2.9619645494830134, + "grad_norm": 0.2446885108947754, + "learning_rate": 2.5372582830397832e-06, + "loss": 0.1502, + "step": 8021 + }, + { + "epoch": 2.962333825701625, + "grad_norm": 0.24274413287639618, + "learning_rate": 2.5126247074762903e-06, + "loss": 0.1571, + "step": 8022 + }, + { + "epoch": 2.962703101920236, + "grad_norm": 0.24617062509059906, + "learning_rate": 2.4879911319127973e-06, + "loss": 0.1563, + "step": 8023 + }, + { + "epoch": 2.9630723781388477, + "grad_norm": 0.21746572852134705, + "learning_rate": 2.4633575563493043e-06, + "loss": 0.1313, + "step": 8024 + }, + { + "epoch": 2.9634416543574593, + "grad_norm": 0.2738996744155884, + "learning_rate": 2.4387239807858113e-06, + "loss": 0.1593, + "step": 8025 + }, + { + "epoch": 2.963810930576071, + "grad_norm": 0.2826036512851715, + "learning_rate": 2.4140904052223183e-06, + "loss": 0.1902, + "step": 8026 + }, + { + "epoch": 2.9641802067946825, + "grad_norm": 0.34158679842948914, + "learning_rate": 2.389456829658825e-06, + "loss": 0.1872, + "step": 8027 + }, + { + "epoch": 2.964549483013294, + "grad_norm": 0.2509957551956177, + "learning_rate": 2.364823254095332e-06, + "loss": 0.1546, + "step": 8028 + }, + { + "epoch": 2.9649187592319057, + "grad_norm": 0.23964618146419525, + "learning_rate": 2.340189678531839e-06, + "loss": 0.1452, + "step": 8029 + }, + { + "epoch": 2.965288035450517, + "grad_norm": 0.2564508020877838, + "learning_rate": 2.315556102968346e-06, + "loss": 0.1482, + "step": 8030 + }, + { + "epoch": 2.9656573116691285, + "grad_norm": 0.22284071147441864, + "learning_rate": 2.290922527404853e-06, + "loss": 0.1343, + "step": 8031 + }, + { + "epoch": 2.96602658788774, + "grad_norm": 0.2107546031475067, + "learning_rate": 2.26628895184136e-06, + "loss": 0.1464, + "step": 8032 + }, + { + "epoch": 2.9663958641063517, + "grad_norm": 0.2847454249858856, + "learning_rate": 2.241655376277867e-06, + "loss": 0.1578, + "step": 8033 + }, + { + "epoch": 2.966765140324963, + "grad_norm": 0.30018970370292664, + "learning_rate": 2.2170218007143736e-06, + "loss": 0.1716, + "step": 8034 + }, + { + "epoch": 2.9671344165435745, + "grad_norm": 0.26028838753700256, + "learning_rate": 2.1923882251508806e-06, + "loss": 0.1481, + "step": 8035 + }, + { + "epoch": 2.967503692762186, + "grad_norm": 0.2827204763889313, + "learning_rate": 2.1677546495873876e-06, + "loss": 0.1477, + "step": 8036 + }, + { + "epoch": 2.9678729689807977, + "grad_norm": 0.30093058943748474, + "learning_rate": 2.1431210740238947e-06, + "loss": 0.1523, + "step": 8037 + }, + { + "epoch": 2.9682422451994093, + "grad_norm": 0.24841101467609406, + "learning_rate": 2.1184874984604017e-06, + "loss": 0.1389, + "step": 8038 + }, + { + "epoch": 2.968611521418021, + "grad_norm": 0.27458131313323975, + "learning_rate": 2.0938539228969087e-06, + "loss": 0.1556, + "step": 8039 + }, + { + "epoch": 2.9689807976366325, + "grad_norm": 0.32116448879241943, + "learning_rate": 2.0692203473334157e-06, + "loss": 0.1752, + "step": 8040 + }, + { + "epoch": 2.9693500738552436, + "grad_norm": 0.22927770018577576, + "learning_rate": 2.0445867717699223e-06, + "loss": 0.1376, + "step": 8041 + }, + { + "epoch": 2.9697193500738552, + "grad_norm": 0.26973241567611694, + "learning_rate": 2.0199531962064293e-06, + "loss": 0.1688, + "step": 8042 + }, + { + "epoch": 2.970088626292467, + "grad_norm": 0.2092060148715973, + "learning_rate": 1.9953196206429363e-06, + "loss": 0.1291, + "step": 8043 + }, + { + "epoch": 2.9704579025110784, + "grad_norm": 0.3036794662475586, + "learning_rate": 1.9706860450794434e-06, + "loss": 0.1817, + "step": 8044 + }, + { + "epoch": 2.9708271787296896, + "grad_norm": 0.27744531631469727, + "learning_rate": 1.9460524695159504e-06, + "loss": 0.1739, + "step": 8045 + }, + { + "epoch": 2.971196454948301, + "grad_norm": 0.2619611620903015, + "learning_rate": 1.9214188939524574e-06, + "loss": 0.148, + "step": 8046 + }, + { + "epoch": 2.971565731166913, + "grad_norm": 0.2631448805332184, + "learning_rate": 1.8967853183889644e-06, + "loss": 0.1535, + "step": 8047 + }, + { + "epoch": 2.9719350073855244, + "grad_norm": 0.27350446581840515, + "learning_rate": 1.872151742825471e-06, + "loss": 0.1626, + "step": 8048 + }, + { + "epoch": 2.972304283604136, + "grad_norm": 0.2253258377313614, + "learning_rate": 1.847518167261978e-06, + "loss": 0.139, + "step": 8049 + }, + { + "epoch": 2.9726735598227476, + "grad_norm": 0.3634534776210785, + "learning_rate": 1.822884591698485e-06, + "loss": 0.166, + "step": 8050 + }, + { + "epoch": 2.9726735598227476, + "eval_loss": 8.982169151306152, + "eval_runtime": 6.9021, + "eval_samples_per_second": 7.244, + "eval_steps_per_second": 1.014, + "step": 8050 + }, + { + "epoch": 2.9730428360413588, + "grad_norm": 0.2547619342803955, + "learning_rate": 1.798251016134992e-06, + "loss": 0.1469, + "step": 8051 + }, + { + "epoch": 2.9734121122599704, + "grad_norm": 0.8772432804107666, + "learning_rate": 1.773617440571499e-06, + "loss": 0.1575, + "step": 8052 + }, + { + "epoch": 2.973781388478582, + "grad_norm": 0.2855498194694519, + "learning_rate": 1.748983865008006e-06, + "loss": 0.163, + "step": 8053 + }, + { + "epoch": 2.9741506646971936, + "grad_norm": 0.28774508833885193, + "learning_rate": 1.724350289444513e-06, + "loss": 0.1748, + "step": 8054 + }, + { + "epoch": 2.974519940915805, + "grad_norm": 0.2574138641357422, + "learning_rate": 1.6997167138810201e-06, + "loss": 0.1861, + "step": 8055 + }, + { + "epoch": 2.9748892171344163, + "grad_norm": 0.27626481652259827, + "learning_rate": 1.6750831383175267e-06, + "loss": 0.1567, + "step": 8056 + }, + { + "epoch": 2.975258493353028, + "grad_norm": 0.28363004326820374, + "learning_rate": 1.6504495627540337e-06, + "loss": 0.1512, + "step": 8057 + }, + { + "epoch": 2.9756277695716395, + "grad_norm": 0.274557501077652, + "learning_rate": 1.6258159871905407e-06, + "loss": 0.1748, + "step": 8058 + }, + { + "epoch": 2.975997045790251, + "grad_norm": 0.2636219263076782, + "learning_rate": 1.6011824116270478e-06, + "loss": 0.1581, + "step": 8059 + }, + { + "epoch": 2.9763663220088628, + "grad_norm": 0.21337440609931946, + "learning_rate": 1.5765488360635548e-06, + "loss": 0.1385, + "step": 8060 + }, + { + "epoch": 2.9767355982274744, + "grad_norm": 0.21846963465213776, + "learning_rate": 1.5519152605000616e-06, + "loss": 0.1595, + "step": 8061 + }, + { + "epoch": 2.9771048744460855, + "grad_norm": 0.2536872327327728, + "learning_rate": 1.5272816849365686e-06, + "loss": 0.1401, + "step": 8062 + }, + { + "epoch": 2.977474150664697, + "grad_norm": 0.266182541847229, + "learning_rate": 1.5026481093730756e-06, + "loss": 0.1574, + "step": 8063 + }, + { + "epoch": 2.9778434268833087, + "grad_norm": 0.21301548182964325, + "learning_rate": 1.4780145338095826e-06, + "loss": 0.1478, + "step": 8064 + }, + { + "epoch": 2.9782127031019203, + "grad_norm": 0.25051793456077576, + "learning_rate": 1.4533809582460894e-06, + "loss": 0.1533, + "step": 8065 + }, + { + "epoch": 2.9785819793205315, + "grad_norm": 0.23402763903141022, + "learning_rate": 1.4287473826825964e-06, + "loss": 0.1537, + "step": 8066 + }, + { + "epoch": 2.978951255539143, + "grad_norm": 0.3606404960155487, + "learning_rate": 1.4041138071191035e-06, + "loss": 0.1795, + "step": 8067 + }, + { + "epoch": 2.9793205317577547, + "grad_norm": 0.2494754195213318, + "learning_rate": 1.3794802315556103e-06, + "loss": 0.1563, + "step": 8068 + }, + { + "epoch": 2.9796898079763663, + "grad_norm": 0.258579283952713, + "learning_rate": 1.3548466559921173e-06, + "loss": 0.1508, + "step": 8069 + }, + { + "epoch": 2.980059084194978, + "grad_norm": 0.2852865755558014, + "learning_rate": 1.3302130804286243e-06, + "loss": 0.1621, + "step": 8070 + }, + { + "epoch": 2.9804283604135895, + "grad_norm": 0.3060619831085205, + "learning_rate": 1.3055795048651313e-06, + "loss": 0.147, + "step": 8071 + }, + { + "epoch": 2.980797636632201, + "grad_norm": 0.22039853036403656, + "learning_rate": 1.2809459293016381e-06, + "loss": 0.1667, + "step": 8072 + }, + { + "epoch": 2.9811669128508123, + "grad_norm": 0.3247520327568054, + "learning_rate": 1.2563123537381451e-06, + "loss": 0.1528, + "step": 8073 + }, + { + "epoch": 2.981536189069424, + "grad_norm": 0.28092482686042786, + "learning_rate": 1.2316787781746521e-06, + "loss": 0.1858, + "step": 8074 + }, + { + "epoch": 2.9819054652880355, + "grad_norm": 0.23419290781021118, + "learning_rate": 1.2070452026111592e-06, + "loss": 0.1331, + "step": 8075 + }, + { + "epoch": 2.982274741506647, + "grad_norm": 0.22580453753471375, + "learning_rate": 1.182411627047666e-06, + "loss": 0.1647, + "step": 8076 + }, + { + "epoch": 2.9826440177252582, + "grad_norm": 0.3158695697784424, + "learning_rate": 1.157778051484173e-06, + "loss": 0.176, + "step": 8077 + }, + { + "epoch": 2.98301329394387, + "grad_norm": 0.26688152551651, + "learning_rate": 1.13314447592068e-06, + "loss": 0.1752, + "step": 8078 + }, + { + "epoch": 2.9833825701624814, + "grad_norm": 0.25893792510032654, + "learning_rate": 1.1085109003571868e-06, + "loss": 0.1567, + "step": 8079 + }, + { + "epoch": 2.983751846381093, + "grad_norm": 0.30720555782318115, + "learning_rate": 1.0838773247936938e-06, + "loss": 0.195, + "step": 8080 + }, + { + "epoch": 2.9841211225997046, + "grad_norm": 0.22736316919326782, + "learning_rate": 1.0592437492302008e-06, + "loss": 0.1371, + "step": 8081 + }, + { + "epoch": 2.9844903988183162, + "grad_norm": 0.25446420907974243, + "learning_rate": 1.0346101736667079e-06, + "loss": 0.1512, + "step": 8082 + }, + { + "epoch": 2.984859675036928, + "grad_norm": 0.21786902844905853, + "learning_rate": 1.0099765981032147e-06, + "loss": 0.1159, + "step": 8083 + }, + { + "epoch": 2.985228951255539, + "grad_norm": 0.22750112414360046, + "learning_rate": 9.853430225397217e-07, + "loss": 0.1417, + "step": 8084 + }, + { + "epoch": 2.9855982274741506, + "grad_norm": 0.3298458755016327, + "learning_rate": 9.607094469762287e-07, + "loss": 0.1857, + "step": 8085 + }, + { + "epoch": 2.985967503692762, + "grad_norm": 0.24306781589984894, + "learning_rate": 9.360758714127355e-07, + "loss": 0.1542, + "step": 8086 + }, + { + "epoch": 2.986336779911374, + "grad_norm": 0.28355881571769714, + "learning_rate": 9.114422958492425e-07, + "loss": 0.1677, + "step": 8087 + }, + { + "epoch": 2.986706056129985, + "grad_norm": 0.31169894337654114, + "learning_rate": 8.868087202857495e-07, + "loss": 0.1748, + "step": 8088 + }, + { + "epoch": 2.9870753323485966, + "grad_norm": 0.2226739525794983, + "learning_rate": 8.621751447222565e-07, + "loss": 0.141, + "step": 8089 + }, + { + "epoch": 2.987444608567208, + "grad_norm": 0.29178452491760254, + "learning_rate": 8.375415691587634e-07, + "loss": 0.1447, + "step": 8090 + }, + { + "epoch": 2.9878138847858198, + "grad_norm": 0.2792452871799469, + "learning_rate": 8.129079935952704e-07, + "loss": 0.1584, + "step": 8091 + }, + { + "epoch": 2.9881831610044314, + "grad_norm": 0.259443461894989, + "learning_rate": 7.882744180317774e-07, + "loss": 0.1594, + "step": 8092 + }, + { + "epoch": 2.988552437223043, + "grad_norm": 0.27456268668174744, + "learning_rate": 7.636408424682843e-07, + "loss": 0.1702, + "step": 8093 + }, + { + "epoch": 2.9889217134416546, + "grad_norm": 0.22297781705856323, + "learning_rate": 7.390072669047913e-07, + "loss": 0.1277, + "step": 8094 + }, + { + "epoch": 2.9892909896602657, + "grad_norm": 0.2450307160615921, + "learning_rate": 7.143736913412982e-07, + "loss": 0.1525, + "step": 8095 + }, + { + "epoch": 2.9896602658788773, + "grad_norm": 0.27379122376441956, + "learning_rate": 6.897401157778051e-07, + "loss": 0.1532, + "step": 8096 + }, + { + "epoch": 2.990029542097489, + "grad_norm": 0.20734794437885284, + "learning_rate": 6.651065402143121e-07, + "loss": 0.125, + "step": 8097 + }, + { + "epoch": 2.9903988183161005, + "grad_norm": 0.2515438497066498, + "learning_rate": 6.404729646508191e-07, + "loss": 0.1499, + "step": 8098 + }, + { + "epoch": 2.9907680945347117, + "grad_norm": 0.24936993420124054, + "learning_rate": 6.158393890873261e-07, + "loss": 0.1447, + "step": 8099 + }, + { + "epoch": 2.9911373707533233, + "grad_norm": 0.2349616289138794, + "learning_rate": 5.91205813523833e-07, + "loss": 0.1363, + "step": 8100 + }, + { + "epoch": 2.9911373707533233, + "eval_loss": 9.003331184387207, + "eval_runtime": 6.905, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 1.014, + "step": 8100 + }, + { + "epoch": 2.991506646971935, + "grad_norm": 0.21882687509059906, + "learning_rate": 5.6657223796034e-07, + "loss": 0.146, + "step": 8101 + }, + { + "epoch": 2.9918759231905465, + "grad_norm": 0.22272057831287384, + "learning_rate": 5.419386623968469e-07, + "loss": 0.1202, + "step": 8102 + }, + { + "epoch": 2.992245199409158, + "grad_norm": 0.31489813327789307, + "learning_rate": 5.173050868333539e-07, + "loss": 0.1592, + "step": 8103 + }, + { + "epoch": 2.9926144756277697, + "grad_norm": 0.2614988684654236, + "learning_rate": 4.926715112698608e-07, + "loss": 0.1545, + "step": 8104 + }, + { + "epoch": 2.9929837518463813, + "grad_norm": 0.2447344958782196, + "learning_rate": 4.6803793570636775e-07, + "loss": 0.1888, + "step": 8105 + }, + { + "epoch": 2.9933530280649925, + "grad_norm": 0.2196211963891983, + "learning_rate": 4.4340436014287476e-07, + "loss": 0.1414, + "step": 8106 + }, + { + "epoch": 2.993722304283604, + "grad_norm": 0.28387728333473206, + "learning_rate": 4.187707845793817e-07, + "loss": 0.1899, + "step": 8107 + }, + { + "epoch": 2.9940915805022157, + "grad_norm": 0.2550733685493469, + "learning_rate": 3.941372090158887e-07, + "loss": 0.1655, + "step": 8108 + }, + { + "epoch": 2.9944608567208273, + "grad_norm": 0.32761019468307495, + "learning_rate": 3.6950363345239565e-07, + "loss": 0.1783, + "step": 8109 + }, + { + "epoch": 2.9948301329394384, + "grad_norm": 0.2537683844566345, + "learning_rate": 3.4487005788890257e-07, + "loss": 0.1673, + "step": 8110 + }, + { + "epoch": 2.99519940915805, + "grad_norm": 0.2546219825744629, + "learning_rate": 3.2023648232540953e-07, + "loss": 0.151, + "step": 8111 + }, + { + "epoch": 2.9955686853766617, + "grad_norm": 0.24820037186145782, + "learning_rate": 2.956029067619165e-07, + "loss": 0.1576, + "step": 8112 + }, + { + "epoch": 2.9959379615952733, + "grad_norm": 0.21701163053512573, + "learning_rate": 2.7096933119842346e-07, + "loss": 0.1239, + "step": 8113 + }, + { + "epoch": 2.996307237813885, + "grad_norm": 0.20344412326812744, + "learning_rate": 2.463357556349304e-07, + "loss": 0.1293, + "step": 8114 + }, + { + "epoch": 2.9966765140324965, + "grad_norm": 0.2563105821609497, + "learning_rate": 2.2170218007143738e-07, + "loss": 0.1415, + "step": 8115 + }, + { + "epoch": 2.997045790251108, + "grad_norm": 0.3039357662200928, + "learning_rate": 1.9706860450794435e-07, + "loss": 0.193, + "step": 8116 + }, + { + "epoch": 2.9974150664697192, + "grad_norm": 0.2374250590801239, + "learning_rate": 1.7243502894445128e-07, + "loss": 0.1498, + "step": 8117 + }, + { + "epoch": 2.997784342688331, + "grad_norm": 0.23057468235492706, + "learning_rate": 1.4780145338095825e-07, + "loss": 0.1291, + "step": 8118 + }, + { + "epoch": 2.9981536189069424, + "grad_norm": 0.2890825867652893, + "learning_rate": 1.231678778174652e-07, + "loss": 0.1688, + "step": 8119 + }, + { + "epoch": 2.998522895125554, + "grad_norm": 0.3223683834075928, + "learning_rate": 9.853430225397217e-08, + "loss": 0.1768, + "step": 8120 + }, + { + "epoch": 2.998892171344165, + "grad_norm": 0.24776941537857056, + "learning_rate": 7.390072669047912e-08, + "loss": 0.142, + "step": 8121 + }, + { + "epoch": 2.999261447562777, + "grad_norm": 0.2459111511707306, + "learning_rate": 4.9267151126986086e-08, + "loss": 0.1533, + "step": 8122 + }, + { + "epoch": 2.9996307237813884, + "grad_norm": 0.3260028064250946, + "learning_rate": 2.4633575563493043e-08, + "loss": 0.1501, + "step": 8123 + }, + { + "epoch": 3.0, + "grad_norm": 0.32445764541625977, + "learning_rate": 0.0, + "loss": 0.1581, + "step": 8124 + } + ], + "logging_steps": 1, + "max_steps": 8124, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.258457622197371e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}