ErrorAI's picture
Training in progress, step 974, checkpoint
823f2c6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0005134788189987,
"eval_steps": 244,
"global_step": 974,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010269576379974327,
"grad_norm": 3.957693576812744,
"learning_rate": 2e-05,
"loss": 3.6592,
"step": 1
},
{
"epoch": 0.0010269576379974327,
"eval_loss": 3.853421926498413,
"eval_runtime": 26.7657,
"eval_samples_per_second": 15.318,
"eval_steps_per_second": 7.659,
"step": 1
},
{
"epoch": 0.0020539152759948653,
"grad_norm": 4.336184501647949,
"learning_rate": 4e-05,
"loss": 4.1936,
"step": 2
},
{
"epoch": 0.0030808729139922978,
"grad_norm": 3.962000846862793,
"learning_rate": 6e-05,
"loss": 3.801,
"step": 3
},
{
"epoch": 0.004107830551989731,
"grad_norm": 4.063741207122803,
"learning_rate": 8e-05,
"loss": 3.8505,
"step": 4
},
{
"epoch": 0.005134788189987163,
"grad_norm": 3.923612594604492,
"learning_rate": 0.0001,
"loss": 3.5915,
"step": 5
},
{
"epoch": 0.0061617458279845955,
"grad_norm": 3.681149959564209,
"learning_rate": 0.00012,
"loss": 3.4027,
"step": 6
},
{
"epoch": 0.007188703465982028,
"grad_norm": 2.262376546859741,
"learning_rate": 0.00014,
"loss": 2.8853,
"step": 7
},
{
"epoch": 0.008215661103979461,
"grad_norm": 1.8211342096328735,
"learning_rate": 0.00016,
"loss": 2.8695,
"step": 8
},
{
"epoch": 0.009242618741976894,
"grad_norm": 3.154785633087158,
"learning_rate": 0.00018,
"loss": 2.9279,
"step": 9
},
{
"epoch": 0.010269576379974325,
"grad_norm": 2.458535671234131,
"learning_rate": 0.0002,
"loss": 2.9412,
"step": 10
},
{
"epoch": 0.011296534017971758,
"grad_norm": 3.2424135208129883,
"learning_rate": 0.0001999994689745966,
"loss": 2.979,
"step": 11
},
{
"epoch": 0.012323491655969191,
"grad_norm": 2.4848523139953613,
"learning_rate": 0.00019999787590402606,
"loss": 2.9422,
"step": 12
},
{
"epoch": 0.013350449293966624,
"grad_norm": 1.9753702878952026,
"learning_rate": 0.00019999522080520765,
"loss": 2.7611,
"step": 13
},
{
"epoch": 0.014377406931964057,
"grad_norm": 2.2294538021087646,
"learning_rate": 0.00019999150370633988,
"loss": 2.5417,
"step": 14
},
{
"epoch": 0.01540436456996149,
"grad_norm": 2.10558819770813,
"learning_rate": 0.00019998672464690022,
"loss": 2.6994,
"step": 15
},
{
"epoch": 0.016431322207958923,
"grad_norm": 1.6060516834259033,
"learning_rate": 0.00019998088367764467,
"loss": 2.6246,
"step": 16
},
{
"epoch": 0.017458279845956354,
"grad_norm": 1.5200241804122925,
"learning_rate": 0.00019997398086060735,
"loss": 2.4818,
"step": 17
},
{
"epoch": 0.01848523748395379,
"grad_norm": 1.8856867551803589,
"learning_rate": 0.00019996601626909964,
"loss": 2.895,
"step": 18
},
{
"epoch": 0.01951219512195122,
"grad_norm": 3.8515360355377197,
"learning_rate": 0.00019995698998770956,
"loss": 3.6369,
"step": 19
},
{
"epoch": 0.02053915275994865,
"grad_norm": 2.049919366836548,
"learning_rate": 0.00019994690211230082,
"loss": 2.6389,
"step": 20
},
{
"epoch": 0.021566110397946085,
"grad_norm": 1.6166138648986816,
"learning_rate": 0.00019993575275001175,
"loss": 2.4489,
"step": 21
},
{
"epoch": 0.022593068035943516,
"grad_norm": 2.4144058227539062,
"learning_rate": 0.00019992354201925428,
"loss": 2.6399,
"step": 22
},
{
"epoch": 0.02362002567394095,
"grad_norm": 1.8632601499557495,
"learning_rate": 0.00019991027004971255,
"loss": 2.4482,
"step": 23
},
{
"epoch": 0.024646983311938382,
"grad_norm": 1.7382563352584839,
"learning_rate": 0.00019989593698234163,
"loss": 2.4847,
"step": 24
},
{
"epoch": 0.025673940949935817,
"grad_norm": 1.5046484470367432,
"learning_rate": 0.000199880542969366,
"loss": 2.1829,
"step": 25
},
{
"epoch": 0.026700898587933248,
"grad_norm": 1.998173475265503,
"learning_rate": 0.0001998640881742778,
"loss": 2.5759,
"step": 26
},
{
"epoch": 0.02772785622593068,
"grad_norm": 1.9964630603790283,
"learning_rate": 0.00019984657277183544,
"loss": 2.1866,
"step": 27
},
{
"epoch": 0.028754813863928114,
"grad_norm": 1.5282529592514038,
"learning_rate": 0.00019982799694806135,
"loss": 2.4439,
"step": 28
},
{
"epoch": 0.029781771501925545,
"grad_norm": 1.4352314472198486,
"learning_rate": 0.0001998083609002402,
"loss": 2.3027,
"step": 29
},
{
"epoch": 0.03080872913992298,
"grad_norm": 1.9962196350097656,
"learning_rate": 0.00019978766483691676,
"loss": 2.696,
"step": 30
},
{
"epoch": 0.03183568677792041,
"grad_norm": 1.603895902633667,
"learning_rate": 0.00019976590897789382,
"loss": 2.5533,
"step": 31
},
{
"epoch": 0.032862644415917845,
"grad_norm": 1.7145624160766602,
"learning_rate": 0.00019974309355422963,
"loss": 2.4172,
"step": 32
},
{
"epoch": 0.03388960205391527,
"grad_norm": 1.640822172164917,
"learning_rate": 0.00019971921880823553,
"loss": 2.5851,
"step": 33
},
{
"epoch": 0.03491655969191271,
"grad_norm": 1.5972627401351929,
"learning_rate": 0.0001996942849934735,
"loss": 2.378,
"step": 34
},
{
"epoch": 0.03594351732991014,
"grad_norm": 1.2989180088043213,
"learning_rate": 0.0001996682923747533,
"loss": 2.2183,
"step": 35
},
{
"epoch": 0.03697047496790758,
"grad_norm": 1.5472369194030762,
"learning_rate": 0.00019964124122812975,
"loss": 2.4299,
"step": 36
},
{
"epoch": 0.037997432605905004,
"grad_norm": 1.8404388427734375,
"learning_rate": 0.0001996131318408998,
"loss": 2.6303,
"step": 37
},
{
"epoch": 0.03902439024390244,
"grad_norm": 1.7805739641189575,
"learning_rate": 0.00019958396451159936,
"loss": 2.5277,
"step": 38
},
{
"epoch": 0.040051347881899874,
"grad_norm": 1.4411464929580688,
"learning_rate": 0.0001995537395500004,
"loss": 2.5659,
"step": 39
},
{
"epoch": 0.0410783055198973,
"grad_norm": 2.1974802017211914,
"learning_rate": 0.00019952245727710723,
"loss": 2.4178,
"step": 40
},
{
"epoch": 0.042105263157894736,
"grad_norm": 1.7201310396194458,
"learning_rate": 0.00019949011802515356,
"loss": 2.3348,
"step": 41
},
{
"epoch": 0.04313222079589217,
"grad_norm": 1.501447319984436,
"learning_rate": 0.0001994567221375987,
"loss": 2.2731,
"step": 42
},
{
"epoch": 0.044159178433889605,
"grad_norm": 2.366774797439575,
"learning_rate": 0.00019942226996912384,
"loss": 2.4477,
"step": 43
},
{
"epoch": 0.04518613607188703,
"grad_norm": 1.6313987970352173,
"learning_rate": 0.00019938676188562863,
"loss": 2.6674,
"step": 44
},
{
"epoch": 0.04621309370988447,
"grad_norm": 1.5255392789840698,
"learning_rate": 0.00019935019826422692,
"loss": 2.428,
"step": 45
},
{
"epoch": 0.0472400513478819,
"grad_norm": 1.4218732118606567,
"learning_rate": 0.00019931257949324288,
"loss": 2.3119,
"step": 46
},
{
"epoch": 0.04826700898587933,
"grad_norm": 1.8226125240325928,
"learning_rate": 0.0001992739059722071,
"loss": 2.6293,
"step": 47
},
{
"epoch": 0.049293966623876764,
"grad_norm": 1.7285653352737427,
"learning_rate": 0.00019923417811185186,
"loss": 2.5319,
"step": 48
},
{
"epoch": 0.0503209242618742,
"grad_norm": 1.639489769935608,
"learning_rate": 0.00019919339633410737,
"loss": 2.5006,
"step": 49
},
{
"epoch": 0.051347881899871634,
"grad_norm": 1.6802177429199219,
"learning_rate": 0.00019915156107209675,
"loss": 2.4881,
"step": 50
},
{
"epoch": 0.05237483953786906,
"grad_norm": 1.5928065776824951,
"learning_rate": 0.0001991086727701317,
"loss": 2.3264,
"step": 51
},
{
"epoch": 0.053401797175866496,
"grad_norm": 1.5361305475234985,
"learning_rate": 0.0001990647318837079,
"loss": 2.3946,
"step": 52
},
{
"epoch": 0.05442875481386393,
"grad_norm": 1.5837557315826416,
"learning_rate": 0.0001990197388794998,
"loss": 2.4201,
"step": 53
},
{
"epoch": 0.05545571245186136,
"grad_norm": 1.4728038311004639,
"learning_rate": 0.000198973694235356,
"loss": 2.3488,
"step": 54
},
{
"epoch": 0.05648267008985879,
"grad_norm": 1.5199151039123535,
"learning_rate": 0.00019892659844029397,
"loss": 2.2515,
"step": 55
},
{
"epoch": 0.05750962772785623,
"grad_norm": 1.5088133811950684,
"learning_rate": 0.00019887845199449504,
"loss": 2.2909,
"step": 56
},
{
"epoch": 0.05853658536585366,
"grad_norm": 1.553422451019287,
"learning_rate": 0.00019882925540929888,
"loss": 2.6474,
"step": 57
},
{
"epoch": 0.05956354300385109,
"grad_norm": 1.7582780122756958,
"learning_rate": 0.00019877900920719827,
"loss": 2.4533,
"step": 58
},
{
"epoch": 0.060590500641848524,
"grad_norm": 1.7055028676986694,
"learning_rate": 0.00019872771392183332,
"loss": 2.4449,
"step": 59
},
{
"epoch": 0.06161745827984596,
"grad_norm": 1.4432878494262695,
"learning_rate": 0.0001986753700979861,
"loss": 2.2296,
"step": 60
},
{
"epoch": 0.0626444159178434,
"grad_norm": 1.9076869487762451,
"learning_rate": 0.00019862197829157457,
"loss": 2.5907,
"step": 61
},
{
"epoch": 0.06367137355584082,
"grad_norm": 1.348199725151062,
"learning_rate": 0.00019856753906964686,
"loss": 2.2372,
"step": 62
},
{
"epoch": 0.06469833119383825,
"grad_norm": 1.7192755937576294,
"learning_rate": 0.0001985120530103752,
"loss": 2.4893,
"step": 63
},
{
"epoch": 0.06572528883183569,
"grad_norm": 1.456470251083374,
"learning_rate": 0.00019845552070304966,
"loss": 2.3985,
"step": 64
},
{
"epoch": 0.06675224646983312,
"grad_norm": 1.9413789510726929,
"learning_rate": 0.00019839794274807213,
"loss": 2.3872,
"step": 65
},
{
"epoch": 0.06777920410783055,
"grad_norm": 1.5768566131591797,
"learning_rate": 0.0001983393197569497,
"loss": 2.1808,
"step": 66
},
{
"epoch": 0.06880616174582799,
"grad_norm": 1.5114476680755615,
"learning_rate": 0.00019827965235228834,
"loss": 2.0469,
"step": 67
},
{
"epoch": 0.06983311938382541,
"grad_norm": 2.0698795318603516,
"learning_rate": 0.00019821894116778615,
"loss": 2.3916,
"step": 68
},
{
"epoch": 0.07086007702182286,
"grad_norm": 1.5041264295578003,
"learning_rate": 0.00019815718684822688,
"loss": 2.4773,
"step": 69
},
{
"epoch": 0.07188703465982028,
"grad_norm": 1.5561423301696777,
"learning_rate": 0.00019809439004947268,
"loss": 2.2211,
"step": 70
},
{
"epoch": 0.07291399229781771,
"grad_norm": 1.5526721477508545,
"learning_rate": 0.00019803055143845745,
"loss": 2.2947,
"step": 71
},
{
"epoch": 0.07394094993581515,
"grad_norm": 2.027614116668701,
"learning_rate": 0.00019796567169317973,
"loss": 2.4629,
"step": 72
},
{
"epoch": 0.07496790757381258,
"grad_norm": 1.4370557069778442,
"learning_rate": 0.00019789975150269536,
"loss": 2.3772,
"step": 73
},
{
"epoch": 0.07599486521181001,
"grad_norm": 1.493373990058899,
"learning_rate": 0.00019783279156711022,
"loss": 2.169,
"step": 74
},
{
"epoch": 0.07702182284980745,
"grad_norm": 1.905671238899231,
"learning_rate": 0.00019776479259757287,
"loss": 2.3834,
"step": 75
},
{
"epoch": 0.07804878048780488,
"grad_norm": 1.5494719743728638,
"learning_rate": 0.00019769575531626695,
"loss": 2.1067,
"step": 76
},
{
"epoch": 0.0790757381258023,
"grad_norm": 1.686172366142273,
"learning_rate": 0.00019762568045640343,
"loss": 2.3663,
"step": 77
},
{
"epoch": 0.08010269576379975,
"grad_norm": 1.640641689300537,
"learning_rate": 0.0001975545687622129,
"loss": 2.4287,
"step": 78
},
{
"epoch": 0.08112965340179717,
"grad_norm": 1.7262331247329712,
"learning_rate": 0.0001974824209889377,
"loss": 2.5115,
"step": 79
},
{
"epoch": 0.0821566110397946,
"grad_norm": 1.6362113952636719,
"learning_rate": 0.00019740923790282389,
"loss": 2.2623,
"step": 80
},
{
"epoch": 0.08318356867779204,
"grad_norm": 1.9334293603897095,
"learning_rate": 0.00019733502028111295,
"loss": 2.3749,
"step": 81
},
{
"epoch": 0.08421052631578947,
"grad_norm": 1.7188832759857178,
"learning_rate": 0.00019725976891203376,
"loss": 2.3507,
"step": 82
},
{
"epoch": 0.0852374839537869,
"grad_norm": 1.7757607698440552,
"learning_rate": 0.0001971834845947941,
"loss": 2.3925,
"step": 83
},
{
"epoch": 0.08626444159178434,
"grad_norm": 1.2982068061828613,
"learning_rate": 0.00019710616813957218,
"loss": 2.0348,
"step": 84
},
{
"epoch": 0.08729139922978177,
"grad_norm": 1.8079991340637207,
"learning_rate": 0.000197027820367508,
"loss": 2.3944,
"step": 85
},
{
"epoch": 0.08831835686777921,
"grad_norm": 1.812174916267395,
"learning_rate": 0.00019694844211069477,
"loss": 2.2011,
"step": 86
},
{
"epoch": 0.08934531450577664,
"grad_norm": 1.7926369905471802,
"learning_rate": 0.00019686803421216985,
"loss": 2.5529,
"step": 87
},
{
"epoch": 0.09037227214377407,
"grad_norm": 1.5705771446228027,
"learning_rate": 0.00019678659752590602,
"loss": 2.2078,
"step": 88
},
{
"epoch": 0.09139922978177151,
"grad_norm": 1.5993576049804688,
"learning_rate": 0.00019670413291680223,
"loss": 2.4257,
"step": 89
},
{
"epoch": 0.09242618741976893,
"grad_norm": 1.531992793083191,
"learning_rate": 0.00019662064126067452,
"loss": 2.3995,
"step": 90
},
{
"epoch": 0.09345314505776636,
"grad_norm": 1.2814476490020752,
"learning_rate": 0.0001965361234442467,
"loss": 2.4486,
"step": 91
},
{
"epoch": 0.0944801026957638,
"grad_norm": 1.4734814167022705,
"learning_rate": 0.00019645058036514096,
"loss": 2.1195,
"step": 92
},
{
"epoch": 0.09550706033376123,
"grad_norm": 1.9309577941894531,
"learning_rate": 0.00019636401293186823,
"loss": 2.1903,
"step": 93
},
{
"epoch": 0.09653401797175866,
"grad_norm": 1.780761480331421,
"learning_rate": 0.00019627642206381863,
"loss": 2.1368,
"step": 94
},
{
"epoch": 0.0975609756097561,
"grad_norm": 1.5235934257507324,
"learning_rate": 0.00019618780869125172,
"loss": 2.2518,
"step": 95
},
{
"epoch": 0.09858793324775353,
"grad_norm": 2.0815932750701904,
"learning_rate": 0.0001960981737552865,
"loss": 2.2663,
"step": 96
},
{
"epoch": 0.09961489088575096,
"grad_norm": 1.7994256019592285,
"learning_rate": 0.00019600751820789152,
"loss": 2.2666,
"step": 97
},
{
"epoch": 0.1006418485237484,
"grad_norm": 1.9231226444244385,
"learning_rate": 0.00019591584301187478,
"loss": 2.1893,
"step": 98
},
{
"epoch": 0.10166880616174583,
"grad_norm": 1.6883149147033691,
"learning_rate": 0.00019582314914087342,
"loss": 2.3377,
"step": 99
},
{
"epoch": 0.10269576379974327,
"grad_norm": 1.4279382228851318,
"learning_rate": 0.00019572943757934348,
"loss": 2.3135,
"step": 100
},
{
"epoch": 0.1037227214377407,
"grad_norm": 1.853316307067871,
"learning_rate": 0.00019563470932254932,
"loss": 2.0813,
"step": 101
},
{
"epoch": 0.10474967907573812,
"grad_norm": 1.6416527032852173,
"learning_rate": 0.00019553896537655318,
"loss": 2.3543,
"step": 102
},
{
"epoch": 0.10577663671373556,
"grad_norm": 1.6184829473495483,
"learning_rate": 0.00019544220675820438,
"loss": 2.1147,
"step": 103
},
{
"epoch": 0.10680359435173299,
"grad_norm": 1.7264267206192017,
"learning_rate": 0.00019534443449512862,
"loss": 2.294,
"step": 104
},
{
"epoch": 0.10783055198973042,
"grad_norm": 2.0255539417266846,
"learning_rate": 0.00019524564962571702,
"loss": 2.3318,
"step": 105
},
{
"epoch": 0.10885750962772786,
"grad_norm": 2.9923765659332275,
"learning_rate": 0.0001951458531991151,
"loss": 2.3253,
"step": 106
},
{
"epoch": 0.10988446726572529,
"grad_norm": 2.349273443222046,
"learning_rate": 0.00019504504627521153,
"loss": 2.3938,
"step": 107
},
{
"epoch": 0.11091142490372272,
"grad_norm": 1.649088978767395,
"learning_rate": 0.00019494322992462716,
"loss": 2.3371,
"step": 108
},
{
"epoch": 0.11193838254172016,
"grad_norm": 2.8484344482421875,
"learning_rate": 0.00019484040522870332,
"loss": 2.0286,
"step": 109
},
{
"epoch": 0.11296534017971759,
"grad_norm": 1.859044075012207,
"learning_rate": 0.00019473657327949054,
"loss": 2.2637,
"step": 110
},
{
"epoch": 0.11399229781771501,
"grad_norm": 1.8111779689788818,
"learning_rate": 0.00019463173517973682,
"loss": 2.2017,
"step": 111
},
{
"epoch": 0.11501925545571245,
"grad_norm": 2.0917441844940186,
"learning_rate": 0.00019452589204287612,
"loss": 2.2183,
"step": 112
},
{
"epoch": 0.11604621309370988,
"grad_norm": 1.8554123640060425,
"learning_rate": 0.0001944190449930163,
"loss": 2.3178,
"step": 113
},
{
"epoch": 0.11707317073170732,
"grad_norm": 1.534638524055481,
"learning_rate": 0.00019431119516492726,
"loss": 2.4167,
"step": 114
},
{
"epoch": 0.11810012836970475,
"grad_norm": 1.856016755104065,
"learning_rate": 0.00019420234370402906,
"loss": 2.226,
"step": 115
},
{
"epoch": 0.11912708600770218,
"grad_norm": 2.0289316177368164,
"learning_rate": 0.00019409249176637945,
"loss": 2.4272,
"step": 116
},
{
"epoch": 0.12015404364569962,
"grad_norm": 1.7497624158859253,
"learning_rate": 0.00019398164051866184,
"loss": 2.5199,
"step": 117
},
{
"epoch": 0.12118100128369705,
"grad_norm": 1.3328431844711304,
"learning_rate": 0.00019386979113817282,
"loss": 2.1596,
"step": 118
},
{
"epoch": 0.12220795892169448,
"grad_norm": 1.2402849197387695,
"learning_rate": 0.00019375694481280965,
"loss": 2.138,
"step": 119
},
{
"epoch": 0.12323491655969192,
"grad_norm": 1.7122734785079956,
"learning_rate": 0.00019364310274105757,
"loss": 2.1925,
"step": 120
},
{
"epoch": 0.12426187419768935,
"grad_norm": 1.8203654289245605,
"learning_rate": 0.00019352826613197726,
"loss": 2.2711,
"step": 121
},
{
"epoch": 0.1252888318356868,
"grad_norm": 1.6196566820144653,
"learning_rate": 0.0001934124362051919,
"loss": 2.3819,
"step": 122
},
{
"epoch": 0.12631578947368421,
"grad_norm": 1.8960175514221191,
"learning_rate": 0.0001932956141908741,
"loss": 2.1812,
"step": 123
},
{
"epoch": 0.12734274711168164,
"grad_norm": 1.275739312171936,
"learning_rate": 0.00019317780132973303,
"loss": 2.2348,
"step": 124
},
{
"epoch": 0.12836970474967907,
"grad_norm": 1.3010280132293701,
"learning_rate": 0.00019305899887300112,
"loss": 2.2978,
"step": 125
},
{
"epoch": 0.1293966623876765,
"grad_norm": 1.5741634368896484,
"learning_rate": 0.00019293920808242083,
"loss": 2.5089,
"step": 126
},
{
"epoch": 0.13042362002567395,
"grad_norm": 1.253029227256775,
"learning_rate": 0.00019281843023023122,
"loss": 2.3474,
"step": 127
},
{
"epoch": 0.13145057766367138,
"grad_norm": 1.4684383869171143,
"learning_rate": 0.00019269666659915444,
"loss": 2.5293,
"step": 128
},
{
"epoch": 0.1324775353016688,
"grad_norm": 1.4969675540924072,
"learning_rate": 0.00019257391848238214,
"loss": 2.4311,
"step": 129
},
{
"epoch": 0.13350449293966624,
"grad_norm": 1.6982190608978271,
"learning_rate": 0.0001924501871835616,
"loss": 2.4293,
"step": 130
},
{
"epoch": 0.13453145057766366,
"grad_norm": 1.5805507898330688,
"learning_rate": 0.00019232547401678218,
"loss": 2.5934,
"step": 131
},
{
"epoch": 0.1355584082156611,
"grad_norm": 1.6854584217071533,
"learning_rate": 0.00019219978030656103,
"loss": 2.3307,
"step": 132
},
{
"epoch": 0.13658536585365855,
"grad_norm": 1.4633373022079468,
"learning_rate": 0.00019207310738782922,
"loss": 2.3324,
"step": 133
},
{
"epoch": 0.13761232349165597,
"grad_norm": 1.3780254125595093,
"learning_rate": 0.00019194545660591752,
"loss": 2.3986,
"step": 134
},
{
"epoch": 0.1386392811296534,
"grad_norm": 1.321736216545105,
"learning_rate": 0.00019181682931654202,
"loss": 2.5895,
"step": 135
},
{
"epoch": 0.13966623876765083,
"grad_norm": 1.1892857551574707,
"learning_rate": 0.00019168722688578998,
"loss": 2.2351,
"step": 136
},
{
"epoch": 0.14069319640564826,
"grad_norm": 1.3525614738464355,
"learning_rate": 0.00019155665069010497,
"loss": 2.4157,
"step": 137
},
{
"epoch": 0.1417201540436457,
"grad_norm": 1.760393500328064,
"learning_rate": 0.00019142510211627264,
"loss": 2.5114,
"step": 138
},
{
"epoch": 0.14274711168164314,
"grad_norm": 1.3307132720947266,
"learning_rate": 0.00019129258256140555,
"loss": 2.4521,
"step": 139
},
{
"epoch": 0.14377406931964057,
"grad_norm": 1.3006818294525146,
"learning_rate": 0.0001911590934329288,
"loss": 2.172,
"step": 140
},
{
"epoch": 0.144801026957638,
"grad_norm": 1.4609280824661255,
"learning_rate": 0.00019102463614856474,
"loss": 2.3233,
"step": 141
},
{
"epoch": 0.14582798459563542,
"grad_norm": 1.4396119117736816,
"learning_rate": 0.000190889212136318,
"loss": 1.9802,
"step": 142
},
{
"epoch": 0.14685494223363285,
"grad_norm": 1.3639447689056396,
"learning_rate": 0.00019075282283446043,
"loss": 1.9724,
"step": 143
},
{
"epoch": 0.1478818998716303,
"grad_norm": 1.2768394947052002,
"learning_rate": 0.0001906154696915157,
"loss": 2.2345,
"step": 144
},
{
"epoch": 0.14890885750962773,
"grad_norm": 1.6951898336410522,
"learning_rate": 0.00019047715416624402,
"loss": 2.1663,
"step": 145
},
{
"epoch": 0.14993581514762516,
"grad_norm": 1.5235414505004883,
"learning_rate": 0.00019033787772762645,
"loss": 2.3048,
"step": 146
},
{
"epoch": 0.1509627727856226,
"grad_norm": 1.5483514070510864,
"learning_rate": 0.0001901976418548496,
"loss": 1.9881,
"step": 147
},
{
"epoch": 0.15198973042362002,
"grad_norm": 1.3355050086975098,
"learning_rate": 0.00019005644803728967,
"loss": 2.1689,
"step": 148
},
{
"epoch": 0.15301668806161745,
"grad_norm": 1.7160605192184448,
"learning_rate": 0.00018991429777449672,
"loss": 2.363,
"step": 149
},
{
"epoch": 0.1540436456996149,
"grad_norm": 1.6425933837890625,
"learning_rate": 0.00018977119257617878,
"loss": 2.2472,
"step": 150
},
{
"epoch": 0.15507060333761233,
"grad_norm": 1.3999041318893433,
"learning_rate": 0.00018962713396218574,
"loss": 2.2772,
"step": 151
},
{
"epoch": 0.15609756097560976,
"grad_norm": 1.7348566055297852,
"learning_rate": 0.00018948212346249333,
"loss": 2.1869,
"step": 152
},
{
"epoch": 0.15712451861360718,
"grad_norm": 1.8590364456176758,
"learning_rate": 0.0001893361626171867,
"loss": 2.3029,
"step": 153
},
{
"epoch": 0.1581514762516046,
"grad_norm": 1.3040502071380615,
"learning_rate": 0.00018918925297644416,
"loss": 2.3171,
"step": 154
},
{
"epoch": 0.15917843388960207,
"grad_norm": 1.3961435556411743,
"learning_rate": 0.00018904139610052077,
"loss": 2.3829,
"step": 155
},
{
"epoch": 0.1602053915275995,
"grad_norm": 1.4264347553253174,
"learning_rate": 0.00018889259355973163,
"loss": 2.0808,
"step": 156
},
{
"epoch": 0.16123234916559692,
"grad_norm": 1.4675887823104858,
"learning_rate": 0.00018874284693443536,
"loss": 2.6039,
"step": 157
},
{
"epoch": 0.16225930680359435,
"grad_norm": 1.436353087425232,
"learning_rate": 0.00018859215781501725,
"loss": 2.0781,
"step": 158
},
{
"epoch": 0.16328626444159178,
"grad_norm": 1.6552093029022217,
"learning_rate": 0.0001884405278018722,
"loss": 2.3886,
"step": 159
},
{
"epoch": 0.1643132220795892,
"grad_norm": 1.694690465927124,
"learning_rate": 0.00018828795850538805,
"loss": 2.327,
"step": 160
},
{
"epoch": 0.16534017971758666,
"grad_norm": 1.3449724912643433,
"learning_rate": 0.00018813445154592826,
"loss": 2.347,
"step": 161
},
{
"epoch": 0.1663671373555841,
"grad_norm": 2.1681737899780273,
"learning_rate": 0.0001879800085538147,
"loss": 2.31,
"step": 162
},
{
"epoch": 0.16739409499358152,
"grad_norm": 1.3670250177383423,
"learning_rate": 0.00018782463116931043,
"loss": 2.263,
"step": 163
},
{
"epoch": 0.16842105263157894,
"grad_norm": 1.5687174797058105,
"learning_rate": 0.0001876683210426022,
"loss": 2.2114,
"step": 164
},
{
"epoch": 0.16944801026957637,
"grad_norm": 1.7429057359695435,
"learning_rate": 0.000187511079833783,
"loss": 2.2278,
"step": 165
},
{
"epoch": 0.1704749679075738,
"grad_norm": 1.7417389154434204,
"learning_rate": 0.0001873529092128343,
"loss": 2.294,
"step": 166
},
{
"epoch": 0.17150192554557125,
"grad_norm": 1.469560980796814,
"learning_rate": 0.0001871938108596085,
"loss": 2.2767,
"step": 167
},
{
"epoch": 0.17252888318356868,
"grad_norm": 1.9978508949279785,
"learning_rate": 0.00018703378646381098,
"loss": 2.4065,
"step": 168
},
{
"epoch": 0.1735558408215661,
"grad_norm": 1.6224966049194336,
"learning_rate": 0.00018687283772498206,
"loss": 2.2886,
"step": 169
},
{
"epoch": 0.17458279845956354,
"grad_norm": 1.9054279327392578,
"learning_rate": 0.00018671096635247914,
"loss": 2.471,
"step": 170
},
{
"epoch": 0.17560975609756097,
"grad_norm": 2.048701763153076,
"learning_rate": 0.00018654817406545845,
"loss": 2.2566,
"step": 171
},
{
"epoch": 0.17663671373555842,
"grad_norm": 1.327681303024292,
"learning_rate": 0.00018638446259285678,
"loss": 2.1956,
"step": 172
},
{
"epoch": 0.17766367137355585,
"grad_norm": 1.559768557548523,
"learning_rate": 0.00018621983367337315,
"loss": 2.4249,
"step": 173
},
{
"epoch": 0.17869062901155328,
"grad_norm": 1.3940227031707764,
"learning_rate": 0.00018605428905545032,
"loss": 2.435,
"step": 174
},
{
"epoch": 0.1797175866495507,
"grad_norm": 1.5285940170288086,
"learning_rate": 0.00018588783049725623,
"loss": 2.2861,
"step": 175
},
{
"epoch": 0.18074454428754813,
"grad_norm": 1.4059717655181885,
"learning_rate": 0.00018572045976666534,
"loss": 2.3312,
"step": 176
},
{
"epoch": 0.18177150192554556,
"grad_norm": 1.327272891998291,
"learning_rate": 0.0001855521786412399,
"loss": 2.267,
"step": 177
},
{
"epoch": 0.18279845956354301,
"grad_norm": 1.8960320949554443,
"learning_rate": 0.0001853829889082109,
"loss": 2.2957,
"step": 178
},
{
"epoch": 0.18382541720154044,
"grad_norm": 2.1734304428100586,
"learning_rate": 0.0001852128923644593,
"loss": 2.3058,
"step": 179
},
{
"epoch": 0.18485237483953787,
"grad_norm": 1.4910588264465332,
"learning_rate": 0.00018504189081649676,
"loss": 2.3334,
"step": 180
},
{
"epoch": 0.1858793324775353,
"grad_norm": 1.785188913345337,
"learning_rate": 0.00018486998608044667,
"loss": 2.3145,
"step": 181
},
{
"epoch": 0.18690629011553272,
"grad_norm": 1.7088462114334106,
"learning_rate": 0.00018469717998202462,
"loss": 2.3375,
"step": 182
},
{
"epoch": 0.18793324775353018,
"grad_norm": 1.542039155960083,
"learning_rate": 0.0001845234743565192,
"loss": 2.3347,
"step": 183
},
{
"epoch": 0.1889602053915276,
"grad_norm": 1.6210240125656128,
"learning_rate": 0.00018434887104877242,
"loss": 2.3396,
"step": 184
},
{
"epoch": 0.18998716302952504,
"grad_norm": 1.2093369960784912,
"learning_rate": 0.00018417337191316003,
"loss": 2.2205,
"step": 185
},
{
"epoch": 0.19101412066752246,
"grad_norm": 1.4520978927612305,
"learning_rate": 0.00018399697881357212,
"loss": 2.3782,
"step": 186
},
{
"epoch": 0.1920410783055199,
"grad_norm": 1.655849814414978,
"learning_rate": 0.00018381969362339298,
"loss": 2.4586,
"step": 187
},
{
"epoch": 0.19306803594351732,
"grad_norm": 1.3043372631072998,
"learning_rate": 0.00018364151822548142,
"loss": 2.3635,
"step": 188
},
{
"epoch": 0.19409499358151477,
"grad_norm": 2.0884976387023926,
"learning_rate": 0.00018346245451215067,
"loss": 2.2686,
"step": 189
},
{
"epoch": 0.1951219512195122,
"grad_norm": 1.5124359130859375,
"learning_rate": 0.00018328250438514836,
"loss": 2.4451,
"step": 190
},
{
"epoch": 0.19614890885750963,
"grad_norm": 1.4146149158477783,
"learning_rate": 0.00018310166975563625,
"loss": 2.2692,
"step": 191
},
{
"epoch": 0.19717586649550706,
"grad_norm": 1.1612133979797363,
"learning_rate": 0.00018291995254417,
"loss": 2.2671,
"step": 192
},
{
"epoch": 0.19820282413350448,
"grad_norm": 1.9117320775985718,
"learning_rate": 0.00018273735468067872,
"loss": 2.3375,
"step": 193
},
{
"epoch": 0.1992297817715019,
"grad_norm": 1.1931228637695312,
"learning_rate": 0.00018255387810444448,
"loss": 2.1428,
"step": 194
},
{
"epoch": 0.20025673940949937,
"grad_norm": 1.616768717765808,
"learning_rate": 0.0001823695247640817,
"loss": 2.3382,
"step": 195
},
{
"epoch": 0.2012836970474968,
"grad_norm": 1.5892282724380493,
"learning_rate": 0.0001821842966175166,
"loss": 2.4363,
"step": 196
},
{
"epoch": 0.20231065468549422,
"grad_norm": 1.5550988912582397,
"learning_rate": 0.00018199819563196617,
"loss": 2.2785,
"step": 197
},
{
"epoch": 0.20333761232349165,
"grad_norm": 2.0319631099700928,
"learning_rate": 0.0001818112237839174,
"loss": 2.3412,
"step": 198
},
{
"epoch": 0.20436456996148908,
"grad_norm": 1.500908613204956,
"learning_rate": 0.00018162338305910636,
"loss": 2.2796,
"step": 199
},
{
"epoch": 0.20539152759948653,
"grad_norm": 1.1863993406295776,
"learning_rate": 0.00018143467545249692,
"loss": 1.9584,
"step": 200
},
{
"epoch": 0.20641848523748396,
"grad_norm": 1.2843319177627563,
"learning_rate": 0.00018124510296825983,
"loss": 2.2401,
"step": 201
},
{
"epoch": 0.2074454428754814,
"grad_norm": 1.886354684829712,
"learning_rate": 0.00018105466761975109,
"loss": 2.4187,
"step": 202
},
{
"epoch": 0.20847240051347882,
"grad_norm": 1.7772157192230225,
"learning_rate": 0.00018086337142949094,
"loss": 2.4715,
"step": 203
},
{
"epoch": 0.20949935815147624,
"grad_norm": 1.5349968671798706,
"learning_rate": 0.00018067121642914206,
"loss": 2.2919,
"step": 204
},
{
"epoch": 0.21052631578947367,
"grad_norm": 1.6492220163345337,
"learning_rate": 0.00018047820465948817,
"loss": 2.2219,
"step": 205
},
{
"epoch": 0.21155327342747113,
"grad_norm": 1.8684515953063965,
"learning_rate": 0.00018028433817041236,
"loss": 2.4697,
"step": 206
},
{
"epoch": 0.21258023106546856,
"grad_norm": 1.287240743637085,
"learning_rate": 0.00018008961902087528,
"loss": 2.3128,
"step": 207
},
{
"epoch": 0.21360718870346598,
"grad_norm": 2.5276641845703125,
"learning_rate": 0.00017989404927889316,
"loss": 2.3647,
"step": 208
},
{
"epoch": 0.2146341463414634,
"grad_norm": 1.3976030349731445,
"learning_rate": 0.00017969763102151603,
"loss": 2.1668,
"step": 209
},
{
"epoch": 0.21566110397946084,
"grad_norm": 1.5781503915786743,
"learning_rate": 0.00017950036633480556,
"loss": 2.2656,
"step": 210
},
{
"epoch": 0.21668806161745827,
"grad_norm": 1.918054223060608,
"learning_rate": 0.00017930225731381302,
"loss": 2.5755,
"step": 211
},
{
"epoch": 0.21771501925545572,
"grad_norm": 1.5252821445465088,
"learning_rate": 0.00017910330606255682,
"loss": 2.1814,
"step": 212
},
{
"epoch": 0.21874197689345315,
"grad_norm": 1.4365805387496948,
"learning_rate": 0.00017890351469400034,
"loss": 2.4923,
"step": 213
},
{
"epoch": 0.21976893453145058,
"grad_norm": 1.3583849668502808,
"learning_rate": 0.00017870288533002938,
"loss": 2.3919,
"step": 214
},
{
"epoch": 0.220795892169448,
"grad_norm": 1.389434576034546,
"learning_rate": 0.00017850142010142982,
"loss": 2.3916,
"step": 215
},
{
"epoch": 0.22182284980744543,
"grad_norm": 1.134041428565979,
"learning_rate": 0.00017829912114786462,
"loss": 1.9883,
"step": 216
},
{
"epoch": 0.2228498074454429,
"grad_norm": 1.3355865478515625,
"learning_rate": 0.00017809599061785155,
"loss": 2.1283,
"step": 217
},
{
"epoch": 0.22387676508344032,
"grad_norm": 1.5802854299545288,
"learning_rate": 0.00017789203066873998,
"loss": 2.3526,
"step": 218
},
{
"epoch": 0.22490372272143774,
"grad_norm": 1.919191837310791,
"learning_rate": 0.0001776872434666882,
"loss": 2.2701,
"step": 219
},
{
"epoch": 0.22593068035943517,
"grad_norm": 1.7387783527374268,
"learning_rate": 0.0001774816311866404,
"loss": 2.4047,
"step": 220
},
{
"epoch": 0.2269576379974326,
"grad_norm": 1.3135710954666138,
"learning_rate": 0.0001772751960123034,
"loss": 2.2767,
"step": 221
},
{
"epoch": 0.22798459563543003,
"grad_norm": 2.7130353450775146,
"learning_rate": 0.00017706794013612364,
"loss": 2.269,
"step": 222
},
{
"epoch": 0.22901155327342748,
"grad_norm": 1.4474828243255615,
"learning_rate": 0.00017685986575926386,
"loss": 2.3064,
"step": 223
},
{
"epoch": 0.2300385109114249,
"grad_norm": 2.2177886962890625,
"learning_rate": 0.00017665097509157962,
"loss": 2.2529,
"step": 224
},
{
"epoch": 0.23106546854942234,
"grad_norm": 1.9783529043197632,
"learning_rate": 0.00017644127035159596,
"loss": 2.1634,
"step": 225
},
{
"epoch": 0.23209242618741976,
"grad_norm": 1.5306183099746704,
"learning_rate": 0.00017623075376648376,
"loss": 2.0591,
"step": 226
},
{
"epoch": 0.2331193838254172,
"grad_norm": 1.4206281900405884,
"learning_rate": 0.00017601942757203612,
"loss": 2.1645,
"step": 227
},
{
"epoch": 0.23414634146341465,
"grad_norm": 1.2981724739074707,
"learning_rate": 0.0001758072940126446,
"loss": 2.3063,
"step": 228
},
{
"epoch": 0.23517329910141208,
"grad_norm": 1.5360183715820312,
"learning_rate": 0.00017559435534127534,
"loss": 2.3622,
"step": 229
},
{
"epoch": 0.2362002567394095,
"grad_norm": 1.329416036605835,
"learning_rate": 0.00017538061381944524,
"loss": 2.3301,
"step": 230
},
{
"epoch": 0.23722721437740693,
"grad_norm": 1.191867470741272,
"learning_rate": 0.00017516607171719786,
"loss": 2.0651,
"step": 231
},
{
"epoch": 0.23825417201540436,
"grad_norm": 1.8169773817062378,
"learning_rate": 0.00017495073131307932,
"loss": 2.4059,
"step": 232
},
{
"epoch": 0.2392811296534018,
"grad_norm": 1.5405203104019165,
"learning_rate": 0.00017473459489411415,
"loss": 2.4716,
"step": 233
},
{
"epoch": 0.24030808729139924,
"grad_norm": 1.899759292602539,
"learning_rate": 0.0001745176647557809,
"loss": 2.3427,
"step": 234
},
{
"epoch": 0.24133504492939667,
"grad_norm": 1.3432451486587524,
"learning_rate": 0.00017429994320198786,
"loss": 2.1323,
"step": 235
},
{
"epoch": 0.2423620025673941,
"grad_norm": 1.5715628862380981,
"learning_rate": 0.00017408143254504856,
"loss": 2.3001,
"step": 236
},
{
"epoch": 0.24338896020539152,
"grad_norm": 1.7504726648330688,
"learning_rate": 0.00017386213510565715,
"loss": 2.2933,
"step": 237
},
{
"epoch": 0.24441591784338895,
"grad_norm": 2.6738271713256836,
"learning_rate": 0.00017364205321286394,
"loss": 2.1217,
"step": 238
},
{
"epoch": 0.24544287548138638,
"grad_norm": 1.704504132270813,
"learning_rate": 0.00017342118920405034,
"loss": 2.3705,
"step": 239
},
{
"epoch": 0.24646983311938384,
"grad_norm": 1.2226909399032593,
"learning_rate": 0.00017319954542490445,
"loss": 2.1303,
"step": 240
},
{
"epoch": 0.24749679075738126,
"grad_norm": 1.40041184425354,
"learning_rate": 0.00017297712422939573,
"loss": 2.2961,
"step": 241
},
{
"epoch": 0.2485237483953787,
"grad_norm": 1.618201494216919,
"learning_rate": 0.00017275392797975032,
"loss": 2.3552,
"step": 242
},
{
"epoch": 0.24955070603337612,
"grad_norm": 1.7513333559036255,
"learning_rate": 0.0001725299590464258,
"loss": 2.4547,
"step": 243
},
{
"epoch": 0.2505776636713736,
"grad_norm": 1.9280955791473389,
"learning_rate": 0.000172305219808086,
"loss": 2.3315,
"step": 244
},
{
"epoch": 0.2505776636713736,
"eval_loss": 2.3158633708953857,
"eval_runtime": 26.8137,
"eval_samples_per_second": 15.291,
"eval_steps_per_second": 7.645,
"step": 244
},
{
"epoch": 0.251604621309371,
"grad_norm": 1.8800264596939087,
"learning_rate": 0.00017207971265157586,
"loss": 2.2771,
"step": 245
},
{
"epoch": 0.25263157894736843,
"grad_norm": 2.2899258136749268,
"learning_rate": 0.00017185343997189588,
"loss": 2.4253,
"step": 246
},
{
"epoch": 0.25365853658536586,
"grad_norm": 1.3957444429397583,
"learning_rate": 0.00017162640417217695,
"loss": 2.2387,
"step": 247
},
{
"epoch": 0.2546854942233633,
"grad_norm": 1.471025824546814,
"learning_rate": 0.00017139860766365457,
"loss": 2.3249,
"step": 248
},
{
"epoch": 0.2557124518613607,
"grad_norm": 1.5242273807525635,
"learning_rate": 0.00017117005286564342,
"loss": 2.274,
"step": 249
},
{
"epoch": 0.25673940949935814,
"grad_norm": 2.3328254222869873,
"learning_rate": 0.00017094074220551158,
"loss": 2.3465,
"step": 250
},
{
"epoch": 0.25776636713735557,
"grad_norm": 1.3928115367889404,
"learning_rate": 0.00017071067811865476,
"loss": 2.1276,
"step": 251
},
{
"epoch": 0.258793324775353,
"grad_norm": 1.318634271621704,
"learning_rate": 0.00017047986304847044,
"loss": 2.3943,
"step": 252
},
{
"epoch": 0.2598202824133504,
"grad_norm": 1.8041123151779175,
"learning_rate": 0.00017024829944633195,
"loss": 2.2782,
"step": 253
},
{
"epoch": 0.2608472400513479,
"grad_norm": 1.4186384677886963,
"learning_rate": 0.0001700159897715624,
"loss": 2.2574,
"step": 254
},
{
"epoch": 0.26187419768934533,
"grad_norm": 1.4350296258926392,
"learning_rate": 0.00016978293649140853,
"loss": 2.103,
"step": 255
},
{
"epoch": 0.26290115532734276,
"grad_norm": 1.6985528469085693,
"learning_rate": 0.0001695491420810146,
"loss": 2.426,
"step": 256
},
{
"epoch": 0.2639281129653402,
"grad_norm": 1.5659419298171997,
"learning_rate": 0.00016931460902339608,
"loss": 2.3175,
"step": 257
},
{
"epoch": 0.2649550706033376,
"grad_norm": 1.5348899364471436,
"learning_rate": 0.00016907933980941312,
"loss": 2.0148,
"step": 258
},
{
"epoch": 0.26598202824133504,
"grad_norm": 1.5878114700317383,
"learning_rate": 0.00016884333693774437,
"loss": 2.3478,
"step": 259
},
{
"epoch": 0.26700898587933247,
"grad_norm": 1.6665490865707397,
"learning_rate": 0.0001686066029148602,
"loss": 2.2672,
"step": 260
},
{
"epoch": 0.2680359435173299,
"grad_norm": 2.0139546394348145,
"learning_rate": 0.00016836914025499623,
"loss": 2.2764,
"step": 261
},
{
"epoch": 0.2690629011553273,
"grad_norm": 1.5040104389190674,
"learning_rate": 0.0001681309514801265,
"loss": 2.4329,
"step": 262
},
{
"epoch": 0.27008985879332476,
"grad_norm": 1.9814090728759766,
"learning_rate": 0.0001678920391199369,
"loss": 2.1721,
"step": 263
},
{
"epoch": 0.2711168164313222,
"grad_norm": 1.6174941062927246,
"learning_rate": 0.00016765240571179802,
"loss": 2.2285,
"step": 264
},
{
"epoch": 0.27214377406931967,
"grad_norm": 1.6301368474960327,
"learning_rate": 0.00016741205380073842,
"loss": 2.1531,
"step": 265
},
{
"epoch": 0.2731707317073171,
"grad_norm": 1.551831603050232,
"learning_rate": 0.00016717098593941752,
"loss": 2.1856,
"step": 266
},
{
"epoch": 0.2741976893453145,
"grad_norm": 1.327530860900879,
"learning_rate": 0.00016692920468809846,
"loss": 2.4251,
"step": 267
},
{
"epoch": 0.27522464698331195,
"grad_norm": 1.6990398168563843,
"learning_rate": 0.00016668671261462102,
"loss": 2.2068,
"step": 268
},
{
"epoch": 0.2762516046213094,
"grad_norm": 2.0649585723876953,
"learning_rate": 0.00016644351229437416,
"loss": 2.5073,
"step": 269
},
{
"epoch": 0.2772785622593068,
"grad_norm": 2.4592831134796143,
"learning_rate": 0.00016619960631026888,
"loss": 2.507,
"step": 270
},
{
"epoch": 0.27830551989730423,
"grad_norm": 1.1428396701812744,
"learning_rate": 0.00016595499725271067,
"loss": 2.33,
"step": 271
},
{
"epoch": 0.27933247753530166,
"grad_norm": 1.5621798038482666,
"learning_rate": 0.00016570968771957196,
"loss": 2.4772,
"step": 272
},
{
"epoch": 0.2803594351732991,
"grad_norm": 2.22113037109375,
"learning_rate": 0.00016546368031616465,
"loss": 2.1417,
"step": 273
},
{
"epoch": 0.2813863928112965,
"grad_norm": 1.6532412767410278,
"learning_rate": 0.0001652169776552123,
"loss": 2.1814,
"step": 274
},
{
"epoch": 0.28241335044929394,
"grad_norm": 1.35118567943573,
"learning_rate": 0.0001649695823568226,
"loss": 2.4149,
"step": 275
},
{
"epoch": 0.2834403080872914,
"grad_norm": 1.3244826793670654,
"learning_rate": 0.00016472149704845927,
"loss": 2.3371,
"step": 276
},
{
"epoch": 0.28446726572528885,
"grad_norm": 1.3463726043701172,
"learning_rate": 0.00016447272436491433,
"loss": 2.1964,
"step": 277
},
{
"epoch": 0.2854942233632863,
"grad_norm": 1.2562531232833862,
"learning_rate": 0.00016422326694828007,
"loss": 2.3121,
"step": 278
},
{
"epoch": 0.2865211810012837,
"grad_norm": 1.1574952602386475,
"learning_rate": 0.000163973127447921,
"loss": 2.0812,
"step": 279
},
{
"epoch": 0.28754813863928114,
"grad_norm": 1.3550206422805786,
"learning_rate": 0.0001637223085204457,
"loss": 2.3015,
"step": 280
},
{
"epoch": 0.28857509627727856,
"grad_norm": 1.6482678651809692,
"learning_rate": 0.0001634708128296786,
"loss": 2.4281,
"step": 281
},
{
"epoch": 0.289602053915276,
"grad_norm": 1.4810844659805298,
"learning_rate": 0.00016321864304663173,
"loss": 2.4883,
"step": 282
},
{
"epoch": 0.2906290115532734,
"grad_norm": 1.8965210914611816,
"learning_rate": 0.00016296580184947633,
"loss": 2.5187,
"step": 283
},
{
"epoch": 0.29165596919127085,
"grad_norm": 1.3743977546691895,
"learning_rate": 0.00016271229192351428,
"loss": 2.1911,
"step": 284
},
{
"epoch": 0.2926829268292683,
"grad_norm": 1.7670851945877075,
"learning_rate": 0.0001624581159611499,
"loss": 2.0569,
"step": 285
},
{
"epoch": 0.2937098844672657,
"grad_norm": 1.865688443183899,
"learning_rate": 0.000162203276661861,
"loss": 2.3579,
"step": 286
},
{
"epoch": 0.29473684210526313,
"grad_norm": 1.762488842010498,
"learning_rate": 0.00016194777673217043,
"loss": 2.1302,
"step": 287
},
{
"epoch": 0.2957637997432606,
"grad_norm": 1.545452356338501,
"learning_rate": 0.00016169161888561723,
"loss": 2.3168,
"step": 288
},
{
"epoch": 0.29679075738125804,
"grad_norm": 1.5264406204223633,
"learning_rate": 0.00016143480584272793,
"loss": 2.3294,
"step": 289
},
{
"epoch": 0.29781771501925547,
"grad_norm": 1.4859471321105957,
"learning_rate": 0.00016117734033098744,
"loss": 2.4337,
"step": 290
},
{
"epoch": 0.2988446726572529,
"grad_norm": 2.1913015842437744,
"learning_rate": 0.0001609192250848104,
"loss": 2.314,
"step": 291
},
{
"epoch": 0.2998716302952503,
"grad_norm": 1.3715288639068604,
"learning_rate": 0.00016066046284551178,
"loss": 2.3759,
"step": 292
},
{
"epoch": 0.30089858793324775,
"grad_norm": 1.243543267250061,
"learning_rate": 0.00016040105636127807,
"loss": 2.289,
"step": 293
},
{
"epoch": 0.3019255455712452,
"grad_norm": 1.3851499557495117,
"learning_rate": 0.00016014100838713797,
"loss": 2.5364,
"step": 294
},
{
"epoch": 0.3029525032092426,
"grad_norm": 1.318569540977478,
"learning_rate": 0.000159880321684933,
"loss": 2.2201,
"step": 295
},
{
"epoch": 0.30397946084724004,
"grad_norm": 1.7080748081207275,
"learning_rate": 0.00015961899902328845,
"loss": 2.4458,
"step": 296
},
{
"epoch": 0.30500641848523746,
"grad_norm": 1.462457299232483,
"learning_rate": 0.0001593570431775837,
"loss": 2.1665,
"step": 297
},
{
"epoch": 0.3060333761232349,
"grad_norm": 1.3777559995651245,
"learning_rate": 0.000159094456929923,
"loss": 2.2534,
"step": 298
},
{
"epoch": 0.3070603337612324,
"grad_norm": 2.4722256660461426,
"learning_rate": 0.00015883124306910565,
"loss": 2.4955,
"step": 299
},
{
"epoch": 0.3080872913992298,
"grad_norm": 1.4936336278915405,
"learning_rate": 0.0001585674043905966,
"loss": 2.3775,
"step": 300
},
{
"epoch": 0.30911424903722723,
"grad_norm": 1.8284447193145752,
"learning_rate": 0.00015830294369649668,
"loss": 2.1124,
"step": 301
},
{
"epoch": 0.31014120667522466,
"grad_norm": 1.5503596067428589,
"learning_rate": 0.0001580378637955128,
"loss": 2.3898,
"step": 302
},
{
"epoch": 0.3111681643132221,
"grad_norm": 1.868800163269043,
"learning_rate": 0.00015777216750292823,
"loss": 2.2951,
"step": 303
},
{
"epoch": 0.3121951219512195,
"grad_norm": 1.6728500127792358,
"learning_rate": 0.0001575058576405725,
"loss": 2.2314,
"step": 304
},
{
"epoch": 0.31322207958921694,
"grad_norm": 1.3890676498413086,
"learning_rate": 0.00015723893703679172,
"loss": 2.1032,
"step": 305
},
{
"epoch": 0.31424903722721437,
"grad_norm": 1.9047855138778687,
"learning_rate": 0.00015697140852641834,
"loss": 2.1308,
"step": 306
},
{
"epoch": 0.3152759948652118,
"grad_norm": 1.7245334386825562,
"learning_rate": 0.00015670327495074103,
"loss": 2.0928,
"step": 307
},
{
"epoch": 0.3163029525032092,
"grad_norm": 1.295081615447998,
"learning_rate": 0.00015643453915747455,
"loss": 2.2695,
"step": 308
},
{
"epoch": 0.31732991014120665,
"grad_norm": 1.3265256881713867,
"learning_rate": 0.00015616520400072963,
"loss": 2.3772,
"step": 309
},
{
"epoch": 0.31835686777920413,
"grad_norm": 1.2527427673339844,
"learning_rate": 0.00015589527234098247,
"loss": 2.3704,
"step": 310
},
{
"epoch": 0.31938382541720156,
"grad_norm": 1.2341771125793457,
"learning_rate": 0.00015562474704504438,
"loss": 2.1607,
"step": 311
},
{
"epoch": 0.320410783055199,
"grad_norm": 1.3696041107177734,
"learning_rate": 0.00015535363098603152,
"loss": 2.3136,
"step": 312
},
{
"epoch": 0.3214377406931964,
"grad_norm": 1.7420108318328857,
"learning_rate": 0.00015508192704333413,
"loss": 2.4447,
"step": 313
},
{
"epoch": 0.32246469833119384,
"grad_norm": 1.5119073390960693,
"learning_rate": 0.00015480963810258613,
"loss": 2.3928,
"step": 314
},
{
"epoch": 0.32349165596919127,
"grad_norm": 1.2545535564422607,
"learning_rate": 0.00015453676705563444,
"loss": 2.4365,
"step": 315
},
{
"epoch": 0.3245186136071887,
"grad_norm": 1.4043221473693848,
"learning_rate": 0.00015426331680050824,
"loss": 2.5759,
"step": 316
},
{
"epoch": 0.3255455712451861,
"grad_norm": 1.4104640483856201,
"learning_rate": 0.00015398929024138807,
"loss": 2.4133,
"step": 317
},
{
"epoch": 0.32657252888318355,
"grad_norm": 1.2791924476623535,
"learning_rate": 0.00015371469028857532,
"loss": 2.0351,
"step": 318
},
{
"epoch": 0.327599486521181,
"grad_norm": 1.2710974216461182,
"learning_rate": 0.00015343951985846095,
"loss": 2.0818,
"step": 319
},
{
"epoch": 0.3286264441591784,
"grad_norm": 1.5024012327194214,
"learning_rate": 0.00015316378187349474,
"loss": 2.1922,
"step": 320
},
{
"epoch": 0.3296534017971759,
"grad_norm": 1.1681658029556274,
"learning_rate": 0.00015288747926215418,
"loss": 1.9304,
"step": 321
},
{
"epoch": 0.3306803594351733,
"grad_norm": 1.3742514848709106,
"learning_rate": 0.00015261061495891345,
"loss": 2.3971,
"step": 322
},
{
"epoch": 0.33170731707317075,
"grad_norm": 2.3611044883728027,
"learning_rate": 0.00015233319190421197,
"loss": 2.3189,
"step": 323
},
{
"epoch": 0.3327342747111682,
"grad_norm": 1.5957375764846802,
"learning_rate": 0.00015205521304442366,
"loss": 2.4309,
"step": 324
},
{
"epoch": 0.3337612323491656,
"grad_norm": 1.3179574012756348,
"learning_rate": 0.00015177668133182522,
"loss": 2.2626,
"step": 325
},
{
"epoch": 0.33478818998716303,
"grad_norm": 1.4424761533737183,
"learning_rate": 0.0001514975997245649,
"loss": 2.2325,
"step": 326
},
{
"epoch": 0.33581514762516046,
"grad_norm": 1.499611496925354,
"learning_rate": 0.00015121797118663124,
"loss": 2.5169,
"step": 327
},
{
"epoch": 0.3368421052631579,
"grad_norm": 2.3086583614349365,
"learning_rate": 0.0001509377986878213,
"loss": 2.4343,
"step": 328
},
{
"epoch": 0.3378690629011553,
"grad_norm": 2.3896758556365967,
"learning_rate": 0.00015065708520370944,
"loss": 2.2872,
"step": 329
},
{
"epoch": 0.33889602053915274,
"grad_norm": 2.070495367050171,
"learning_rate": 0.00015037583371561535,
"loss": 2.1122,
"step": 330
},
{
"epoch": 0.33992297817715017,
"grad_norm": 1.6736974716186523,
"learning_rate": 0.0001500940472105729,
"loss": 2.2805,
"step": 331
},
{
"epoch": 0.3409499358151476,
"grad_norm": 1.298316240310669,
"learning_rate": 0.00014981172868129786,
"loss": 2.2484,
"step": 332
},
{
"epoch": 0.3419768934531451,
"grad_norm": 1.763777732849121,
"learning_rate": 0.00014952888112615645,
"loss": 2.3688,
"step": 333
},
{
"epoch": 0.3430038510911425,
"grad_norm": 1.5646326541900635,
"learning_rate": 0.0001492455075491334,
"loss": 2.2685,
"step": 334
},
{
"epoch": 0.34403080872913994,
"grad_norm": 2.415956735610962,
"learning_rate": 0.00014896161095980008,
"loss": 2.2984,
"step": 335
},
{
"epoch": 0.34505776636713736,
"grad_norm": 1.9530178308486938,
"learning_rate": 0.00014867719437328252,
"loss": 2.3111,
"step": 336
},
{
"epoch": 0.3460847240051348,
"grad_norm": 1.4584389925003052,
"learning_rate": 0.00014839226081022938,
"loss": 2.4523,
"step": 337
},
{
"epoch": 0.3471116816431322,
"grad_norm": 2.4603986740112305,
"learning_rate": 0.00014810681329677987,
"loss": 2.2778,
"step": 338
},
{
"epoch": 0.34813863928112965,
"grad_norm": 1.2564218044281006,
"learning_rate": 0.00014782085486453154,
"loss": 2.171,
"step": 339
},
{
"epoch": 0.3491655969191271,
"grad_norm": 8.110305786132812,
"learning_rate": 0.00014753438855050828,
"loss": 2.3417,
"step": 340
},
{
"epoch": 0.3501925545571245,
"grad_norm": 1.9910926818847656,
"learning_rate": 0.00014724741739712794,
"loss": 2.0176,
"step": 341
},
{
"epoch": 0.35121951219512193,
"grad_norm": 1.5625698566436768,
"learning_rate": 0.00014695994445216985,
"loss": 2.4374,
"step": 342
},
{
"epoch": 0.35224646983311936,
"grad_norm": 1.5060590505599976,
"learning_rate": 0.00014667197276874286,
"loss": 2.3906,
"step": 343
},
{
"epoch": 0.35327342747111684,
"grad_norm": 1.2828539609909058,
"learning_rate": 0.00014638350540525246,
"loss": 2.2784,
"step": 344
},
{
"epoch": 0.35430038510911427,
"grad_norm": 1.1949716806411743,
"learning_rate": 0.0001460945454253687,
"loss": 2.2366,
"step": 345
},
{
"epoch": 0.3553273427471117,
"grad_norm": 1.9188710451126099,
"learning_rate": 0.00014580509589799329,
"loss": 2.2652,
"step": 346
},
{
"epoch": 0.3563543003851091,
"grad_norm": 1.220718502998352,
"learning_rate": 0.00014551515989722733,
"loss": 2.1333,
"step": 347
},
{
"epoch": 0.35738125802310655,
"grad_norm": 1.4678255319595337,
"learning_rate": 0.00014522474050233846,
"loss": 2.1317,
"step": 348
},
{
"epoch": 0.358408215661104,
"grad_norm": 1.2910618782043457,
"learning_rate": 0.00014493384079772813,
"loss": 2.4736,
"step": 349
},
{
"epoch": 0.3594351732991014,
"grad_norm": 1.3866629600524902,
"learning_rate": 0.00014464246387289913,
"loss": 2.1866,
"step": 350
},
{
"epoch": 0.36046213093709883,
"grad_norm": 1.3790191411972046,
"learning_rate": 0.00014435061282242232,
"loss": 2.2616,
"step": 351
},
{
"epoch": 0.36148908857509626,
"grad_norm": 1.481720209121704,
"learning_rate": 0.00014405829074590424,
"loss": 2.2923,
"step": 352
},
{
"epoch": 0.3625160462130937,
"grad_norm": 1.427502155303955,
"learning_rate": 0.00014376550074795375,
"loss": 2.3331,
"step": 353
},
{
"epoch": 0.3635430038510911,
"grad_norm": 1.3853408098220825,
"learning_rate": 0.00014347224593814944,
"loss": 2.3992,
"step": 354
},
{
"epoch": 0.3645699614890886,
"grad_norm": 1.7792441844940186,
"learning_rate": 0.00014317852943100643,
"loss": 2.4461,
"step": 355
},
{
"epoch": 0.36559691912708603,
"grad_norm": 1.5491658449172974,
"learning_rate": 0.00014288435434594315,
"loss": 2.1179,
"step": 356
},
{
"epoch": 0.36662387676508346,
"grad_norm": 1.3137096166610718,
"learning_rate": 0.00014258972380724858,
"loss": 2.0743,
"step": 357
},
{
"epoch": 0.3676508344030809,
"grad_norm": 1.8642239570617676,
"learning_rate": 0.00014229464094404865,
"loss": 2.3197,
"step": 358
},
{
"epoch": 0.3686777920410783,
"grad_norm": 1.9754403829574585,
"learning_rate": 0.00014199910889027334,
"loss": 2.4367,
"step": 359
},
{
"epoch": 0.36970474967907574,
"grad_norm": 1.3563640117645264,
"learning_rate": 0.00014170313078462317,
"loss": 2.3651,
"step": 360
},
{
"epoch": 0.37073170731707317,
"grad_norm": 1.7116272449493408,
"learning_rate": 0.00014140670977053603,
"loss": 2.2974,
"step": 361
},
{
"epoch": 0.3717586649550706,
"grad_norm": 1.1759010553359985,
"learning_rate": 0.00014110984899615367,
"loss": 2.0675,
"step": 362
},
{
"epoch": 0.372785622593068,
"grad_norm": 1.3649141788482666,
"learning_rate": 0.00014081255161428838,
"loss": 2.3528,
"step": 363
},
{
"epoch": 0.37381258023106545,
"grad_norm": 1.641750454902649,
"learning_rate": 0.00014051482078238932,
"loss": 2.439,
"step": 364
},
{
"epoch": 0.3748395378690629,
"grad_norm": 1.3235962390899658,
"learning_rate": 0.00014021665966250927,
"loss": 2.3409,
"step": 365
},
{
"epoch": 0.37586649550706036,
"grad_norm": 1.463796615600586,
"learning_rate": 0.0001399180714212708,
"loss": 2.427,
"step": 366
},
{
"epoch": 0.3768934531450578,
"grad_norm": 1.3167238235473633,
"learning_rate": 0.0001396190592298327,
"loss": 2.1034,
"step": 367
},
{
"epoch": 0.3779204107830552,
"grad_norm": 2.2455053329467773,
"learning_rate": 0.0001393196262638564,
"loss": 2.2139,
"step": 368
},
{
"epoch": 0.37894736842105264,
"grad_norm": 1.6802003383636475,
"learning_rate": 0.0001390197757034721,
"loss": 2.3074,
"step": 369
},
{
"epoch": 0.37997432605905007,
"grad_norm": 1.349055290222168,
"learning_rate": 0.00013871951073324507,
"loss": 2.2607,
"step": 370
},
{
"epoch": 0.3810012836970475,
"grad_norm": 1.5269880294799805,
"learning_rate": 0.00013841883454214195,
"loss": 2.3848,
"step": 371
},
{
"epoch": 0.3820282413350449,
"grad_norm": 1.4355286359786987,
"learning_rate": 0.00013811775032349655,
"loss": 2.2389,
"step": 372
},
{
"epoch": 0.38305519897304235,
"grad_norm": 1.4193495512008667,
"learning_rate": 0.00013781626127497631,
"loss": 2.574,
"step": 373
},
{
"epoch": 0.3840821566110398,
"grad_norm": 1.2355318069458008,
"learning_rate": 0.0001375143705985481,
"loss": 2.0485,
"step": 374
},
{
"epoch": 0.3851091142490372,
"grad_norm": 1.8594748973846436,
"learning_rate": 0.0001372120815004442,
"loss": 2.1466,
"step": 375
},
{
"epoch": 0.38613607188703464,
"grad_norm": 2.062476873397827,
"learning_rate": 0.0001369093971911285,
"loss": 2.264,
"step": 376
},
{
"epoch": 0.38716302952503207,
"grad_norm": 1.016082525253296,
"learning_rate": 0.00013660632088526213,
"loss": 1.8858,
"step": 377
},
{
"epoch": 0.38818998716302955,
"grad_norm": 1.8727707862854004,
"learning_rate": 0.00013630285580166945,
"loss": 2.3701,
"step": 378
},
{
"epoch": 0.389216944801027,
"grad_norm": 2.2555723190307617,
"learning_rate": 0.00013599900516330382,
"loss": 2.34,
"step": 379
},
{
"epoch": 0.3902439024390244,
"grad_norm": 1.6758098602294922,
"learning_rate": 0.00013569477219721335,
"loss": 2.3075,
"step": 380
},
{
"epoch": 0.39127086007702183,
"grad_norm": 1.789185643196106,
"learning_rate": 0.0001353901601345068,
"loss": 2.2605,
"step": 381
},
{
"epoch": 0.39229781771501926,
"grad_norm": 1.6649662256240845,
"learning_rate": 0.000135085172210319,
"loss": 2.3859,
"step": 382
},
{
"epoch": 0.3933247753530167,
"grad_norm": 1.5336881875991821,
"learning_rate": 0.00013477981166377663,
"loss": 2.4461,
"step": 383
},
{
"epoch": 0.3943517329910141,
"grad_norm": 1.6705595254898071,
"learning_rate": 0.00013447408173796385,
"loss": 2.2572,
"step": 384
},
{
"epoch": 0.39537869062901154,
"grad_norm": 1.7873886823654175,
"learning_rate": 0.00013416798567988784,
"loss": 2.1611,
"step": 385
},
{
"epoch": 0.39640564826700897,
"grad_norm": 1.7348014116287231,
"learning_rate": 0.00013386152674044422,
"loss": 2.2431,
"step": 386
},
{
"epoch": 0.3974326059050064,
"grad_norm": 1.687648057937622,
"learning_rate": 0.00013355470817438264,
"loss": 2.2149,
"step": 387
},
{
"epoch": 0.3984595635430038,
"grad_norm": 1.5006660223007202,
"learning_rate": 0.00013324753324027216,
"loss": 2.5911,
"step": 388
},
{
"epoch": 0.3994865211810013,
"grad_norm": 1.5582588911056519,
"learning_rate": 0.00013294000520046664,
"loss": 2.2855,
"step": 389
},
{
"epoch": 0.40051347881899874,
"grad_norm": 1.5377285480499268,
"learning_rate": 0.00013263212732107012,
"loss": 2.3232,
"step": 390
},
{
"epoch": 0.40154043645699616,
"grad_norm": 1.326240062713623,
"learning_rate": 0.00013232390287190208,
"loss": 2.205,
"step": 391
},
{
"epoch": 0.4025673940949936,
"grad_norm": 1.4398608207702637,
"learning_rate": 0.0001320153351264628,
"loss": 2.4615,
"step": 392
},
{
"epoch": 0.403594351732991,
"grad_norm": 1.499045729637146,
"learning_rate": 0.0001317064273618985,
"loss": 2.3906,
"step": 393
},
{
"epoch": 0.40462130937098845,
"grad_norm": 1.3966516256332397,
"learning_rate": 0.00013139718285896655,
"loss": 2.3382,
"step": 394
},
{
"epoch": 0.4056482670089859,
"grad_norm": 1.1530864238739014,
"learning_rate": 0.0001310876049020007,
"loss": 2.2139,
"step": 395
},
{
"epoch": 0.4066752246469833,
"grad_norm": 1.4689910411834717,
"learning_rate": 0.00013077769677887619,
"loss": 2.3199,
"step": 396
},
{
"epoch": 0.40770218228498073,
"grad_norm": 1.7335143089294434,
"learning_rate": 0.00013046746178097467,
"loss": 2.1957,
"step": 397
},
{
"epoch": 0.40872913992297816,
"grad_norm": 1.2896586656570435,
"learning_rate": 0.00013015690320314954,
"loss": 2.4031,
"step": 398
},
{
"epoch": 0.4097560975609756,
"grad_norm": 1.3342877626419067,
"learning_rate": 0.0001298460243436906,
"loss": 2.2962,
"step": 399
},
{
"epoch": 0.41078305519897307,
"grad_norm": 1.4110368490219116,
"learning_rate": 0.00012953482850428926,
"loss": 2.316,
"step": 400
},
{
"epoch": 0.4118100128369705,
"grad_norm": 1.4386307001113892,
"learning_rate": 0.00012922331899000353,
"loss": 2.269,
"step": 401
},
{
"epoch": 0.4128369704749679,
"grad_norm": 1.1738072633743286,
"learning_rate": 0.00012891149910922267,
"loss": 2.3723,
"step": 402
},
{
"epoch": 0.41386392811296535,
"grad_norm": 1.9212470054626465,
"learning_rate": 0.00012859937217363224,
"loss": 2.1092,
"step": 403
},
{
"epoch": 0.4148908857509628,
"grad_norm": 1.2879332304000854,
"learning_rate": 0.00012828694149817887,
"loss": 2.228,
"step": 404
},
{
"epoch": 0.4159178433889602,
"grad_norm": 1.5181375741958618,
"learning_rate": 0.00012797421040103513,
"loss": 2.5877,
"step": 405
},
{
"epoch": 0.41694480102695763,
"grad_norm": 1.5899111032485962,
"learning_rate": 0.00012766118220356408,
"loss": 2.3006,
"step": 406
},
{
"epoch": 0.41797175866495506,
"grad_norm": 1.445020318031311,
"learning_rate": 0.00012734786023028423,
"loss": 2.1708,
"step": 407
},
{
"epoch": 0.4189987163029525,
"grad_norm": 1.396371603012085,
"learning_rate": 0.0001270342478088342,
"loss": 2.1616,
"step": 408
},
{
"epoch": 0.4200256739409499,
"grad_norm": 1.4456965923309326,
"learning_rate": 0.00012672034826993715,
"loss": 2.285,
"step": 409
},
{
"epoch": 0.42105263157894735,
"grad_norm": 1.3749362230300903,
"learning_rate": 0.0001264061649473657,
"loss": 2.2865,
"step": 410
},
{
"epoch": 0.42207958921694483,
"grad_norm": 1.549967646598816,
"learning_rate": 0.0001260917011779064,
"loss": 2.2219,
"step": 411
},
{
"epoch": 0.42310654685494226,
"grad_norm": 1.52138090133667,
"learning_rate": 0.00012577696030132421,
"loss": 2.2663,
"step": 412
},
{
"epoch": 0.4241335044929397,
"grad_norm": 1.315598726272583,
"learning_rate": 0.00012546194566032714,
"loss": 2.5049,
"step": 413
},
{
"epoch": 0.4251604621309371,
"grad_norm": 1.5495781898498535,
"learning_rate": 0.00012514666060053076,
"loss": 2.2123,
"step": 414
},
{
"epoch": 0.42618741976893454,
"grad_norm": 1.3094850778579712,
"learning_rate": 0.00012483110847042256,
"loss": 2.3119,
"step": 415
},
{
"epoch": 0.42721437740693197,
"grad_norm": 1.3870692253112793,
"learning_rate": 0.0001245152926213265,
"loss": 2.2084,
"step": 416
},
{
"epoch": 0.4282413350449294,
"grad_norm": 1.5510514974594116,
"learning_rate": 0.0001241992164073674,
"loss": 2.3628,
"step": 417
},
{
"epoch": 0.4292682926829268,
"grad_norm": 1.489369511604309,
"learning_rate": 0.00012388288318543512,
"loss": 2.2547,
"step": 418
},
{
"epoch": 0.43029525032092425,
"grad_norm": 1.5452933311462402,
"learning_rate": 0.00012356629631514929,
"loss": 2.2346,
"step": 419
},
{
"epoch": 0.4313222079589217,
"grad_norm": 1.330915927886963,
"learning_rate": 0.00012324945915882332,
"loss": 2.0298,
"step": 420
},
{
"epoch": 0.4323491655969191,
"grad_norm": 1.3958618640899658,
"learning_rate": 0.00012293237508142877,
"loss": 2.2811,
"step": 421
},
{
"epoch": 0.43337612323491653,
"grad_norm": 1.4904903173446655,
"learning_rate": 0.00012261504745055964,
"loss": 2.1233,
"step": 422
},
{
"epoch": 0.434403080872914,
"grad_norm": 1.2206223011016846,
"learning_rate": 0.00012229747963639654,
"loss": 2.196,
"step": 423
},
{
"epoch": 0.43543003851091144,
"grad_norm": 1.7518811225891113,
"learning_rate": 0.00012197967501167112,
"loss": 2.4287,
"step": 424
},
{
"epoch": 0.43645699614890887,
"grad_norm": 1.3162510395050049,
"learning_rate": 0.00012166163695162983,
"loss": 2.1072,
"step": 425
},
{
"epoch": 0.4374839537869063,
"grad_norm": 1.5282611846923828,
"learning_rate": 0.00012134336883399855,
"loss": 2.2738,
"step": 426
},
{
"epoch": 0.4385109114249037,
"grad_norm": 1.561740517616272,
"learning_rate": 0.00012102487403894633,
"loss": 2.4247,
"step": 427
},
{
"epoch": 0.43953786906290115,
"grad_norm": 1.488586664199829,
"learning_rate": 0.00012070615594904977,
"loss": 2.3052,
"step": 428
},
{
"epoch": 0.4405648267008986,
"grad_norm": 1.209370493888855,
"learning_rate": 0.00012038721794925689,
"loss": 2.1714,
"step": 429
},
{
"epoch": 0.441591784338896,
"grad_norm": 1.4916038513183594,
"learning_rate": 0.00012006806342685126,
"loss": 2.1914,
"step": 430
},
{
"epoch": 0.44261874197689344,
"grad_norm": 1.618943691253662,
"learning_rate": 0.00011974869577141611,
"loss": 2.2324,
"step": 431
},
{
"epoch": 0.44364569961489086,
"grad_norm": 1.1960707902908325,
"learning_rate": 0.00011942911837479817,
"loss": 2.1326,
"step": 432
},
{
"epoch": 0.4446726572528883,
"grad_norm": 1.9583168029785156,
"learning_rate": 0.0001191093346310718,
"loss": 2.5026,
"step": 433
},
{
"epoch": 0.4456996148908858,
"grad_norm": 1.3039475679397583,
"learning_rate": 0.00011878934793650273,
"loss": 2.3416,
"step": 434
},
{
"epoch": 0.4467265725288832,
"grad_norm": 1.7960904836654663,
"learning_rate": 0.00011846916168951232,
"loss": 2.2458,
"step": 435
},
{
"epoch": 0.44775353016688063,
"grad_norm": 1.5260133743286133,
"learning_rate": 0.00011814877929064118,
"loss": 2.2471,
"step": 436
},
{
"epoch": 0.44878048780487806,
"grad_norm": 1.8603578805923462,
"learning_rate": 0.00011782820414251314,
"loss": 2.2966,
"step": 437
},
{
"epoch": 0.4498074454428755,
"grad_norm": 2.106266498565674,
"learning_rate": 0.00011750743964979918,
"loss": 2.3253,
"step": 438
},
{
"epoch": 0.4508344030808729,
"grad_norm": 1.6676055192947388,
"learning_rate": 0.00011718648921918112,
"loss": 2.302,
"step": 439
},
{
"epoch": 0.45186136071887034,
"grad_norm": 1.521597146987915,
"learning_rate": 0.00011686535625931565,
"loss": 2.4547,
"step": 440
},
{
"epoch": 0.45288831835686777,
"grad_norm": 2.344489097595215,
"learning_rate": 0.00011654404418079794,
"loss": 2.2382,
"step": 441
},
{
"epoch": 0.4539152759948652,
"grad_norm": 1.9089165925979614,
"learning_rate": 0.00011622255639612554,
"loss": 2.2597,
"step": 442
},
{
"epoch": 0.4549422336328626,
"grad_norm": 1.8408923149108887,
"learning_rate": 0.00011590089631966206,
"loss": 2.3862,
"step": 443
},
{
"epoch": 0.45596919127086005,
"grad_norm": 1.6886488199234009,
"learning_rate": 0.00011557906736760089,
"loss": 2.2685,
"step": 444
},
{
"epoch": 0.45699614890885754,
"grad_norm": 1.1247152090072632,
"learning_rate": 0.00011525707295792907,
"loss": 1.9322,
"step": 445
},
{
"epoch": 0.45802310654685496,
"grad_norm": 1.5255829095840454,
"learning_rate": 0.00011493491651039077,
"loss": 2.162,
"step": 446
},
{
"epoch": 0.4590500641848524,
"grad_norm": 1.3797430992126465,
"learning_rate": 0.00011461260144645119,
"loss": 2.2511,
"step": 447
},
{
"epoch": 0.4600770218228498,
"grad_norm": 1.63080632686615,
"learning_rate": 0.00011429013118926002,
"loss": 2.2,
"step": 448
},
{
"epoch": 0.46110397946084725,
"grad_norm": 1.5671621561050415,
"learning_rate": 0.00011396750916361524,
"loss": 2.036,
"step": 449
},
{
"epoch": 0.4621309370988447,
"grad_norm": 1.4500336647033691,
"learning_rate": 0.00011364473879592674,
"loss": 2.4217,
"step": 450
},
{
"epoch": 0.4631578947368421,
"grad_norm": 1.1536897420883179,
"learning_rate": 0.00011332182351417975,
"loss": 2.027,
"step": 451
},
{
"epoch": 0.46418485237483953,
"grad_norm": 1.3846821784973145,
"learning_rate": 0.00011299876674789864,
"loss": 2.3609,
"step": 452
},
{
"epoch": 0.46521181001283696,
"grad_norm": 1.307686448097229,
"learning_rate": 0.00011267557192811038,
"loss": 2.1803,
"step": 453
},
{
"epoch": 0.4662387676508344,
"grad_norm": 1.2983853816986084,
"learning_rate": 0.0001123522424873082,
"loss": 2.1518,
"step": 454
},
{
"epoch": 0.4672657252888318,
"grad_norm": 1.341827154159546,
"learning_rate": 0.00011202878185941501,
"loss": 2.2907,
"step": 455
},
{
"epoch": 0.4682926829268293,
"grad_norm": 1.4334162473678589,
"learning_rate": 0.00011170519347974704,
"loss": 2.1504,
"step": 456
},
{
"epoch": 0.4693196405648267,
"grad_norm": 1.19010591506958,
"learning_rate": 0.00011138148078497728,
"loss": 2.2356,
"step": 457
},
{
"epoch": 0.47034659820282415,
"grad_norm": 2.3501694202423096,
"learning_rate": 0.000111057647213099,
"loss": 2.3976,
"step": 458
},
{
"epoch": 0.4713735558408216,
"grad_norm": 1.4157503843307495,
"learning_rate": 0.00011073369620338928,
"loss": 2.2394,
"step": 459
},
{
"epoch": 0.472400513478819,
"grad_norm": 1.45404052734375,
"learning_rate": 0.0001104096311963724,
"loss": 2.354,
"step": 460
},
{
"epoch": 0.47342747111681643,
"grad_norm": 1.3356945514678955,
"learning_rate": 0.00011008545563378346,
"loss": 2.2715,
"step": 461
},
{
"epoch": 0.47445442875481386,
"grad_norm": 1.5381147861480713,
"learning_rate": 0.00010976117295853154,
"loss": 2.4286,
"step": 462
},
{
"epoch": 0.4754813863928113,
"grad_norm": 1.5712486505508423,
"learning_rate": 0.00010943678661466346,
"loss": 2.4077,
"step": 463
},
{
"epoch": 0.4765083440308087,
"grad_norm": 1.5998138189315796,
"learning_rate": 0.00010911230004732703,
"loss": 2.2756,
"step": 464
},
{
"epoch": 0.47753530166880614,
"grad_norm": 1.5580840110778809,
"learning_rate": 0.0001087877167027344,
"loss": 2.3054,
"step": 465
},
{
"epoch": 0.4785622593068036,
"grad_norm": 1.1897094249725342,
"learning_rate": 0.00010846304002812564,
"loss": 2.0802,
"step": 466
},
{
"epoch": 0.479589216944801,
"grad_norm": 1.3887132406234741,
"learning_rate": 0.00010813827347173195,
"loss": 2.1605,
"step": 467
},
{
"epoch": 0.4806161745827985,
"grad_norm": 1.958724856376648,
"learning_rate": 0.00010781342048273921,
"loss": 2.2558,
"step": 468
},
{
"epoch": 0.4816431322207959,
"grad_norm": 1.5566227436065674,
"learning_rate": 0.0001074884845112512,
"loss": 2.2141,
"step": 469
},
{
"epoch": 0.48267008985879334,
"grad_norm": 1.616424560546875,
"learning_rate": 0.00010716346900825299,
"loss": 2.1406,
"step": 470
},
{
"epoch": 0.48369704749679077,
"grad_norm": 1.6108357906341553,
"learning_rate": 0.00010683837742557436,
"loss": 2.4381,
"step": 471
},
{
"epoch": 0.4847240051347882,
"grad_norm": 1.4207314252853394,
"learning_rate": 0.00010651321321585315,
"loss": 2.1121,
"step": 472
},
{
"epoch": 0.4857509627727856,
"grad_norm": 1.2405275106430054,
"learning_rate": 0.00010618797983249841,
"loss": 2.2362,
"step": 473
},
{
"epoch": 0.48677792041078305,
"grad_norm": 1.5273611545562744,
"learning_rate": 0.00010586268072965396,
"loss": 2.0912,
"step": 474
},
{
"epoch": 0.4878048780487805,
"grad_norm": 1.4871629476547241,
"learning_rate": 0.00010553731936216149,
"loss": 2.1009,
"step": 475
},
{
"epoch": 0.4888318356867779,
"grad_norm": 1.320486068725586,
"learning_rate": 0.00010521189918552406,
"loss": 2.1408,
"step": 476
},
{
"epoch": 0.48985879332477533,
"grad_norm": 1.4156553745269775,
"learning_rate": 0.0001048864236558693,
"loss": 2.1874,
"step": 477
},
{
"epoch": 0.49088575096277276,
"grad_norm": 1.620944857597351,
"learning_rate": 0.00010456089622991263,
"loss": 2.0141,
"step": 478
},
{
"epoch": 0.49191270860077024,
"grad_norm": 1.3774328231811523,
"learning_rate": 0.00010423532036492077,
"loss": 2.0491,
"step": 479
},
{
"epoch": 0.49293966623876767,
"grad_norm": 1.6792056560516357,
"learning_rate": 0.00010390969951867482,
"loss": 2.3037,
"step": 480
},
{
"epoch": 0.4939666238767651,
"grad_norm": 1.4954179525375366,
"learning_rate": 0.00010358403714943357,
"loss": 2.3587,
"step": 481
},
{
"epoch": 0.4949935815147625,
"grad_norm": 1.3356521129608154,
"learning_rate": 0.00010325833671589687,
"loss": 2.205,
"step": 482
},
{
"epoch": 0.49602053915275995,
"grad_norm": 1.6689985990524292,
"learning_rate": 0.00010293260167716876,
"loss": 2.2734,
"step": 483
},
{
"epoch": 0.4970474967907574,
"grad_norm": 1.5499637126922607,
"learning_rate": 0.00010260683549272089,
"loss": 2.1623,
"step": 484
},
{
"epoch": 0.4980744544287548,
"grad_norm": 1.2015153169631958,
"learning_rate": 0.00010228104162235563,
"loss": 2.2428,
"step": 485
},
{
"epoch": 0.49910141206675224,
"grad_norm": 1.4437837600708008,
"learning_rate": 0.00010195522352616943,
"loss": 2.1359,
"step": 486
},
{
"epoch": 0.5001283697047497,
"grad_norm": 1.5427740812301636,
"learning_rate": 0.00010162938466451599,
"loss": 2.29,
"step": 487
},
{
"epoch": 0.5011553273427471,
"grad_norm": 1.5538071393966675,
"learning_rate": 0.00010130352849796958,
"loss": 2.1563,
"step": 488
},
{
"epoch": 0.5011553273427471,
"eval_loss": 2.2764453887939453,
"eval_runtime": 26.8115,
"eval_samples_per_second": 15.292,
"eval_steps_per_second": 7.646,
"step": 488
},
{
"epoch": 0.5021822849807446,
"grad_norm": 1.645787239074707,
"learning_rate": 0.00010097765848728823,
"loss": 2.4307,
"step": 489
},
{
"epoch": 0.503209242618742,
"grad_norm": 1.417846441268921,
"learning_rate": 0.00010065177809337702,
"loss": 2.2206,
"step": 490
},
{
"epoch": 0.5042362002567394,
"grad_norm": 2.257739782333374,
"learning_rate": 0.00010032589077725134,
"loss": 2.2328,
"step": 491
},
{
"epoch": 0.5052631578947369,
"grad_norm": 1.419081211090088,
"learning_rate": 0.0001,
"loss": 2.1812,
"step": 492
},
{
"epoch": 0.5062901155327343,
"grad_norm": 1.5131466388702393,
"learning_rate": 9.967410922274868e-05,
"loss": 2.3043,
"step": 493
},
{
"epoch": 0.5073170731707317,
"grad_norm": 1.4304825067520142,
"learning_rate": 9.934822190662299e-05,
"loss": 2.2831,
"step": 494
},
{
"epoch": 0.5083440308087291,
"grad_norm": 1.3889083862304688,
"learning_rate": 9.902234151271177e-05,
"loss": 2.2284,
"step": 495
},
{
"epoch": 0.5093709884467266,
"grad_norm": 1.4664725065231323,
"learning_rate": 9.869647150203046e-05,
"loss": 2.2311,
"step": 496
},
{
"epoch": 0.510397946084724,
"grad_norm": 1.3767751455307007,
"learning_rate": 9.837061533548403e-05,
"loss": 2.1475,
"step": 497
},
{
"epoch": 0.5114249037227214,
"grad_norm": 1.3076868057250977,
"learning_rate": 9.80447764738306e-05,
"loss": 2.198,
"step": 498
},
{
"epoch": 0.5124518613607189,
"grad_norm": 1.886834979057312,
"learning_rate": 9.771895837764439e-05,
"loss": 2.1954,
"step": 499
},
{
"epoch": 0.5134788189987163,
"grad_norm": 1.3807141780853271,
"learning_rate": 9.739316450727913e-05,
"loss": 1.8945,
"step": 500
},
{
"epoch": 0.5145057766367137,
"grad_norm": 1.3954834938049316,
"learning_rate": 9.706739832283127e-05,
"loss": 2.2309,
"step": 501
},
{
"epoch": 0.5155327342747111,
"grad_norm": 1.31797456741333,
"learning_rate": 9.674166328410318e-05,
"loss": 2.2839,
"step": 502
},
{
"epoch": 0.5165596919127086,
"grad_norm": 1.9284818172454834,
"learning_rate": 9.641596285056648e-05,
"loss": 2.3199,
"step": 503
},
{
"epoch": 0.517586649550706,
"grad_norm": 2.1243088245391846,
"learning_rate": 9.609030048132523e-05,
"loss": 2.3362,
"step": 504
},
{
"epoch": 0.5186136071887034,
"grad_norm": 1.7423335313796997,
"learning_rate": 9.576467963507925e-05,
"loss": 2.2644,
"step": 505
},
{
"epoch": 0.5196405648267008,
"grad_norm": 1.9456878900527954,
"learning_rate": 9.543910377008742e-05,
"loss": 2.3168,
"step": 506
},
{
"epoch": 0.5206675224646984,
"grad_norm": 1.2658746242523193,
"learning_rate": 9.511357634413075e-05,
"loss": 2.46,
"step": 507
},
{
"epoch": 0.5216944801026958,
"grad_norm": 1.1964818239212036,
"learning_rate": 9.478810081447595e-05,
"loss": 2.0302,
"step": 508
},
{
"epoch": 0.5227214377406932,
"grad_norm": 1.313723087310791,
"learning_rate": 9.446268063783853e-05,
"loss": 2.3145,
"step": 509
},
{
"epoch": 0.5237483953786907,
"grad_norm": 1.8463740348815918,
"learning_rate": 9.413731927034605e-05,
"loss": 2.1669,
"step": 510
},
{
"epoch": 0.5247753530166881,
"grad_norm": 1.1602845191955566,
"learning_rate": 9.381202016750158e-05,
"loss": 2.198,
"step": 511
},
{
"epoch": 0.5258023106546855,
"grad_norm": 1.203307032585144,
"learning_rate": 9.348678678414686e-05,
"loss": 2.2497,
"step": 512
},
{
"epoch": 0.526829268292683,
"grad_norm": 1.3734437227249146,
"learning_rate": 9.316162257442562e-05,
"loss": 2.5348,
"step": 513
},
{
"epoch": 0.5278562259306804,
"grad_norm": 1.2321196794509888,
"learning_rate": 9.283653099174704e-05,
"loss": 2.1953,
"step": 514
},
{
"epoch": 0.5288831835686778,
"grad_norm": 1.2227696180343628,
"learning_rate": 9.251151548874884e-05,
"loss": 2.2543,
"step": 515
},
{
"epoch": 0.5299101412066752,
"grad_norm": 1.7139776945114136,
"learning_rate": 9.21865795172608e-05,
"loss": 2.2894,
"step": 516
},
{
"epoch": 0.5309370988446727,
"grad_norm": 1.5949795246124268,
"learning_rate": 9.186172652826808e-05,
"loss": 2.2051,
"step": 517
},
{
"epoch": 0.5319640564826701,
"grad_norm": 1.3807586431503296,
"learning_rate": 9.15369599718744e-05,
"loss": 2.2512,
"step": 518
},
{
"epoch": 0.5329910141206675,
"grad_norm": 1.3648223876953125,
"learning_rate": 9.121228329726563e-05,
"loss": 2.4064,
"step": 519
},
{
"epoch": 0.5340179717586649,
"grad_norm": 1.2418193817138672,
"learning_rate": 9.0887699952673e-05,
"loss": 2.2983,
"step": 520
},
{
"epoch": 0.5350449293966624,
"grad_norm": 1.6278191804885864,
"learning_rate": 9.056321338533656e-05,
"loss": 2.5757,
"step": 521
},
{
"epoch": 0.5360718870346598,
"grad_norm": 4.154942989349365,
"learning_rate": 9.023882704146848e-05,
"loss": 2.1376,
"step": 522
},
{
"epoch": 0.5370988446726572,
"grad_norm": 1.5529977083206177,
"learning_rate": 8.991454436621657e-05,
"loss": 2.235,
"step": 523
},
{
"epoch": 0.5381258023106547,
"grad_norm": 1.4370229244232178,
"learning_rate": 8.959036880362763e-05,
"loss": 2.1873,
"step": 524
},
{
"epoch": 0.5391527599486521,
"grad_norm": 1.117822289466858,
"learning_rate": 8.926630379661075e-05,
"loss": 1.9378,
"step": 525
},
{
"epoch": 0.5401797175866495,
"grad_norm": 1.476722240447998,
"learning_rate": 8.894235278690104e-05,
"loss": 2.3088,
"step": 526
},
{
"epoch": 0.5412066752246469,
"grad_norm": 1.3249425888061523,
"learning_rate": 8.861851921502275e-05,
"loss": 2.2814,
"step": 527
},
{
"epoch": 0.5422336328626444,
"grad_norm": 1.800880789756775,
"learning_rate": 8.829480652025297e-05,
"loss": 2.2492,
"step": 528
},
{
"epoch": 0.5432605905006418,
"grad_norm": 1.4842054843902588,
"learning_rate": 8.797121814058501e-05,
"loss": 2.3106,
"step": 529
},
{
"epoch": 0.5442875481386393,
"grad_norm": 1.6607662439346313,
"learning_rate": 8.764775751269182e-05,
"loss": 2.437,
"step": 530
},
{
"epoch": 0.5453145057766368,
"grad_norm": 1.5801904201507568,
"learning_rate": 8.732442807188965e-05,
"loss": 2.4136,
"step": 531
},
{
"epoch": 0.5463414634146342,
"grad_norm": 1.321534514427185,
"learning_rate": 8.70012332521014e-05,
"loss": 2.3567,
"step": 532
},
{
"epoch": 0.5473684210526316,
"grad_norm": 1.2890071868896484,
"learning_rate": 8.66781764858203e-05,
"loss": 2.1654,
"step": 533
},
{
"epoch": 0.548395378690629,
"grad_norm": 1.3918139934539795,
"learning_rate": 8.635526120407329e-05,
"loss": 2.5423,
"step": 534
},
{
"epoch": 0.5494223363286265,
"grad_norm": 2.1062328815460205,
"learning_rate": 8.603249083638477e-05,
"loss": 2.1476,
"step": 535
},
{
"epoch": 0.5504492939666239,
"grad_norm": 1.81232750415802,
"learning_rate": 8.570986881074003e-05,
"loss": 2.2798,
"step": 536
},
{
"epoch": 0.5514762516046213,
"grad_norm": 1.62819242477417,
"learning_rate": 8.538739855354886e-05,
"loss": 2.2052,
"step": 537
},
{
"epoch": 0.5525032092426188,
"grad_norm": 1.3328535556793213,
"learning_rate": 8.506508348960924e-05,
"loss": 2.2646,
"step": 538
},
{
"epoch": 0.5535301668806162,
"grad_norm": 1.2471153736114502,
"learning_rate": 8.474292704207094e-05,
"loss": 2.23,
"step": 539
},
{
"epoch": 0.5545571245186136,
"grad_norm": 1.668730616569519,
"learning_rate": 8.442093263239912e-05,
"loss": 2.2528,
"step": 540
},
{
"epoch": 0.555584082156611,
"grad_norm": 2.2429211139678955,
"learning_rate": 8.409910368033795e-05,
"loss": 2.2875,
"step": 541
},
{
"epoch": 0.5566110397946085,
"grad_norm": 1.9282841682434082,
"learning_rate": 8.377744360387447e-05,
"loss": 2.1459,
"step": 542
},
{
"epoch": 0.5576379974326059,
"grad_norm": 1.3737177848815918,
"learning_rate": 8.345595581920205e-05,
"loss": 2.31,
"step": 543
},
{
"epoch": 0.5586649550706033,
"grad_norm": 1.400428056716919,
"learning_rate": 8.313464374068437e-05,
"loss": 1.9801,
"step": 544
},
{
"epoch": 0.5596919127086007,
"grad_norm": 1.5178526639938354,
"learning_rate": 8.28135107808189e-05,
"loss": 2.3441,
"step": 545
},
{
"epoch": 0.5607188703465982,
"grad_norm": 1.2568373680114746,
"learning_rate": 8.249256035020086e-05,
"loss": 2.2817,
"step": 546
},
{
"epoch": 0.5617458279845956,
"grad_norm": 1.4138679504394531,
"learning_rate": 8.217179585748688e-05,
"loss": 2.3209,
"step": 547
},
{
"epoch": 0.562772785622593,
"grad_norm": 1.4604218006134033,
"learning_rate": 8.185122070935884e-05,
"loss": 2.1436,
"step": 548
},
{
"epoch": 0.5637997432605905,
"grad_norm": 1.4326988458633423,
"learning_rate": 8.15308383104877e-05,
"loss": 2.4009,
"step": 549
},
{
"epoch": 0.5648267008985879,
"grad_norm": 1.4617232084274292,
"learning_rate": 8.121065206349729e-05,
"loss": 2.1219,
"step": 550
},
{
"epoch": 0.5658536585365853,
"grad_norm": 1.1367815732955933,
"learning_rate": 8.089066536892824e-05,
"loss": 2.1921,
"step": 551
},
{
"epoch": 0.5668806161745829,
"grad_norm": 1.0669057369232178,
"learning_rate": 8.057088162520186e-05,
"loss": 2.0411,
"step": 552
},
{
"epoch": 0.5679075738125803,
"grad_norm": 1.1332672834396362,
"learning_rate": 8.02513042285839e-05,
"loss": 2.1418,
"step": 553
},
{
"epoch": 0.5689345314505777,
"grad_norm": 1.5040186643600464,
"learning_rate": 7.993193657314875e-05,
"loss": 2.2761,
"step": 554
},
{
"epoch": 0.5699614890885751,
"grad_norm": 1.1837241649627686,
"learning_rate": 7.961278205074313e-05,
"loss": 2.2972,
"step": 555
},
{
"epoch": 0.5709884467265726,
"grad_norm": 1.3050509691238403,
"learning_rate": 7.929384405095025e-05,
"loss": 2.2907,
"step": 556
},
{
"epoch": 0.57201540436457,
"grad_norm": 1.4179966449737549,
"learning_rate": 7.897512596105368e-05,
"loss": 2.4073,
"step": 557
},
{
"epoch": 0.5730423620025674,
"grad_norm": 1.321635365486145,
"learning_rate": 7.865663116600148e-05,
"loss": 2.3307,
"step": 558
},
{
"epoch": 0.5740693196405648,
"grad_norm": 1.3887020349502563,
"learning_rate": 7.833836304837021e-05,
"loss": 2.1635,
"step": 559
},
{
"epoch": 0.5750962772785623,
"grad_norm": 1.4184433221817017,
"learning_rate": 7.802032498832895e-05,
"loss": 2.1665,
"step": 560
},
{
"epoch": 0.5761232349165597,
"grad_norm": 1.585056185722351,
"learning_rate": 7.770252036360351e-05,
"loss": 2.366,
"step": 561
},
{
"epoch": 0.5771501925545571,
"grad_norm": 1.7346497774124146,
"learning_rate": 7.738495254944042e-05,
"loss": 2.3033,
"step": 562
},
{
"epoch": 0.5781771501925546,
"grad_norm": 1.9621151685714722,
"learning_rate": 7.706762491857126e-05,
"loss": 2.2521,
"step": 563
},
{
"epoch": 0.579204107830552,
"grad_norm": 2.09137225151062,
"learning_rate": 7.675054084117672e-05,
"loss": 2.3874,
"step": 564
},
{
"epoch": 0.5802310654685494,
"grad_norm": 1.4368287324905396,
"learning_rate": 7.643370368485072e-05,
"loss": 2.1901,
"step": 565
},
{
"epoch": 0.5812580231065468,
"grad_norm": 1.4019243717193604,
"learning_rate": 7.611711681456493e-05,
"loss": 2.2313,
"step": 566
},
{
"epoch": 0.5822849807445443,
"grad_norm": 1.383966326713562,
"learning_rate": 7.580078359263267e-05,
"loss": 2.2297,
"step": 567
},
{
"epoch": 0.5833119383825417,
"grad_norm": 1.407023310661316,
"learning_rate": 7.54847073786735e-05,
"loss": 2.4025,
"step": 568
},
{
"epoch": 0.5843388960205391,
"grad_norm": 1.3167904615402222,
"learning_rate": 7.516889152957744e-05,
"loss": 2.2873,
"step": 569
},
{
"epoch": 0.5853658536585366,
"grad_norm": 1.6377153396606445,
"learning_rate": 7.485333939946926e-05,
"loss": 2.2427,
"step": 570
},
{
"epoch": 0.586392811296534,
"grad_norm": 2.095268726348877,
"learning_rate": 7.453805433967287e-05,
"loss": 2.3737,
"step": 571
},
{
"epoch": 0.5874197689345314,
"grad_norm": 1.2016150951385498,
"learning_rate": 7.422303969867581e-05,
"loss": 2.2338,
"step": 572
},
{
"epoch": 0.5884467265725288,
"grad_norm": 1.3786977529525757,
"learning_rate": 7.39082988220936e-05,
"loss": 2.1732,
"step": 573
},
{
"epoch": 0.5894736842105263,
"grad_norm": 1.3394079208374023,
"learning_rate": 7.359383505263431e-05,
"loss": 2.2412,
"step": 574
},
{
"epoch": 0.5905006418485238,
"grad_norm": 1.994136095046997,
"learning_rate": 7.327965173006286e-05,
"loss": 2.455,
"step": 575
},
{
"epoch": 0.5915275994865212,
"grad_norm": 1.2922275066375732,
"learning_rate": 7.296575219116582e-05,
"loss": 2.2607,
"step": 576
},
{
"epoch": 0.5925545571245187,
"grad_norm": 1.4127111434936523,
"learning_rate": 7.265213976971577e-05,
"loss": 2.3233,
"step": 577
},
{
"epoch": 0.5935815147625161,
"grad_norm": 1.5525174140930176,
"learning_rate": 7.233881779643594e-05,
"loss": 2.2218,
"step": 578
},
{
"epoch": 0.5946084724005135,
"grad_norm": 1.2909669876098633,
"learning_rate": 7.202578959896491e-05,
"loss": 2.4499,
"step": 579
},
{
"epoch": 0.5956354300385109,
"grad_norm": 1.5280107259750366,
"learning_rate": 7.171305850182113e-05,
"loss": 2.5343,
"step": 580
},
{
"epoch": 0.5966623876765084,
"grad_norm": 1.4098429679870605,
"learning_rate": 7.140062782636777e-05,
"loss": 2.2938,
"step": 581
},
{
"epoch": 0.5976893453145058,
"grad_norm": 1.3740651607513428,
"learning_rate": 7.108850089077735e-05,
"loss": 2.3292,
"step": 582
},
{
"epoch": 0.5987163029525032,
"grad_norm": 1.922861099243164,
"learning_rate": 7.077668100999648e-05,
"loss": 2.6001,
"step": 583
},
{
"epoch": 0.5997432605905006,
"grad_norm": 1.2894479036331177,
"learning_rate": 7.046517149571075e-05,
"loss": 2.1586,
"step": 584
},
{
"epoch": 0.6007702182284981,
"grad_norm": 1.6118210554122925,
"learning_rate": 7.015397565630944e-05,
"loss": 2.5227,
"step": 585
},
{
"epoch": 0.6017971758664955,
"grad_norm": 1.51421058177948,
"learning_rate": 6.98430967968505e-05,
"loss": 2.4502,
"step": 586
},
{
"epoch": 0.6028241335044929,
"grad_norm": 1.5112583637237549,
"learning_rate": 6.953253821902532e-05,
"loss": 2.1649,
"step": 587
},
{
"epoch": 0.6038510911424904,
"grad_norm": 1.23862624168396,
"learning_rate": 6.922230322112382e-05,
"loss": 2.2405,
"step": 588
},
{
"epoch": 0.6048780487804878,
"grad_norm": 1.5707911252975464,
"learning_rate": 6.891239509799931e-05,
"loss": 2.3034,
"step": 589
},
{
"epoch": 0.6059050064184852,
"grad_norm": 1.347216010093689,
"learning_rate": 6.86028171410335e-05,
"loss": 2.1914,
"step": 590
},
{
"epoch": 0.6069319640564826,
"grad_norm": 1.8234333992004395,
"learning_rate": 6.829357263810156e-05,
"loss": 2.215,
"step": 591
},
{
"epoch": 0.6079589216944801,
"grad_norm": 1.4702991247177124,
"learning_rate": 6.798466487353723e-05,
"loss": 2.3195,
"step": 592
},
{
"epoch": 0.6089858793324775,
"grad_norm": 1.6644995212554932,
"learning_rate": 6.767609712809793e-05,
"loss": 2.3335,
"step": 593
},
{
"epoch": 0.6100128369704749,
"grad_norm": 2.144955635070801,
"learning_rate": 6.736787267892991e-05,
"loss": 2.4013,
"step": 594
},
{
"epoch": 0.6110397946084724,
"grad_norm": 1.5009002685546875,
"learning_rate": 6.705999479953338e-05,
"loss": 2.1646,
"step": 595
},
{
"epoch": 0.6120667522464698,
"grad_norm": 1.6303348541259766,
"learning_rate": 6.675246675972789e-05,
"loss": 2.1712,
"step": 596
},
{
"epoch": 0.6130937098844673,
"grad_norm": 1.4405221939086914,
"learning_rate": 6.644529182561739e-05,
"loss": 2.2961,
"step": 597
},
{
"epoch": 0.6141206675224647,
"grad_norm": 1.6782268285751343,
"learning_rate": 6.613847325955578e-05,
"loss": 2.2071,
"step": 598
},
{
"epoch": 0.6151476251604622,
"grad_norm": 1.2012258768081665,
"learning_rate": 6.583201432011217e-05,
"loss": 1.8268,
"step": 599
},
{
"epoch": 0.6161745827984596,
"grad_norm": 1.4990489482879639,
"learning_rate": 6.552591826203616e-05,
"loss": 2.0925,
"step": 600
},
{
"epoch": 0.617201540436457,
"grad_norm": 1.6155550479888916,
"learning_rate": 6.522018833622338e-05,
"loss": 2.2946,
"step": 601
},
{
"epoch": 0.6182284980744545,
"grad_norm": 1.6897152662277222,
"learning_rate": 6.491482778968104e-05,
"loss": 2.212,
"step": 602
},
{
"epoch": 0.6192554557124519,
"grad_norm": 1.3177330493927002,
"learning_rate": 6.460983986549321e-05,
"loss": 1.7627,
"step": 603
},
{
"epoch": 0.6202824133504493,
"grad_norm": 1.4330686330795288,
"learning_rate": 6.430522780278663e-05,
"loss": 2.4068,
"step": 604
},
{
"epoch": 0.6213093709884467,
"grad_norm": 1.3086313009262085,
"learning_rate": 6.400099483669621e-05,
"loss": 2.2901,
"step": 605
},
{
"epoch": 0.6223363286264442,
"grad_norm": 1.5514755249023438,
"learning_rate": 6.369714419833056e-05,
"loss": 2.0377,
"step": 606
},
{
"epoch": 0.6233632862644416,
"grad_norm": 1.43644380569458,
"learning_rate": 6.339367911473788e-05,
"loss": 2.0917,
"step": 607
},
{
"epoch": 0.624390243902439,
"grad_norm": 1.5321091413497925,
"learning_rate": 6.309060280887151e-05,
"loss": 2.2432,
"step": 608
},
{
"epoch": 0.6254172015404365,
"grad_norm": 1.3088302612304688,
"learning_rate": 6.278791849955583e-05,
"loss": 2.2123,
"step": 609
},
{
"epoch": 0.6264441591784339,
"grad_norm": 1.256137728691101,
"learning_rate": 6.248562940145195e-05,
"loss": 2.2079,
"step": 610
},
{
"epoch": 0.6274711168164313,
"grad_norm": 1.4580830335617065,
"learning_rate": 6.21837387250237e-05,
"loss": 2.4283,
"step": 611
},
{
"epoch": 0.6284980744544287,
"grad_norm": 1.6171642541885376,
"learning_rate": 6.188224967650347e-05,
"loss": 2.3705,
"step": 612
},
{
"epoch": 0.6295250320924262,
"grad_norm": 1.3267604112625122,
"learning_rate": 6.158116545785809e-05,
"loss": 2.2335,
"step": 613
},
{
"epoch": 0.6305519897304236,
"grad_norm": 1.5025784969329834,
"learning_rate": 6.128048926675494e-05,
"loss": 2.3648,
"step": 614
},
{
"epoch": 0.631578947368421,
"grad_norm": 1.0958548784255981,
"learning_rate": 6.098022429652794e-05,
"loss": 2.1022,
"step": 615
},
{
"epoch": 0.6326059050064184,
"grad_norm": 1.2155437469482422,
"learning_rate": 6.068037373614364e-05,
"loss": 2.2298,
"step": 616
},
{
"epoch": 0.6336328626444159,
"grad_norm": 1.4382355213165283,
"learning_rate": 6.0380940770167336e-05,
"loss": 2.3538,
"step": 617
},
{
"epoch": 0.6346598202824133,
"grad_norm": 1.2357529401779175,
"learning_rate": 6.008192857872923e-05,
"loss": 1.8669,
"step": 618
},
{
"epoch": 0.6356867779204107,
"grad_norm": 1.3660916090011597,
"learning_rate": 5.9783340337490754e-05,
"loss": 2.3527,
"step": 619
},
{
"epoch": 0.6367137355584083,
"grad_norm": 1.899346947669983,
"learning_rate": 5.94851792176107e-05,
"loss": 2.3702,
"step": 620
},
{
"epoch": 0.6377406931964057,
"grad_norm": 1.207220196723938,
"learning_rate": 5.9187448385711685e-05,
"loss": 2.2377,
"step": 621
},
{
"epoch": 0.6387676508344031,
"grad_norm": 1.7264626026153564,
"learning_rate": 5.889015100384636e-05,
"loss": 2.3115,
"step": 622
},
{
"epoch": 0.6397946084724005,
"grad_norm": 1.4954859018325806,
"learning_rate": 5.859329022946399e-05,
"loss": 2.1911,
"step": 623
},
{
"epoch": 0.640821566110398,
"grad_norm": 1.2664867639541626,
"learning_rate": 5.8296869215376846e-05,
"loss": 2.0781,
"step": 624
},
{
"epoch": 0.6418485237483954,
"grad_norm": 1.6995880603790283,
"learning_rate": 5.8000891109726706e-05,
"loss": 2.2242,
"step": 625
},
{
"epoch": 0.6428754813863928,
"grad_norm": 1.3912755250930786,
"learning_rate": 5.770535905595138e-05,
"loss": 2.1297,
"step": 626
},
{
"epoch": 0.6439024390243903,
"grad_norm": 1.3863202333450317,
"learning_rate": 5.741027619275146e-05,
"loss": 2.4886,
"step": 627
},
{
"epoch": 0.6449293966623877,
"grad_norm": 1.896565318107605,
"learning_rate": 5.7115645654056815e-05,
"loss": 2.4106,
"step": 628
},
{
"epoch": 0.6459563543003851,
"grad_norm": 1.4082729816436768,
"learning_rate": 5.6821470568993606e-05,
"loss": 2.4035,
"step": 629
},
{
"epoch": 0.6469833119383825,
"grad_norm": 1.357062578201294,
"learning_rate": 5.6527754061850554e-05,
"loss": 2.0427,
"step": 630
},
{
"epoch": 0.64801026957638,
"grad_norm": 1.3760401010513306,
"learning_rate": 5.623449925204627e-05,
"loss": 2.0807,
"step": 631
},
{
"epoch": 0.6490372272143774,
"grad_norm": 1.399768590927124,
"learning_rate": 5.594170925409579e-05,
"loss": 2.1833,
"step": 632
},
{
"epoch": 0.6500641848523748,
"grad_norm": 1.6240482330322266,
"learning_rate": 5.564938717757766e-05,
"loss": 2.4324,
"step": 633
},
{
"epoch": 0.6510911424903723,
"grad_norm": 1.388628363609314,
"learning_rate": 5.5357536127100904e-05,
"loss": 2.2536,
"step": 634
},
{
"epoch": 0.6521181001283697,
"grad_norm": 1.4866918325424194,
"learning_rate": 5.506615920227186e-05,
"loss": 2.1934,
"step": 635
},
{
"epoch": 0.6531450577663671,
"grad_norm": 1.1647313833236694,
"learning_rate": 5.4775259497661555e-05,
"loss": 2.1951,
"step": 636
},
{
"epoch": 0.6541720154043645,
"grad_norm": 1.377734661102295,
"learning_rate": 5.448484010277267e-05,
"loss": 2.3885,
"step": 637
},
{
"epoch": 0.655198973042362,
"grad_norm": 1.5145090818405151,
"learning_rate": 5.419490410200675e-05,
"loss": 2.2318,
"step": 638
},
{
"epoch": 0.6562259306803594,
"grad_norm": 1.4447916746139526,
"learning_rate": 5.390545457463134e-05,
"loss": 2.1195,
"step": 639
},
{
"epoch": 0.6572528883183568,
"grad_norm": 2.2646050453186035,
"learning_rate": 5.361649459474756e-05,
"loss": 2.1927,
"step": 640
},
{
"epoch": 0.6582798459563542,
"grad_norm": 1.7940598726272583,
"learning_rate": 5.332802723125716e-05,
"loss": 2.2811,
"step": 641
},
{
"epoch": 0.6593068035943518,
"grad_norm": 1.3245995044708252,
"learning_rate": 5.304005554783015e-05,
"loss": 2.2386,
"step": 642
},
{
"epoch": 0.6603337612323492,
"grad_norm": 2.0891315937042236,
"learning_rate": 5.275258260287211e-05,
"loss": 2.2728,
"step": 643
},
{
"epoch": 0.6613607188703466,
"grad_norm": 1.4078350067138672,
"learning_rate": 5.246561144949173e-05,
"loss": 2.1705,
"step": 644
},
{
"epoch": 0.6623876765083441,
"grad_norm": 1.688930630683899,
"learning_rate": 5.217914513546848e-05,
"loss": 2.5033,
"step": 645
},
{
"epoch": 0.6634146341463415,
"grad_norm": 1.6725249290466309,
"learning_rate": 5.1893186703220165e-05,
"loss": 2.4268,
"step": 646
},
{
"epoch": 0.6644415917843389,
"grad_norm": 1.7176076173782349,
"learning_rate": 5.160773918977061e-05,
"loss": 2.388,
"step": 647
},
{
"epoch": 0.6654685494223364,
"grad_norm": 1.360771894454956,
"learning_rate": 5.13228056267175e-05,
"loss": 2.2381,
"step": 648
},
{
"epoch": 0.6664955070603338,
"grad_norm": 1.610059142112732,
"learning_rate": 5.103838904019993e-05,
"loss": 1.8292,
"step": 649
},
{
"epoch": 0.6675224646983312,
"grad_norm": 1.5176039934158325,
"learning_rate": 5.0754492450866607e-05,
"loss": 2.283,
"step": 650
},
{
"epoch": 0.6685494223363286,
"grad_norm": 1.2839123010635376,
"learning_rate": 5.047111887384357e-05,
"loss": 2.4285,
"step": 651
},
{
"epoch": 0.6695763799743261,
"grad_norm": 1.2786331176757812,
"learning_rate": 5.018827131870214e-05,
"loss": 2.2895,
"step": 652
},
{
"epoch": 0.6706033376123235,
"grad_norm": 1.634473204612732,
"learning_rate": 4.9905952789427126e-05,
"loss": 2.0957,
"step": 653
},
{
"epoch": 0.6716302952503209,
"grad_norm": 1.3561400175094604,
"learning_rate": 4.9624166284384656e-05,
"loss": 2.1695,
"step": 654
},
{
"epoch": 0.6726572528883183,
"grad_norm": 1.4681189060211182,
"learning_rate": 4.934291479629063e-05,
"loss": 2.415,
"step": 655
},
{
"epoch": 0.6736842105263158,
"grad_norm": 1.7720856666564941,
"learning_rate": 4.9062201312178725e-05,
"loss": 2.2386,
"step": 656
},
{
"epoch": 0.6747111681643132,
"grad_norm": 1.505091667175293,
"learning_rate": 4.8782028813368786e-05,
"loss": 2.2751,
"step": 657
},
{
"epoch": 0.6757381258023106,
"grad_norm": 1.2972487211227417,
"learning_rate": 4.850240027543509e-05,
"loss": 2.4914,
"step": 658
},
{
"epoch": 0.6767650834403081,
"grad_norm": 2.298083543777466,
"learning_rate": 4.822331866817478e-05,
"loss": 2.3478,
"step": 659
},
{
"epoch": 0.6777920410783055,
"grad_norm": 1.8115476369857788,
"learning_rate": 4.7944786955576313e-05,
"loss": 2.4564,
"step": 660
},
{
"epoch": 0.6788189987163029,
"grad_norm": 1.3283237218856812,
"learning_rate": 4.766680809578804e-05,
"loss": 2.0925,
"step": 661
},
{
"epoch": 0.6798459563543003,
"grad_norm": 2.0927574634552,
"learning_rate": 4.738938504108659e-05,
"loss": 2.1346,
"step": 662
},
{
"epoch": 0.6808729139922978,
"grad_norm": 1.2842458486557007,
"learning_rate": 4.7112520737845814e-05,
"loss": 2.176,
"step": 663
},
{
"epoch": 0.6818998716302952,
"grad_norm": 1.6555302143096924,
"learning_rate": 4.683621812650525e-05,
"loss": 2.3672,
"step": 664
},
{
"epoch": 0.6829268292682927,
"grad_norm": 1.4594167470932007,
"learning_rate": 4.6560480141539044e-05,
"loss": 2.2314,
"step": 665
},
{
"epoch": 0.6839537869062902,
"grad_norm": 1.2699614763259888,
"learning_rate": 4.628530971142471e-05,
"loss": 2.1865,
"step": 666
},
{
"epoch": 0.6849807445442876,
"grad_norm": 1.110546588897705,
"learning_rate": 4.601070975861194e-05,
"loss": 1.8842,
"step": 667
},
{
"epoch": 0.686007702182285,
"grad_norm": 1.594329833984375,
"learning_rate": 4.573668319949179e-05,
"loss": 2.3873,
"step": 668
},
{
"epoch": 0.6870346598202824,
"grad_norm": 1.4520328044891357,
"learning_rate": 4.5463232944365554e-05,
"loss": 2.3261,
"step": 669
},
{
"epoch": 0.6880616174582799,
"grad_norm": 1.9483394622802734,
"learning_rate": 4.519036189741386e-05,
"loss": 2.2976,
"step": 670
},
{
"epoch": 0.6890885750962773,
"grad_norm": 2.0844573974609375,
"learning_rate": 4.4918072956665915e-05,
"loss": 2.1116,
"step": 671
},
{
"epoch": 0.6901155327342747,
"grad_norm": 1.7560206651687622,
"learning_rate": 4.464636901396852e-05,
"loss": 2.3037,
"step": 672
},
{
"epoch": 0.6911424903722722,
"grad_norm": 1.24983549118042,
"learning_rate": 4.4375252954955635e-05,
"loss": 2.0582,
"step": 673
},
{
"epoch": 0.6921694480102696,
"grad_norm": 1.5504297018051147,
"learning_rate": 4.410472765901755e-05,
"loss": 2.4398,
"step": 674
},
{
"epoch": 0.693196405648267,
"grad_norm": 1.6074193716049194,
"learning_rate": 4.3834795999270364e-05,
"loss": 2.2749,
"step": 675
},
{
"epoch": 0.6942233632862644,
"grad_norm": 2.164050817489624,
"learning_rate": 4.356546084252548e-05,
"loss": 2.3882,
"step": 676
},
{
"epoch": 0.6952503209242619,
"grad_norm": 1.8083585500717163,
"learning_rate": 4.3296725049259015e-05,
"loss": 2.244,
"step": 677
},
{
"epoch": 0.6962772785622593,
"grad_norm": 1.108267068862915,
"learning_rate": 4.302859147358168e-05,
"loss": 1.9859,
"step": 678
},
{
"epoch": 0.6973042362002567,
"grad_norm": 1.5166776180267334,
"learning_rate": 4.2761062963208275e-05,
"loss": 2.296,
"step": 679
},
{
"epoch": 0.6983311938382541,
"grad_norm": 1.389072060585022,
"learning_rate": 4.249414235942755e-05,
"loss": 2.1956,
"step": 680
},
{
"epoch": 0.6993581514762516,
"grad_norm": 1.3236082792282104,
"learning_rate": 4.222783249707184e-05,
"loss": 2.0828,
"step": 681
},
{
"epoch": 0.700385109114249,
"grad_norm": 1.5802843570709229,
"learning_rate": 4.196213620448723e-05,
"loss": 2.2064,
"step": 682
},
{
"epoch": 0.7014120667522464,
"grad_norm": 1.5572673082351685,
"learning_rate": 4.169705630350335e-05,
"loss": 2.3091,
"step": 683
},
{
"epoch": 0.7024390243902439,
"grad_norm": 1.3279824256896973,
"learning_rate": 4.143259560940341e-05,
"loss": 2.1509,
"step": 684
},
{
"epoch": 0.7034659820282413,
"grad_norm": 1.4098100662231445,
"learning_rate": 4.116875693089439e-05,
"loss": 2.352,
"step": 685
},
{
"epoch": 0.7044929396662387,
"grad_norm": 1.4625084400177002,
"learning_rate": 4.0905543070077036e-05,
"loss": 2.0663,
"step": 686
},
{
"epoch": 0.7055198973042363,
"grad_norm": 1.8245489597320557,
"learning_rate": 4.064295682241631e-05,
"loss": 2.4885,
"step": 687
},
{
"epoch": 0.7065468549422337,
"grad_norm": 1.1943061351776123,
"learning_rate": 4.038100097671155e-05,
"loss": 2.2384,
"step": 688
},
{
"epoch": 0.7075738125802311,
"grad_norm": 1.149429440498352,
"learning_rate": 4.0119678315067025e-05,
"loss": 2.0727,
"step": 689
},
{
"epoch": 0.7086007702182285,
"grad_norm": 1.3723962306976318,
"learning_rate": 3.985899161286205e-05,
"loss": 2.3764,
"step": 690
},
{
"epoch": 0.709627727856226,
"grad_norm": 1.4548147916793823,
"learning_rate": 3.959894363872192e-05,
"loss": 2.1269,
"step": 691
},
{
"epoch": 0.7106546854942234,
"grad_norm": 1.5420607328414917,
"learning_rate": 3.933953715448822e-05,
"loss": 2.1683,
"step": 692
},
{
"epoch": 0.7116816431322208,
"grad_norm": 1.3956791162490845,
"learning_rate": 3.90807749151896e-05,
"loss": 2.2853,
"step": 693
},
{
"epoch": 0.7127086007702182,
"grad_norm": 1.335822582244873,
"learning_rate": 3.882265966901257e-05,
"loss": 2.4303,
"step": 694
},
{
"epoch": 0.7137355584082157,
"grad_norm": 1.4300553798675537,
"learning_rate": 3.85651941572721e-05,
"loss": 2.2918,
"step": 695
},
{
"epoch": 0.7147625160462131,
"grad_norm": 1.398274540901184,
"learning_rate": 3.8308381114382776e-05,
"loss": 2.3322,
"step": 696
},
{
"epoch": 0.7157894736842105,
"grad_norm": 2.2710065841674805,
"learning_rate": 3.805222326782958e-05,
"loss": 2.2784,
"step": 697
},
{
"epoch": 0.716816431322208,
"grad_norm": 1.4800862073898315,
"learning_rate": 3.7796723338138995e-05,
"loss": 2.276,
"step": 698
},
{
"epoch": 0.7178433889602054,
"grad_norm": 1.4330739974975586,
"learning_rate": 3.7541884038850125e-05,
"loss": 2.2517,
"step": 699
},
{
"epoch": 0.7188703465982028,
"grad_norm": 1.2923405170440674,
"learning_rate": 3.728770807648574e-05,
"loss": 2.0609,
"step": 700
},
{
"epoch": 0.7198973042362002,
"grad_norm": 1.1865580081939697,
"learning_rate": 3.703419815052371e-05,
"loss": 2.0863,
"step": 701
},
{
"epoch": 0.7209242618741977,
"grad_norm": 1.1879284381866455,
"learning_rate": 3.6781356953368284e-05,
"loss": 2.1486,
"step": 702
},
{
"epoch": 0.7219512195121951,
"grad_norm": 1.1846188306808472,
"learning_rate": 3.6529187170321446e-05,
"loss": 2.2775,
"step": 703
},
{
"epoch": 0.7229781771501925,
"grad_norm": 1.822152853012085,
"learning_rate": 3.627769147955433e-05,
"loss": 1.9519,
"step": 704
},
{
"epoch": 0.72400513478819,
"grad_norm": 1.2924715280532837,
"learning_rate": 3.602687255207903e-05,
"loss": 2.1845,
"step": 705
},
{
"epoch": 0.7250320924261874,
"grad_norm": 1.6589620113372803,
"learning_rate": 3.5776733051719936e-05,
"loss": 2.4154,
"step": 706
},
{
"epoch": 0.7260590500641848,
"grad_norm": 1.5250493288040161,
"learning_rate": 3.5527275635085666e-05,
"loss": 2.447,
"step": 707
},
{
"epoch": 0.7270860077021822,
"grad_norm": 1.4836504459381104,
"learning_rate": 3.527850295154075e-05,
"loss": 2.2225,
"step": 708
},
{
"epoch": 0.7281129653401797,
"grad_norm": 1.7604604959487915,
"learning_rate": 3.5030417643177415e-05,
"loss": 2.396,
"step": 709
},
{
"epoch": 0.7291399229781772,
"grad_norm": 1.4900736808776855,
"learning_rate": 3.47830223447877e-05,
"loss": 2.3763,
"step": 710
},
{
"epoch": 0.7301668806161746,
"grad_norm": 1.5196295976638794,
"learning_rate": 3.453631968383538e-05,
"loss": 2.2122,
"step": 711
},
{
"epoch": 0.7311938382541721,
"grad_norm": 1.2298855781555176,
"learning_rate": 3.4290312280428064e-05,
"loss": 2.1634,
"step": 712
},
{
"epoch": 0.7322207958921695,
"grad_norm": 1.4192733764648438,
"learning_rate": 3.404500274728938e-05,
"loss": 1.9621,
"step": 713
},
{
"epoch": 0.7332477535301669,
"grad_norm": 1.2010129690170288,
"learning_rate": 3.3800393689731146e-05,
"loss": 2.14,
"step": 714
},
{
"epoch": 0.7342747111681643,
"grad_norm": 1.346863031387329,
"learning_rate": 3.355648770562587e-05,
"loss": 2.3788,
"step": 715
},
{
"epoch": 0.7353016688061618,
"grad_norm": 1.6066118478775024,
"learning_rate": 3.331328738537902e-05,
"loss": 2.5079,
"step": 716
},
{
"epoch": 0.7363286264441592,
"grad_norm": 1.403447151184082,
"learning_rate": 3.307079531190155e-05,
"loss": 2.2288,
"step": 717
},
{
"epoch": 0.7373555840821566,
"grad_norm": 1.413053035736084,
"learning_rate": 3.28290140605825e-05,
"loss": 1.987,
"step": 718
},
{
"epoch": 0.738382541720154,
"grad_norm": 1.3956674337387085,
"learning_rate": 3.2587946199261586e-05,
"loss": 2.2841,
"step": 719
},
{
"epoch": 0.7394094993581515,
"grad_norm": 1.6530338525772095,
"learning_rate": 3.2347594288201976e-05,
"loss": 2.2004,
"step": 720
},
{
"epoch": 0.7404364569961489,
"grad_norm": 1.269932508468628,
"learning_rate": 3.2107960880063094e-05,
"loss": 2.1639,
"step": 721
},
{
"epoch": 0.7414634146341463,
"grad_norm": 1.6402631998062134,
"learning_rate": 3.186904851987351e-05,
"loss": 2.3119,
"step": 722
},
{
"epoch": 0.7424903722721438,
"grad_norm": 1.7420886754989624,
"learning_rate": 3.1630859745003794e-05,
"loss": 2.2652,
"step": 723
},
{
"epoch": 0.7435173299101412,
"grad_norm": 1.468622088432312,
"learning_rate": 3.139339708513981e-05,
"loss": 2.1195,
"step": 724
},
{
"epoch": 0.7445442875481386,
"grad_norm": 1.339032530784607,
"learning_rate": 3.115666306225562e-05,
"loss": 2.1675,
"step": 725
},
{
"epoch": 0.745571245186136,
"grad_norm": 1.675392746925354,
"learning_rate": 3.092066019058689e-05,
"loss": 2.2438,
"step": 726
},
{
"epoch": 0.7465982028241335,
"grad_norm": 1.2472889423370361,
"learning_rate": 3.0685390976603945e-05,
"loss": 2.2506,
"step": 727
},
{
"epoch": 0.7476251604621309,
"grad_norm": 1.4236087799072266,
"learning_rate": 3.0450857918985387e-05,
"loss": 2.0269,
"step": 728
},
{
"epoch": 0.7486521181001283,
"grad_norm": 1.3883495330810547,
"learning_rate": 3.021706350859147e-05,
"loss": 2.3359,
"step": 729
},
{
"epoch": 0.7496790757381258,
"grad_norm": 1.3572190999984741,
"learning_rate": 2.998401022843761e-05,
"loss": 2.0806,
"step": 730
},
{
"epoch": 0.7507060333761232,
"grad_norm": 1.2549355030059814,
"learning_rate": 2.9751700553668072e-05,
"loss": 2.1428,
"step": 731
},
{
"epoch": 0.7517329910141207,
"grad_norm": 1.676809310913086,
"learning_rate": 2.9520136951529576e-05,
"loss": 2.4592,
"step": 732
},
{
"epoch": 0.7517329910141207,
"eval_loss": 2.252448081970215,
"eval_runtime": 26.8305,
"eval_samples_per_second": 15.281,
"eval_steps_per_second": 7.641,
"step": 732
},
{
"epoch": 0.7527599486521181,
"grad_norm": 1.2231281995773315,
"learning_rate": 2.9289321881345254e-05,
"loss": 2.0821,
"step": 733
},
{
"epoch": 0.7537869062901156,
"grad_norm": 1.4395884275436401,
"learning_rate": 2.9059257794488424e-05,
"loss": 2.2183,
"step": 734
},
{
"epoch": 0.754813863928113,
"grad_norm": 1.5501489639282227,
"learning_rate": 2.882994713435658e-05,
"loss": 2.4913,
"step": 735
},
{
"epoch": 0.7558408215661104,
"grad_norm": 1.4353766441345215,
"learning_rate": 2.860139233634547e-05,
"loss": 2.2493,
"step": 736
},
{
"epoch": 0.7568677792041079,
"grad_norm": 1.2123923301696777,
"learning_rate": 2.8373595827823086e-05,
"loss": 2.3579,
"step": 737
},
{
"epoch": 0.7578947368421053,
"grad_norm": 1.1628131866455078,
"learning_rate": 2.8146560028104153e-05,
"loss": 2.1711,
"step": 738
},
{
"epoch": 0.7589216944801027,
"grad_norm": 1.0057865381240845,
"learning_rate": 2.792028734842418e-05,
"loss": 1.7073,
"step": 739
},
{
"epoch": 0.7599486521181001,
"grad_norm": 1.5157403945922852,
"learning_rate": 2.7694780191914006e-05,
"loss": 2.3561,
"step": 740
},
{
"epoch": 0.7609756097560976,
"grad_norm": 1.333644151687622,
"learning_rate": 2.7470040953574238e-05,
"loss": 2.3166,
"step": 741
},
{
"epoch": 0.762002567394095,
"grad_norm": 1.4937711954116821,
"learning_rate": 2.724607202024969e-05,
"loss": 2.2235,
"step": 742
},
{
"epoch": 0.7630295250320924,
"grad_norm": 1.6282092332839966,
"learning_rate": 2.7022875770604284e-05,
"loss": 2.3438,
"step": 743
},
{
"epoch": 0.7640564826700899,
"grad_norm": 1.3984050750732422,
"learning_rate": 2.6800454575095567e-05,
"loss": 2.2035,
"step": 744
},
{
"epoch": 0.7650834403080873,
"grad_norm": 1.3804353475570679,
"learning_rate": 2.6578810795949682e-05,
"loss": 2.4949,
"step": 745
},
{
"epoch": 0.7661103979460847,
"grad_norm": 1.403314232826233,
"learning_rate": 2.6357946787136113e-05,
"loss": 2.4336,
"step": 746
},
{
"epoch": 0.7671373555840821,
"grad_norm": 1.34967041015625,
"learning_rate": 2.613786489434287e-05,
"loss": 2.2571,
"step": 747
},
{
"epoch": 0.7681643132220796,
"grad_norm": 1.5277897119522095,
"learning_rate": 2.591856745495148e-05,
"loss": 2.1579,
"step": 748
},
{
"epoch": 0.769191270860077,
"grad_norm": 1.5782662630081177,
"learning_rate": 2.5700056798012163e-05,
"loss": 2.2695,
"step": 749
},
{
"epoch": 0.7702182284980744,
"grad_norm": 1.5577448606491089,
"learning_rate": 2.548233524421911e-05,
"loss": 2.234,
"step": 750
},
{
"epoch": 0.7712451861360718,
"grad_norm": 1.2873456478118896,
"learning_rate": 2.5265405105885855e-05,
"loss": 2.0918,
"step": 751
},
{
"epoch": 0.7722721437740693,
"grad_norm": 1.7132203578948975,
"learning_rate": 2.5049268686920667e-05,
"loss": 2.2356,
"step": 752
},
{
"epoch": 0.7732991014120667,
"grad_norm": 1.3685425519943237,
"learning_rate": 2.4833928282802132e-05,
"loss": 2.1547,
"step": 753
},
{
"epoch": 0.7743260590500641,
"grad_norm": 1.3102649450302124,
"learning_rate": 2.461938618055478e-05,
"loss": 2.1503,
"step": 754
},
{
"epoch": 0.7753530166880617,
"grad_norm": 1.4933326244354248,
"learning_rate": 2.440564465872469e-05,
"loss": 2.2991,
"step": 755
},
{
"epoch": 0.7763799743260591,
"grad_norm": 1.284360647201538,
"learning_rate": 2.4192705987355424e-05,
"loss": 2.145,
"step": 756
},
{
"epoch": 0.7774069319640565,
"grad_norm": 1.3690736293792725,
"learning_rate": 2.3980572427963887e-05,
"loss": 2.2586,
"step": 757
},
{
"epoch": 0.778433889602054,
"grad_norm": 1.6226760149002075,
"learning_rate": 2.3769246233516242e-05,
"loss": 2.4489,
"step": 758
},
{
"epoch": 0.7794608472400514,
"grad_norm": 1.2062180042266846,
"learning_rate": 2.3558729648404065e-05,
"loss": 2.013,
"step": 759
},
{
"epoch": 0.7804878048780488,
"grad_norm": 1.2239223718643188,
"learning_rate": 2.33490249084204e-05,
"loss": 2.2947,
"step": 760
},
{
"epoch": 0.7815147625160462,
"grad_norm": 1.7042533159255981,
"learning_rate": 2.3140134240736168e-05,
"loss": 2.1473,
"step": 761
},
{
"epoch": 0.7825417201540437,
"grad_norm": 1.2973995208740234,
"learning_rate": 2.2932059863876365e-05,
"loss": 2.1991,
"step": 762
},
{
"epoch": 0.7835686777920411,
"grad_norm": 1.2338628768920898,
"learning_rate": 2.272480398769662e-05,
"loss": 2.1853,
"step": 763
},
{
"epoch": 0.7845956354300385,
"grad_norm": 2.069164991378784,
"learning_rate": 2.2518368813359637e-05,
"loss": 2.2275,
"step": 764
},
{
"epoch": 0.785622593068036,
"grad_norm": 1.2554938793182373,
"learning_rate": 2.231275653331181e-05,
"loss": 2.2557,
"step": 765
},
{
"epoch": 0.7866495507060334,
"grad_norm": 1.2675412893295288,
"learning_rate": 2.2107969331260048e-05,
"loss": 2.3231,
"step": 766
},
{
"epoch": 0.7876765083440308,
"grad_norm": 1.4239006042480469,
"learning_rate": 2.1904009382148472e-05,
"loss": 2.2516,
"step": 767
},
{
"epoch": 0.7887034659820282,
"grad_norm": 1.2731049060821533,
"learning_rate": 2.170087885213541e-05,
"loss": 2.1673,
"step": 768
},
{
"epoch": 0.7897304236200257,
"grad_norm": 1.8812044858932495,
"learning_rate": 2.1498579898570227e-05,
"loss": 2.1782,
"step": 769
},
{
"epoch": 0.7907573812580231,
"grad_norm": 1.426591396331787,
"learning_rate": 2.1297114669970618e-05,
"loss": 1.9592,
"step": 770
},
{
"epoch": 0.7917843388960205,
"grad_norm": 1.302981972694397,
"learning_rate": 2.109648530599968e-05,
"loss": 2.2618,
"step": 771
},
{
"epoch": 0.7928112965340179,
"grad_norm": 1.8140078783035278,
"learning_rate": 2.089669393744319e-05,
"loss": 2.3927,
"step": 772
},
{
"epoch": 0.7938382541720154,
"grad_norm": 2.2119557857513428,
"learning_rate": 2.0697742686187017e-05,
"loss": 2.2843,
"step": 773
},
{
"epoch": 0.7948652118100128,
"grad_norm": 1.7960866689682007,
"learning_rate": 2.049963366519446e-05,
"loss": 2.4243,
"step": 774
},
{
"epoch": 0.7958921694480102,
"grad_norm": 1.459788203239441,
"learning_rate": 2.030236897848402e-05,
"loss": 2.2595,
"step": 775
},
{
"epoch": 0.7969191270860077,
"grad_norm": 1.232600450515747,
"learning_rate": 2.0105950721106894e-05,
"loss": 2.1295,
"step": 776
},
{
"epoch": 0.7979460847240052,
"grad_norm": 1.4180529117584229,
"learning_rate": 1.9910380979124754e-05,
"loss": 2.1814,
"step": 777
},
{
"epoch": 0.7989730423620026,
"grad_norm": 1.501356601715088,
"learning_rate": 1.971566182958765e-05,
"loss": 2.4455,
"step": 778
},
{
"epoch": 0.8,
"grad_norm": 1.6094731092453003,
"learning_rate": 1.952179534051183e-05,
"loss": 2.3726,
"step": 779
},
{
"epoch": 0.8010269576379975,
"grad_norm": 1.3195991516113281,
"learning_rate": 1.9328783570857957e-05,
"loss": 2.2679,
"step": 780
},
{
"epoch": 0.8020539152759949,
"grad_norm": 1.4381358623504639,
"learning_rate": 1.9136628570509063e-05,
"loss": 2.0012,
"step": 781
},
{
"epoch": 0.8030808729139923,
"grad_norm": 1.612783670425415,
"learning_rate": 1.8945332380248913e-05,
"loss": 2.2622,
"step": 782
},
{
"epoch": 0.8041078305519898,
"grad_norm": 1.2924425601959229,
"learning_rate": 1.8754897031740192e-05,
"loss": 2.0476,
"step": 783
},
{
"epoch": 0.8051347881899872,
"grad_norm": 1.5139434337615967,
"learning_rate": 1.856532454750307e-05,
"loss": 2.0912,
"step": 784
},
{
"epoch": 0.8061617458279846,
"grad_norm": 1.4951143264770508,
"learning_rate": 1.8376616940893654e-05,
"loss": 2.3838,
"step": 785
},
{
"epoch": 0.807188703465982,
"grad_norm": 1.432755947113037,
"learning_rate": 1.8188776216082603e-05,
"loss": 2.2717,
"step": 786
},
{
"epoch": 0.8082156611039795,
"grad_norm": 1.244824767112732,
"learning_rate": 1.800180436803386e-05,
"loss": 1.8481,
"step": 787
},
{
"epoch": 0.8092426187419769,
"grad_norm": 1.570754885673523,
"learning_rate": 1.7815703382483417e-05,
"loss": 2.3096,
"step": 788
},
{
"epoch": 0.8102695763799743,
"grad_norm": 1.3875128030776978,
"learning_rate": 1.7630475235918308e-05,
"loss": 2.0865,
"step": 789
},
{
"epoch": 0.8112965340179717,
"grad_norm": 1.4651671648025513,
"learning_rate": 1.7446121895555555e-05,
"loss": 2.4152,
"step": 790
},
{
"epoch": 0.8123234916559692,
"grad_norm": 1.1608806848526,
"learning_rate": 1.7262645319321324e-05,
"loss": 2.214,
"step": 791
},
{
"epoch": 0.8133504492939666,
"grad_norm": 1.622000813484192,
"learning_rate": 1.708004745583003e-05,
"loss": 2.2874,
"step": 792
},
{
"epoch": 0.814377406931964,
"grad_norm": 1.3545804023742676,
"learning_rate": 1.689833024436377e-05,
"loss": 2.3003,
"step": 793
},
{
"epoch": 0.8154043645699615,
"grad_norm": 1.4039793014526367,
"learning_rate": 1.6717495614851652e-05,
"loss": 1.8358,
"step": 794
},
{
"epoch": 0.8164313222079589,
"grad_norm": 1.3507652282714844,
"learning_rate": 1.6537545487849336e-05,
"loss": 2.2534,
"step": 795
},
{
"epoch": 0.8174582798459563,
"grad_norm": 1.4168148040771484,
"learning_rate": 1.6358481774518606e-05,
"loss": 2.2372,
"step": 796
},
{
"epoch": 0.8184852374839537,
"grad_norm": 1.266169786453247,
"learning_rate": 1.6180306376607035e-05,
"loss": 2.387,
"step": 797
},
{
"epoch": 0.8195121951219512,
"grad_norm": 1.277815818786621,
"learning_rate": 1.6003021186427893e-05,
"loss": 2.2015,
"step": 798
},
{
"epoch": 0.8205391527599486,
"grad_norm": 1.7137157917022705,
"learning_rate": 1.5826628086839968e-05,
"loss": 2.1036,
"step": 799
},
{
"epoch": 0.8215661103979461,
"grad_norm": 1.4451020956039429,
"learning_rate": 1.5651128951227612e-05,
"loss": 2.0347,
"step": 800
},
{
"epoch": 0.8225930680359436,
"grad_norm": 1.3843578100204468,
"learning_rate": 1.547652564348082e-05,
"loss": 2.1694,
"step": 801
},
{
"epoch": 0.823620025673941,
"grad_norm": 1.3804799318313599,
"learning_rate": 1.5302820017975394e-05,
"loss": 2.0706,
"step": 802
},
{
"epoch": 0.8246469833119384,
"grad_norm": 1.2779897451400757,
"learning_rate": 1.5130013919553355e-05,
"loss": 2.2537,
"step": 803
},
{
"epoch": 0.8256739409499358,
"grad_norm": 1.3886756896972656,
"learning_rate": 1.4958109183503243e-05,
"loss": 2.412,
"step": 804
},
{
"epoch": 0.8267008985879333,
"grad_norm": 1.5321778059005737,
"learning_rate": 1.4787107635540732e-05,
"loss": 2.0181,
"step": 805
},
{
"epoch": 0.8277278562259307,
"grad_norm": 1.277282953262329,
"learning_rate": 1.4617011091789135e-05,
"loss": 2.2371,
"step": 806
},
{
"epoch": 0.8287548138639281,
"grad_norm": 1.8173282146453857,
"learning_rate": 1.4447821358760127e-05,
"loss": 2.3445,
"step": 807
},
{
"epoch": 0.8297817715019256,
"grad_norm": 2.2534754276275635,
"learning_rate": 1.4279540233334665e-05,
"loss": 2.3177,
"step": 808
},
{
"epoch": 0.830808729139923,
"grad_norm": 1.2119274139404297,
"learning_rate": 1.4112169502743799e-05,
"loss": 2.0937,
"step": 809
},
{
"epoch": 0.8318356867779204,
"grad_norm": 1.1014364957809448,
"learning_rate": 1.3945710944549706e-05,
"loss": 2.0666,
"step": 810
},
{
"epoch": 0.8328626444159178,
"grad_norm": 1.0724173784255981,
"learning_rate": 1.3780166326626876e-05,
"loss": 2.383,
"step": 811
},
{
"epoch": 0.8338896020539153,
"grad_norm": 1.3168890476226807,
"learning_rate": 1.361553740714323e-05,
"loss": 2.0837,
"step": 812
},
{
"epoch": 0.8349165596919127,
"grad_norm": 1.224159598350525,
"learning_rate": 1.3451825934541551e-05,
"loss": 2.3732,
"step": 813
},
{
"epoch": 0.8359435173299101,
"grad_norm": 1.204418659210205,
"learning_rate": 1.3289033647520877e-05,
"loss": 2.2253,
"step": 814
},
{
"epoch": 0.8369704749679076,
"grad_norm": 1.26673424243927,
"learning_rate": 1.3127162275017957e-05,
"loss": 2.2487,
"step": 815
},
{
"epoch": 0.837997432605905,
"grad_norm": 1.131012201309204,
"learning_rate": 1.2966213536189032e-05,
"loss": 1.9738,
"step": 816
},
{
"epoch": 0.8390243902439024,
"grad_norm": 1.244232177734375,
"learning_rate": 1.2806189140391489e-05,
"loss": 2.1122,
"step": 817
},
{
"epoch": 0.8400513478818998,
"grad_norm": 1.446234941482544,
"learning_rate": 1.2647090787165694e-05,
"loss": 2.3183,
"step": 818
},
{
"epoch": 0.8410783055198973,
"grad_norm": 1.4866114854812622,
"learning_rate": 1.2488920166217032e-05,
"loss": 2.273,
"step": 819
},
{
"epoch": 0.8421052631578947,
"grad_norm": 1.590286374092102,
"learning_rate": 1.2331678957397819e-05,
"loss": 2.2149,
"step": 820
},
{
"epoch": 0.8431322207958921,
"grad_norm": 1.6288813352584839,
"learning_rate": 1.2175368830689593e-05,
"loss": 2.204,
"step": 821
},
{
"epoch": 0.8441591784338897,
"grad_norm": 1.644627571105957,
"learning_rate": 1.2019991446185309e-05,
"loss": 2.3602,
"step": 822
},
{
"epoch": 0.8451861360718871,
"grad_norm": 3.2406749725341797,
"learning_rate": 1.186554845407174e-05,
"loss": 2.2797,
"step": 823
},
{
"epoch": 0.8462130937098845,
"grad_norm": 1.201799988746643,
"learning_rate": 1.1712041494611958e-05,
"loss": 2.2419,
"step": 824
},
{
"epoch": 0.8472400513478819,
"grad_norm": 3.3227691650390625,
"learning_rate": 1.1559472198127818e-05,
"loss": 2.3199,
"step": 825
},
{
"epoch": 0.8482670089858794,
"grad_norm": 1.7569915056228638,
"learning_rate": 1.1407842184982786e-05,
"loss": 2.2096,
"step": 826
},
{
"epoch": 0.8492939666238768,
"grad_norm": 1.4387773275375366,
"learning_rate": 1.125715306556464e-05,
"loss": 2.2704,
"step": 827
},
{
"epoch": 0.8503209242618742,
"grad_norm": 1.222449779510498,
"learning_rate": 1.1107406440268376e-05,
"loss": 2.1525,
"step": 828
},
{
"epoch": 0.8513478818998717,
"grad_norm": 1.3529927730560303,
"learning_rate": 1.0958603899479281e-05,
"loss": 2.3474,
"step": 829
},
{
"epoch": 0.8523748395378691,
"grad_norm": 1.2619078159332275,
"learning_rate": 1.0810747023555878e-05,
"loss": 2.1297,
"step": 830
},
{
"epoch": 0.8534017971758665,
"grad_norm": 1.312843918800354,
"learning_rate": 1.0663837382813336e-05,
"loss": 2.1388,
"step": 831
},
{
"epoch": 0.8544287548138639,
"grad_norm": 2.356374979019165,
"learning_rate": 1.0517876537506687e-05,
"loss": 2.2537,
"step": 832
},
{
"epoch": 0.8554557124518614,
"grad_norm": 1.5887354612350464,
"learning_rate": 1.0372866037814277e-05,
"loss": 2.1951,
"step": 833
},
{
"epoch": 0.8564826700898588,
"grad_norm": 1.3657351732254028,
"learning_rate": 1.0228807423821263e-05,
"loss": 2.3044,
"step": 834
},
{
"epoch": 0.8575096277278562,
"grad_norm": 1.2927043437957764,
"learning_rate": 1.0085702225503313e-05,
"loss": 2.0492,
"step": 835
},
{
"epoch": 0.8585365853658536,
"grad_norm": 1.8013806343078613,
"learning_rate": 9.943551962710362e-06,
"loss": 2.2268,
"step": 836
},
{
"epoch": 0.8595635430038511,
"grad_norm": 1.4299966096878052,
"learning_rate": 9.802358145150425e-06,
"loss": 2.404,
"step": 837
},
{
"epoch": 0.8605905006418485,
"grad_norm": 1.9017884731292725,
"learning_rate": 9.662122272373575e-06,
"loss": 2.2539,
"step": 838
},
{
"epoch": 0.8616174582798459,
"grad_norm": 1.2827292680740356,
"learning_rate": 9.522845833756e-06,
"loss": 2.1102,
"step": 839
},
{
"epoch": 0.8626444159178434,
"grad_norm": 1.567421555519104,
"learning_rate": 9.384530308484273e-06,
"loss": 2.4192,
"step": 840
},
{
"epoch": 0.8636713735558408,
"grad_norm": 1.4824482202529907,
"learning_rate": 9.247177165539556e-06,
"loss": 2.3457,
"step": 841
},
{
"epoch": 0.8646983311938382,
"grad_norm": 2.1519415378570557,
"learning_rate": 9.110787863682002e-06,
"loss": 2.2903,
"step": 842
},
{
"epoch": 0.8657252888318356,
"grad_norm": 1.3354597091674805,
"learning_rate": 8.97536385143527e-06,
"loss": 2.2387,
"step": 843
},
{
"epoch": 0.8667522464698331,
"grad_norm": 1.363411784172058,
"learning_rate": 8.840906567071194e-06,
"loss": 2.3414,
"step": 844
},
{
"epoch": 0.8677792041078306,
"grad_norm": 1.2312959432601929,
"learning_rate": 8.707417438594445e-06,
"loss": 2.1462,
"step": 845
},
{
"epoch": 0.868806161745828,
"grad_norm": 1.4682419300079346,
"learning_rate": 8.574897883727384e-06,
"loss": 2.3364,
"step": 846
},
{
"epoch": 0.8698331193838255,
"grad_norm": 1.9188342094421387,
"learning_rate": 8.443349309895032e-06,
"loss": 2.1837,
"step": 847
},
{
"epoch": 0.8708600770218229,
"grad_norm": 1.4827263355255127,
"learning_rate": 8.312773114210049e-06,
"loss": 2.2226,
"step": 848
},
{
"epoch": 0.8718870346598203,
"grad_norm": 1.3923516273498535,
"learning_rate": 8.183170683457986e-06,
"loss": 2.4293,
"step": 849
},
{
"epoch": 0.8729139922978177,
"grad_norm": 1.3553410768508911,
"learning_rate": 8.054543394082504e-06,
"loss": 2.0766,
"step": 850
},
{
"epoch": 0.8739409499358152,
"grad_norm": 3.3262252807617188,
"learning_rate": 7.926892612170777e-06,
"loss": 2.2739,
"step": 851
},
{
"epoch": 0.8749679075738126,
"grad_norm": 1.1720492839813232,
"learning_rate": 7.800219693438981e-06,
"loss": 2.1775,
"step": 852
},
{
"epoch": 0.87599486521181,
"grad_norm": 1.3928182125091553,
"learning_rate": 7.674525983217828e-06,
"loss": 2.1329,
"step": 853
},
{
"epoch": 0.8770218228498075,
"grad_norm": 2.7582273483276367,
"learning_rate": 7.5498128164383955e-06,
"loss": 2.2974,
"step": 854
},
{
"epoch": 0.8780487804878049,
"grad_norm": 1.4647380113601685,
"learning_rate": 7.426081517617889e-06,
"loss": 2.532,
"step": 855
},
{
"epoch": 0.8790757381258023,
"grad_norm": 1.2066142559051514,
"learning_rate": 7.30333340084558e-06,
"loss": 2.1876,
"step": 856
},
{
"epoch": 0.8801026957637997,
"grad_norm": 1.2079867124557495,
"learning_rate": 7.181569769768792e-06,
"loss": 2.201,
"step": 857
},
{
"epoch": 0.8811296534017972,
"grad_norm": 1.4736223220825195,
"learning_rate": 7.0607919175791796e-06,
"loss": 2.2242,
"step": 858
},
{
"epoch": 0.8821566110397946,
"grad_norm": 1.428174376487732,
"learning_rate": 6.941001126998892e-06,
"loss": 2.3792,
"step": 859
},
{
"epoch": 0.883183568677792,
"grad_norm": 1.695448875427246,
"learning_rate": 6.822198670266988e-06,
"loss": 2.3633,
"step": 860
},
{
"epoch": 0.8842105263157894,
"grad_norm": 1.5708576440811157,
"learning_rate": 6.7043858091259235e-06,
"loss": 2.297,
"step": 861
},
{
"epoch": 0.8852374839537869,
"grad_norm": 1.4929567575454712,
"learning_rate": 6.587563794808127e-06,
"loss": 2.3336,
"step": 862
},
{
"epoch": 0.8862644415917843,
"grad_norm": 1.305184245109558,
"learning_rate": 6.471733868022744e-06,
"loss": 2.4024,
"step": 863
},
{
"epoch": 0.8872913992297817,
"grad_norm": 1.4627037048339844,
"learning_rate": 6.356897258942451e-06,
"loss": 2.1668,
"step": 864
},
{
"epoch": 0.8883183568677792,
"grad_norm": 1.36262845993042,
"learning_rate": 6.243055187190383e-06,
"loss": 2.0239,
"step": 865
},
{
"epoch": 0.8893453145057766,
"grad_norm": 1.3705987930297852,
"learning_rate": 6.130208861827202e-06,
"loss": 2.1388,
"step": 866
},
{
"epoch": 0.8903722721437741,
"grad_norm": 1.4911210536956787,
"learning_rate": 6.018359481338176e-06,
"loss": 2.3255,
"step": 867
},
{
"epoch": 0.8913992297817716,
"grad_norm": 1.2989243268966675,
"learning_rate": 5.907508233620573e-06,
"loss": 2.2913,
"step": 868
},
{
"epoch": 0.892426187419769,
"grad_norm": 1.4845342636108398,
"learning_rate": 5.797656295970955e-06,
"loss": 2.3531,
"step": 869
},
{
"epoch": 0.8934531450577664,
"grad_norm": 1.3806096315383911,
"learning_rate": 5.688804835072748e-06,
"loss": 2.1876,
"step": 870
},
{
"epoch": 0.8944801026957638,
"grad_norm": 1.2086135149002075,
"learning_rate": 5.580955006983735e-06,
"loss": 2.2474,
"step": 871
},
{
"epoch": 0.8955070603337613,
"grad_norm": 1.488832712173462,
"learning_rate": 5.474107957123886e-06,
"loss": 2.2586,
"step": 872
},
{
"epoch": 0.8965340179717587,
"grad_norm": 2.129424571990967,
"learning_rate": 5.3682648202631695e-06,
"loss": 2.2862,
"step": 873
},
{
"epoch": 0.8975609756097561,
"grad_norm": 1.4406683444976807,
"learning_rate": 5.263426720509468e-06,
"loss": 1.9062,
"step": 874
},
{
"epoch": 0.8985879332477535,
"grad_norm": 1.2508238554000854,
"learning_rate": 5.159594771296683e-06,
"loss": 2.0821,
"step": 875
},
{
"epoch": 0.899614890885751,
"grad_norm": 1.6048517227172852,
"learning_rate": 5.056770075372841e-06,
"loss": 2.0023,
"step": 876
},
{
"epoch": 0.9006418485237484,
"grad_norm": 1.4277167320251465,
"learning_rate": 4.954953724788469e-06,
"loss": 2.3978,
"step": 877
},
{
"epoch": 0.9016688061617458,
"grad_norm": 1.236256718635559,
"learning_rate": 4.8541468008849285e-06,
"loss": 2.0407,
"step": 878
},
{
"epoch": 0.9026957637997433,
"grad_norm": 1.405112624168396,
"learning_rate": 4.754350374283001e-06,
"loss": 2.2371,
"step": 879
},
{
"epoch": 0.9037227214377407,
"grad_norm": 1.4664572477340698,
"learning_rate": 4.6555655048713955e-06,
"loss": 2.4453,
"step": 880
},
{
"epoch": 0.9047496790757381,
"grad_norm": 1.3141064643859863,
"learning_rate": 4.5577932417956495e-06,
"loss": 2.1578,
"step": 881
},
{
"epoch": 0.9057766367137355,
"grad_norm": 1.4012268781661987,
"learning_rate": 4.461034623446847e-06,
"loss": 2.4731,
"step": 882
},
{
"epoch": 0.906803594351733,
"grad_norm": 1.1742385625839233,
"learning_rate": 4.3652906774506955e-06,
"loss": 2.1617,
"step": 883
},
{
"epoch": 0.9078305519897304,
"grad_norm": 1.2050607204437256,
"learning_rate": 4.270562420656543e-06,
"loss": 2.2211,
"step": 884
},
{
"epoch": 0.9088575096277278,
"grad_norm": 1.6956837177276611,
"learning_rate": 4.176850859126591e-06,
"loss": 2.3175,
"step": 885
},
{
"epoch": 0.9098844672657252,
"grad_norm": 1.1956061124801636,
"learning_rate": 4.084156988125231e-06,
"loss": 2.2572,
"step": 886
},
{
"epoch": 0.9109114249037227,
"grad_norm": 1.2421371936798096,
"learning_rate": 3.992481792108493e-06,
"loss": 2.0939,
"step": 887
},
{
"epoch": 0.9119383825417201,
"grad_norm": 1.1449053287506104,
"learning_rate": 3.901826244713525e-06,
"loss": 2.0271,
"step": 888
},
{
"epoch": 0.9129653401797175,
"grad_norm": 1.5248461961746216,
"learning_rate": 3.812191308748303e-06,
"loss": 2.3376,
"step": 889
},
{
"epoch": 0.9139922978177151,
"grad_norm": 1.607293725013733,
"learning_rate": 3.723577936181366e-06,
"loss": 2.3735,
"step": 890
},
{
"epoch": 0.9150192554557125,
"grad_norm": 1.536257266998291,
"learning_rate": 3.6359870681317743e-06,
"loss": 2.2889,
"step": 891
},
{
"epoch": 0.9160462130937099,
"grad_norm": 1.1060168743133545,
"learning_rate": 3.5494196348590415e-06,
"loss": 1.8348,
"step": 892
},
{
"epoch": 0.9170731707317074,
"grad_norm": 1.3008726835250854,
"learning_rate": 3.4638765557532983e-06,
"loss": 2.2412,
"step": 893
},
{
"epoch": 0.9181001283697048,
"grad_norm": 1.4789313077926636,
"learning_rate": 3.3793587393255e-06,
"loss": 2.1869,
"step": 894
},
{
"epoch": 0.9191270860077022,
"grad_norm": 1.9860131740570068,
"learning_rate": 3.295867083197801e-06,
"loss": 2.3982,
"step": 895
},
{
"epoch": 0.9201540436456996,
"grad_norm": 1.4276963472366333,
"learning_rate": 3.213402474093996e-06,
"loss": 2.4385,
"step": 896
},
{
"epoch": 0.9211810012836971,
"grad_norm": 1.2232952117919922,
"learning_rate": 3.131965787830149e-06,
"loss": 2.1511,
"step": 897
},
{
"epoch": 0.9222079589216945,
"grad_norm": 1.288060188293457,
"learning_rate": 3.0515578893052344e-06,
"loss": 2.2703,
"step": 898
},
{
"epoch": 0.9232349165596919,
"grad_norm": 2.902099847793579,
"learning_rate": 2.9721796324919893e-06,
"loss": 2.4419,
"step": 899
},
{
"epoch": 0.9242618741976893,
"grad_norm": 1.4718948602676392,
"learning_rate": 2.8938318604278314e-06,
"loss": 2.2219,
"step": 900
},
{
"epoch": 0.9252888318356868,
"grad_norm": 1.2647035121917725,
"learning_rate": 2.8165154052058997e-06,
"loss": 1.9561,
"step": 901
},
{
"epoch": 0.9263157894736842,
"grad_norm": 1.5724482536315918,
"learning_rate": 2.7402310879662497e-06,
"loss": 2.2152,
"step": 902
},
{
"epoch": 0.9273427471116816,
"grad_norm": 1.4992693662643433,
"learning_rate": 2.664979718887073e-06,
"loss": 2.3404,
"step": 903
},
{
"epoch": 0.9283697047496791,
"grad_norm": 1.3174737691879272,
"learning_rate": 2.590762097176136e-06,
"loss": 2.1052,
"step": 904
},
{
"epoch": 0.9293966623876765,
"grad_norm": 1.3780856132507324,
"learning_rate": 2.517579011062299e-06,
"loss": 2.1754,
"step": 905
},
{
"epoch": 0.9304236200256739,
"grad_norm": 1.4942290782928467,
"learning_rate": 2.44543123778711e-06,
"loss": 2.2278,
"step": 906
},
{
"epoch": 0.9314505776636713,
"grad_norm": 1.3182977437973022,
"learning_rate": 2.3743195435966036e-06,
"loss": 2.416,
"step": 907
},
{
"epoch": 0.9324775353016688,
"grad_norm": 1.462436318397522,
"learning_rate": 2.304244683733059e-06,
"loss": 2.4912,
"step": 908
},
{
"epoch": 0.9335044929396662,
"grad_norm": 1.334877610206604,
"learning_rate": 2.2352074024271195e-06,
"loss": 2.1424,
"step": 909
},
{
"epoch": 0.9345314505776636,
"grad_norm": 1.6479849815368652,
"learning_rate": 2.167208432889789e-06,
"loss": 2.1548,
"step": 910
},
{
"epoch": 0.935558408215661,
"grad_norm": 1.6695164442062378,
"learning_rate": 2.1002484973046577e-06,
"loss": 2.2078,
"step": 911
},
{
"epoch": 0.9365853658536586,
"grad_norm": 1.5034846067428589,
"learning_rate": 2.034328306820288e-06,
"loss": 2.3183,
"step": 912
},
{
"epoch": 0.937612323491656,
"grad_norm": 1.3325846195220947,
"learning_rate": 1.969448561542553e-06,
"loss": 2.1897,
"step": 913
},
{
"epoch": 0.9386392811296534,
"grad_norm": 1.5285733938217163,
"learning_rate": 1.9056099505273427e-06,
"loss": 2.4954,
"step": 914
},
{
"epoch": 0.9396662387676509,
"grad_norm": 1.4107942581176758,
"learning_rate": 1.8428131517731373e-06,
"loss": 2.4118,
"step": 915
},
{
"epoch": 0.9406931964056483,
"grad_norm": 1.5880571603775024,
"learning_rate": 1.7810588322138222e-06,
"loss": 2.1729,
"step": 916
},
{
"epoch": 0.9417201540436457,
"grad_norm": 1.3712589740753174,
"learning_rate": 1.7203476477116843e-06,
"loss": 2.0461,
"step": 917
},
{
"epoch": 0.9427471116816432,
"grad_norm": 1.2501283884048462,
"learning_rate": 1.6606802430503166e-06,
"loss": 2.2321,
"step": 918
},
{
"epoch": 0.9437740693196406,
"grad_norm": 1.9402281045913696,
"learning_rate": 1.6020572519278908e-06,
"loss": 2.2923,
"step": 919
},
{
"epoch": 0.944801026957638,
"grad_norm": 1.619081735610962,
"learning_rate": 1.5444792969503407e-06,
"loss": 2.4231,
"step": 920
},
{
"epoch": 0.9458279845956354,
"grad_norm": 1.4488154649734497,
"learning_rate": 1.487946989624811e-06,
"loss": 2.3659,
"step": 921
},
{
"epoch": 0.9468549422336329,
"grad_norm": 1.4263108968734741,
"learning_rate": 1.43246093035313e-06,
"loss": 2.2714,
"step": 922
},
{
"epoch": 0.9478818998716303,
"grad_norm": 1.1710644960403442,
"learning_rate": 1.3780217084254366e-06,
"loss": 2.128,
"step": 923
},
{
"epoch": 0.9489088575096277,
"grad_norm": 1.3074594736099243,
"learning_rate": 1.3246299020139185e-06,
"loss": 2.0772,
"step": 924
},
{
"epoch": 0.9499358151476252,
"grad_norm": 1.3507771492004395,
"learning_rate": 1.2722860781666956e-06,
"loss": 2.2176,
"step": 925
},
{
"epoch": 0.9509627727856226,
"grad_norm": 1.6228179931640625,
"learning_rate": 1.2209907928017795e-06,
"loss": 2.3079,
"step": 926
},
{
"epoch": 0.95198973042362,
"grad_norm": 1.375373125076294,
"learning_rate": 1.1707445907011339e-06,
"loss": 2.1246,
"step": 927
},
{
"epoch": 0.9530166880616174,
"grad_norm": 1.8349575996398926,
"learning_rate": 1.1215480055049798e-06,
"loss": 2.2632,
"step": 928
},
{
"epoch": 0.9540436456996149,
"grad_norm": 1.3294469118118286,
"learning_rate": 1.073401559706022e-06,
"loss": 2.1936,
"step": 929
},
{
"epoch": 0.9550706033376123,
"grad_norm": 1.4897345304489136,
"learning_rate": 1.0263057646440199e-06,
"loss": 2.2939,
"step": 930
},
{
"epoch": 0.9560975609756097,
"grad_norm": 1.434954047203064,
"learning_rate": 9.802611205002032e-07,
"loss": 2.1473,
"step": 931
},
{
"epoch": 0.9571245186136071,
"grad_norm": 1.6770613193511963,
"learning_rate": 9.352681162920984e-07,
"loss": 2.2041,
"step": 932
},
{
"epoch": 0.9581514762516046,
"grad_norm": 1.771236538887024,
"learning_rate": 8.913272298682773e-07,
"loss": 2.1005,
"step": 933
},
{
"epoch": 0.959178433889602,
"grad_norm": 1.6060534715652466,
"learning_rate": 8.484389279032834e-07,
"loss": 2.3045,
"step": 934
},
{
"epoch": 0.9602053915275995,
"grad_norm": 1.5178483724594116,
"learning_rate": 8.066036658926579e-07,
"loss": 2.2922,
"step": 935
},
{
"epoch": 0.961232349165597,
"grad_norm": 1.6939451694488525,
"learning_rate": 7.658218881481439e-07,
"loss": 2.2083,
"step": 936
},
{
"epoch": 0.9622593068035944,
"grad_norm": 1.2474678754806519,
"learning_rate": 7.260940277929451e-07,
"loss": 2.1818,
"step": 937
},
{
"epoch": 0.9632862644415918,
"grad_norm": 1.459787368774414,
"learning_rate": 6.874205067571083e-07,
"loss": 2.3195,
"step": 938
},
{
"epoch": 0.9643132220795892,
"grad_norm": 1.278882384300232,
"learning_rate": 6.498017357731034e-07,
"loss": 2.1238,
"step": 939
},
{
"epoch": 0.9653401797175867,
"grad_norm": 1.3337883949279785,
"learning_rate": 6.132381143713728e-07,
"loss": 2.1721,
"step": 940
},
{
"epoch": 0.9663671373555841,
"grad_norm": 1.6333835124969482,
"learning_rate": 5.777300308761446e-07,
"loss": 2.2194,
"step": 941
},
{
"epoch": 0.9673940949935815,
"grad_norm": 1.384678602218628,
"learning_rate": 5.432778624013257e-07,
"loss": 2.4414,
"step": 942
},
{
"epoch": 0.968421052631579,
"grad_norm": 1.3769394159317017,
"learning_rate": 5.098819748464378e-07,
"loss": 2.3543,
"step": 943
},
{
"epoch": 0.9694480102695764,
"grad_norm": 1.353691577911377,
"learning_rate": 4.775427228927765e-07,
"loss": 2.1787,
"step": 944
},
{
"epoch": 0.9704749679075738,
"grad_norm": 1.647281289100647,
"learning_rate": 4.462604499996248e-07,
"loss": 2.2027,
"step": 945
},
{
"epoch": 0.9715019255455712,
"grad_norm": 1.2839142084121704,
"learning_rate": 4.1603548840062345e-07,
"loss": 2.1197,
"step": 946
},
{
"epoch": 0.9725288831835687,
"grad_norm": 1.1656153202056885,
"learning_rate": 3.8686815910021767e-07,
"loss": 2.1826,
"step": 947
},
{
"epoch": 0.9735558408215661,
"grad_norm": 1.508293628692627,
"learning_rate": 3.5875877187024896e-07,
"loss": 2.4654,
"step": 948
},
{
"epoch": 0.9745827984595635,
"grad_norm": 1.6085854768753052,
"learning_rate": 3.317076252467133e-07,
"loss": 2.2999,
"step": 949
},
{
"epoch": 0.975609756097561,
"grad_norm": 1.520034670829773,
"learning_rate": 3.0571500652651907e-07,
"loss": 1.7566,
"step": 950
},
{
"epoch": 0.9766367137355584,
"grad_norm": 1.2050639390945435,
"learning_rate": 2.807811917644898e-07,
"loss": 2.2555,
"step": 951
},
{
"epoch": 0.9776636713735558,
"grad_norm": 1.7655271291732788,
"learning_rate": 2.5690644577039956e-07,
"loss": 2.2677,
"step": 952
},
{
"epoch": 0.9786906290115532,
"grad_norm": 1.795967698097229,
"learning_rate": 2.340910221061754e-07,
"loss": 2.399,
"step": 953
},
{
"epoch": 0.9797175866495507,
"grad_norm": 1.9675099849700928,
"learning_rate": 2.1233516308323264e-07,
"loss": 2.2401,
"step": 954
},
{
"epoch": 0.9807445442875481,
"grad_norm": 1.5977956056594849,
"learning_rate": 1.9163909975982164e-07,
"loss": 1.8277,
"step": 955
},
{
"epoch": 0.9817715019255455,
"grad_norm": 1.4226264953613281,
"learning_rate": 1.7200305193866284e-07,
"loss": 2.127,
"step": 956
},
{
"epoch": 0.9827984595635431,
"grad_norm": 1.2429808378219604,
"learning_rate": 1.534272281645488e-07,
"loss": 2.3365,
"step": 957
},
{
"epoch": 0.9838254172015405,
"grad_norm": 1.4925501346588135,
"learning_rate": 1.359118257221903e-07,
"loss": 2.2891,
"step": 958
},
{
"epoch": 0.9848523748395379,
"grad_norm": 1.5961631536483765,
"learning_rate": 1.1945703063402924e-07,
"loss": 2.2596,
"step": 959
},
{
"epoch": 0.9858793324775353,
"grad_norm": 1.4775968790054321,
"learning_rate": 1.0406301765837345e-07,
"loss": 2.2753,
"step": 960
},
{
"epoch": 0.9869062901155328,
"grad_norm": 1.4262019395828247,
"learning_rate": 8.972995028745379e-08,
"loss": 2.2151,
"step": 961
},
{
"epoch": 0.9879332477535302,
"grad_norm": 1.8904521465301514,
"learning_rate": 7.645798074572552e-08,
"loss": 2.3732,
"step": 962
},
{
"epoch": 0.9889602053915276,
"grad_norm": 1.2109746932983398,
"learning_rate": 6.424724998825848e-08,
"loss": 2.2444,
"step": 963
},
{
"epoch": 0.989987163029525,
"grad_norm": 1.230819582939148,
"learning_rate": 5.3097887699193885e-08,
"loss": 2.1996,
"step": 964
},
{
"epoch": 0.9910141206675225,
"grad_norm": 1.2803691625595093,
"learning_rate": 4.3010012290445324e-08,
"loss": 2.3936,
"step": 965
},
{
"epoch": 0.9920410783055199,
"grad_norm": 1.3575743436813354,
"learning_rate": 3.3983730900377655e-08,
"loss": 2.3142,
"step": 966
},
{
"epoch": 0.9930680359435173,
"grad_norm": 1.3123668432235718,
"learning_rate": 2.601913939266343e-08,
"loss": 2.158,
"step": 967
},
{
"epoch": 0.9940949935815148,
"grad_norm": 1.5591670274734497,
"learning_rate": 1.9116322355339222e-08,
"loss": 2.1941,
"step": 968
},
{
"epoch": 0.9951219512195122,
"grad_norm": 1.1492890119552612,
"learning_rate": 1.3275353099795329e-08,
"loss": 1.9275,
"step": 969
},
{
"epoch": 0.9961489088575096,
"grad_norm": 1.3780685663223267,
"learning_rate": 8.496293660120724e-09,
"loss": 2.2059,
"step": 970
},
{
"epoch": 0.997175866495507,
"grad_norm": 1.3340226411819458,
"learning_rate": 4.779194792348119e-09,
"loss": 2.2606,
"step": 971
},
{
"epoch": 0.9982028241335045,
"grad_norm": 1.5144405364990234,
"learning_rate": 2.124095973954354e-09,
"loss": 2.181,
"step": 972
},
{
"epoch": 0.9992297817715019,
"grad_norm": 1.2708326578140259,
"learning_rate": 5.310254034274209e-10,
"loss": 1.9886,
"step": 973
},
{
"epoch": 1.0005134788189987,
"grad_norm": 2.398379325866699,
"learning_rate": 0.0,
"loss": 3.4023,
"step": 974
}
],
"logging_steps": 1,
"max_steps": 974,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 244,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.7208949874753536e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}