{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6848, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.0004999993423087359, "loss": 1.3569, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.240234375, "learning_rate": 0.000499997369238404, "loss": 1.4018, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.28125, "learning_rate": 0.0004999940807993858, "loss": 1.1722, "step": 15 }, { "epoch": 0.0, "grad_norm": 0.2099609375, "learning_rate": 0.0004999894770089834, "loss": 1.2406, "step": 20 }, { "epoch": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0004999835578914198, "loss": 1.1451, "step": 25 }, { "epoch": 0.0, "grad_norm": 0.14453125, "learning_rate": 0.0004999763234778388, "loss": 1.2433, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.138671875, "learning_rate": 0.0004999677738063041, "loss": 0.9514, "step": 35 }, { "epoch": 0.01, "grad_norm": 0.12890625, "learning_rate": 0.0004999579089218003, "loss": 1.1503, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.11328125, "learning_rate": 0.0004999467288762319, "loss": 1.0971, "step": 45 }, { "epoch": 0.01, "grad_norm": 0.140625, "learning_rate": 0.0004999342337284227, "loss": 1.1548, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.134765625, "learning_rate": 0.0004999204235441167, "loss": 1.0912, "step": 55 }, { "epoch": 0.01, "grad_norm": 0.12060546875, "learning_rate": 0.0004999052983959764, "loss": 1.0526, "step": 60 }, { "epoch": 0.01, "grad_norm": 0.111328125, "learning_rate": 0.0004998888583635831, "loss": 0.9175, "step": 65 }, { "epoch": 0.01, "grad_norm": 0.14453125, "learning_rate": 0.0004998711035334366, "loss": 1.1837, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.130859375, "learning_rate": 0.0004998520339989548, "loss": 1.1483, "step": 75 }, { "epoch": 0.01, "grad_norm": 0.130859375, "learning_rate": 0.0004998316498604721, "loss": 1.1718, "step": 80 }, { "epoch": 0.01, "grad_norm": 0.115234375, "learning_rate": 0.0004998099512252406, "loss": 1.1358, "step": 85 }, { "epoch": 0.01, "grad_norm": 0.1455078125, "learning_rate": 0.0004997869382074284, "loss": 1.0682, "step": 90 }, { "epoch": 0.01, "grad_norm": 0.1484375, "learning_rate": 0.0004997626109281188, "loss": 1.202, "step": 95 }, { "epoch": 0.01, "grad_norm": 0.1171875, "learning_rate": 0.0004997369695153109, "loss": 1.022, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.1123046875, "learning_rate": 0.0004997100141039175, "loss": 1.1212, "step": 105 }, { "epoch": 0.02, "grad_norm": 0.1298828125, "learning_rate": 0.0004996817448357654, "loss": 0.9973, "step": 110 }, { "epoch": 0.02, "grad_norm": 0.138671875, "learning_rate": 0.0004996521618595943, "loss": 1.1896, "step": 115 }, { "epoch": 0.02, "grad_norm": 0.1494140625, "learning_rate": 0.0004996212653310557, "loss": 1.2285, "step": 120 }, { "epoch": 0.02, "grad_norm": 0.1337890625, "learning_rate": 0.0004995890554127129, "loss": 1.0276, "step": 125 }, { "epoch": 0.02, "grad_norm": 0.181640625, "learning_rate": 0.0004995555322740391, "loss": 1.0489, "step": 130 }, { "epoch": 0.02, "grad_norm": 0.1083984375, "learning_rate": 0.0004995206960914175, "loss": 1.1461, "step": 135 }, { "epoch": 0.02, "grad_norm": 0.12109375, "learning_rate": 0.0004994845470481397, "loss": 1.1738, "step": 140 }, { "epoch": 0.02, "grad_norm": 0.1728515625, "learning_rate": 0.0004994470853344047, "loss": 1.1, "step": 145 }, { "epoch": 0.02, "grad_norm": 0.11376953125, "learning_rate": 0.0004994083111473188, "loss": 1.0523, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.099609375, "learning_rate": 0.0004993682246908933, "loss": 1.1993, "step": 155 }, { "epoch": 0.02, "grad_norm": 0.1455078125, "learning_rate": 0.0004993268261760445, "loss": 1.2094, "step": 160 }, { "epoch": 0.02, "grad_norm": 0.1396484375, "learning_rate": 0.0004992841158205917, "loss": 0.9554, "step": 165 }, { "epoch": 0.02, "grad_norm": 0.1904296875, "learning_rate": 0.0004992400938492571, "loss": 1.3459, "step": 170 }, { "epoch": 0.03, "grad_norm": 0.1630859375, "learning_rate": 0.0004991947604936632, "loss": 1.0545, "step": 175 }, { "epoch": 0.03, "grad_norm": 0.1650390625, "learning_rate": 0.000499148115992333, "loss": 1.0093, "step": 180 }, { "epoch": 0.03, "grad_norm": 0.119140625, "learning_rate": 0.0004991001605906879, "loss": 1.0769, "step": 185 }, { "epoch": 0.03, "grad_norm": 0.1396484375, "learning_rate": 0.0004990508945410469, "loss": 1.2767, "step": 190 }, { "epoch": 0.03, "grad_norm": 0.154296875, "learning_rate": 0.0004990003181026245, "loss": 0.9031, "step": 195 }, { "epoch": 0.03, "grad_norm": 0.162109375, "learning_rate": 0.0004989484315415304, "loss": 1.1937, "step": 200 }, { "epoch": 0.03, "grad_norm": 0.130859375, "learning_rate": 0.000498895235130767, "loss": 1.0573, "step": 205 }, { "epoch": 0.03, "grad_norm": 0.1630859375, "learning_rate": 0.0004988407291502293, "loss": 1.0702, "step": 210 }, { "epoch": 0.03, "grad_norm": 0.1181640625, "learning_rate": 0.0004987849138867016, "loss": 0.9613, "step": 215 }, { "epoch": 0.03, "grad_norm": 0.12255859375, "learning_rate": 0.0004987277896338581, "loss": 1.1099, "step": 220 }, { "epoch": 0.03, "grad_norm": 0.11572265625, "learning_rate": 0.0004986693566922595, "loss": 1.0443, "step": 225 }, { "epoch": 0.03, "grad_norm": 0.138671875, "learning_rate": 0.0004986096153693524, "loss": 1.0332, "step": 230 }, { "epoch": 0.03, "grad_norm": 0.1337890625, "learning_rate": 0.0004985485659794676, "loss": 0.9495, "step": 235 }, { "epoch": 0.04, "grad_norm": 0.126953125, "learning_rate": 0.0004984862088438187, "loss": 0.9367, "step": 240 }, { "epoch": 0.04, "grad_norm": 0.11083984375, "learning_rate": 0.0004984225442904991, "loss": 1.0334, "step": 245 }, { "epoch": 0.04, "grad_norm": 0.1328125, "learning_rate": 0.000498357572654482, "loss": 1.042, "step": 250 }, { "epoch": 0.04, "grad_norm": 0.134765625, "learning_rate": 0.0004982912942776178, "loss": 1.0528, "step": 255 }, { "epoch": 0.04, "grad_norm": 0.1396484375, "learning_rate": 0.0004982237095086319, "loss": 0.9818, "step": 260 }, { "epoch": 0.04, "grad_norm": 0.169921875, "learning_rate": 0.0004981548187031236, "loss": 1.2563, "step": 265 }, { "epoch": 0.04, "grad_norm": 0.158203125, "learning_rate": 0.0004980846222235642, "loss": 1.2442, "step": 270 }, { "epoch": 0.04, "grad_norm": 0.12451171875, "learning_rate": 0.0004980131204392943, "loss": 1.1043, "step": 275 }, { "epoch": 0.04, "grad_norm": 0.1328125, "learning_rate": 0.0004979403137265229, "loss": 1.0199, "step": 280 }, { "epoch": 0.04, "grad_norm": 0.11669921875, "learning_rate": 0.0004978662024683247, "loss": 1.0435, "step": 285 }, { "epoch": 0.04, "grad_norm": 0.12255859375, "learning_rate": 0.0004977907870546382, "loss": 0.9776, "step": 290 }, { "epoch": 0.04, "grad_norm": 0.1337890625, "learning_rate": 0.000497714067882264, "loss": 1.2056, "step": 295 }, { "epoch": 0.04, "grad_norm": 0.142578125, "learning_rate": 0.0004976360453548621, "loss": 1.0263, "step": 300 }, { "epoch": 0.04, "grad_norm": 0.1376953125, "learning_rate": 0.0004975567198829506, "loss": 1.1623, "step": 305 }, { "epoch": 0.05, "grad_norm": 0.1533203125, "learning_rate": 0.0004974760918839028, "loss": 1.1521, "step": 310 }, { "epoch": 0.05, "grad_norm": 0.1396484375, "learning_rate": 0.0004973941617819453, "loss": 0.9725, "step": 315 }, { "epoch": 0.05, "grad_norm": 0.1181640625, "learning_rate": 0.0004973109300081559, "loss": 0.9627, "step": 320 }, { "epoch": 0.05, "grad_norm": 0.1435546875, "learning_rate": 0.0004972263970004611, "loss": 1.224, "step": 325 }, { "epoch": 0.05, "grad_norm": 0.16796875, "learning_rate": 0.0004971405632036337, "loss": 1.1421, "step": 330 }, { "epoch": 0.05, "grad_norm": 0.189453125, "learning_rate": 0.0004970534290692909, "loss": 1.2515, "step": 335 }, { "epoch": 0.05, "grad_norm": 0.1591796875, "learning_rate": 0.0004969649950558915, "loss": 1.2142, "step": 340 }, { "epoch": 0.05, "grad_norm": 0.1328125, "learning_rate": 0.0004968752616287339, "loss": 1.0142, "step": 345 }, { "epoch": 0.05, "grad_norm": 0.1142578125, "learning_rate": 0.0004967842292599532, "loss": 1.352, "step": 350 }, { "epoch": 0.05, "grad_norm": 0.146484375, "learning_rate": 0.0004966918984285187, "loss": 1.3625, "step": 355 }, { "epoch": 0.05, "grad_norm": 0.1337890625, "learning_rate": 0.0004965982696202321, "loss": 1.1559, "step": 360 }, { "epoch": 0.05, "grad_norm": 0.1650390625, "learning_rate": 0.000496503343327724, "loss": 1.2461, "step": 365 }, { "epoch": 0.05, "grad_norm": 0.1142578125, "learning_rate": 0.0004964071200504521, "loss": 0.9961, "step": 370 }, { "epoch": 0.05, "grad_norm": 0.115234375, "learning_rate": 0.0004963096002946982, "loss": 0.9184, "step": 375 }, { "epoch": 0.06, "grad_norm": 0.1796875, "learning_rate": 0.0004962107845735652, "loss": 1.1593, "step": 380 }, { "epoch": 0.06, "grad_norm": 0.10986328125, "learning_rate": 0.0004961106734069751, "loss": 0.9015, "step": 385 }, { "epoch": 0.06, "grad_norm": 0.16015625, "learning_rate": 0.0004960092673216656, "loss": 0.9922, "step": 390 }, { "epoch": 0.06, "grad_norm": 0.134765625, "learning_rate": 0.0004959065668511881, "loss": 1.223, "step": 395 }, { "epoch": 0.06, "grad_norm": 0.1298828125, "learning_rate": 0.0004958025725359043, "loss": 0.9188, "step": 400 }, { "epoch": 0.06, "grad_norm": 0.1728515625, "learning_rate": 0.0004956972849229831, "loss": 1.1122, "step": 405 }, { "epoch": 0.06, "grad_norm": 0.11962890625, "learning_rate": 0.0004955907045663988, "loss": 1.2144, "step": 410 }, { "epoch": 0.06, "grad_norm": 0.15234375, "learning_rate": 0.0004954828320269268, "loss": 0.8657, "step": 415 }, { "epoch": 0.06, "grad_norm": 0.154296875, "learning_rate": 0.000495373667872142, "loss": 1.0499, "step": 420 }, { "epoch": 0.06, "grad_norm": 0.1728515625, "learning_rate": 0.0004952632126764148, "loss": 1.2801, "step": 425 }, { "epoch": 0.06, "grad_norm": 0.1484375, "learning_rate": 0.0004951514670209086, "loss": 1.0664, "step": 430 }, { "epoch": 0.06, "grad_norm": 0.1328125, "learning_rate": 0.0004950384314935764, "loss": 0.9355, "step": 435 }, { "epoch": 0.06, "grad_norm": 0.1259765625, "learning_rate": 0.000494924106689158, "loss": 1.0729, "step": 440 }, { "epoch": 0.06, "grad_norm": 0.13671875, "learning_rate": 0.000494808493209177, "loss": 0.8503, "step": 445 }, { "epoch": 0.07, "grad_norm": 0.279296875, "learning_rate": 0.0004946915916619371, "loss": 1.0489, "step": 450 }, { "epoch": 0.07, "grad_norm": 0.1259765625, "learning_rate": 0.0004945734026625193, "loss": 0.9458, "step": 455 }, { "epoch": 0.07, "grad_norm": 0.173828125, "learning_rate": 0.0004944539268327786, "loss": 1.1663, "step": 460 }, { "epoch": 0.07, "grad_norm": 0.142578125, "learning_rate": 0.0004943331648013406, "loss": 1.0852, "step": 465 }, { "epoch": 0.07, "grad_norm": 0.123046875, "learning_rate": 0.0004942111172035984, "loss": 1.0142, "step": 470 }, { "epoch": 0.07, "grad_norm": 0.119140625, "learning_rate": 0.0004940877846817092, "loss": 0.9625, "step": 475 }, { "epoch": 0.07, "grad_norm": 0.169921875, "learning_rate": 0.0004939631678845907, "loss": 1.1333, "step": 480 }, { "epoch": 0.07, "grad_norm": 0.150390625, "learning_rate": 0.000493837267467918, "loss": 0.851, "step": 485 }, { "epoch": 0.07, "grad_norm": 0.1259765625, "learning_rate": 0.0004937100840941199, "loss": 1.0439, "step": 490 }, { "epoch": 0.07, "grad_norm": 0.1298828125, "learning_rate": 0.0004935816184323754, "loss": 1.1395, "step": 495 }, { "epoch": 0.07, "grad_norm": 0.1708984375, "learning_rate": 0.0004934518711586106, "loss": 1.2047, "step": 500 }, { "epoch": 0.07, "grad_norm": 0.1484375, "learning_rate": 0.0004933208429554948, "loss": 1.006, "step": 505 }, { "epoch": 0.07, "grad_norm": 0.240234375, "learning_rate": 0.0004931885345124366, "loss": 0.8703, "step": 510 }, { "epoch": 0.08, "grad_norm": 0.2001953125, "learning_rate": 0.000493054946525581, "loss": 1.0765, "step": 515 }, { "epoch": 0.08, "grad_norm": 0.208984375, "learning_rate": 0.0004929200796978052, "loss": 1.1185, "step": 520 }, { "epoch": 0.08, "grad_norm": 0.12451171875, "learning_rate": 0.000492783934738715, "loss": 1.0236, "step": 525 }, { "epoch": 0.08, "grad_norm": 0.1455078125, "learning_rate": 0.0004926465123646413, "loss": 1.0684, "step": 530 }, { "epoch": 0.08, "grad_norm": 0.150390625, "learning_rate": 0.000492507813298636, "loss": 0.9493, "step": 535 }, { "epoch": 0.08, "grad_norm": 0.1259765625, "learning_rate": 0.0004923678382704684, "loss": 0.9562, "step": 540 }, { "epoch": 0.08, "grad_norm": 0.25, "learning_rate": 0.0004922265880166214, "loss": 1.2393, "step": 545 }, { "epoch": 0.08, "grad_norm": 0.14453125, "learning_rate": 0.0004920840632802874, "loss": 1.2773, "step": 550 }, { "epoch": 0.08, "grad_norm": 0.1533203125, "learning_rate": 0.0004919402648113646, "loss": 0.9897, "step": 555 }, { "epoch": 0.08, "grad_norm": 0.15234375, "learning_rate": 0.000491795193366453, "loss": 1.0711, "step": 560 }, { "epoch": 0.08, "grad_norm": 0.11962890625, "learning_rate": 0.0004916488497088504, "loss": 1.1958, "step": 565 }, { "epoch": 0.08, "grad_norm": 0.1455078125, "learning_rate": 0.0004915012346085483, "loss": 1.2852, "step": 570 }, { "epoch": 0.08, "grad_norm": 0.142578125, "learning_rate": 0.0004913523488422279, "loss": 1.2695, "step": 575 }, { "epoch": 0.08, "grad_norm": 0.146484375, "learning_rate": 0.0004912021931932564, "loss": 1.1283, "step": 580 }, { "epoch": 0.09, "grad_norm": 0.1455078125, "learning_rate": 0.0004910507684516821, "loss": 0.8647, "step": 585 }, { "epoch": 0.09, "grad_norm": 0.1650390625, "learning_rate": 0.0004908980754142308, "loss": 0.958, "step": 590 }, { "epoch": 0.09, "grad_norm": 0.1650390625, "learning_rate": 0.0004907441148843016, "loss": 0.977, "step": 595 }, { "epoch": 0.09, "grad_norm": 0.158203125, "learning_rate": 0.0004905888876719624, "loss": 1.2838, "step": 600 }, { "epoch": 0.09, "grad_norm": 0.14453125, "learning_rate": 0.000490432394593946, "loss": 1.0568, "step": 605 }, { "epoch": 0.09, "grad_norm": 0.119140625, "learning_rate": 0.0004902746364736452, "loss": 1.1851, "step": 610 }, { "epoch": 0.09, "grad_norm": 0.154296875, "learning_rate": 0.0004901156141411094, "loss": 1.3584, "step": 615 }, { "epoch": 0.09, "grad_norm": 0.134765625, "learning_rate": 0.0004899553284330392, "loss": 1.0006, "step": 620 }, { "epoch": 0.09, "grad_norm": 0.1201171875, "learning_rate": 0.0004897937801927826, "loss": 1.1368, "step": 625 }, { "epoch": 0.09, "grad_norm": 0.138671875, "learning_rate": 0.0004896309702703308, "loss": 1.1571, "step": 630 }, { "epoch": 0.09, "grad_norm": 0.1357421875, "learning_rate": 0.0004894668995223129, "loss": 1.0559, "step": 635 }, { "epoch": 0.09, "grad_norm": 0.1416015625, "learning_rate": 0.000489301568811992, "loss": 1.0644, "step": 640 }, { "epoch": 0.09, "grad_norm": 0.1474609375, "learning_rate": 0.0004891349790092609, "loss": 1.1105, "step": 645 }, { "epoch": 0.09, "grad_norm": 0.1416015625, "learning_rate": 0.0004889671309906366, "loss": 1.3242, "step": 650 }, { "epoch": 0.1, "grad_norm": 0.1650390625, "learning_rate": 0.0004887980256392567, "loss": 1.1187, "step": 655 }, { "epoch": 0.1, "grad_norm": 0.1376953125, "learning_rate": 0.000488627663844874, "loss": 1.0096, "step": 660 }, { "epoch": 0.1, "grad_norm": 0.1728515625, "learning_rate": 0.0004884560465038523, "loss": 1.0949, "step": 665 }, { "epoch": 0.1, "grad_norm": 0.146484375, "learning_rate": 0.0004882831745191611, "loss": 1.0701, "step": 670 }, { "epoch": 0.1, "grad_norm": 0.1337890625, "learning_rate": 0.00048810904880037186, "loss": 1.141, "step": 675 }, { "epoch": 0.1, "grad_norm": 0.404296875, "learning_rate": 0.0004879336702636523, "loss": 1.0199, "step": 680 }, { "epoch": 0.1, "grad_norm": 0.1435546875, "learning_rate": 0.00048775703983176176, "loss": 1.0905, "step": 685 }, { "epoch": 0.1, "grad_norm": 0.15234375, "learning_rate": 0.00048757915843404656, "loss": 1.1913, "step": 690 }, { "epoch": 0.1, "grad_norm": 0.140625, "learning_rate": 0.0004874000270064351, "loss": 1.118, "step": 695 }, { "epoch": 0.1, "grad_norm": 0.125, "learning_rate": 0.0004872196464914328, "loss": 0.9663, "step": 700 }, { "epoch": 0.1, "grad_norm": 0.15234375, "learning_rate": 0.00048703801783811707, "loss": 0.9864, "step": 705 }, { "epoch": 0.1, "grad_norm": 0.2041015625, "learning_rate": 0.00048685514200213256, "loss": 1.2736, "step": 710 }, { "epoch": 0.1, "grad_norm": 0.1484375, "learning_rate": 0.0004866710199456861, "loss": 0.9167, "step": 715 }, { "epoch": 0.11, "grad_norm": 0.1552734375, "learning_rate": 0.0004864856526375413, "loss": 1.1975, "step": 720 }, { "epoch": 0.11, "grad_norm": 0.1494140625, "learning_rate": 0.00048629904105301395, "loss": 1.0447, "step": 725 }, { "epoch": 0.11, "grad_norm": 0.1318359375, "learning_rate": 0.0004861111861739664, "loss": 1.0386, "step": 730 }, { "epoch": 0.11, "grad_norm": 0.150390625, "learning_rate": 0.0004859220889888028, "loss": 1.1379, "step": 735 }, { "epoch": 0.11, "grad_norm": 0.1279296875, "learning_rate": 0.0004857317504924638, "loss": 0.957, "step": 740 }, { "epoch": 0.11, "grad_norm": 0.1533203125, "learning_rate": 0.0004855401716864209, "loss": 1.1209, "step": 745 }, { "epoch": 0.11, "grad_norm": 0.134765625, "learning_rate": 0.000485347353578672, "loss": 0.9555, "step": 750 }, { "epoch": 0.11, "grad_norm": 0.1611328125, "learning_rate": 0.00048515329718373514, "loss": 1.3033, "step": 755 }, { "epoch": 0.11, "grad_norm": 0.1376953125, "learning_rate": 0.00048495800352264405, "loss": 0.9557, "step": 760 }, { "epoch": 0.11, "grad_norm": 0.130859375, "learning_rate": 0.00048476147362294225, "loss": 1.2029, "step": 765 }, { "epoch": 0.11, "grad_norm": 0.126953125, "learning_rate": 0.00048456370851867757, "loss": 1.1479, "step": 770 }, { "epoch": 0.11, "grad_norm": 0.138671875, "learning_rate": 0.0004843647092503972, "loss": 0.9966, "step": 775 }, { "epoch": 0.11, "grad_norm": 0.16796875, "learning_rate": 0.0004841644768651417, "loss": 0.9547, "step": 780 }, { "epoch": 0.11, "grad_norm": 0.123046875, "learning_rate": 0.0004839630124164398, "loss": 1.0046, "step": 785 }, { "epoch": 0.12, "grad_norm": 0.1533203125, "learning_rate": 0.0004837603169643028, "loss": 1.0157, "step": 790 }, { "epoch": 0.12, "grad_norm": 0.2265625, "learning_rate": 0.00048355639157521887, "loss": 1.1056, "step": 795 }, { "epoch": 0.12, "grad_norm": 0.1689453125, "learning_rate": 0.0004833512373221476, "loss": 1.3283, "step": 800 }, { "epoch": 0.12, "grad_norm": 0.2490234375, "learning_rate": 0.00048314485528451435, "loss": 1.0102, "step": 805 }, { "epoch": 0.12, "grad_norm": 0.12255859375, "learning_rate": 0.00048293724654820434, "loss": 0.9784, "step": 810 }, { "epoch": 0.12, "grad_norm": 0.1416015625, "learning_rate": 0.0004827284122055572, "loss": 1.0426, "step": 815 }, { "epoch": 0.12, "grad_norm": 0.1650390625, "learning_rate": 0.0004825183533553611, "loss": 1.0352, "step": 820 }, { "epoch": 0.12, "grad_norm": 0.1181640625, "learning_rate": 0.00048230707110284704, "loss": 1.1475, "step": 825 }, { "epoch": 0.12, "grad_norm": 0.1337890625, "learning_rate": 0.000482094566559683, "loss": 0.9966, "step": 830 }, { "epoch": 0.12, "grad_norm": 0.1416015625, "learning_rate": 0.00048188084084396787, "loss": 1.261, "step": 835 }, { "epoch": 0.12, "grad_norm": 0.1943359375, "learning_rate": 0.0004816658950802262, "loss": 1.0961, "step": 840 }, { "epoch": 0.12, "grad_norm": 0.185546875, "learning_rate": 0.00048144973039940134, "loss": 1.1438, "step": 845 }, { "epoch": 0.12, "grad_norm": 0.146484375, "learning_rate": 0.00048123234793885037, "loss": 1.1274, "step": 850 }, { "epoch": 0.12, "grad_norm": 0.12060546875, "learning_rate": 0.0004810137488423376, "loss": 1.0945, "step": 855 }, { "epoch": 0.13, "grad_norm": 0.17578125, "learning_rate": 0.0004807939342600289, "loss": 0.9427, "step": 860 }, { "epoch": 0.13, "grad_norm": 0.13671875, "learning_rate": 0.0004805729053484852, "loss": 1.1931, "step": 865 }, { "epoch": 0.13, "grad_norm": 0.130859375, "learning_rate": 0.0004803506632706569, "loss": 1.0598, "step": 870 }, { "epoch": 0.13, "grad_norm": 0.1826171875, "learning_rate": 0.00048012720919587715, "loss": 1.1676, "step": 875 }, { "epoch": 0.13, "grad_norm": 0.1904296875, "learning_rate": 0.00047990254429985653, "loss": 1.2744, "step": 880 }, { "epoch": 0.13, "grad_norm": 0.1689453125, "learning_rate": 0.000479676669764676, "loss": 1.0646, "step": 885 }, { "epoch": 0.13, "grad_norm": 0.15625, "learning_rate": 0.0004794495867787813, "loss": 1.1586, "step": 890 }, { "epoch": 0.13, "grad_norm": 0.150390625, "learning_rate": 0.00047922129653697634, "loss": 1.2567, "step": 895 }, { "epoch": 0.13, "grad_norm": 0.15625, "learning_rate": 0.00047899180024041723, "loss": 1.0267, "step": 900 }, { "epoch": 0.13, "grad_norm": 0.12890625, "learning_rate": 0.00047876109909660545, "loss": 1.0467, "step": 905 }, { "epoch": 0.13, "grad_norm": 0.1484375, "learning_rate": 0.0004785291943193822, "loss": 0.8516, "step": 910 }, { "epoch": 0.13, "grad_norm": 0.125, "learning_rate": 0.0004782960871289214, "loss": 1.2609, "step": 915 }, { "epoch": 0.13, "grad_norm": 0.146484375, "learning_rate": 0.00047806177875172343, "loss": 0.9976, "step": 920 }, { "epoch": 0.14, "grad_norm": 0.1455078125, "learning_rate": 0.000477826270420609, "loss": 1.0972, "step": 925 }, { "epoch": 0.14, "grad_norm": 0.169921875, "learning_rate": 0.0004775895633747123, "loss": 0.9034, "step": 930 }, { "epoch": 0.14, "grad_norm": 0.1474609375, "learning_rate": 0.00047735165885947446, "loss": 0.964, "step": 935 }, { "epoch": 0.14, "grad_norm": 0.125, "learning_rate": 0.0004771125581266373, "loss": 0.9712, "step": 940 }, { "epoch": 0.14, "grad_norm": 0.1943359375, "learning_rate": 0.00047687226243423665, "loss": 0.9746, "step": 945 }, { "epoch": 0.14, "grad_norm": 0.1845703125, "learning_rate": 0.0004766307730465954, "loss": 1.2146, "step": 950 }, { "epoch": 0.14, "grad_norm": 0.146484375, "learning_rate": 0.00047638809123431725, "loss": 1.0272, "step": 955 }, { "epoch": 0.14, "grad_norm": 0.244140625, "learning_rate": 0.0004761442182742799, "loss": 1.1749, "step": 960 }, { "epoch": 0.14, "grad_norm": 0.1337890625, "learning_rate": 0.00047589915544962827, "loss": 1.0741, "step": 965 }, { "epoch": 0.14, "grad_norm": 0.123046875, "learning_rate": 0.00047565290404976774, "loss": 1.0154, "step": 970 }, { "epoch": 0.14, "grad_norm": 0.1591796875, "learning_rate": 0.0004754054653703575, "loss": 1.0645, "step": 975 }, { "epoch": 0.14, "grad_norm": 0.1533203125, "learning_rate": 0.0004751568407133036, "loss": 1.1635, "step": 980 }, { "epoch": 0.14, "grad_norm": 0.494140625, "learning_rate": 0.0004749070313867522, "loss": 0.9108, "step": 985 }, { "epoch": 0.14, "grad_norm": 0.1298828125, "learning_rate": 0.00047465603870508255, "loss": 0.943, "step": 990 }, { "epoch": 0.15, "grad_norm": 0.150390625, "learning_rate": 0.0004744038639889002, "loss": 1.0041, "step": 995 }, { "epoch": 0.15, "grad_norm": 0.134765625, "learning_rate": 0.00047415050856503005, "loss": 0.9916, "step": 1000 }, { "epoch": 0.15, "grad_norm": 0.146484375, "learning_rate": 0.0004738959737665093, "loss": 0.967, "step": 1005 }, { "epoch": 0.15, "grad_norm": 0.1494140625, "learning_rate": 0.00047364026093258045, "loss": 1.2214, "step": 1010 }, { "epoch": 0.15, "grad_norm": 0.142578125, "learning_rate": 0.0004733833714086842, "loss": 0.9998, "step": 1015 }, { "epoch": 0.15, "grad_norm": 0.1494140625, "learning_rate": 0.0004731253065464526, "loss": 1.0818, "step": 1020 }, { "epoch": 0.15, "grad_norm": 0.166015625, "learning_rate": 0.00047286606770370165, "loss": 0.9446, "step": 1025 }, { "epoch": 0.15, "grad_norm": 0.1591796875, "learning_rate": 0.0004726056562444244, "loss": 1.0604, "step": 1030 }, { "epoch": 0.15, "grad_norm": 0.154296875, "learning_rate": 0.00047234407353878344, "loss": 0.9393, "step": 1035 }, { "epoch": 0.15, "grad_norm": 0.154296875, "learning_rate": 0.0004720813209631042, "loss": 0.9309, "step": 1040 }, { "epoch": 0.15, "grad_norm": 0.1650390625, "learning_rate": 0.0004718173998998672, "loss": 1.1868, "step": 1045 }, { "epoch": 0.15, "grad_norm": 0.1357421875, "learning_rate": 0.00047155231173770106, "loss": 0.9496, "step": 1050 }, { "epoch": 0.15, "grad_norm": 0.1474609375, "learning_rate": 0.00047128605787137513, "loss": 0.8653, "step": 1055 }, { "epoch": 0.15, "grad_norm": 0.1376953125, "learning_rate": 0.0004710186397017922, "loss": 1.111, "step": 1060 }, { "epoch": 0.16, "grad_norm": 0.15234375, "learning_rate": 0.000470750058635981, "loss": 1.0898, "step": 1065 }, { "epoch": 0.16, "grad_norm": 0.1630859375, "learning_rate": 0.00047048031608708875, "loss": 1.2158, "step": 1070 }, { "epoch": 0.16, "grad_norm": 0.1396484375, "learning_rate": 0.0004702094134743742, "loss": 0.8783, "step": 1075 }, { "epoch": 0.16, "grad_norm": 0.1337890625, "learning_rate": 0.0004699373522231996, "loss": 0.9491, "step": 1080 }, { "epoch": 0.16, "grad_norm": 0.1943359375, "learning_rate": 0.00046966413376502315, "loss": 1.0776, "step": 1085 }, { "epoch": 0.16, "grad_norm": 0.1982421875, "learning_rate": 0.00046938975953739225, "loss": 1.2107, "step": 1090 }, { "epoch": 0.16, "grad_norm": 0.123046875, "learning_rate": 0.0004691142309839351, "loss": 1.1185, "step": 1095 }, { "epoch": 0.16, "grad_norm": 0.1298828125, "learning_rate": 0.0004688375495543535, "loss": 0.8819, "step": 1100 }, { "epoch": 0.16, "grad_norm": 0.1279296875, "learning_rate": 0.0004685597167044152, "loss": 1.0172, "step": 1105 }, { "epoch": 0.16, "grad_norm": 0.138671875, "learning_rate": 0.0004682807338959458, "loss": 0.8787, "step": 1110 }, { "epoch": 0.16, "grad_norm": 0.1630859375, "learning_rate": 0.0004680006025968221, "loss": 1.0478, "step": 1115 }, { "epoch": 0.16, "grad_norm": 0.11572265625, "learning_rate": 0.0004677193242809632, "loss": 1.0733, "step": 1120 }, { "epoch": 0.16, "grad_norm": 0.138671875, "learning_rate": 0.0004674369004283234, "loss": 1.0968, "step": 1125 }, { "epoch": 0.17, "grad_norm": 0.11328125, "learning_rate": 0.00046715333252488435, "loss": 1.1467, "step": 1130 }, { "epoch": 0.17, "grad_norm": 0.130859375, "learning_rate": 0.0004668686220626471, "loss": 1.078, "step": 1135 }, { "epoch": 0.17, "grad_norm": 0.166015625, "learning_rate": 0.0004665827705396244, "loss": 1.0399, "step": 1140 }, { "epoch": 0.17, "grad_norm": 0.197265625, "learning_rate": 0.0004662957794598325, "loss": 1.1142, "step": 1145 }, { "epoch": 0.17, "grad_norm": 0.15234375, "learning_rate": 0.00046600765033328375, "loss": 1.1665, "step": 1150 }, { "epoch": 0.17, "grad_norm": 0.1279296875, "learning_rate": 0.00046571838467597814, "loss": 1.1634, "step": 1155 }, { "epoch": 0.17, "grad_norm": 0.134765625, "learning_rate": 0.0004654279840098957, "loss": 1.1111, "step": 1160 }, { "epoch": 0.17, "grad_norm": 0.1337890625, "learning_rate": 0.0004651364498629882, "loss": 1.0922, "step": 1165 }, { "epoch": 0.17, "grad_norm": 0.14453125, "learning_rate": 0.00046484378376917136, "loss": 1.1662, "step": 1170 }, { "epoch": 0.17, "grad_norm": 0.126953125, "learning_rate": 0.00046454998726831666, "loss": 1.1064, "step": 1175 }, { "epoch": 0.17, "grad_norm": 0.138671875, "learning_rate": 0.0004642550619062432, "loss": 1.2852, "step": 1180 }, { "epoch": 0.17, "grad_norm": 0.142578125, "learning_rate": 0.0004639590092347098, "loss": 0.8237, "step": 1185 }, { "epoch": 0.17, "grad_norm": 0.142578125, "learning_rate": 0.00046366183081140636, "loss": 1.1068, "step": 1190 }, { "epoch": 0.17, "grad_norm": 0.1728515625, "learning_rate": 0.0004633635281999461, "loss": 1.0434, "step": 1195 }, { "epoch": 0.18, "grad_norm": 0.154296875, "learning_rate": 0.00046306410296985726, "loss": 1.1346, "step": 1200 }, { "epoch": 0.18, "grad_norm": 0.14453125, "learning_rate": 0.0004627635566965747, "loss": 1.1974, "step": 1205 }, { "epoch": 0.18, "grad_norm": 0.1416015625, "learning_rate": 0.0004624618909614316, "loss": 0.9354, "step": 1210 }, { "epoch": 0.18, "grad_norm": 0.134765625, "learning_rate": 0.00046215910735165146, "loss": 1.1143, "step": 1215 }, { "epoch": 0.18, "grad_norm": 0.162109375, "learning_rate": 0.00046185520746033917, "loss": 1.036, "step": 1220 }, { "epoch": 0.18, "grad_norm": 0.146484375, "learning_rate": 0.0004615501928864733, "loss": 0.9453, "step": 1225 }, { "epoch": 0.18, "grad_norm": 0.1484375, "learning_rate": 0.0004612440652348972, "loss": 0.9825, "step": 1230 }, { "epoch": 0.18, "grad_norm": 0.13671875, "learning_rate": 0.00046093682611631073, "loss": 1.3258, "step": 1235 }, { "epoch": 0.18, "grad_norm": 0.109375, "learning_rate": 0.0004606284771472616, "loss": 0.9973, "step": 1240 }, { "epoch": 0.18, "grad_norm": 0.1533203125, "learning_rate": 0.00046031901995013746, "loss": 1.0622, "step": 1245 }, { "epoch": 0.18, "grad_norm": 0.150390625, "learning_rate": 0.00046000845615315657, "loss": 1.0037, "step": 1250 }, { "epoch": 0.18, "grad_norm": 0.1494140625, "learning_rate": 0.00045969678739035956, "loss": 0.9876, "step": 1255 }, { "epoch": 0.18, "grad_norm": 0.1875, "learning_rate": 0.00045938401530160126, "loss": 1.1052, "step": 1260 }, { "epoch": 0.18, "grad_norm": 0.1494140625, "learning_rate": 0.00045907014153254123, "loss": 1.1145, "step": 1265 }, { "epoch": 0.19, "grad_norm": 0.1318359375, "learning_rate": 0.0004587551677346359, "loss": 0.7833, "step": 1270 }, { "epoch": 0.19, "grad_norm": 0.1328125, "learning_rate": 0.0004584390955651292, "loss": 0.9334, "step": 1275 }, { "epoch": 0.19, "grad_norm": 0.166015625, "learning_rate": 0.00045812192668704456, "loss": 1.157, "step": 1280 }, { "epoch": 0.19, "grad_norm": 0.1611328125, "learning_rate": 0.0004578036627691755, "loss": 1.0041, "step": 1285 }, { "epoch": 0.19, "grad_norm": 0.140625, "learning_rate": 0.0004574843054860772, "loss": 0.9626, "step": 1290 }, { "epoch": 0.19, "grad_norm": 0.2236328125, "learning_rate": 0.0004571638565180577, "loss": 1.1321, "step": 1295 }, { "epoch": 0.19, "grad_norm": 0.140625, "learning_rate": 0.0004568423175511688, "loss": 0.9283, "step": 1300 }, { "epoch": 0.19, "grad_norm": 0.154296875, "learning_rate": 0.0004565196902771975, "loss": 1.0294, "step": 1305 }, { "epoch": 0.19, "grad_norm": 0.1474609375, "learning_rate": 0.00045619597639365685, "loss": 1.0769, "step": 1310 }, { "epoch": 0.19, "grad_norm": 0.126953125, "learning_rate": 0.00045587117760377733, "loss": 0.9879, "step": 1315 }, { "epoch": 0.19, "grad_norm": 0.15625, "learning_rate": 0.00045554529561649744, "loss": 1.0914, "step": 1320 }, { "epoch": 0.19, "grad_norm": 0.1416015625, "learning_rate": 0.00045521833214645516, "loss": 1.0999, "step": 1325 }, { "epoch": 0.19, "grad_norm": 0.1484375, "learning_rate": 0.0004548902889139785, "loss": 1.0667, "step": 1330 }, { "epoch": 0.19, "grad_norm": 0.1884765625, "learning_rate": 0.00045456116764507694, "loss": 1.1249, "step": 1335 }, { "epoch": 0.2, "grad_norm": 0.12158203125, "learning_rate": 0.0004542309700714319, "loss": 1.0936, "step": 1340 }, { "epoch": 0.2, "grad_norm": 0.146484375, "learning_rate": 0.0004538996979303878, "loss": 0.9589, "step": 1345 }, { "epoch": 0.2, "grad_norm": 0.17578125, "learning_rate": 0.0004535673529649431, "loss": 1.088, "step": 1350 }, { "epoch": 0.2, "grad_norm": 0.1435546875, "learning_rate": 0.0004532339369237408, "loss": 1.065, "step": 1355 }, { "epoch": 0.2, "grad_norm": 0.142578125, "learning_rate": 0.00045289945156105937, "loss": 1.0804, "step": 1360 }, { "epoch": 0.2, "grad_norm": 0.189453125, "learning_rate": 0.00045256389863680365, "loss": 1.1125, "step": 1365 }, { "epoch": 0.2, "grad_norm": 0.15234375, "learning_rate": 0.0004522272799164955, "loss": 1.0844, "step": 1370 }, { "epoch": 0.2, "grad_norm": 0.1533203125, "learning_rate": 0.00045188959717126445, "loss": 1.0587, "step": 1375 }, { "epoch": 0.2, "grad_norm": 0.138671875, "learning_rate": 0.00045155085217783844, "loss": 1.0235, "step": 1380 }, { "epoch": 0.2, "grad_norm": 0.1298828125, "learning_rate": 0.0004512110467185343, "loss": 0.9903, "step": 1385 }, { "epoch": 0.2, "grad_norm": 0.208984375, "learning_rate": 0.0004508701825812489, "loss": 1.3046, "step": 1390 }, { "epoch": 0.2, "grad_norm": 0.1826171875, "learning_rate": 0.0004505282615594491, "loss": 1.0024, "step": 1395 }, { "epoch": 0.2, "grad_norm": 0.150390625, "learning_rate": 0.0004501852854521626, "loss": 1.1726, "step": 1400 }, { "epoch": 0.21, "grad_norm": 0.134765625, "learning_rate": 0.0004498412560639685, "loss": 1.1269, "step": 1405 }, { "epoch": 0.21, "grad_norm": 0.134765625, "learning_rate": 0.00044949617520498785, "loss": 0.9801, "step": 1410 }, { "epoch": 0.21, "grad_norm": 0.142578125, "learning_rate": 0.00044915004469087404, "loss": 1.0223, "step": 1415 }, { "epoch": 0.21, "grad_norm": 0.1376953125, "learning_rate": 0.00044880286634280304, "loss": 0.9374, "step": 1420 }, { "epoch": 0.21, "grad_norm": 0.1318359375, "learning_rate": 0.00044845464198746426, "loss": 1.0407, "step": 1425 }, { "epoch": 0.21, "grad_norm": 0.1171875, "learning_rate": 0.0004481053734570507, "loss": 1.0165, "step": 1430 }, { "epoch": 0.21, "grad_norm": 0.15625, "learning_rate": 0.00044775506258924904, "loss": 1.2979, "step": 1435 }, { "epoch": 0.21, "grad_norm": 0.162109375, "learning_rate": 0.0004474037112272307, "loss": 1.0358, "step": 1440 }, { "epoch": 0.21, "grad_norm": 0.134765625, "learning_rate": 0.00044705132121964134, "loss": 1.0453, "step": 1445 }, { "epoch": 0.21, "grad_norm": 0.125, "learning_rate": 0.00044669789442059154, "loss": 0.7933, "step": 1450 }, { "epoch": 0.21, "grad_norm": 0.1328125, "learning_rate": 0.00044634343268964717, "loss": 1.1367, "step": 1455 }, { "epoch": 0.21, "grad_norm": 0.1533203125, "learning_rate": 0.00044598793789181923, "loss": 1.0895, "step": 1460 }, { "epoch": 0.21, "grad_norm": 0.181640625, "learning_rate": 0.0004456314118975543, "loss": 0.9276, "step": 1465 }, { "epoch": 0.21, "grad_norm": 0.1552734375, "learning_rate": 0.00044527385658272466, "loss": 1.0582, "step": 1470 }, { "epoch": 0.22, "grad_norm": 0.146484375, "learning_rate": 0.0004449152738286184, "loss": 1.0049, "step": 1475 }, { "epoch": 0.22, "grad_norm": 0.15625, "learning_rate": 0.0004445556655219294, "loss": 1.0436, "step": 1480 }, { "epoch": 0.22, "grad_norm": 0.14453125, "learning_rate": 0.00044419503355474767, "loss": 1.1623, "step": 1485 }, { "epoch": 0.22, "grad_norm": 0.154296875, "learning_rate": 0.00044383337982454907, "loss": 1.1004, "step": 1490 }, { "epoch": 0.22, "grad_norm": 0.1357421875, "learning_rate": 0.0004434707062341856, "loss": 0.9782, "step": 1495 }, { "epoch": 0.22, "grad_norm": 0.1552734375, "learning_rate": 0.0004431070146918754, "loss": 0.8625, "step": 1500 }, { "epoch": 0.22, "grad_norm": 0.1513671875, "learning_rate": 0.00044274230711119247, "loss": 1.007, "step": 1505 }, { "epoch": 0.22, "grad_norm": 0.140625, "learning_rate": 0.0004423765854110565, "loss": 1.0016, "step": 1510 }, { "epoch": 0.22, "grad_norm": 0.181640625, "learning_rate": 0.0004420098515157235, "loss": 1.1458, "step": 1515 }, { "epoch": 0.22, "grad_norm": 0.125, "learning_rate": 0.0004416421073547747, "loss": 1.1517, "step": 1520 }, { "epoch": 0.22, "grad_norm": 0.1455078125, "learning_rate": 0.0004412733548631072, "loss": 1.0341, "step": 1525 }, { "epoch": 0.22, "grad_norm": 0.1513671875, "learning_rate": 0.00044090359598092333, "loss": 0.9788, "step": 1530 }, { "epoch": 0.22, "grad_norm": 0.1767578125, "learning_rate": 0.0004405328326537205, "loss": 1.0459, "step": 1535 }, { "epoch": 0.22, "grad_norm": 0.140625, "learning_rate": 0.0004401610668322812, "loss": 1.021, "step": 1540 }, { "epoch": 0.23, "grad_norm": 0.138671875, "learning_rate": 0.00043978830047266247, "loss": 0.9662, "step": 1545 }, { "epoch": 0.23, "grad_norm": 0.1484375, "learning_rate": 0.0004394145355361857, "loss": 0.8826, "step": 1550 }, { "epoch": 0.23, "grad_norm": 0.1630859375, "learning_rate": 0.0004390397739894265, "loss": 1.3257, "step": 1555 }, { "epoch": 0.23, "grad_norm": 0.12890625, "learning_rate": 0.0004386640178042038, "loss": 1.13, "step": 1560 }, { "epoch": 0.23, "grad_norm": 0.1376953125, "learning_rate": 0.0004382872689575703, "loss": 1.1859, "step": 1565 }, { "epoch": 0.23, "grad_norm": 0.13671875, "learning_rate": 0.0004379095294318013, "loss": 1.0333, "step": 1570 }, { "epoch": 0.23, "grad_norm": 0.1435546875, "learning_rate": 0.00043753080121438465, "loss": 0.9258, "step": 1575 }, { "epoch": 0.23, "grad_norm": 0.1201171875, "learning_rate": 0.0004371510862980104, "loss": 1.008, "step": 1580 }, { "epoch": 0.23, "grad_norm": 0.1416015625, "learning_rate": 0.00043677038668055987, "loss": 1.1719, "step": 1585 }, { "epoch": 0.23, "grad_norm": 0.1435546875, "learning_rate": 0.00043638870436509567, "loss": 1.3385, "step": 1590 }, { "epoch": 0.23, "grad_norm": 0.1865234375, "learning_rate": 0.0004360060413598507, "loss": 0.9583, "step": 1595 }, { "epoch": 0.23, "grad_norm": 0.1171875, "learning_rate": 0.000435622399678218, "loss": 0.921, "step": 1600 }, { "epoch": 0.23, "grad_norm": 0.15234375, "learning_rate": 0.0004352377813387398, "loss": 1.1462, "step": 1605 }, { "epoch": 0.24, "grad_norm": 0.1630859375, "learning_rate": 0.000434852188365097, "loss": 1.1806, "step": 1610 }, { "epoch": 0.24, "grad_norm": 0.279296875, "learning_rate": 0.0004344656227860987, "loss": 1.239, "step": 1615 }, { "epoch": 0.24, "grad_norm": 0.14453125, "learning_rate": 0.0004340780866356713, "loss": 1.1628, "step": 1620 }, { "epoch": 0.24, "grad_norm": 0.1552734375, "learning_rate": 0.000433689581952848, "loss": 1.1506, "step": 1625 }, { "epoch": 0.24, "grad_norm": 0.19140625, "learning_rate": 0.0004333001107817578, "loss": 1.1992, "step": 1630 }, { "epoch": 0.24, "grad_norm": 0.150390625, "learning_rate": 0.00043290967517161505, "loss": 1.1323, "step": 1635 }, { "epoch": 0.24, "grad_norm": 0.134765625, "learning_rate": 0.00043251827717670846, "loss": 1.2102, "step": 1640 }, { "epoch": 0.24, "grad_norm": 0.162109375, "learning_rate": 0.0004321259188563904, "loss": 1.0335, "step": 1645 }, { "epoch": 0.24, "grad_norm": 0.177734375, "learning_rate": 0.00043173260227506586, "loss": 1.286, "step": 1650 }, { "epoch": 0.24, "grad_norm": 0.2236328125, "learning_rate": 0.000431338329502182, "loss": 1.155, "step": 1655 }, { "epoch": 0.24, "grad_norm": 0.150390625, "learning_rate": 0.0004309431026122169, "loss": 1.1915, "step": 1660 }, { "epoch": 0.24, "grad_norm": 0.142578125, "learning_rate": 0.0004305469236846686, "loss": 0.8947, "step": 1665 }, { "epoch": 0.24, "grad_norm": 0.1416015625, "learning_rate": 0.0004301497948040446, "loss": 1.0983, "step": 1670 }, { "epoch": 0.24, "grad_norm": 0.15234375, "learning_rate": 0.0004297517180598504, "loss": 1.1136, "step": 1675 }, { "epoch": 0.25, "grad_norm": 0.134765625, "learning_rate": 0.00042935269554657875, "loss": 1.1264, "step": 1680 }, { "epoch": 0.25, "grad_norm": 0.16015625, "learning_rate": 0.00042895272936369875, "loss": 0.9479, "step": 1685 }, { "epoch": 0.25, "grad_norm": 0.1552734375, "learning_rate": 0.0004285518216156444, "loss": 1.1757, "step": 1690 }, { "epoch": 0.25, "grad_norm": 0.181640625, "learning_rate": 0.00042814997441180386, "loss": 1.0151, "step": 1695 }, { "epoch": 0.25, "grad_norm": 0.11083984375, "learning_rate": 0.0004277471898665084, "loss": 0.9314, "step": 1700 }, { "epoch": 0.25, "grad_norm": 0.138671875, "learning_rate": 0.00042734347009902097, "loss": 1.1818, "step": 1705 }, { "epoch": 0.25, "grad_norm": 0.142578125, "learning_rate": 0.0004269388172335253, "loss": 1.0081, "step": 1710 }, { "epoch": 0.25, "grad_norm": 0.15234375, "learning_rate": 0.00042653323339911464, "loss": 1.2873, "step": 1715 }, { "epoch": 0.25, "grad_norm": 0.150390625, "learning_rate": 0.0004261267207297804, "loss": 0.9362, "step": 1720 }, { "epoch": 0.25, "grad_norm": 0.16015625, "learning_rate": 0.0004257192813644015, "loss": 1.0657, "step": 1725 }, { "epoch": 0.25, "grad_norm": 0.1416015625, "learning_rate": 0.00042531091744673233, "loss": 1.0267, "step": 1730 }, { "epoch": 0.25, "grad_norm": 0.1416015625, "learning_rate": 0.0004249016311253918, "loss": 0.9564, "step": 1735 }, { "epoch": 0.25, "grad_norm": 0.1572265625, "learning_rate": 0.00042449142455385235, "loss": 1.1318, "step": 1740 }, { "epoch": 0.25, "grad_norm": 0.1494140625, "learning_rate": 0.00042408029989042816, "loss": 1.1812, "step": 1745 }, { "epoch": 0.26, "grad_norm": 0.142578125, "learning_rate": 0.00042366825929826416, "loss": 1.166, "step": 1750 }, { "epoch": 0.26, "grad_norm": 0.171875, "learning_rate": 0.000423255304945324, "loss": 0.9768, "step": 1755 }, { "epoch": 0.26, "grad_norm": 0.1435546875, "learning_rate": 0.00042284143900437986, "loss": 0.9451, "step": 1760 }, { "epoch": 0.26, "grad_norm": 0.2080078125, "learning_rate": 0.0004224266636529995, "loss": 1.0424, "step": 1765 }, { "epoch": 0.26, "grad_norm": 0.109375, "learning_rate": 0.0004220109810735362, "loss": 1.1772, "step": 1770 }, { "epoch": 0.26, "grad_norm": 0.138671875, "learning_rate": 0.0004215943934531162, "loss": 0.9938, "step": 1775 }, { "epoch": 0.26, "grad_norm": 0.1962890625, "learning_rate": 0.00042117690298362775, "loss": 1.0018, "step": 1780 }, { "epoch": 0.26, "grad_norm": 0.1376953125, "learning_rate": 0.0004207585118617098, "loss": 1.0415, "step": 1785 }, { "epoch": 0.26, "grad_norm": 0.466796875, "learning_rate": 0.00042033922228873954, "loss": 1.203, "step": 1790 }, { "epoch": 0.26, "grad_norm": 0.154296875, "learning_rate": 0.0004199190364708218, "loss": 1.0172, "step": 1795 }, { "epoch": 0.26, "grad_norm": 0.1455078125, "learning_rate": 0.000419497956618777, "loss": 1.0916, "step": 1800 }, { "epoch": 0.26, "grad_norm": 0.1259765625, "learning_rate": 0.00041907598494812933, "loss": 1.0867, "step": 1805 }, { "epoch": 0.26, "grad_norm": 0.1494140625, "learning_rate": 0.00041865312367909545, "loss": 1.0811, "step": 1810 }, { "epoch": 0.27, "grad_norm": 0.14453125, "learning_rate": 0.00041822937503657276, "loss": 1.0382, "step": 1815 }, { "epoch": 0.27, "grad_norm": 0.1416015625, "learning_rate": 0.0004178047412501275, "loss": 1.0956, "step": 1820 }, { "epoch": 0.27, "grad_norm": 0.1494140625, "learning_rate": 0.00041737922455398305, "loss": 1.0508, "step": 1825 }, { "epoch": 0.27, "grad_norm": 0.1923828125, "learning_rate": 0.00041695282718700834, "loss": 1.1273, "step": 1830 }, { "epoch": 0.27, "grad_norm": 0.64453125, "learning_rate": 0.00041652555139270585, "loss": 1.1146, "step": 1835 }, { "epoch": 0.27, "grad_norm": 0.1396484375, "learning_rate": 0.0004160973994192003, "loss": 0.9018, "step": 1840 }, { "epoch": 0.27, "grad_norm": 0.1474609375, "learning_rate": 0.0004156683735192259, "loss": 0.973, "step": 1845 }, { "epoch": 0.27, "grad_norm": 0.1494140625, "learning_rate": 0.0004152384759501156, "loss": 0.999, "step": 1850 }, { "epoch": 0.27, "grad_norm": 0.181640625, "learning_rate": 0.00041480770897378826, "loss": 1.1376, "step": 1855 }, { "epoch": 0.27, "grad_norm": 0.1513671875, "learning_rate": 0.0004143760748567373, "loss": 1.0657, "step": 1860 }, { "epoch": 0.27, "grad_norm": 0.1435546875, "learning_rate": 0.0004139435758700186, "loss": 0.9538, "step": 1865 }, { "epoch": 0.27, "grad_norm": 0.2041015625, "learning_rate": 0.0004135102142892387, "loss": 1.0505, "step": 1870 }, { "epoch": 0.27, "grad_norm": 0.12451171875, "learning_rate": 0.0004130759923945426, "loss": 0.8248, "step": 1875 }, { "epoch": 0.27, "grad_norm": 0.125, "learning_rate": 0.0004126409124706018, "loss": 1.1633, "step": 1880 }, { "epoch": 0.28, "grad_norm": 0.1533203125, "learning_rate": 0.0004122049768066025, "loss": 1.0844, "step": 1885 }, { "epoch": 0.28, "grad_norm": 0.1669921875, "learning_rate": 0.00041176818769623325, "loss": 1.1534, "step": 1890 }, { "epoch": 0.28, "grad_norm": 0.7265625, "learning_rate": 0.00041133054743767305, "loss": 1.004, "step": 1895 }, { "epoch": 0.28, "grad_norm": 0.140625, "learning_rate": 0.00041089205833357944, "loss": 1.1132, "step": 1900 }, { "epoch": 0.28, "grad_norm": 0.185546875, "learning_rate": 0.00041045272269107604, "loss": 1.2416, "step": 1905 }, { "epoch": 0.28, "grad_norm": 0.1689453125, "learning_rate": 0.0004100125428217405, "loss": 1.0682, "step": 1910 }, { "epoch": 0.28, "grad_norm": 0.1318359375, "learning_rate": 0.0004095715210415925, "loss": 1.2839, "step": 1915 }, { "epoch": 0.28, "grad_norm": 0.146484375, "learning_rate": 0.0004091296596710812, "loss": 1.0805, "step": 1920 }, { "epoch": 0.28, "grad_norm": 0.166015625, "learning_rate": 0.00040868696103507385, "loss": 1.1538, "step": 1925 }, { "epoch": 0.28, "grad_norm": 0.142578125, "learning_rate": 0.0004082434274628425, "loss": 1.0307, "step": 1930 }, { "epoch": 0.28, "grad_norm": 0.1591796875, "learning_rate": 0.00040779906128805235, "loss": 1.1501, "step": 1935 }, { "epoch": 0.28, "grad_norm": 0.150390625, "learning_rate": 0.0004073538648487495, "loss": 0.9575, "step": 1940 }, { "epoch": 0.28, "grad_norm": 0.1748046875, "learning_rate": 0.0004069078404873483, "loss": 0.9019, "step": 1945 }, { "epoch": 0.28, "grad_norm": 0.146484375, "learning_rate": 0.0004064609905506195, "loss": 1.0083, "step": 1950 }, { "epoch": 0.29, "grad_norm": 0.140625, "learning_rate": 0.00040601331738967736, "loss": 1.176, "step": 1955 }, { "epoch": 0.29, "grad_norm": 0.126953125, "learning_rate": 0.0004055648233599678, "loss": 0.9837, "step": 1960 }, { "epoch": 0.29, "grad_norm": 0.1572265625, "learning_rate": 0.00040511551082125563, "loss": 1.1932, "step": 1965 }, { "epoch": 0.29, "grad_norm": 0.12890625, "learning_rate": 0.0004046653821376123, "loss": 0.9395, "step": 1970 }, { "epoch": 0.29, "grad_norm": 0.13671875, "learning_rate": 0.00040421443967740344, "loss": 1.1488, "step": 1975 }, { "epoch": 0.29, "grad_norm": 0.1904296875, "learning_rate": 0.00040376268581327645, "loss": 1.0614, "step": 1980 }, { "epoch": 0.29, "grad_norm": 0.1240234375, "learning_rate": 0.00040331012292214776, "loss": 1.0927, "step": 1985 }, { "epoch": 0.29, "grad_norm": 0.1328125, "learning_rate": 0.00040285675338519065, "loss": 0.9929, "step": 1990 }, { "epoch": 0.29, "grad_norm": 0.130859375, "learning_rate": 0.0004024025795878228, "loss": 1.0894, "step": 1995 }, { "epoch": 0.29, "grad_norm": 0.14453125, "learning_rate": 0.00040194760391969305, "loss": 1.2229, "step": 2000 }, { "epoch": 0.29, "grad_norm": 0.125, "learning_rate": 0.00040149182877466974, "loss": 1.1571, "step": 2005 }, { "epoch": 0.29, "grad_norm": 0.14453125, "learning_rate": 0.00040103525655082746, "loss": 0.8849, "step": 2010 }, { "epoch": 0.29, "grad_norm": 0.181640625, "learning_rate": 0.00040057788965043475, "loss": 1.0083, "step": 2015 }, { "epoch": 0.29, "grad_norm": 0.1611328125, "learning_rate": 0.00040011973047994134, "loss": 1.0084, "step": 2020 }, { "epoch": 0.3, "grad_norm": 0.1552734375, "learning_rate": 0.0003996607814499654, "loss": 0.945, "step": 2025 }, { "epoch": 0.3, "grad_norm": 0.142578125, "learning_rate": 0.0003992010449752812, "loss": 0.9116, "step": 2030 }, { "epoch": 0.3, "grad_norm": 0.16015625, "learning_rate": 0.00039874052347480606, "loss": 1.1155, "step": 2035 }, { "epoch": 0.3, "grad_norm": 0.1748046875, "learning_rate": 0.0003982792193715875, "loss": 1.0788, "step": 2040 }, { "epoch": 0.3, "grad_norm": 0.1435546875, "learning_rate": 0.00039781713509279125, "loss": 1.1903, "step": 2045 }, { "epoch": 0.3, "grad_norm": 0.216796875, "learning_rate": 0.00039735427306968746, "loss": 1.258, "step": 2050 }, { "epoch": 0.3, "grad_norm": 0.1572265625, "learning_rate": 0.0003968906357376387, "loss": 0.9277, "step": 2055 }, { "epoch": 0.3, "grad_norm": 0.1494140625, "learning_rate": 0.0003964262255360867, "loss": 1.1963, "step": 2060 }, { "epoch": 0.3, "grad_norm": 0.1455078125, "learning_rate": 0.0003959610449085397, "loss": 0.841, "step": 2065 }, { "epoch": 0.3, "grad_norm": 0.1298828125, "learning_rate": 0.0003954950963025595, "loss": 1.0462, "step": 2070 }, { "epoch": 0.3, "grad_norm": 0.1640625, "learning_rate": 0.000395028382169749, "loss": 1.0065, "step": 2075 }, { "epoch": 0.3, "grad_norm": 0.1396484375, "learning_rate": 0.00039456090496573843, "loss": 0.9813, "step": 2080 }, { "epoch": 0.3, "grad_norm": 0.16796875, "learning_rate": 0.00039409266715017323, "loss": 1.0473, "step": 2085 }, { "epoch": 0.31, "grad_norm": 0.1640625, "learning_rate": 0.00039362367118670084, "loss": 1.1596, "step": 2090 }, { "epoch": 0.31, "grad_norm": 0.1513671875, "learning_rate": 0.00039315391954295755, "loss": 1.1368, "step": 2095 }, { "epoch": 0.31, "grad_norm": 0.1708984375, "learning_rate": 0.0003926834146905558, "loss": 1.2268, "step": 2100 }, { "epoch": 0.31, "grad_norm": 0.1806640625, "learning_rate": 0.00039221215910507104, "loss": 1.0425, "step": 2105 }, { "epoch": 0.31, "grad_norm": 0.177734375, "learning_rate": 0.0003917401552660287, "loss": 1.1053, "step": 2110 }, { "epoch": 0.31, "grad_norm": 0.162109375, "learning_rate": 0.00039126740565689126, "loss": 1.053, "step": 2115 }, { "epoch": 0.31, "grad_norm": 0.15625, "learning_rate": 0.00039079391276504504, "loss": 1.2397, "step": 2120 }, { "epoch": 0.31, "grad_norm": 0.1767578125, "learning_rate": 0.00039031967908178703, "loss": 1.067, "step": 2125 }, { "epoch": 0.31, "grad_norm": 0.1650390625, "learning_rate": 0.00038984470710231217, "loss": 1.1106, "step": 2130 }, { "epoch": 0.31, "grad_norm": 0.1337890625, "learning_rate": 0.0003893689993256997, "loss": 1.0861, "step": 2135 }, { "epoch": 0.31, "grad_norm": 0.162109375, "learning_rate": 0.00038889255825490053, "loss": 0.9505, "step": 2140 }, { "epoch": 0.31, "grad_norm": 0.142578125, "learning_rate": 0.00038841538639672367, "loss": 1.1636, "step": 2145 }, { "epoch": 0.31, "grad_norm": 0.375, "learning_rate": 0.00038793748626182316, "loss": 1.0212, "step": 2150 }, { "epoch": 0.31, "grad_norm": 0.1298828125, "learning_rate": 0.000387458860364685, "loss": 1.0993, "step": 2155 }, { "epoch": 0.32, "grad_norm": 0.140625, "learning_rate": 0.00038697951122361376, "loss": 1.1088, "step": 2160 }, { "epoch": 0.32, "grad_norm": 0.1650390625, "learning_rate": 0.0003864994413607194, "loss": 1.1727, "step": 2165 }, { "epoch": 0.32, "grad_norm": 0.162109375, "learning_rate": 0.0003860186533019039, "loss": 0.9805, "step": 2170 }, { "epoch": 0.32, "grad_norm": 0.1484375, "learning_rate": 0.0003855371495768482, "loss": 1.0571, "step": 2175 }, { "epoch": 0.32, "grad_norm": 0.158203125, "learning_rate": 0.0003850549327189985, "loss": 1.2412, "step": 2180 }, { "epoch": 0.32, "grad_norm": 0.150390625, "learning_rate": 0.00038457200526555347, "loss": 1.0324, "step": 2185 }, { "epoch": 0.32, "grad_norm": 0.146484375, "learning_rate": 0.00038408836975745034, "loss": 1.1788, "step": 2190 }, { "epoch": 0.32, "grad_norm": 0.1474609375, "learning_rate": 0.00038360402873935194, "loss": 1.0882, "step": 2195 }, { "epoch": 0.32, "grad_norm": 0.1640625, "learning_rate": 0.00038311898475963316, "loss": 0.904, "step": 2200 }, { "epoch": 0.32, "grad_norm": 0.1376953125, "learning_rate": 0.0003826332403703675, "loss": 1.0482, "step": 2205 }, { "epoch": 0.32, "grad_norm": 0.1708984375, "learning_rate": 0.0003821467981273136, "loss": 1.176, "step": 2210 }, { "epoch": 0.32, "grad_norm": 0.177734375, "learning_rate": 0.00038165966058990213, "loss": 1.1139, "step": 2215 }, { "epoch": 0.32, "grad_norm": 0.142578125, "learning_rate": 0.0003811718303212218, "loss": 0.9585, "step": 2220 }, { "epoch": 0.32, "grad_norm": 0.1279296875, "learning_rate": 0.00038068330988800625, "loss": 1.1465, "step": 2225 }, { "epoch": 0.33, "grad_norm": 0.162109375, "learning_rate": 0.0003801941018606204, "loss": 0.9928, "step": 2230 }, { "epoch": 0.33, "grad_norm": 0.17578125, "learning_rate": 0.0003797042088130472, "loss": 0.9144, "step": 2235 }, { "epoch": 0.33, "grad_norm": 0.158203125, "learning_rate": 0.0003792136333228735, "loss": 0.8392, "step": 2240 }, { "epoch": 0.33, "grad_norm": 0.1689453125, "learning_rate": 0.00037872237797127717, "loss": 1.096, "step": 2245 }, { "epoch": 0.33, "grad_norm": 0.1357421875, "learning_rate": 0.00037823044534301286, "loss": 1.0438, "step": 2250 }, { "epoch": 0.33, "grad_norm": 0.1806640625, "learning_rate": 0.00037773783802639904, "loss": 1.0796, "step": 2255 }, { "epoch": 0.33, "grad_norm": 0.13671875, "learning_rate": 0.0003772445586133039, "loss": 1.013, "step": 2260 }, { "epoch": 0.33, "grad_norm": 0.1640625, "learning_rate": 0.00037675060969913186, "loss": 1.03, "step": 2265 }, { "epoch": 0.33, "grad_norm": 0.158203125, "learning_rate": 0.0003762559938828101, "loss": 0.8575, "step": 2270 }, { "epoch": 0.33, "grad_norm": 0.1669921875, "learning_rate": 0.00037576071376677463, "loss": 1.0536, "step": 2275 }, { "epoch": 0.33, "grad_norm": 0.150390625, "learning_rate": 0.00037526477195695663, "loss": 0.9726, "step": 2280 }, { "epoch": 0.33, "grad_norm": 0.140625, "learning_rate": 0.0003747681710627689, "loss": 1.0853, "step": 2285 }, { "epoch": 0.33, "grad_norm": 0.1591796875, "learning_rate": 0.000374270913697092, "loss": 1.1967, "step": 2290 }, { "epoch": 0.34, "grad_norm": 0.1318359375, "learning_rate": 0.00037377300247626056, "loss": 0.995, "step": 2295 }, { "epoch": 0.34, "grad_norm": 0.13671875, "learning_rate": 0.0003732744400200494, "loss": 1.1294, "step": 2300 }, { "epoch": 0.34, "grad_norm": 0.138671875, "learning_rate": 0.0003727752289516599, "loss": 0.9954, "step": 2305 }, { "epoch": 0.34, "grad_norm": 0.140625, "learning_rate": 0.00037227537189770624, "loss": 1.0589, "step": 2310 }, { "epoch": 0.34, "grad_norm": 0.138671875, "learning_rate": 0.0003717748714882012, "loss": 1.0241, "step": 2315 }, { "epoch": 0.34, "grad_norm": 0.1748046875, "learning_rate": 0.00037127373035654286, "loss": 1.0478, "step": 2320 }, { "epoch": 0.34, "grad_norm": 0.150390625, "learning_rate": 0.0003707719511395004, "loss": 1.0938, "step": 2325 }, { "epoch": 0.34, "grad_norm": 0.1376953125, "learning_rate": 0.0003702695364772002, "loss": 1.1013, "step": 2330 }, { "epoch": 0.34, "grad_norm": 0.158203125, "learning_rate": 0.0003697664890131121, "loss": 0.9581, "step": 2335 }, { "epoch": 0.34, "grad_norm": 0.1474609375, "learning_rate": 0.0003692628113940356, "loss": 1.0603, "step": 2340 }, { "epoch": 0.34, "grad_norm": 0.2099609375, "learning_rate": 0.0003687585062700856, "loss": 0.9877, "step": 2345 }, { "epoch": 0.34, "grad_norm": 0.13671875, "learning_rate": 0.0003682535762946788, "loss": 1.1015, "step": 2350 }, { "epoch": 0.34, "grad_norm": 0.134765625, "learning_rate": 0.00036774802412451927, "loss": 0.834, "step": 2355 }, { "epoch": 0.34, "grad_norm": 0.2197265625, "learning_rate": 0.0003672418524195851, "loss": 1.2594, "step": 2360 }, { "epoch": 0.35, "grad_norm": 0.1591796875, "learning_rate": 0.0003667350638431139, "loss": 1.1488, "step": 2365 }, { "epoch": 0.35, "grad_norm": 0.1884765625, "learning_rate": 0.0003662276610615891, "loss": 1.3167, "step": 2370 }, { "epoch": 0.35, "grad_norm": 0.1396484375, "learning_rate": 0.00036571964674472567, "loss": 0.9609, "step": 2375 }, { "epoch": 0.35, "grad_norm": 0.1376953125, "learning_rate": 0.0003652110235654563, "loss": 1.2101, "step": 2380 }, { "epoch": 0.35, "grad_norm": 0.1494140625, "learning_rate": 0.00036470179419991703, "loss": 1.0423, "step": 2385 }, { "epoch": 0.35, "grad_norm": 0.158203125, "learning_rate": 0.0003641919613274336, "loss": 1.0078, "step": 2390 }, { "epoch": 0.35, "grad_norm": 0.1611328125, "learning_rate": 0.00036368152763050697, "loss": 0.9718, "step": 2395 }, { "epoch": 0.35, "grad_norm": 0.166015625, "learning_rate": 0.00036317049579479954, "loss": 1.115, "step": 2400 }, { "epoch": 0.35, "grad_norm": 0.177734375, "learning_rate": 0.00036265886850912044, "loss": 1.1202, "step": 2405 }, { "epoch": 0.35, "grad_norm": 0.150390625, "learning_rate": 0.0003621466484654123, "loss": 0.9148, "step": 2410 }, { "epoch": 0.35, "grad_norm": 0.1494140625, "learning_rate": 0.0003616338383587362, "loss": 1.1323, "step": 2415 }, { "epoch": 0.35, "grad_norm": 0.1416015625, "learning_rate": 0.00036112044088725806, "loss": 1.0034, "step": 2420 }, { "epoch": 0.35, "grad_norm": 0.1298828125, "learning_rate": 0.00036060645875223387, "loss": 1.0439, "step": 2425 }, { "epoch": 0.35, "grad_norm": 0.1513671875, "learning_rate": 0.00036009189465799633, "loss": 0.9872, "step": 2430 }, { "epoch": 0.36, "grad_norm": 0.1181640625, "learning_rate": 0.0003595767513119399, "loss": 1.0703, "step": 2435 }, { "epoch": 0.36, "grad_norm": 0.2333984375, "learning_rate": 0.00035906103142450675, "loss": 1.1819, "step": 2440 }, { "epoch": 0.36, "grad_norm": 0.14453125, "learning_rate": 0.00035854473770917264, "loss": 1.106, "step": 2445 }, { "epoch": 0.36, "grad_norm": 0.171875, "learning_rate": 0.0003580278728824324, "loss": 1.0991, "step": 2450 }, { "epoch": 0.36, "grad_norm": 0.1591796875, "learning_rate": 0.00035751043966378613, "loss": 1.0253, "step": 2455 }, { "epoch": 0.36, "grad_norm": 0.125, "learning_rate": 0.000356992440775724, "loss": 1.0299, "step": 2460 }, { "epoch": 0.36, "grad_norm": 0.15234375, "learning_rate": 0.00035647387894371284, "loss": 0.9544, "step": 2465 }, { "epoch": 0.36, "grad_norm": 0.1552734375, "learning_rate": 0.0003559547568961815, "loss": 1.0262, "step": 2470 }, { "epoch": 0.36, "grad_norm": 0.1318359375, "learning_rate": 0.00035543507736450616, "loss": 1.0388, "step": 2475 }, { "epoch": 0.36, "grad_norm": 0.140625, "learning_rate": 0.0003549148430829962, "loss": 1.2653, "step": 2480 }, { "epoch": 0.36, "grad_norm": 0.15625, "learning_rate": 0.00035439405678888014, "loss": 1.045, "step": 2485 }, { "epoch": 0.36, "grad_norm": 0.17578125, "learning_rate": 0.00035387272122229063, "loss": 1.2579, "step": 2490 }, { "epoch": 0.36, "grad_norm": 0.173828125, "learning_rate": 0.00035335083912625063, "loss": 1.4965, "step": 2495 }, { "epoch": 0.37, "grad_norm": 0.19921875, "learning_rate": 0.00035282841324665826, "loss": 1.2086, "step": 2500 }, { "epoch": 0.37, "grad_norm": 0.16796875, "learning_rate": 0.0003523054463322732, "loss": 1.0837, "step": 2505 }, { "epoch": 0.37, "grad_norm": 0.1552734375, "learning_rate": 0.0003517819411347015, "loss": 0.969, "step": 2510 }, { "epoch": 0.37, "grad_norm": 0.142578125, "learning_rate": 0.00035125790040838164, "loss": 0.9484, "step": 2515 }, { "epoch": 0.37, "grad_norm": 0.1298828125, "learning_rate": 0.0003507333269105695, "loss": 1.0408, "step": 2520 }, { "epoch": 0.37, "grad_norm": 0.1494140625, "learning_rate": 0.0003502082234013246, "loss": 0.8761, "step": 2525 }, { "epoch": 0.37, "grad_norm": 0.1748046875, "learning_rate": 0.0003496825926434947, "loss": 1.0746, "step": 2530 }, { "epoch": 0.37, "grad_norm": 0.138671875, "learning_rate": 0.0003491564374027018, "loss": 1.0468, "step": 2535 }, { "epoch": 0.37, "grad_norm": 0.40234375, "learning_rate": 0.00034862976044732765, "loss": 1.0049, "step": 2540 }, { "epoch": 0.37, "grad_norm": 0.146484375, "learning_rate": 0.00034810256454849886, "loss": 1.2078, "step": 2545 }, { "epoch": 0.37, "grad_norm": 0.1435546875, "learning_rate": 0.0003475748524800726, "loss": 0.9254, "step": 2550 }, { "epoch": 0.37, "grad_norm": 0.1455078125, "learning_rate": 0.00034704662701862165, "loss": 1.0607, "step": 2555 }, { "epoch": 0.37, "grad_norm": 0.1767578125, "learning_rate": 0.0003465178909434204, "loss": 1.216, "step": 2560 }, { "epoch": 0.37, "grad_norm": 0.1708984375, "learning_rate": 0.0003459886470364295, "loss": 1.1468, "step": 2565 }, { "epoch": 0.38, "grad_norm": 0.1982421875, "learning_rate": 0.0003454588980822817, "loss": 1.1476, "step": 2570 }, { "epoch": 0.38, "grad_norm": 0.154296875, "learning_rate": 0.00034492864686826716, "loss": 0.9272, "step": 2575 }, { "epoch": 0.38, "grad_norm": 0.1826171875, "learning_rate": 0.0003443978961843185, "loss": 1.2118, "step": 2580 }, { "epoch": 0.38, "grad_norm": 0.1376953125, "learning_rate": 0.00034386664882299643, "loss": 0.8843, "step": 2585 }, { "epoch": 0.38, "grad_norm": 0.150390625, "learning_rate": 0.0003433349075794751, "loss": 1.1248, "step": 2590 }, { "epoch": 0.38, "grad_norm": 0.1484375, "learning_rate": 0.0003428026752515269, "loss": 1.0207, "step": 2595 }, { "epoch": 0.38, "grad_norm": 0.1611328125, "learning_rate": 0.00034226995463950837, "loss": 0.9798, "step": 2600 }, { "epoch": 0.38, "grad_norm": 0.158203125, "learning_rate": 0.00034173674854634495, "loss": 0.8772, "step": 2605 }, { "epoch": 0.38, "grad_norm": 0.134765625, "learning_rate": 0.00034120305977751653, "loss": 1.0818, "step": 2610 }, { "epoch": 0.38, "grad_norm": 0.162109375, "learning_rate": 0.0003406688911410428, "loss": 1.0153, "step": 2615 }, { "epoch": 0.38, "grad_norm": 0.1474609375, "learning_rate": 0.000340134245447468, "loss": 1.0314, "step": 2620 }, { "epoch": 0.38, "grad_norm": 0.150390625, "learning_rate": 0.00033959912550984655, "loss": 0.9562, "step": 2625 }, { "epoch": 0.38, "grad_norm": 0.154296875, "learning_rate": 0.0003390635341437282, "loss": 0.9481, "step": 2630 }, { "epoch": 0.38, "grad_norm": 0.1669921875, "learning_rate": 0.0003385274741671429, "loss": 1.224, "step": 2635 }, { "epoch": 0.39, "grad_norm": 0.1376953125, "learning_rate": 0.0003379909484005865, "loss": 1.1988, "step": 2640 }, { "epoch": 0.39, "grad_norm": 0.2216796875, "learning_rate": 0.0003374539596670055, "loss": 0.9781, "step": 2645 }, { "epoch": 0.39, "grad_norm": 0.1474609375, "learning_rate": 0.00033691651079178223, "loss": 0.9193, "step": 2650 }, { "epoch": 0.39, "grad_norm": 0.1494140625, "learning_rate": 0.00033637860460272, "loss": 1.043, "step": 2655 }, { "epoch": 0.39, "grad_norm": 0.1630859375, "learning_rate": 0.00033584024393002864, "loss": 1.0526, "step": 2660 }, { "epoch": 0.39, "grad_norm": 0.171875, "learning_rate": 0.00033530143160630894, "loss": 1.1236, "step": 2665 }, { "epoch": 0.39, "grad_norm": 0.13671875, "learning_rate": 0.00033476217046653813, "loss": 1.185, "step": 2670 }, { "epoch": 0.39, "grad_norm": 0.1357421875, "learning_rate": 0.00033422246334805503, "loss": 1.1898, "step": 2675 }, { "epoch": 0.39, "grad_norm": 0.1572265625, "learning_rate": 0.0003336823130905449, "loss": 0.9399, "step": 2680 }, { "epoch": 0.39, "grad_norm": 0.1728515625, "learning_rate": 0.00033314172253602435, "loss": 1.031, "step": 2685 }, { "epoch": 0.39, "grad_norm": 0.1669921875, "learning_rate": 0.00033260069452882725, "loss": 0.9471, "step": 2690 }, { "epoch": 0.39, "grad_norm": 0.181640625, "learning_rate": 0.00033205923191558835, "loss": 1.087, "step": 2695 }, { "epoch": 0.39, "grad_norm": 0.146484375, "learning_rate": 0.00033151733754522986, "loss": 1.3382, "step": 2700 }, { "epoch": 0.4, "grad_norm": 0.1328125, "learning_rate": 0.000330975014268945, "loss": 1.048, "step": 2705 }, { "epoch": 0.4, "grad_norm": 0.1162109375, "learning_rate": 0.0003304322649401843, "loss": 0.8484, "step": 2710 }, { "epoch": 0.4, "grad_norm": 0.1884765625, "learning_rate": 0.0003298890924146395, "loss": 0.8403, "step": 2715 }, { "epoch": 0.4, "grad_norm": 0.1650390625, "learning_rate": 0.0003293454995502293, "loss": 1.211, "step": 2720 }, { "epoch": 0.4, "grad_norm": 0.15234375, "learning_rate": 0.00032880148920708384, "loss": 0.9348, "step": 2725 }, { "epoch": 0.4, "grad_norm": 0.240234375, "learning_rate": 0.0003282570642475301, "loss": 1.1156, "step": 2730 }, { "epoch": 0.4, "grad_norm": 0.1884765625, "learning_rate": 0.0003277122275360762, "loss": 1.0835, "step": 2735 }, { "epoch": 0.4, "grad_norm": 0.138671875, "learning_rate": 0.00032716698193939697, "loss": 0.9661, "step": 2740 }, { "epoch": 0.4, "grad_norm": 0.1376953125, "learning_rate": 0.0003266213303263186, "loss": 0.9852, "step": 2745 }, { "epoch": 0.4, "grad_norm": 0.162109375, "learning_rate": 0.0003260752755678034, "loss": 1.0605, "step": 2750 }, { "epoch": 0.4, "grad_norm": 0.1767578125, "learning_rate": 0.00032552882053693495, "loss": 1.3394, "step": 2755 }, { "epoch": 0.4, "grad_norm": 0.1416015625, "learning_rate": 0.00032498196810890284, "loss": 1.1218, "step": 2760 }, { "epoch": 0.4, "grad_norm": 0.14453125, "learning_rate": 0.00032443472116098766, "loss": 1.2519, "step": 2765 }, { "epoch": 0.4, "grad_norm": 0.150390625, "learning_rate": 0.0003238870825725456, "loss": 0.8018, "step": 2770 }, { "epoch": 0.41, "grad_norm": 0.1689453125, "learning_rate": 0.00032333905522499376, "loss": 1.1112, "step": 2775 }, { "epoch": 0.41, "grad_norm": 0.1337890625, "learning_rate": 0.0003227906420017942, "loss": 1.0559, "step": 2780 }, { "epoch": 0.41, "grad_norm": 0.1650390625, "learning_rate": 0.00032224184578843996, "loss": 1.1349, "step": 2785 }, { "epoch": 0.41, "grad_norm": 0.14453125, "learning_rate": 0.0003216926694724386, "loss": 1.0435, "step": 2790 }, { "epoch": 0.41, "grad_norm": 0.142578125, "learning_rate": 0.000321143115943298, "loss": 0.7837, "step": 2795 }, { "epoch": 0.41, "grad_norm": 0.17578125, "learning_rate": 0.00032059318809251046, "loss": 0.9842, "step": 2800 }, { "epoch": 0.41, "grad_norm": 0.16015625, "learning_rate": 0.00032004288881353816, "loss": 0.9675, "step": 2805 }, { "epoch": 0.41, "grad_norm": 0.169921875, "learning_rate": 0.0003194922210017971, "loss": 1.2367, "step": 2810 }, { "epoch": 0.41, "grad_norm": 0.1484375, "learning_rate": 0.00031894118755464266, "loss": 1.1448, "step": 2815 }, { "epoch": 0.41, "grad_norm": 0.1640625, "learning_rate": 0.0003183897913713538, "loss": 1.2115, "step": 2820 }, { "epoch": 0.41, "grad_norm": 0.1630859375, "learning_rate": 0.0003178380353531183, "loss": 1.0475, "step": 2825 }, { "epoch": 0.41, "grad_norm": 0.1884765625, "learning_rate": 0.00031728592240301697, "loss": 1.2608, "step": 2830 }, { "epoch": 0.41, "grad_norm": 0.1865234375, "learning_rate": 0.00031673345542600873, "loss": 0.9939, "step": 2835 }, { "epoch": 0.41, "grad_norm": 0.1455078125, "learning_rate": 0.0003161806373289153, "loss": 1.1296, "step": 2840 }, { "epoch": 0.42, "grad_norm": 0.1474609375, "learning_rate": 0.0003156274710204056, "loss": 0.9441, "step": 2845 }, { "epoch": 0.42, "grad_norm": 0.1708984375, "learning_rate": 0.0003150739594109809, "loss": 1.1399, "step": 2850 }, { "epoch": 0.42, "grad_norm": 0.2041015625, "learning_rate": 0.0003145201054129592, "loss": 1.0609, "step": 2855 }, { "epoch": 0.42, "grad_norm": 0.1630859375, "learning_rate": 0.00031396591194046003, "loss": 1.0824, "step": 2860 }, { "epoch": 0.42, "grad_norm": 0.162109375, "learning_rate": 0.00031341138190938897, "loss": 1.0619, "step": 2865 }, { "epoch": 0.42, "grad_norm": 0.1318359375, "learning_rate": 0.00031285651823742246, "loss": 1.1303, "step": 2870 }, { "epoch": 0.42, "grad_norm": 0.15234375, "learning_rate": 0.0003123013238439925, "loss": 1.2609, "step": 2875 }, { "epoch": 0.42, "grad_norm": 0.271484375, "learning_rate": 0.00031174580165027106, "loss": 1.2319, "step": 2880 }, { "epoch": 0.42, "grad_norm": 0.138671875, "learning_rate": 0.0003111899545791549, "loss": 0.8276, "step": 2885 }, { "epoch": 0.42, "grad_norm": 0.1494140625, "learning_rate": 0.00031063378555525007, "loss": 1.0235, "step": 2890 }, { "epoch": 0.42, "grad_norm": 0.1513671875, "learning_rate": 0.0003100772975048567, "loss": 0.8105, "step": 2895 }, { "epoch": 0.42, "grad_norm": 0.146484375, "learning_rate": 0.00030952049335595345, "loss": 1.1349, "step": 2900 }, { "epoch": 0.42, "grad_norm": 0.1650390625, "learning_rate": 0.0003089633760381821, "loss": 1.0796, "step": 2905 }, { "epoch": 0.42, "grad_norm": 0.1630859375, "learning_rate": 0.0003084059484828321, "loss": 0.9593, "step": 2910 }, { "epoch": 0.43, "grad_norm": 0.1640625, "learning_rate": 0.00030784821362282537, "loss": 1.0396, "step": 2915 }, { "epoch": 0.43, "grad_norm": 0.45703125, "learning_rate": 0.00030729017439270074, "loss": 0.9951, "step": 2920 }, { "epoch": 0.43, "grad_norm": 0.228515625, "learning_rate": 0.00030673183372859834, "loss": 1.0801, "step": 2925 }, { "epoch": 0.43, "grad_norm": 0.162109375, "learning_rate": 0.00030617319456824446, "loss": 0.9871, "step": 2930 }, { "epoch": 0.43, "grad_norm": 0.1611328125, "learning_rate": 0.0003056142598509358, "loss": 0.8944, "step": 2935 }, { "epoch": 0.43, "grad_norm": 0.1533203125, "learning_rate": 0.00030505503251752425, "loss": 0.9589, "step": 2940 }, { "epoch": 0.43, "grad_norm": 0.15234375, "learning_rate": 0.00030449551551040124, "loss": 1.0908, "step": 2945 }, { "epoch": 0.43, "grad_norm": 0.158203125, "learning_rate": 0.00030393571177348235, "loss": 0.9541, "step": 2950 }, { "epoch": 0.43, "grad_norm": 0.1572265625, "learning_rate": 0.0003033756242521918, "loss": 1.004, "step": 2955 }, { "epoch": 0.43, "grad_norm": 0.1767578125, "learning_rate": 0.00030281525589344696, "loss": 1.2345, "step": 2960 }, { "epoch": 0.43, "grad_norm": 0.1591796875, "learning_rate": 0.0003022546096456428, "loss": 1.3482, "step": 2965 }, { "epoch": 0.43, "grad_norm": 0.1904296875, "learning_rate": 0.0003016936884586364, "loss": 1.1067, "step": 2970 }, { "epoch": 0.43, "grad_norm": 0.1298828125, "learning_rate": 0.00030113249528373163, "loss": 1.0384, "step": 2975 }, { "epoch": 0.44, "grad_norm": 0.146484375, "learning_rate": 0.0003005710330736632, "loss": 0.8763, "step": 2980 }, { "epoch": 0.44, "grad_norm": 0.1572265625, "learning_rate": 0.00030000930478258134, "loss": 1.0176, "step": 2985 }, { "epoch": 0.44, "grad_norm": 0.185546875, "learning_rate": 0.00029944731336603646, "loss": 1.1598, "step": 2990 }, { "epoch": 0.44, "grad_norm": 0.14453125, "learning_rate": 0.0002988850617809633, "loss": 0.9417, "step": 2995 }, { "epoch": 0.44, "grad_norm": 0.1416015625, "learning_rate": 0.00029832255298566557, "loss": 0.8104, "step": 3000 }, { "epoch": 0.44, "grad_norm": 0.1767578125, "learning_rate": 0.0002977597899398001, "loss": 1.0644, "step": 3005 }, { "epoch": 0.44, "grad_norm": 0.16015625, "learning_rate": 0.0002971967756043617, "loss": 1.1664, "step": 3010 }, { "epoch": 0.44, "grad_norm": 0.1533203125, "learning_rate": 0.0002966335129416672, "loss": 1.0205, "step": 3015 }, { "epoch": 0.44, "grad_norm": 0.1650390625, "learning_rate": 0.0002960700049153402, "loss": 1.1998, "step": 3020 }, { "epoch": 0.44, "grad_norm": 0.140625, "learning_rate": 0.0002955062544902949, "loss": 1.0464, "step": 3025 }, { "epoch": 0.44, "grad_norm": 0.16015625, "learning_rate": 0.0002949422646327214, "loss": 1.3176, "step": 3030 }, { "epoch": 0.44, "grad_norm": 0.1484375, "learning_rate": 0.00029437803831006907, "loss": 0.9588, "step": 3035 }, { "epoch": 0.44, "grad_norm": 0.1552734375, "learning_rate": 0.0002938135784910319, "loss": 1.0234, "step": 3040 }, { "epoch": 0.44, "grad_norm": 0.1552734375, "learning_rate": 0.00029324888814553213, "loss": 1.1289, "step": 3045 }, { "epoch": 0.45, "grad_norm": 0.162109375, "learning_rate": 0.00029268397024470507, "loss": 1.0552, "step": 3050 }, { "epoch": 0.45, "grad_norm": 0.1640625, "learning_rate": 0.0002921188277608832, "loss": 1.2105, "step": 3055 }, { "epoch": 0.45, "grad_norm": 0.1796875, "learning_rate": 0.00029155346366758074, "loss": 1.0885, "step": 3060 }, { "epoch": 0.45, "grad_norm": 0.1328125, "learning_rate": 0.0002909878809394779, "loss": 0.8802, "step": 3065 }, { "epoch": 0.45, "grad_norm": 0.150390625, "learning_rate": 0.0002904220825524052, "loss": 1.1406, "step": 3070 }, { "epoch": 0.45, "grad_norm": 0.154296875, "learning_rate": 0.000289856071483328, "loss": 1.0026, "step": 3075 }, { "epoch": 0.45, "grad_norm": 0.1513671875, "learning_rate": 0.0002892898507103305, "loss": 0.9414, "step": 3080 }, { "epoch": 0.45, "grad_norm": 0.1376953125, "learning_rate": 0.0002887234232126004, "loss": 0.9942, "step": 3085 }, { "epoch": 0.45, "grad_norm": 0.15234375, "learning_rate": 0.00028815679197041283, "loss": 0.942, "step": 3090 }, { "epoch": 0.45, "grad_norm": 0.130859375, "learning_rate": 0.0002875899599651155, "loss": 1.0531, "step": 3095 }, { "epoch": 0.45, "grad_norm": 0.166015625, "learning_rate": 0.0002870229301791117, "loss": 1.0638, "step": 3100 }, { "epoch": 0.45, "grad_norm": 0.15625, "learning_rate": 0.000286455705595846, "loss": 0.9991, "step": 3105 }, { "epoch": 0.45, "grad_norm": 0.14453125, "learning_rate": 0.0002858882891997874, "loss": 1.2492, "step": 3110 }, { "epoch": 0.45, "grad_norm": 0.142578125, "learning_rate": 0.00028532068397641466, "loss": 0.8871, "step": 3115 }, { "epoch": 0.46, "grad_norm": 0.1708984375, "learning_rate": 0.0002847528929121994, "loss": 1.0626, "step": 3120 }, { "epoch": 0.46, "grad_norm": 0.48828125, "learning_rate": 0.00028418491899459173, "loss": 1.2234, "step": 3125 }, { "epoch": 0.46, "grad_norm": 0.16796875, "learning_rate": 0.00028361676521200324, "loss": 1.254, "step": 3130 }, { "epoch": 0.46, "grad_norm": 0.1513671875, "learning_rate": 0.00028304843455379236, "loss": 1.267, "step": 3135 }, { "epoch": 0.46, "grad_norm": 0.19921875, "learning_rate": 0.00028247993001024793, "loss": 1.1695, "step": 3140 }, { "epoch": 0.46, "grad_norm": 0.16796875, "learning_rate": 0.0002819112545725736, "loss": 1.1273, "step": 3145 }, { "epoch": 0.46, "grad_norm": 0.1337890625, "learning_rate": 0.0002813424112328724, "loss": 0.9973, "step": 3150 }, { "epoch": 0.46, "grad_norm": 0.1533203125, "learning_rate": 0.0002807734029841307, "loss": 1.049, "step": 3155 }, { "epoch": 0.46, "grad_norm": 0.1494140625, "learning_rate": 0.00028020423282020256, "loss": 0.904, "step": 3160 }, { "epoch": 0.46, "grad_norm": 0.1796875, "learning_rate": 0.00027963490373579384, "loss": 1.1445, "step": 3165 }, { "epoch": 0.46, "grad_norm": 0.16796875, "learning_rate": 0.0002790654187264468, "loss": 1.2037, "step": 3170 }, { "epoch": 0.46, "grad_norm": 0.1416015625, "learning_rate": 0.0002784957807885238, "loss": 0.8471, "step": 3175 }, { "epoch": 0.46, "grad_norm": 0.14453125, "learning_rate": 0.00027792599291919217, "loss": 1.1505, "step": 3180 }, { "epoch": 0.47, "grad_norm": 0.15234375, "learning_rate": 0.0002773560581164078, "loss": 1.1143, "step": 3185 }, { "epoch": 0.47, "grad_norm": 0.1611328125, "learning_rate": 0.0002767859793789, "loss": 0.9737, "step": 3190 }, { "epoch": 0.47, "grad_norm": 0.1728515625, "learning_rate": 0.000276215759706155, "loss": 1.0739, "step": 3195 }, { "epoch": 0.47, "grad_norm": 0.1689453125, "learning_rate": 0.0002756454020984009, "loss": 1.0384, "step": 3200 }, { "epoch": 0.47, "grad_norm": 0.1455078125, "learning_rate": 0.0002750749095565914, "loss": 1.2285, "step": 3205 }, { "epoch": 0.47, "grad_norm": 0.17578125, "learning_rate": 0.0002745042850823902, "loss": 1.2136, "step": 3210 }, { "epoch": 0.47, "grad_norm": 0.158203125, "learning_rate": 0.0002739335316781551, "loss": 0.9552, "step": 3215 }, { "epoch": 0.47, "grad_norm": 0.173828125, "learning_rate": 0.00027336265234692244, "loss": 1.1343, "step": 3220 }, { "epoch": 0.47, "grad_norm": 0.171875, "learning_rate": 0.0002727916500923909, "loss": 0.8818, "step": 3225 }, { "epoch": 0.47, "grad_norm": 0.189453125, "learning_rate": 0.00027222052791890607, "loss": 0.9985, "step": 3230 }, { "epoch": 0.47, "grad_norm": 0.1787109375, "learning_rate": 0.00027164928883144444, "loss": 0.8941, "step": 3235 }, { "epoch": 0.47, "grad_norm": 0.13671875, "learning_rate": 0.00027107793583559776, "loss": 1.0627, "step": 3240 }, { "epoch": 0.47, "grad_norm": 0.1376953125, "learning_rate": 0.00027050647193755696, "loss": 1.1494, "step": 3245 }, { "epoch": 0.47, "grad_norm": 0.1318359375, "learning_rate": 0.0002699349001440965, "loss": 0.956, "step": 3250 }, { "epoch": 0.48, "grad_norm": 0.1484375, "learning_rate": 0.0002693632234625586, "loss": 0.9758, "step": 3255 }, { "epoch": 0.48, "grad_norm": 0.1455078125, "learning_rate": 0.0002687914449008375, "loss": 0.9468, "step": 3260 }, { "epoch": 0.48, "grad_norm": 0.17578125, "learning_rate": 0.0002682195674673631, "loss": 1.0775, "step": 3265 }, { "epoch": 0.48, "grad_norm": 0.146484375, "learning_rate": 0.00026764759417108585, "loss": 1.0727, "step": 3270 }, { "epoch": 0.48, "grad_norm": 0.1884765625, "learning_rate": 0.0002670755280214605, "loss": 1.2369, "step": 3275 }, { "epoch": 0.48, "grad_norm": 0.1650390625, "learning_rate": 0.0002665033720284302, "loss": 1.0324, "step": 3280 }, { "epoch": 0.48, "grad_norm": 0.1220703125, "learning_rate": 0.00026593112920241104, "loss": 1.0534, "step": 3285 }, { "epoch": 0.48, "grad_norm": 0.169921875, "learning_rate": 0.0002653588025542759, "loss": 1.025, "step": 3290 }, { "epoch": 0.48, "grad_norm": 0.1357421875, "learning_rate": 0.00026478639509533853, "loss": 1.1727, "step": 3295 }, { "epoch": 0.48, "grad_norm": 0.1904296875, "learning_rate": 0.0002642139098373382, "loss": 1.1025, "step": 3300 }, { "epoch": 0.48, "grad_norm": 0.1376953125, "learning_rate": 0.0002636413497924231, "loss": 0.936, "step": 3305 }, { "epoch": 0.48, "grad_norm": 0.1572265625, "learning_rate": 0.0002630687179731354, "loss": 1.075, "step": 3310 }, { "epoch": 0.48, "grad_norm": 0.1240234375, "learning_rate": 0.00026249601739239447, "loss": 0.7971, "step": 3315 }, { "epoch": 0.48, "grad_norm": 0.15625, "learning_rate": 0.00026192325106348186, "loss": 1.0379, "step": 3320 }, { "epoch": 0.49, "grad_norm": 0.15234375, "learning_rate": 0.00026135042200002464, "loss": 0.8698, "step": 3325 }, { "epoch": 0.49, "grad_norm": 0.1474609375, "learning_rate": 0.00026077753321598034, "loss": 0.9574, "step": 3330 }, { "epoch": 0.49, "grad_norm": 0.1494140625, "learning_rate": 0.0002602045877256204, "loss": 0.9652, "step": 3335 }, { "epoch": 0.49, "grad_norm": 0.1376953125, "learning_rate": 0.00025963158854351494, "loss": 1.0468, "step": 3340 }, { "epoch": 0.49, "grad_norm": 0.1357421875, "learning_rate": 0.0002590585386845162, "loss": 1.0618, "step": 3345 }, { "epoch": 0.49, "grad_norm": 0.1337890625, "learning_rate": 0.00025848544116374353, "loss": 1.0876, "step": 3350 }, { "epoch": 0.49, "grad_norm": 0.140625, "learning_rate": 0.0002579122989965666, "loss": 1.1683, "step": 3355 }, { "epoch": 0.49, "grad_norm": 0.1708984375, "learning_rate": 0.00025733911519859023, "loss": 1.0189, "step": 3360 }, { "epoch": 0.49, "grad_norm": 0.140625, "learning_rate": 0.0002567658927856382, "loss": 1.0906, "step": 3365 }, { "epoch": 0.49, "grad_norm": 0.1767578125, "learning_rate": 0.0002561926347737376, "loss": 1.2646, "step": 3370 }, { "epoch": 0.49, "grad_norm": 0.1591796875, "learning_rate": 0.00025561934417910265, "loss": 1.1042, "step": 3375 }, { "epoch": 0.49, "grad_norm": 0.1435546875, "learning_rate": 0.00025504602401811913, "loss": 0.8099, "step": 3380 }, { "epoch": 0.49, "grad_norm": 0.1552734375, "learning_rate": 0.0002544726773073282, "loss": 0.9141, "step": 3385 }, { "epoch": 0.5, "grad_norm": 0.17578125, "learning_rate": 0.00025389930706341097, "loss": 1.2311, "step": 3390 }, { "epoch": 0.5, "grad_norm": 0.1923828125, "learning_rate": 0.00025332591630317227, "loss": 0.9525, "step": 3395 }, { "epoch": 0.5, "grad_norm": 0.1650390625, "learning_rate": 0.00025275250804352464, "loss": 1.1637, "step": 3400 }, { "epoch": 0.5, "grad_norm": 0.1669921875, "learning_rate": 0.0002521790853014732, "loss": 1.1291, "step": 3405 }, { "epoch": 0.5, "grad_norm": 0.125, "learning_rate": 0.00025160565109409873, "loss": 0.9412, "step": 3410 }, { "epoch": 0.5, "grad_norm": 0.1435546875, "learning_rate": 0.00025103220843854273, "loss": 0.9922, "step": 3415 }, { "epoch": 0.5, "grad_norm": 0.30078125, "learning_rate": 0.0002504587603519909, "loss": 0.9436, "step": 3420 }, { "epoch": 0.5, "grad_norm": 0.1640625, "learning_rate": 0.0002498853098516577, "loss": 1.463, "step": 3425 }, { "epoch": 0.5, "grad_norm": 0.1591796875, "learning_rate": 0.0002493118599547702, "loss": 0.9141, "step": 3430 }, { "epoch": 0.5, "grad_norm": 0.1513671875, "learning_rate": 0.00024873841367855216, "loss": 0.9547, "step": 3435 }, { "epoch": 0.5, "grad_norm": 0.1787109375, "learning_rate": 0.0002481649740402086, "loss": 1.0718, "step": 3440 }, { "epoch": 0.5, "grad_norm": 0.134765625, "learning_rate": 0.00024759154405690946, "loss": 0.9502, "step": 3445 }, { "epoch": 0.5, "grad_norm": 0.1259765625, "learning_rate": 0.00024701812674577376, "loss": 1.0506, "step": 3450 }, { "epoch": 0.5, "grad_norm": 0.1533203125, "learning_rate": 0.000246444725123854, "loss": 0.9621, "step": 3455 }, { "epoch": 0.51, "grad_norm": 0.162109375, "learning_rate": 0.0002458713422081199, "loss": 1.2094, "step": 3460 }, { "epoch": 0.51, "grad_norm": 0.138671875, "learning_rate": 0.00024529798101544326, "loss": 1.1523, "step": 3465 }, { "epoch": 0.51, "grad_norm": 0.15234375, "learning_rate": 0.000244724644562581, "loss": 1.1936, "step": 3470 }, { "epoch": 0.51, "grad_norm": 0.1455078125, "learning_rate": 0.00024415133586616036, "loss": 1.1543, "step": 3475 }, { "epoch": 0.51, "grad_norm": 0.13671875, "learning_rate": 0.00024357805794266205, "loss": 1.1209, "step": 3480 }, { "epoch": 0.51, "grad_norm": 0.1396484375, "learning_rate": 0.00024300481380840527, "loss": 1.0296, "step": 3485 }, { "epoch": 0.51, "grad_norm": 0.1611328125, "learning_rate": 0.00024243160647953132, "loss": 0.9974, "step": 3490 }, { "epoch": 0.51, "grad_norm": 0.12255859375, "learning_rate": 0.00024185843897198784, "loss": 1.1089, "step": 3495 }, { "epoch": 0.51, "grad_norm": 0.140625, "learning_rate": 0.00024128531430151276, "loss": 0.8318, "step": 3500 }, { "epoch": 0.51, "grad_norm": 0.1435546875, "learning_rate": 0.00024071223548361895, "loss": 1.0249, "step": 3505 }, { "epoch": 0.51, "grad_norm": 0.1474609375, "learning_rate": 0.00024013920553357784, "loss": 0.9857, "step": 3510 }, { "epoch": 0.51, "grad_norm": 0.1416015625, "learning_rate": 0.0002395662274664038, "loss": 1.117, "step": 3515 }, { "epoch": 0.51, "grad_norm": 0.15625, "learning_rate": 0.0002389933042968381, "loss": 1.0439, "step": 3520 }, { "epoch": 0.51, "grad_norm": 0.1845703125, "learning_rate": 0.0002384204390393332, "loss": 1.0804, "step": 3525 }, { "epoch": 0.52, "grad_norm": 0.1455078125, "learning_rate": 0.00023784763470803712, "loss": 1.1855, "step": 3530 }, { "epoch": 0.52, "grad_norm": 0.140625, "learning_rate": 0.00023727489431677688, "loss": 1.0133, "step": 3535 }, { "epoch": 0.52, "grad_norm": 0.15625, "learning_rate": 0.00023670222087904342, "loss": 1.3227, "step": 3540 }, { "epoch": 0.52, "grad_norm": 0.177734375, "learning_rate": 0.00023612961740797516, "loss": 1.3998, "step": 3545 }, { "epoch": 0.52, "grad_norm": 0.1630859375, "learning_rate": 0.0002355570869163427, "loss": 1.0318, "step": 3550 }, { "epoch": 0.52, "grad_norm": 0.171875, "learning_rate": 0.0002349846324165323, "loss": 1.0445, "step": 3555 }, { "epoch": 0.52, "grad_norm": 0.15234375, "learning_rate": 0.0002344122569205306, "loss": 1.1273, "step": 3560 }, { "epoch": 0.52, "grad_norm": 0.1630859375, "learning_rate": 0.0002338399634399084, "loss": 1.0658, "step": 3565 }, { "epoch": 0.52, "grad_norm": 0.1474609375, "learning_rate": 0.00023326775498580535, "loss": 1.0565, "step": 3570 }, { "epoch": 0.52, "grad_norm": 0.16796875, "learning_rate": 0.00023269563456891322, "loss": 0.9532, "step": 3575 }, { "epoch": 0.52, "grad_norm": 0.1650390625, "learning_rate": 0.00023212360519946095, "loss": 1.1475, "step": 3580 }, { "epoch": 0.52, "grad_norm": 0.1630859375, "learning_rate": 0.00023155166988719805, "loss": 1.128, "step": 3585 }, { "epoch": 0.52, "grad_norm": 0.146484375, "learning_rate": 0.00023097983164137976, "loss": 0.8851, "step": 3590 }, { "epoch": 0.52, "grad_norm": 0.1474609375, "learning_rate": 0.00023040809347074997, "loss": 1.0787, "step": 3595 }, { "epoch": 0.53, "grad_norm": 0.150390625, "learning_rate": 0.00022983645838352642, "loss": 1.0876, "step": 3600 }, { "epoch": 0.53, "grad_norm": 0.1416015625, "learning_rate": 0.0002292649293873842, "loss": 0.948, "step": 3605 }, { "epoch": 0.53, "grad_norm": 0.1416015625, "learning_rate": 0.00022869350948944038, "loss": 1.1694, "step": 3610 }, { "epoch": 0.53, "grad_norm": 0.140625, "learning_rate": 0.00022812220169623798, "loss": 1.1419, "step": 3615 }, { "epoch": 0.53, "grad_norm": 0.1533203125, "learning_rate": 0.00022755100901373026, "loss": 1.0157, "step": 3620 }, { "epoch": 0.53, "grad_norm": 0.1220703125, "learning_rate": 0.00022697993444726456, "loss": 1.11, "step": 3625 }, { "epoch": 0.53, "grad_norm": 0.1328125, "learning_rate": 0.00022640898100156686, "loss": 1.25, "step": 3630 }, { "epoch": 0.53, "grad_norm": 0.1435546875, "learning_rate": 0.00022583815168072613, "loss": 0.9598, "step": 3635 }, { "epoch": 0.53, "grad_norm": 0.171875, "learning_rate": 0.00022526744948817793, "loss": 1.3431, "step": 3640 }, { "epoch": 0.53, "grad_norm": 0.162109375, "learning_rate": 0.00022469687742668898, "loss": 0.9564, "step": 3645 }, { "epoch": 0.53, "grad_norm": 0.1455078125, "learning_rate": 0.00022412643849834132, "loss": 1.142, "step": 3650 }, { "epoch": 0.53, "grad_norm": 0.1328125, "learning_rate": 0.00022355613570451677, "loss": 1.0124, "step": 3655 }, { "epoch": 0.53, "grad_norm": 0.16796875, "learning_rate": 0.00022298597204588043, "loss": 1.013, "step": 3660 }, { "epoch": 0.54, "grad_norm": 0.140625, "learning_rate": 0.00022241595052236566, "loss": 1.0421, "step": 3665 }, { "epoch": 0.54, "grad_norm": 0.2578125, "learning_rate": 0.00022184607413315788, "loss": 0.9687, "step": 3670 }, { "epoch": 0.54, "grad_norm": 0.1748046875, "learning_rate": 0.00022127634587667874, "loss": 0.9082, "step": 3675 }, { "epoch": 0.54, "grad_norm": 0.15234375, "learning_rate": 0.0002207067687505708, "loss": 1.0893, "step": 3680 }, { "epoch": 0.54, "grad_norm": 0.1376953125, "learning_rate": 0.00022013734575168116, "loss": 1.2716, "step": 3685 }, { "epoch": 0.54, "grad_norm": 0.1572265625, "learning_rate": 0.00021956807987604616, "loss": 1.1467, "step": 3690 }, { "epoch": 0.54, "grad_norm": 0.154296875, "learning_rate": 0.00021899897411887517, "loss": 1.1248, "step": 3695 }, { "epoch": 0.54, "grad_norm": 0.171875, "learning_rate": 0.00021843003147453544, "loss": 1.0846, "step": 3700 }, { "epoch": 0.54, "grad_norm": 0.1650390625, "learning_rate": 0.00021786125493653582, "loss": 0.9875, "step": 3705 }, { "epoch": 0.54, "grad_norm": 0.162109375, "learning_rate": 0.0002172926474975111, "loss": 1.1556, "step": 3710 }, { "epoch": 0.54, "grad_norm": 0.1572265625, "learning_rate": 0.0002167242121492064, "loss": 0.9676, "step": 3715 }, { "epoch": 0.54, "grad_norm": 0.1787109375, "learning_rate": 0.0002161559518824616, "loss": 1.0359, "step": 3720 }, { "epoch": 0.54, "grad_norm": 0.1455078125, "learning_rate": 0.00021558786968719514, "loss": 1.0299, "step": 3725 }, { "epoch": 0.54, "grad_norm": 0.1533203125, "learning_rate": 0.00021501996855238852, "loss": 1.1488, "step": 3730 }, { "epoch": 0.55, "grad_norm": 0.166015625, "learning_rate": 0.00021445225146607064, "loss": 1.1796, "step": 3735 }, { "epoch": 0.55, "grad_norm": 0.1552734375, "learning_rate": 0.00021388472141530218, "loss": 1.0853, "step": 3740 }, { "epoch": 0.55, "grad_norm": 0.166015625, "learning_rate": 0.00021331738138615958, "loss": 0.8584, "step": 3745 }, { "epoch": 0.55, "grad_norm": 0.158203125, "learning_rate": 0.0002127502343637194, "loss": 0.9085, "step": 3750 }, { "epoch": 0.55, "grad_norm": 0.17578125, "learning_rate": 0.00021218328333204284, "loss": 1.1501, "step": 3755 }, { "epoch": 0.55, "grad_norm": 0.1982421875, "learning_rate": 0.00021161653127415968, "loss": 1.1757, "step": 3760 }, { "epoch": 0.55, "grad_norm": 0.1611328125, "learning_rate": 0.00021104998117205316, "loss": 1.1097, "step": 3765 }, { "epoch": 0.55, "grad_norm": 0.1572265625, "learning_rate": 0.0002104836360066436, "loss": 1.4605, "step": 3770 }, { "epoch": 0.55, "grad_norm": 0.1435546875, "learning_rate": 0.00020991749875777315, "loss": 0.9118, "step": 3775 }, { "epoch": 0.55, "grad_norm": 0.134765625, "learning_rate": 0.0002093515724041899, "loss": 1.0906, "step": 3780 }, { "epoch": 0.55, "grad_norm": 0.146484375, "learning_rate": 0.00020878585992353252, "loss": 0.9324, "step": 3785 }, { "epoch": 0.55, "grad_norm": 0.138671875, "learning_rate": 0.00020822036429231426, "loss": 0.9894, "step": 3790 }, { "epoch": 0.55, "grad_norm": 0.142578125, "learning_rate": 0.00020765508848590738, "loss": 0.9968, "step": 3795 }, { "epoch": 0.55, "grad_norm": 0.1669921875, "learning_rate": 0.0002070900354785275, "loss": 0.9817, "step": 3800 }, { "epoch": 0.56, "grad_norm": 0.16015625, "learning_rate": 0.00020652520824321813, "loss": 1.1311, "step": 3805 }, { "epoch": 0.56, "grad_norm": 0.146484375, "learning_rate": 0.00020596060975183473, "loss": 0.9224, "step": 3810 }, { "epoch": 0.56, "grad_norm": 0.15234375, "learning_rate": 0.00020539624297502933, "loss": 1.1655, "step": 3815 }, { "epoch": 0.56, "grad_norm": 0.138671875, "learning_rate": 0.00020483211088223458, "loss": 1.0407, "step": 3820 }, { "epoch": 0.56, "grad_norm": 0.154296875, "learning_rate": 0.00020426821644164858, "loss": 1.0116, "step": 3825 }, { "epoch": 0.56, "grad_norm": 0.1357421875, "learning_rate": 0.00020370456262021897, "loss": 1.0392, "step": 3830 }, { "epoch": 0.56, "grad_norm": 0.1357421875, "learning_rate": 0.00020314115238362712, "loss": 0.9167, "step": 3835 }, { "epoch": 0.56, "grad_norm": 0.126953125, "learning_rate": 0.00020257798869627308, "loss": 1.0841, "step": 3840 }, { "epoch": 0.56, "grad_norm": 0.1796875, "learning_rate": 0.0002020150745212595, "loss": 0.8587, "step": 3845 }, { "epoch": 0.56, "grad_norm": 0.181640625, "learning_rate": 0.00020145241282037638, "loss": 1.0392, "step": 3850 }, { "epoch": 0.56, "grad_norm": 0.14453125, "learning_rate": 0.00020089000655408503, "loss": 0.97, "step": 3855 }, { "epoch": 0.56, "grad_norm": 0.1513671875, "learning_rate": 0.00020032785868150308, "loss": 0.989, "step": 3860 }, { "epoch": 0.56, "grad_norm": 0.15234375, "learning_rate": 0.00019976597216038835, "loss": 0.8304, "step": 3865 }, { "epoch": 0.57, "grad_norm": 0.150390625, "learning_rate": 0.00019920434994712393, "loss": 1.0367, "step": 3870 }, { "epoch": 0.57, "grad_norm": 0.1748046875, "learning_rate": 0.0001986429949967018, "loss": 1.2286, "step": 3875 }, { "epoch": 0.57, "grad_norm": 0.1591796875, "learning_rate": 0.00019808191026270805, "loss": 1.0935, "step": 3880 }, { "epoch": 0.57, "grad_norm": 0.134765625, "learning_rate": 0.0001975210986973067, "loss": 1.0016, "step": 3885 }, { "epoch": 0.57, "grad_norm": 0.1953125, "learning_rate": 0.00019696056325122502, "loss": 1.0657, "step": 3890 }, { "epoch": 0.57, "grad_norm": 0.1640625, "learning_rate": 0.0001964003068737369, "loss": 0.9734, "step": 3895 }, { "epoch": 0.57, "grad_norm": 0.1728515625, "learning_rate": 0.00019584033251264824, "loss": 1.0914, "step": 3900 }, { "epoch": 0.57, "grad_norm": 0.166015625, "learning_rate": 0.00019528064311428092, "loss": 1.0734, "step": 3905 }, { "epoch": 0.57, "grad_norm": 0.1376953125, "learning_rate": 0.00019472124162345756, "loss": 0.7545, "step": 3910 }, { "epoch": 0.57, "grad_norm": 0.158203125, "learning_rate": 0.00019416213098348605, "loss": 1.0816, "step": 3915 }, { "epoch": 0.57, "grad_norm": 0.1474609375, "learning_rate": 0.00019360331413614384, "loss": 1.0022, "step": 3920 }, { "epoch": 0.57, "grad_norm": 0.177734375, "learning_rate": 0.00019304479402166256, "loss": 1.2418, "step": 3925 }, { "epoch": 0.57, "grad_norm": 0.1474609375, "learning_rate": 0.00019248657357871256, "loss": 1.0155, "step": 3930 }, { "epoch": 0.57, "grad_norm": 0.16015625, "learning_rate": 0.0001919286557443876, "loss": 0.9751, "step": 3935 }, { "epoch": 0.58, "grad_norm": 0.158203125, "learning_rate": 0.0001913710434541892, "loss": 0.986, "step": 3940 }, { "epoch": 0.58, "grad_norm": 0.146484375, "learning_rate": 0.00019081373964201107, "loss": 0.9691, "step": 3945 }, { "epoch": 0.58, "grad_norm": 0.1533203125, "learning_rate": 0.00019025674724012403, "loss": 1.2395, "step": 3950 }, { "epoch": 0.58, "grad_norm": 0.150390625, "learning_rate": 0.00018970006917916059, "loss": 1.0455, "step": 3955 }, { "epoch": 0.58, "grad_norm": 0.158203125, "learning_rate": 0.00018914370838809885, "loss": 1.0043, "step": 3960 }, { "epoch": 0.58, "grad_norm": 0.181640625, "learning_rate": 0.00018858766779424803, "loss": 1.1731, "step": 3965 }, { "epoch": 0.58, "grad_norm": 0.2177734375, "learning_rate": 0.00018803195032323231, "loss": 0.9617, "step": 3970 }, { "epoch": 0.58, "grad_norm": 0.1640625, "learning_rate": 0.00018747655889897612, "loss": 1.0404, "step": 3975 }, { "epoch": 0.58, "grad_norm": 0.2060546875, "learning_rate": 0.000186921496443688, "loss": 0.9949, "step": 3980 }, { "epoch": 0.58, "grad_norm": 0.1337890625, "learning_rate": 0.00018636676587784585, "loss": 0.8834, "step": 3985 }, { "epoch": 0.58, "grad_norm": 0.158203125, "learning_rate": 0.0001858123701201812, "loss": 0.9534, "step": 3990 }, { "epoch": 0.58, "grad_norm": 0.1845703125, "learning_rate": 0.00018525831208766397, "loss": 0.9737, "step": 3995 }, { "epoch": 0.58, "grad_norm": 0.1474609375, "learning_rate": 0.00018470459469548727, "loss": 0.9685, "step": 4000 }, { "epoch": 0.58, "grad_norm": 0.1611328125, "learning_rate": 0.00018415122085705188, "loss": 1.2834, "step": 4005 }, { "epoch": 0.59, "grad_norm": 0.138671875, "learning_rate": 0.0001835981934839508, "loss": 0.991, "step": 4010 }, { "epoch": 0.59, "grad_norm": 0.1630859375, "learning_rate": 0.00018304551548595422, "loss": 1.2289, "step": 4015 }, { "epoch": 0.59, "grad_norm": 0.140625, "learning_rate": 0.0001824931897709941, "loss": 1.0199, "step": 4020 }, { "epoch": 0.59, "grad_norm": 1.1953125, "learning_rate": 0.00018194121924514892, "loss": 1.0528, "step": 4025 }, { "epoch": 0.59, "grad_norm": 0.15625, "learning_rate": 0.00018138960681262807, "loss": 0.9923, "step": 4030 }, { "epoch": 0.59, "grad_norm": 0.16796875, "learning_rate": 0.00018083835537575693, "loss": 1.0761, "step": 4035 }, { "epoch": 0.59, "grad_norm": 0.1513671875, "learning_rate": 0.00018028746783496175, "loss": 1.1735, "step": 4040 }, { "epoch": 0.59, "grad_norm": 0.1484375, "learning_rate": 0.00017973694708875383, "loss": 0.9852, "step": 4045 }, { "epoch": 0.59, "grad_norm": 0.1259765625, "learning_rate": 0.00017918679603371455, "loss": 0.9332, "step": 4050 }, { "epoch": 0.59, "grad_norm": 0.166015625, "learning_rate": 0.00017863701756448035, "loss": 1.1593, "step": 4055 }, { "epoch": 0.59, "grad_norm": 0.1318359375, "learning_rate": 0.00017808761457372706, "loss": 1.1001, "step": 4060 }, { "epoch": 0.59, "grad_norm": 0.134765625, "learning_rate": 0.00017753858995215539, "loss": 1.0817, "step": 4065 }, { "epoch": 0.59, "grad_norm": 0.169921875, "learning_rate": 0.00017698994658847455, "loss": 1.2082, "step": 4070 }, { "epoch": 0.6, "grad_norm": 0.1357421875, "learning_rate": 0.00017644168736938843, "loss": 1.0919, "step": 4075 }, { "epoch": 0.6, "grad_norm": 0.146484375, "learning_rate": 0.00017589381517957902, "loss": 0.988, "step": 4080 }, { "epoch": 0.6, "grad_norm": 0.142578125, "learning_rate": 0.0001753463329016928, "loss": 1.0371, "step": 4085 }, { "epoch": 0.6, "grad_norm": 0.150390625, "learning_rate": 0.00017479924341632397, "loss": 1.0561, "step": 4090 }, { "epoch": 0.6, "grad_norm": 0.1630859375, "learning_rate": 0.00017425254960200048, "loss": 0.9408, "step": 4095 }, { "epoch": 0.6, "grad_norm": 0.1650390625, "learning_rate": 0.00017370625433516814, "loss": 1.1421, "step": 4100 }, { "epoch": 0.6, "grad_norm": 0.140625, "learning_rate": 0.0001731603604901761, "loss": 1.1159, "step": 4105 }, { "epoch": 0.6, "grad_norm": 0.15234375, "learning_rate": 0.00017261487093926117, "loss": 1.1286, "step": 4110 }, { "epoch": 0.6, "grad_norm": 0.1640625, "learning_rate": 0.00017206978855253315, "loss": 0.9616, "step": 4115 }, { "epoch": 0.6, "grad_norm": 0.1376953125, "learning_rate": 0.00017152511619795925, "loss": 0.9986, "step": 4120 }, { "epoch": 0.6, "grad_norm": 0.1611328125, "learning_rate": 0.00017098085674134962, "loss": 1.0591, "step": 4125 }, { "epoch": 0.6, "grad_norm": 0.1689453125, "learning_rate": 0.0001704370130463418, "loss": 1.1427, "step": 4130 }, { "epoch": 0.6, "grad_norm": 0.169921875, "learning_rate": 0.00016989358797438563, "loss": 0.8409, "step": 4135 }, { "epoch": 0.6, "grad_norm": 0.15234375, "learning_rate": 0.00016935058438472862, "loss": 1.1067, "step": 4140 }, { "epoch": 0.61, "grad_norm": 0.1962890625, "learning_rate": 0.00016880800513440037, "loss": 1.2404, "step": 4145 }, { "epoch": 0.61, "grad_norm": 0.1416015625, "learning_rate": 0.00016826585307819813, "loss": 1.1614, "step": 4150 }, { "epoch": 0.61, "grad_norm": 0.146484375, "learning_rate": 0.00016772413106867112, "loss": 1.0634, "step": 4155 }, { "epoch": 0.61, "grad_norm": 0.1328125, "learning_rate": 0.00016718284195610606, "loss": 1.0528, "step": 4160 }, { "epoch": 0.61, "grad_norm": 0.1611328125, "learning_rate": 0.0001666419885885118, "loss": 0.9877, "step": 4165 }, { "epoch": 0.61, "grad_norm": 0.154296875, "learning_rate": 0.0001661015738116049, "loss": 1.1577, "step": 4170 }, { "epoch": 0.61, "grad_norm": 0.197265625, "learning_rate": 0.00016556160046879377, "loss": 1.1811, "step": 4175 }, { "epoch": 0.61, "grad_norm": 0.138671875, "learning_rate": 0.00016502207140116446, "loss": 0.9547, "step": 4180 }, { "epoch": 0.61, "grad_norm": 0.1494140625, "learning_rate": 0.00016448298944746526, "loss": 1.1406, "step": 4185 }, { "epoch": 0.61, "grad_norm": 0.1572265625, "learning_rate": 0.00016394435744409238, "loss": 1.0288, "step": 4190 }, { "epoch": 0.61, "grad_norm": 0.1591796875, "learning_rate": 0.00016340617822507416, "loss": 1.0023, "step": 4195 }, { "epoch": 0.61, "grad_norm": 0.158203125, "learning_rate": 0.00016286845462205684, "loss": 1.1052, "step": 4200 }, { "epoch": 0.61, "grad_norm": 0.140625, "learning_rate": 0.00016233118946428927, "loss": 0.9259, "step": 4205 }, { "epoch": 0.61, "grad_norm": 0.171875, "learning_rate": 0.0001617943855786082, "loss": 0.9792, "step": 4210 }, { "epoch": 0.62, "grad_norm": 0.15625, "learning_rate": 0.00016125804578942364, "loss": 1.2032, "step": 4215 }, { "epoch": 0.62, "grad_norm": 0.1328125, "learning_rate": 0.00016072217291870344, "loss": 1.0617, "step": 4220 }, { "epoch": 0.62, "grad_norm": 0.2216796875, "learning_rate": 0.0001601867697859588, "loss": 1.2913, "step": 4225 }, { "epoch": 0.62, "grad_norm": 0.1220703125, "learning_rate": 0.0001596518392082294, "loss": 0.8958, "step": 4230 }, { "epoch": 0.62, "grad_norm": 0.1416015625, "learning_rate": 0.00015911738400006874, "loss": 0.9306, "step": 4235 }, { "epoch": 0.62, "grad_norm": 0.16796875, "learning_rate": 0.00015858340697352892, "loss": 0.9702, "step": 4240 }, { "epoch": 0.62, "grad_norm": 0.1611328125, "learning_rate": 0.00015804991093814607, "loss": 0.9372, "step": 4245 }, { "epoch": 0.62, "grad_norm": 0.185546875, "learning_rate": 0.00015751689870092565, "loss": 1.1663, "step": 4250 }, { "epoch": 0.62, "grad_norm": 0.134765625, "learning_rate": 0.00015698437306632768, "loss": 0.7672, "step": 4255 }, { "epoch": 0.62, "grad_norm": 0.201171875, "learning_rate": 0.00015645233683625188, "loss": 0.9745, "step": 4260 }, { "epoch": 0.62, "grad_norm": 0.17578125, "learning_rate": 0.0001559207928100227, "loss": 1.058, "step": 4265 }, { "epoch": 0.62, "grad_norm": 0.1376953125, "learning_rate": 0.00015538974378437513, "loss": 0.9706, "step": 4270 }, { "epoch": 0.62, "grad_norm": 0.1630859375, "learning_rate": 0.00015485919255343972, "loss": 1.1274, "step": 4275 }, { "epoch": 0.62, "grad_norm": 0.134765625, "learning_rate": 0.00015432914190872756, "loss": 0.9899, "step": 4280 }, { "epoch": 0.63, "grad_norm": 0.1328125, "learning_rate": 0.00015379959463911622, "loss": 1.0313, "step": 4285 }, { "epoch": 0.63, "grad_norm": 0.162109375, "learning_rate": 0.0001532705535308346, "loss": 0.9323, "step": 4290 }, { "epoch": 0.63, "grad_norm": 0.146484375, "learning_rate": 0.00015274202136744824, "loss": 0.8996, "step": 4295 }, { "epoch": 0.63, "grad_norm": 0.1328125, "learning_rate": 0.00015221400092984527, "loss": 0.8551, "step": 4300 }, { "epoch": 0.63, "grad_norm": 0.1455078125, "learning_rate": 0.00015168649499622104, "loss": 0.9615, "step": 4305 }, { "epoch": 0.63, "grad_norm": 0.1513671875, "learning_rate": 0.0001511595063420638, "loss": 1.0077, "step": 4310 }, { "epoch": 0.63, "grad_norm": 0.169921875, "learning_rate": 0.00015063303774014036, "loss": 0.9879, "step": 4315 }, { "epoch": 0.63, "grad_norm": 0.1533203125, "learning_rate": 0.00015010709196048105, "loss": 1.1628, "step": 4320 }, { "epoch": 0.63, "grad_norm": 0.1748046875, "learning_rate": 0.00014958167177036552, "loss": 1.1096, "step": 4325 }, { "epoch": 0.63, "grad_norm": 0.16015625, "learning_rate": 0.0001490567799343078, "loss": 0.9157, "step": 4330 }, { "epoch": 0.63, "grad_norm": 0.142578125, "learning_rate": 0.00014853241921404208, "loss": 1.311, "step": 4335 }, { "epoch": 0.63, "grad_norm": 0.390625, "learning_rate": 0.00014800859236850823, "loss": 1.1399, "step": 4340 }, { "epoch": 0.63, "grad_norm": 0.138671875, "learning_rate": 0.00014748530215383693, "loss": 1.2342, "step": 4345 }, { "epoch": 0.64, "grad_norm": 0.150390625, "learning_rate": 0.00014696255132333529, "loss": 1.196, "step": 4350 }, { "epoch": 0.64, "grad_norm": 0.134765625, "learning_rate": 0.00014644034262747255, "loss": 0.9654, "step": 4355 }, { "epoch": 0.64, "grad_norm": 0.1591796875, "learning_rate": 0.0001459186788138656, "loss": 1.0736, "step": 4360 }, { "epoch": 0.64, "grad_norm": 0.1484375, "learning_rate": 0.00014539756262726435, "loss": 1.0585, "step": 4365 }, { "epoch": 0.64, "grad_norm": 0.1787109375, "learning_rate": 0.0001448769968095372, "loss": 0.9221, "step": 4370 }, { "epoch": 0.64, "grad_norm": 0.1591796875, "learning_rate": 0.00014435698409965674, "loss": 1.0557, "step": 4375 }, { "epoch": 0.64, "grad_norm": 0.16796875, "learning_rate": 0.0001438375272336857, "loss": 1.1002, "step": 4380 }, { "epoch": 0.64, "grad_norm": 0.15234375, "learning_rate": 0.00014331862894476205, "loss": 1.1207, "step": 4385 }, { "epoch": 0.64, "grad_norm": 0.146484375, "learning_rate": 0.00014280029196308473, "loss": 0.9124, "step": 4390 }, { "epoch": 0.64, "grad_norm": 0.1337890625, "learning_rate": 0.0001422825190158992, "loss": 1.3247, "step": 4395 }, { "epoch": 0.64, "grad_norm": 0.212890625, "learning_rate": 0.00014176531282748356, "loss": 1.0844, "step": 4400 }, { "epoch": 0.64, "grad_norm": 0.14453125, "learning_rate": 0.00014124867611913384, "loss": 1.1084, "step": 4405 }, { "epoch": 0.64, "grad_norm": 0.14453125, "learning_rate": 0.00014073261160914952, "loss": 1.1076, "step": 4410 }, { "epoch": 0.64, "grad_norm": 0.1357421875, "learning_rate": 0.00014021712201281943, "loss": 0.9841, "step": 4415 }, { "epoch": 0.65, "grad_norm": 0.14453125, "learning_rate": 0.00013970221004240782, "loss": 1.0774, "step": 4420 }, { "epoch": 0.65, "grad_norm": 0.150390625, "learning_rate": 0.00013918787840713954, "loss": 1.3116, "step": 4425 }, { "epoch": 0.65, "grad_norm": 0.1728515625, "learning_rate": 0.00013867412981318584, "loss": 1.1418, "step": 4430 }, { "epoch": 0.65, "grad_norm": 0.1484375, "learning_rate": 0.00013816096696365042, "loss": 0.9842, "step": 4435 }, { "epoch": 0.65, "grad_norm": 0.1953125, "learning_rate": 0.0001376483925585552, "loss": 1.1092, "step": 4440 }, { "epoch": 0.65, "grad_norm": 0.173828125, "learning_rate": 0.00013713640929482562, "loss": 0.9737, "step": 4445 }, { "epoch": 0.65, "grad_norm": 0.1494140625, "learning_rate": 0.00013662501986627732, "loss": 1.1233, "step": 4450 }, { "epoch": 0.65, "grad_norm": 0.1474609375, "learning_rate": 0.00013611422696360094, "loss": 1.0444, "step": 4455 }, { "epoch": 0.65, "grad_norm": 0.1474609375, "learning_rate": 0.0001356040332743489, "loss": 1.2116, "step": 4460 }, { "epoch": 0.65, "grad_norm": 0.1357421875, "learning_rate": 0.0001350944414829205, "loss": 1.0156, "step": 4465 }, { "epoch": 0.65, "grad_norm": 0.13671875, "learning_rate": 0.00013458545427054842, "loss": 1.1135, "step": 4470 }, { "epoch": 0.65, "grad_norm": 0.1806640625, "learning_rate": 0.0001340770743152842, "loss": 1.3632, "step": 4475 }, { "epoch": 0.65, "grad_norm": 0.177734375, "learning_rate": 0.0001335693042919841, "loss": 1.0433, "step": 4480 }, { "epoch": 0.65, "grad_norm": 0.1630859375, "learning_rate": 0.00013306214687229558, "loss": 0.8539, "step": 4485 }, { "epoch": 0.66, "grad_norm": 0.16015625, "learning_rate": 0.0001325556047246427, "loss": 1.0244, "step": 4490 }, { "epoch": 0.66, "grad_norm": 0.1630859375, "learning_rate": 0.00013204968051421214, "loss": 0.9696, "step": 4495 }, { "epoch": 0.66, "grad_norm": 0.1513671875, "learning_rate": 0.0001315443769029393, "loss": 1.2288, "step": 4500 }, { "epoch": 0.66, "grad_norm": 0.140625, "learning_rate": 0.0001310396965494944, "loss": 0.9919, "step": 4505 }, { "epoch": 0.66, "grad_norm": 0.12890625, "learning_rate": 0.00013053564210926844, "loss": 0.9188, "step": 4510 }, { "epoch": 0.66, "grad_norm": 0.19921875, "learning_rate": 0.00013003221623435895, "loss": 1.2513, "step": 4515 }, { "epoch": 0.66, "grad_norm": 0.1689453125, "learning_rate": 0.00012952942157355617, "loss": 1.0146, "step": 4520 }, { "epoch": 0.66, "grad_norm": 0.1689453125, "learning_rate": 0.00012902726077232966, "loss": 0.9834, "step": 4525 }, { "epoch": 0.66, "grad_norm": 0.130859375, "learning_rate": 0.00012852573647281328, "loss": 1.067, "step": 4530 }, { "epoch": 0.66, "grad_norm": 0.146484375, "learning_rate": 0.0001280248513137925, "loss": 0.9357, "step": 4535 }, { "epoch": 0.66, "grad_norm": 0.1416015625, "learning_rate": 0.0001275246079306895, "loss": 0.9427, "step": 4540 }, { "epoch": 0.66, "grad_norm": 0.1357421875, "learning_rate": 0.00012702500895554996, "loss": 1.0688, "step": 4545 }, { "epoch": 0.66, "grad_norm": 0.1484375, "learning_rate": 0.00012652605701702892, "loss": 1.0269, "step": 4550 }, { "epoch": 0.67, "grad_norm": 0.1689453125, "learning_rate": 0.0001260277547403771, "loss": 1.1006, "step": 4555 }, { "epoch": 0.67, "grad_norm": 0.1513671875, "learning_rate": 0.00012553010474742674, "loss": 0.9569, "step": 4560 }, { "epoch": 0.67, "grad_norm": 0.2275390625, "learning_rate": 0.00012503310965657853, "loss": 1.0812, "step": 4565 }, { "epoch": 0.67, "grad_norm": 0.140625, "learning_rate": 0.00012453677208278685, "loss": 1.1446, "step": 4570 }, { "epoch": 0.67, "grad_norm": 0.1669921875, "learning_rate": 0.00012404109463754704, "loss": 0.8849, "step": 4575 }, { "epoch": 0.67, "grad_norm": 0.1572265625, "learning_rate": 0.00012354607992888064, "loss": 1.0462, "step": 4580 }, { "epoch": 0.67, "grad_norm": 0.169921875, "learning_rate": 0.0001230517305613227, "loss": 1.0097, "step": 4585 }, { "epoch": 0.67, "grad_norm": 0.154296875, "learning_rate": 0.00012255804913590722, "loss": 1.2159, "step": 4590 }, { "epoch": 0.67, "grad_norm": 0.126953125, "learning_rate": 0.00012206503825015373, "loss": 1.0315, "step": 4595 }, { "epoch": 0.67, "grad_norm": 0.169921875, "learning_rate": 0.00012157270049805405, "loss": 1.2781, "step": 4600 }, { "epoch": 0.67, "grad_norm": 0.1494140625, "learning_rate": 0.00012108103847005791, "loss": 0.9739, "step": 4605 }, { "epoch": 0.67, "grad_norm": 0.1484375, "learning_rate": 0.00012059005475306011, "loss": 1.2582, "step": 4610 }, { "epoch": 0.67, "grad_norm": 0.15234375, "learning_rate": 0.00012009975193038603, "loss": 0.9291, "step": 4615 }, { "epoch": 0.67, "grad_norm": 0.1435546875, "learning_rate": 0.00011961013258177892, "loss": 0.7847, "step": 4620 }, { "epoch": 0.68, "grad_norm": 0.1416015625, "learning_rate": 0.00011912119928338553, "loss": 0.9385, "step": 4625 }, { "epoch": 0.68, "grad_norm": 0.142578125, "learning_rate": 0.00011863295460774334, "loss": 0.982, "step": 4630 }, { "epoch": 0.68, "grad_norm": 0.1357421875, "learning_rate": 0.00011814540112376623, "loss": 1.2993, "step": 4635 }, { "epoch": 0.68, "grad_norm": 0.140625, "learning_rate": 0.00011765854139673174, "loss": 0.9739, "step": 4640 }, { "epoch": 0.68, "grad_norm": 0.1591796875, "learning_rate": 0.00011717237798826677, "loss": 0.9895, "step": 4645 }, { "epoch": 0.68, "grad_norm": 0.1748046875, "learning_rate": 0.00011668691345633492, "loss": 1.1416, "step": 4650 }, { "epoch": 0.68, "grad_norm": 0.1220703125, "learning_rate": 0.00011620215035522225, "loss": 1.1118, "step": 4655 }, { "epoch": 0.68, "grad_norm": 0.185546875, "learning_rate": 0.00011571809123552462, "loss": 1.258, "step": 4660 }, { "epoch": 0.68, "grad_norm": 0.15625, "learning_rate": 0.0001152347386441335, "loss": 1.038, "step": 4665 }, { "epoch": 0.68, "grad_norm": 0.1494140625, "learning_rate": 0.00011475209512422324, "loss": 1.078, "step": 4670 }, { "epoch": 0.68, "grad_norm": 0.15234375, "learning_rate": 0.00011427016321523717, "loss": 0.9228, "step": 4675 }, { "epoch": 0.68, "grad_norm": 0.203125, "learning_rate": 0.0001137889454528745, "loss": 1.118, "step": 4680 }, { "epoch": 0.68, "grad_norm": 0.16796875, "learning_rate": 0.000113308444369077, "loss": 1.1574, "step": 4685 }, { "epoch": 0.68, "grad_norm": 0.14453125, "learning_rate": 0.00011282866249201574, "loss": 1.1618, "step": 4690 }, { "epoch": 0.69, "grad_norm": 0.16796875, "learning_rate": 0.00011234960234607739, "loss": 1.2177, "step": 4695 }, { "epoch": 0.69, "grad_norm": 0.1962890625, "learning_rate": 0.00011187126645185122, "loss": 1.225, "step": 4700 }, { "epoch": 0.69, "grad_norm": 0.15234375, "learning_rate": 0.00011139365732611611, "loss": 1.0852, "step": 4705 }, { "epoch": 0.69, "grad_norm": 0.1484375, "learning_rate": 0.00011091677748182688, "loss": 1.0386, "step": 4710 }, { "epoch": 0.69, "grad_norm": 0.1484375, "learning_rate": 0.00011044062942810113, "loss": 0.9717, "step": 4715 }, { "epoch": 0.69, "grad_norm": 0.1455078125, "learning_rate": 0.00010996521567020607, "loss": 0.915, "step": 4720 }, { "epoch": 0.69, "grad_norm": 0.1640625, "learning_rate": 0.0001094905387095457, "loss": 1.0127, "step": 4725 }, { "epoch": 0.69, "grad_norm": 0.1396484375, "learning_rate": 0.00010901660104364697, "loss": 0.9142, "step": 4730 }, { "epoch": 0.69, "grad_norm": 0.1630859375, "learning_rate": 0.00010854340516614736, "loss": 1.1216, "step": 4735 }, { "epoch": 0.69, "grad_norm": 0.189453125, "learning_rate": 0.00010807095356678112, "loss": 1.0968, "step": 4740 }, { "epoch": 0.69, "grad_norm": 0.169921875, "learning_rate": 0.00010759924873136647, "loss": 1.0457, "step": 4745 }, { "epoch": 0.69, "grad_norm": 0.16796875, "learning_rate": 0.00010712829314179274, "loss": 0.9762, "step": 4750 }, { "epoch": 0.69, "grad_norm": 0.1318359375, "learning_rate": 0.000106658089276007, "loss": 0.971, "step": 4755 }, { "epoch": 0.7, "grad_norm": 0.1337890625, "learning_rate": 0.00010618863960800096, "loss": 0.9612, "step": 4760 }, { "epoch": 0.7, "grad_norm": 0.1484375, "learning_rate": 0.0001057199466077981, "loss": 0.9232, "step": 4765 }, { "epoch": 0.7, "grad_norm": 0.1611328125, "learning_rate": 0.00010525201274144087, "loss": 1.1631, "step": 4770 }, { "epoch": 0.7, "grad_norm": 0.1533203125, "learning_rate": 0.00010478484047097745, "loss": 1.1005, "step": 4775 }, { "epoch": 0.7, "grad_norm": 0.140625, "learning_rate": 0.00010431843225444873, "loss": 0.9329, "step": 4780 }, { "epoch": 0.7, "grad_norm": 0.134765625, "learning_rate": 0.00010385279054587551, "loss": 1.0208, "step": 4785 }, { "epoch": 0.7, "grad_norm": 0.16796875, "learning_rate": 0.00010338791779524573, "loss": 1.0081, "step": 4790 }, { "epoch": 0.7, "grad_norm": 0.28515625, "learning_rate": 0.00010292381644850149, "loss": 1.1968, "step": 4795 }, { "epoch": 0.7, "grad_norm": 0.1728515625, "learning_rate": 0.00010246048894752588, "loss": 1.2157, "step": 4800 }, { "epoch": 0.7, "grad_norm": 0.12890625, "learning_rate": 0.00010199793773013041, "loss": 0.9195, "step": 4805 }, { "epoch": 0.7, "grad_norm": 0.1611328125, "learning_rate": 0.00010153616523004233, "loss": 0.959, "step": 4810 }, { "epoch": 0.7, "grad_norm": 0.142578125, "learning_rate": 0.00010107517387689166, "loss": 1.0178, "step": 4815 }, { "epoch": 0.7, "grad_norm": 0.166015625, "learning_rate": 0.0001006149660961982, "loss": 0.8926, "step": 4820 }, { "epoch": 0.7, "grad_norm": 0.171875, "learning_rate": 0.00010015554430935902, "loss": 1.151, "step": 4825 }, { "epoch": 0.71, "grad_norm": 0.12451171875, "learning_rate": 9.969691093363553e-05, "loss": 0.9893, "step": 4830 }, { "epoch": 0.71, "grad_norm": 0.1357421875, "learning_rate": 9.923906838214138e-05, "loss": 1.0514, "step": 4835 }, { "epoch": 0.71, "grad_norm": 0.1416015625, "learning_rate": 9.878201906382881e-05, "loss": 1.0175, "step": 4840 }, { "epoch": 0.71, "grad_norm": 0.1533203125, "learning_rate": 9.832576538347652e-05, "loss": 1.1843, "step": 4845 }, { "epoch": 0.71, "grad_norm": 0.140625, "learning_rate": 9.787030974167694e-05, "loss": 1.0639, "step": 4850 }, { "epoch": 0.71, "grad_norm": 0.1640625, "learning_rate": 9.741565453482368e-05, "loss": 1.0156, "step": 4855 }, { "epoch": 0.71, "grad_norm": 0.162109375, "learning_rate": 9.696180215509892e-05, "loss": 1.1617, "step": 4860 }, { "epoch": 0.71, "grad_norm": 0.1494140625, "learning_rate": 9.650875499046058e-05, "loss": 0.9796, "step": 4865 }, { "epoch": 0.71, "grad_norm": 0.1455078125, "learning_rate": 9.605651542462979e-05, "loss": 0.7412, "step": 4870 }, { "epoch": 0.71, "grad_norm": 0.1533203125, "learning_rate": 9.560508583707878e-05, "loss": 1.1196, "step": 4875 }, { "epoch": 0.71, "grad_norm": 0.1474609375, "learning_rate": 9.515446860301802e-05, "loss": 0.7949, "step": 4880 }, { "epoch": 0.71, "grad_norm": 0.1435546875, "learning_rate": 9.470466609338351e-05, "loss": 1.1594, "step": 4885 }, { "epoch": 0.71, "grad_norm": 0.169921875, "learning_rate": 9.425568067482468e-05, "loss": 1.0454, "step": 4890 }, { "epoch": 0.71, "grad_norm": 0.193359375, "learning_rate": 9.380751470969186e-05, "loss": 1.2346, "step": 4895 }, { "epoch": 0.72, "grad_norm": 0.1455078125, "learning_rate": 9.33601705560239e-05, "loss": 0.9849, "step": 4900 }, { "epoch": 0.72, "grad_norm": 0.1865234375, "learning_rate": 9.291365056753543e-05, "loss": 1.1268, "step": 4905 }, { "epoch": 0.72, "grad_norm": 0.1494140625, "learning_rate": 9.246795709360472e-05, "loss": 0.9571, "step": 4910 }, { "epoch": 0.72, "grad_norm": 0.1494140625, "learning_rate": 9.20230924792615e-05, "loss": 1.2293, "step": 4915 }, { "epoch": 0.72, "grad_norm": 0.15234375, "learning_rate": 9.157905906517444e-05, "loss": 0.9942, "step": 4920 }, { "epoch": 0.72, "grad_norm": 0.140625, "learning_rate": 9.113585918763858e-05, "loss": 1.0211, "step": 4925 }, { "epoch": 0.72, "grad_norm": 0.1416015625, "learning_rate": 9.06934951785634e-05, "loss": 1.0026, "step": 4930 }, { "epoch": 0.72, "grad_norm": 0.1611328125, "learning_rate": 9.025196936546051e-05, "loss": 1.0391, "step": 4935 }, { "epoch": 0.72, "grad_norm": 0.1748046875, "learning_rate": 8.981128407143136e-05, "loss": 1.0796, "step": 4940 }, { "epoch": 0.72, "grad_norm": 0.142578125, "learning_rate": 8.937144161515481e-05, "loss": 1.2019, "step": 4945 }, { "epoch": 0.72, "grad_norm": 0.1865234375, "learning_rate": 8.893244431087516e-05, "loss": 1.1303, "step": 4950 }, { "epoch": 0.72, "grad_norm": 0.1669921875, "learning_rate": 8.849429446838983e-05, "loss": 1.098, "step": 4955 }, { "epoch": 0.72, "grad_norm": 0.142578125, "learning_rate": 8.805699439303772e-05, "loss": 1.205, "step": 4960 }, { "epoch": 0.73, "grad_norm": 0.162109375, "learning_rate": 8.762054638568614e-05, "loss": 1.1974, "step": 4965 }, { "epoch": 0.73, "grad_norm": 0.162109375, "learning_rate": 8.718495274271951e-05, "loss": 0.8459, "step": 4970 }, { "epoch": 0.73, "grad_norm": 0.14453125, "learning_rate": 8.675021575602674e-05, "loss": 1.0461, "step": 4975 }, { "epoch": 0.73, "grad_norm": 0.1416015625, "learning_rate": 8.631633771298969e-05, "loss": 1.0227, "step": 4980 }, { "epoch": 0.73, "grad_norm": 0.13671875, "learning_rate": 8.588332089647083e-05, "loss": 0.9869, "step": 4985 }, { "epoch": 0.73, "grad_norm": 0.1474609375, "learning_rate": 8.545116758480109e-05, "loss": 1.0406, "step": 4990 }, { "epoch": 0.73, "grad_norm": 0.12158203125, "learning_rate": 8.501988005176805e-05, "loss": 0.9332, "step": 4995 }, { "epoch": 0.73, "grad_norm": 0.1875, "learning_rate": 8.458946056660413e-05, "loss": 1.1838, "step": 5000 }, { "epoch": 0.73, "grad_norm": 0.1455078125, "learning_rate": 8.415991139397452e-05, "loss": 1.1195, "step": 5005 }, { "epoch": 0.73, "grad_norm": 0.171875, "learning_rate": 8.373123479396505e-05, "loss": 1.1509, "step": 5010 }, { "epoch": 0.73, "grad_norm": 0.16796875, "learning_rate": 8.330343302207042e-05, "loss": 1.0687, "step": 5015 }, { "epoch": 0.73, "grad_norm": 0.177734375, "learning_rate": 8.287650832918267e-05, "loss": 1.1246, "step": 5020 }, { "epoch": 0.73, "grad_norm": 0.142578125, "learning_rate": 8.245046296157898e-05, "loss": 1.0336, "step": 5025 }, { "epoch": 0.73, "grad_norm": 0.1845703125, "learning_rate": 8.202529916090983e-05, "loss": 1.1219, "step": 5030 }, { "epoch": 0.74, "grad_norm": 0.154296875, "learning_rate": 8.160101916418728e-05, "loss": 1.0273, "step": 5035 }, { "epoch": 0.74, "grad_norm": 0.1435546875, "learning_rate": 8.117762520377334e-05, "loss": 0.9658, "step": 5040 }, { "epoch": 0.74, "grad_norm": 0.1708984375, "learning_rate": 8.075511950736819e-05, "loss": 1.0894, "step": 5045 }, { "epoch": 0.74, "grad_norm": 0.23828125, "learning_rate": 8.033350429799821e-05, "loss": 1.1085, "step": 5050 }, { "epoch": 0.74, "grad_norm": 0.1767578125, "learning_rate": 7.99127817940044e-05, "loss": 0.945, "step": 5055 }, { "epoch": 0.74, "grad_norm": 0.1455078125, "learning_rate": 7.94929542090311e-05, "loss": 1.1813, "step": 5060 }, { "epoch": 0.74, "grad_norm": 0.1396484375, "learning_rate": 7.907402375201362e-05, "loss": 0.9865, "step": 5065 }, { "epoch": 0.74, "grad_norm": 0.1572265625, "learning_rate": 7.86559926271673e-05, "loss": 1.1039, "step": 5070 }, { "epoch": 0.74, "grad_norm": 0.1650390625, "learning_rate": 7.823886303397543e-05, "loss": 0.9412, "step": 5075 }, { "epoch": 0.74, "grad_norm": 0.13671875, "learning_rate": 7.782263716717783e-05, "loss": 0.8976, "step": 5080 }, { "epoch": 0.74, "grad_norm": 0.19921875, "learning_rate": 7.740731721675953e-05, "loss": 1.0748, "step": 5085 }, { "epoch": 0.74, "grad_norm": 0.1513671875, "learning_rate": 7.699290536793904e-05, "loss": 0.9474, "step": 5090 }, { "epoch": 0.74, "grad_norm": 0.1640625, "learning_rate": 7.657940380115668e-05, "loss": 1.0187, "step": 5095 }, { "epoch": 0.74, "grad_norm": 0.140625, "learning_rate": 7.616681469206335e-05, "loss": 1.0857, "step": 5100 }, { "epoch": 0.75, "grad_norm": 0.1328125, "learning_rate": 7.575514021150915e-05, "loss": 0.992, "step": 5105 }, { "epoch": 0.75, "grad_norm": 0.16015625, "learning_rate": 7.534438252553177e-05, "loss": 0.9423, "step": 5110 }, { "epoch": 0.75, "grad_norm": 0.1689453125, "learning_rate": 7.493454379534515e-05, "loss": 0.9686, "step": 5115 }, { "epoch": 0.75, "grad_norm": 0.134765625, "learning_rate": 7.452562617732795e-05, "loss": 0.965, "step": 5120 }, { "epoch": 0.75, "grad_norm": 0.150390625, "learning_rate": 7.411763182301275e-05, "loss": 0.9155, "step": 5125 }, { "epoch": 0.75, "grad_norm": 0.1630859375, "learning_rate": 7.371056287907396e-05, "loss": 1.1172, "step": 5130 }, { "epoch": 0.75, "grad_norm": 0.1396484375, "learning_rate": 7.330442148731726e-05, "loss": 1.1276, "step": 5135 }, { "epoch": 0.75, "grad_norm": 0.1630859375, "learning_rate": 7.289920978466758e-05, "loss": 0.887, "step": 5140 }, { "epoch": 0.75, "grad_norm": 0.158203125, "learning_rate": 7.249492990315873e-05, "loss": 1.0794, "step": 5145 }, { "epoch": 0.75, "grad_norm": 0.1767578125, "learning_rate": 7.209158396992133e-05, "loss": 1.1647, "step": 5150 }, { "epoch": 0.75, "grad_norm": 0.16015625, "learning_rate": 7.168917410717224e-05, "loss": 1.1844, "step": 5155 }, { "epoch": 0.75, "grad_norm": 0.13671875, "learning_rate": 7.128770243220298e-05, "loss": 1.1793, "step": 5160 }, { "epoch": 0.75, "grad_norm": 0.1982421875, "learning_rate": 7.088717105736897e-05, "loss": 1.2339, "step": 5165 }, { "epoch": 0.75, "grad_norm": 0.1650390625, "learning_rate": 7.048758209007797e-05, "loss": 1.114, "step": 5170 }, { "epoch": 0.76, "grad_norm": 0.14453125, "learning_rate": 7.008893763277952e-05, "loss": 1.1371, "step": 5175 }, { "epoch": 0.76, "grad_norm": 0.150390625, "learning_rate": 6.969123978295325e-05, "loss": 0.9877, "step": 5180 }, { "epoch": 0.76, "grad_norm": 0.1923828125, "learning_rate": 6.929449063309857e-05, "loss": 1.1437, "step": 5185 }, { "epoch": 0.76, "grad_norm": 0.1923828125, "learning_rate": 6.88986922707229e-05, "loss": 1.1434, "step": 5190 }, { "epoch": 0.76, "grad_norm": 0.140625, "learning_rate": 6.85038467783314e-05, "loss": 0.9752, "step": 5195 }, { "epoch": 0.76, "grad_norm": 0.1572265625, "learning_rate": 6.810995623341546e-05, "loss": 0.9277, "step": 5200 }, { "epoch": 0.76, "grad_norm": 0.1494140625, "learning_rate": 6.77170227084419e-05, "loss": 1.3186, "step": 5205 }, { "epoch": 0.76, "grad_norm": 0.142578125, "learning_rate": 6.732504827084255e-05, "loss": 1.0487, "step": 5210 }, { "epoch": 0.76, "grad_norm": 0.15234375, "learning_rate": 6.693403498300249e-05, "loss": 0.9536, "step": 5215 }, { "epoch": 0.76, "grad_norm": 0.1689453125, "learning_rate": 6.654398490225008e-05, "loss": 1.0277, "step": 5220 }, { "epoch": 0.76, "grad_norm": 0.1396484375, "learning_rate": 6.61549000808454e-05, "loss": 1.0484, "step": 5225 }, { "epoch": 0.76, "grad_norm": 0.1650390625, "learning_rate": 6.576678256597016e-05, "loss": 1.1071, "step": 5230 }, { "epoch": 0.76, "grad_norm": 0.19140625, "learning_rate": 6.537963439971612e-05, "loss": 1.1738, "step": 5235 }, { "epoch": 0.77, "grad_norm": 0.162109375, "learning_rate": 6.499345761907521e-05, "loss": 1.1875, "step": 5240 }, { "epoch": 0.77, "grad_norm": 0.2197265625, "learning_rate": 6.460825425592807e-05, "loss": 1.3482, "step": 5245 }, { "epoch": 0.77, "grad_norm": 0.134765625, "learning_rate": 6.422402633703389e-05, "loss": 0.9738, "step": 5250 }, { "epoch": 0.77, "grad_norm": 0.1572265625, "learning_rate": 6.384077588401932e-05, "loss": 1.2898, "step": 5255 }, { "epoch": 0.77, "grad_norm": 0.140625, "learning_rate": 6.345850491336826e-05, "loss": 0.9434, "step": 5260 }, { "epoch": 0.77, "grad_norm": 0.1787109375, "learning_rate": 6.307721543641088e-05, "loss": 1.2934, "step": 5265 }, { "epoch": 0.77, "grad_norm": 0.1337890625, "learning_rate": 6.269690945931328e-05, "loss": 1.0275, "step": 5270 }, { "epoch": 0.77, "grad_norm": 0.15234375, "learning_rate": 6.23175889830668e-05, "loss": 0.928, "step": 5275 }, { "epoch": 0.77, "grad_norm": 0.1220703125, "learning_rate": 6.19392560034775e-05, "loss": 0.6696, "step": 5280 }, { "epoch": 0.77, "grad_norm": 0.177734375, "learning_rate": 6.156191251115575e-05, "loss": 1.0608, "step": 5285 }, { "epoch": 0.77, "grad_norm": 0.21875, "learning_rate": 6.11855604915058e-05, "loss": 1.1805, "step": 5290 }, { "epoch": 0.77, "grad_norm": 0.154296875, "learning_rate": 6.081020192471509e-05, "loss": 0.9473, "step": 5295 }, { "epoch": 0.77, "grad_norm": 0.1650390625, "learning_rate": 6.0435838785743905e-05, "loss": 0.8069, "step": 5300 }, { "epoch": 0.77, "grad_norm": 0.1943359375, "learning_rate": 6.006247304431528e-05, "loss": 0.8966, "step": 5305 }, { "epoch": 0.78, "grad_norm": 0.169921875, "learning_rate": 5.9690106664904346e-05, "loss": 0.9691, "step": 5310 }, { "epoch": 0.78, "grad_norm": 0.1591796875, "learning_rate": 5.931874160672798e-05, "loss": 1.1929, "step": 5315 }, { "epoch": 0.78, "grad_norm": 0.1455078125, "learning_rate": 5.894837982373455e-05, "loss": 1.0995, "step": 5320 }, { "epoch": 0.78, "grad_norm": 0.1591796875, "learning_rate": 5.857902326459377e-05, "loss": 0.9066, "step": 5325 }, { "epoch": 0.78, "grad_norm": 0.15234375, "learning_rate": 5.8210673872686385e-05, "loss": 1.0044, "step": 5330 }, { "epoch": 0.78, "grad_norm": 0.1669921875, "learning_rate": 5.7843333586093754e-05, "loss": 1.0552, "step": 5335 }, { "epoch": 0.78, "grad_norm": 0.17578125, "learning_rate": 5.747700433758776e-05, "loss": 0.9869, "step": 5340 }, { "epoch": 0.78, "grad_norm": 0.1640625, "learning_rate": 5.7111688054620945e-05, "loss": 1.1215, "step": 5345 }, { "epoch": 0.78, "grad_norm": 0.134765625, "learning_rate": 5.6747386659315755e-05, "loss": 1.0677, "step": 5350 }, { "epoch": 0.78, "grad_norm": 0.1689453125, "learning_rate": 5.638410206845512e-05, "loss": 1.0609, "step": 5355 }, { "epoch": 0.78, "grad_norm": 0.134765625, "learning_rate": 5.602183619347179e-05, "loss": 1.1, "step": 5360 }, { "epoch": 0.78, "grad_norm": 0.1591796875, "learning_rate": 5.566059094043849e-05, "loss": 1.0852, "step": 5365 }, { "epoch": 0.78, "grad_norm": 0.158203125, "learning_rate": 5.5300368210058075e-05, "loss": 1.2309, "step": 5370 }, { "epoch": 0.78, "grad_norm": 0.1640625, "learning_rate": 5.4941169897653354e-05, "loss": 1.0994, "step": 5375 }, { "epoch": 0.79, "grad_norm": 0.1865234375, "learning_rate": 5.458299789315699e-05, "loss": 1.2385, "step": 5380 }, { "epoch": 0.79, "grad_norm": 0.146484375, "learning_rate": 5.4225854081101697e-05, "loss": 0.9504, "step": 5385 }, { "epoch": 0.79, "grad_norm": 0.16796875, "learning_rate": 5.3869740340610515e-05, "loss": 1.1537, "step": 5390 }, { "epoch": 0.79, "grad_norm": 0.1533203125, "learning_rate": 5.351465854538659e-05, "loss": 1.3266, "step": 5395 }, { "epoch": 0.79, "grad_norm": 0.1533203125, "learning_rate": 5.31606105637035e-05, "loss": 0.9765, "step": 5400 }, { "epoch": 0.79, "grad_norm": 0.142578125, "learning_rate": 5.2807598258395234e-05, "loss": 1.0391, "step": 5405 }, { "epoch": 0.79, "grad_norm": 0.1865234375, "learning_rate": 5.2455623486846765e-05, "loss": 1.2316, "step": 5410 }, { "epoch": 0.79, "grad_norm": 0.171875, "learning_rate": 5.210468810098398e-05, "loss": 0.9606, "step": 5415 }, { "epoch": 0.79, "grad_norm": 0.171875, "learning_rate": 5.1754793947263984e-05, "loss": 1.2529, "step": 5420 }, { "epoch": 0.79, "grad_norm": 0.1416015625, "learning_rate": 5.140594286666528e-05, "loss": 1.1235, "step": 5425 }, { "epoch": 0.79, "grad_norm": 0.1650390625, "learning_rate": 5.105813669467843e-05, "loss": 1.0962, "step": 5430 }, { "epoch": 0.79, "grad_norm": 0.1708984375, "learning_rate": 5.0711377261296126e-05, "loss": 0.9264, "step": 5435 }, { "epoch": 0.79, "grad_norm": 0.224609375, "learning_rate": 5.036566639100351e-05, "loss": 1.2677, "step": 5440 }, { "epoch": 0.8, "grad_norm": 0.15625, "learning_rate": 5.0021005902768726e-05, "loss": 0.9795, "step": 5445 }, { "epoch": 0.8, "grad_norm": 0.17578125, "learning_rate": 4.96773976100332e-05, "loss": 0.9653, "step": 5450 }, { "epoch": 0.8, "grad_norm": 0.1708984375, "learning_rate": 4.933484332070257e-05, "loss": 1.2085, "step": 5455 }, { "epoch": 0.8, "grad_norm": 0.1845703125, "learning_rate": 4.899334483713641e-05, "loss": 1.3929, "step": 5460 }, { "epoch": 0.8, "grad_norm": 0.1337890625, "learning_rate": 4.865290395613928e-05, "loss": 0.9019, "step": 5465 }, { "epoch": 0.8, "grad_norm": 0.1337890625, "learning_rate": 4.831352246895104e-05, "loss": 0.9831, "step": 5470 }, { "epoch": 0.8, "grad_norm": 0.1953125, "learning_rate": 4.797520216123769e-05, "loss": 1.0529, "step": 5475 }, { "epoch": 0.8, "grad_norm": 0.169921875, "learning_rate": 4.763794481308176e-05, "loss": 1.1049, "step": 5480 }, { "epoch": 0.8, "grad_norm": 0.11865234375, "learning_rate": 4.730175219897292e-05, "loss": 0.9096, "step": 5485 }, { "epoch": 0.8, "grad_norm": 0.1484375, "learning_rate": 4.6966626087798614e-05, "loss": 1.3048, "step": 5490 }, { "epoch": 0.8, "grad_norm": 0.1533203125, "learning_rate": 4.66325682428351e-05, "loss": 1.1603, "step": 5495 }, { "epoch": 0.8, "grad_norm": 0.17578125, "learning_rate": 4.629958042173779e-05, "loss": 0.9917, "step": 5500 }, { "epoch": 0.8, "grad_norm": 0.1318359375, "learning_rate": 4.5967664376532076e-05, "loss": 0.9204, "step": 5505 }, { "epoch": 0.8, "grad_norm": 0.1455078125, "learning_rate": 4.563682185360421e-05, "loss": 0.9311, "step": 5510 }, { "epoch": 0.81, "grad_norm": 0.138671875, "learning_rate": 4.530705459369211e-05, "loss": 0.9714, "step": 5515 }, { "epoch": 0.81, "grad_norm": 0.1982421875, "learning_rate": 4.497836433187619e-05, "loss": 1.1269, "step": 5520 }, { "epoch": 0.81, "grad_norm": 0.162109375, "learning_rate": 4.4650752797570165e-05, "loss": 1.2484, "step": 5525 }, { "epoch": 0.81, "grad_norm": 0.14453125, "learning_rate": 4.432422171451187e-05, "loss": 0.9576, "step": 5530 }, { "epoch": 0.81, "grad_norm": 0.1376953125, "learning_rate": 4.399877280075448e-05, "loss": 0.9191, "step": 5535 }, { "epoch": 0.81, "grad_norm": 0.1591796875, "learning_rate": 4.3674407768657384e-05, "loss": 1.2496, "step": 5540 }, { "epoch": 0.81, "grad_norm": 0.1650390625, "learning_rate": 4.335112832487684e-05, "loss": 0.9345, "step": 5545 }, { "epoch": 0.81, "grad_norm": 0.146484375, "learning_rate": 4.302893617035741e-05, "loss": 0.9389, "step": 5550 }, { "epoch": 0.81, "grad_norm": 0.1669921875, "learning_rate": 4.270783300032269e-05, "loss": 1.2902, "step": 5555 }, { "epoch": 0.81, "grad_norm": 0.1650390625, "learning_rate": 4.23878205042669e-05, "loss": 0.9465, "step": 5560 }, { "epoch": 0.81, "grad_norm": 0.1484375, "learning_rate": 4.206890036594535e-05, "loss": 0.9596, "step": 5565 }, { "epoch": 0.81, "grad_norm": 0.154296875, "learning_rate": 4.175107426336591e-05, "loss": 1.0553, "step": 5570 }, { "epoch": 0.81, "grad_norm": 0.1923828125, "learning_rate": 4.1434343868780086e-05, "loss": 1.1287, "step": 5575 }, { "epoch": 0.81, "grad_norm": 0.1572265625, "learning_rate": 4.111871084867461e-05, "loss": 1.2026, "step": 5580 }, { "epoch": 0.82, "grad_norm": 0.14453125, "learning_rate": 4.080417686376203e-05, "loss": 1.0681, "step": 5585 }, { "epoch": 0.82, "grad_norm": 0.140625, "learning_rate": 4.0490743568972366e-05, "loss": 1.1642, "step": 5590 }, { "epoch": 0.82, "grad_norm": 0.130859375, "learning_rate": 4.017841261344427e-05, "loss": 1.1061, "step": 5595 }, { "epoch": 0.82, "grad_norm": 0.14453125, "learning_rate": 3.986718564051653e-05, "loss": 0.8276, "step": 5600 }, { "epoch": 0.82, "grad_norm": 0.146484375, "learning_rate": 3.9557064287719294e-05, "loss": 0.9965, "step": 5605 }, { "epoch": 0.82, "grad_norm": 0.134765625, "learning_rate": 3.924805018676536e-05, "loss": 1.0255, "step": 5610 }, { "epoch": 0.82, "grad_norm": 0.1611328125, "learning_rate": 3.894014496354165e-05, "loss": 1.0776, "step": 5615 }, { "epoch": 0.82, "grad_norm": 0.142578125, "learning_rate": 3.863335023810083e-05, "loss": 0.9152, "step": 5620 }, { "epoch": 0.82, "grad_norm": 0.1689453125, "learning_rate": 3.8327667624652645e-05, "loss": 1.1285, "step": 5625 }, { "epoch": 0.82, "grad_norm": 0.1484375, "learning_rate": 3.802309873155529e-05, "loss": 1.2754, "step": 5630 }, { "epoch": 0.82, "grad_norm": 0.1591796875, "learning_rate": 3.771964516130716e-05, "loss": 1.0922, "step": 5635 }, { "epoch": 0.82, "grad_norm": 0.162109375, "learning_rate": 3.741730851053834e-05, "loss": 1.0333, "step": 5640 }, { "epoch": 0.82, "grad_norm": 0.1728515625, "learning_rate": 3.7116090370002356e-05, "loss": 1.1931, "step": 5645 }, { "epoch": 0.83, "grad_norm": 0.1630859375, "learning_rate": 3.681599232456742e-05, "loss": 1.2224, "step": 5650 }, { "epoch": 0.83, "grad_norm": 0.1787109375, "learning_rate": 3.651701595320839e-05, "loss": 0.9946, "step": 5655 }, { "epoch": 0.83, "grad_norm": 0.1142578125, "learning_rate": 3.621916282899856e-05, "loss": 0.9051, "step": 5660 }, { "epoch": 0.83, "grad_norm": 0.1669921875, "learning_rate": 3.592243451910102e-05, "loss": 1.0493, "step": 5665 }, { "epoch": 0.83, "grad_norm": 0.16015625, "learning_rate": 3.5626832584760766e-05, "loss": 1.0392, "step": 5670 }, { "epoch": 0.83, "grad_norm": 0.1708984375, "learning_rate": 3.5332358581296214e-05, "loss": 1.0141, "step": 5675 }, { "epoch": 0.83, "grad_norm": 0.154296875, "learning_rate": 3.503901405809129e-05, "loss": 1.1582, "step": 5680 }, { "epoch": 0.83, "grad_norm": 0.158203125, "learning_rate": 3.474680055858695e-05, "loss": 1.1519, "step": 5685 }, { "epoch": 0.83, "grad_norm": 0.15625, "learning_rate": 3.4455719620273386e-05, "loss": 1.164, "step": 5690 }, { "epoch": 0.83, "grad_norm": 0.1591796875, "learning_rate": 3.416577277468172e-05, "loss": 1.0292, "step": 5695 }, { "epoch": 0.83, "grad_norm": 0.162109375, "learning_rate": 3.387696154737591e-05, "loss": 1.2348, "step": 5700 }, { "epoch": 0.83, "grad_norm": 0.138671875, "learning_rate": 3.3589287457945014e-05, "loss": 1.0566, "step": 5705 }, { "epoch": 0.83, "grad_norm": 0.1787109375, "learning_rate": 3.330275201999494e-05, "loss": 1.1678, "step": 5710 }, { "epoch": 0.83, "grad_norm": 0.142578125, "learning_rate": 3.3017356741140506e-05, "loss": 0.9696, "step": 5715 }, { "epoch": 0.84, "grad_norm": 0.1572265625, "learning_rate": 3.2733103122997486e-05, "loss": 1.0827, "step": 5720 }, { "epoch": 0.84, "grad_norm": 0.15234375, "learning_rate": 3.2449992661174905e-05, "loss": 0.9318, "step": 5725 }, { "epoch": 0.84, "grad_norm": 0.154296875, "learning_rate": 3.216802684526704e-05, "loss": 1.1558, "step": 5730 }, { "epoch": 0.84, "grad_norm": 0.1474609375, "learning_rate": 3.1887207158845465e-05, "loss": 1.2175, "step": 5735 }, { "epoch": 0.84, "grad_norm": 0.189453125, "learning_rate": 3.160753507945138e-05, "loss": 1.1071, "step": 5740 }, { "epoch": 0.84, "grad_norm": 0.1455078125, "learning_rate": 3.132901207858796e-05, "loss": 1.0666, "step": 5745 }, { "epoch": 0.84, "grad_norm": 0.171875, "learning_rate": 3.10516396217122e-05, "loss": 1.2645, "step": 5750 }, { "epoch": 0.84, "grad_norm": 0.15625, "learning_rate": 3.077541916822782e-05, "loss": 0.9798, "step": 5755 }, { "epoch": 0.84, "grad_norm": 0.1494140625, "learning_rate": 3.0500352171476896e-05, "loss": 0.7795, "step": 5760 }, { "epoch": 0.84, "grad_norm": 0.154296875, "learning_rate": 3.0226440078732846e-05, "loss": 0.9405, "step": 5765 }, { "epoch": 0.84, "grad_norm": 0.1669921875, "learning_rate": 2.9953684331192278e-05, "loss": 1.0061, "step": 5770 }, { "epoch": 0.84, "grad_norm": 0.1572265625, "learning_rate": 2.9682086363967852e-05, "loss": 1.1415, "step": 5775 }, { "epoch": 0.84, "grad_norm": 0.1396484375, "learning_rate": 2.9411647606080395e-05, "loss": 1.1644, "step": 5780 }, { "epoch": 0.84, "grad_norm": 0.16015625, "learning_rate": 2.9142369480451652e-05, "loss": 1.0313, "step": 5785 }, { "epoch": 0.85, "grad_norm": 0.16015625, "learning_rate": 2.8874253403896487e-05, "loss": 1.2192, "step": 5790 }, { "epoch": 0.85, "grad_norm": 0.138671875, "learning_rate": 2.8607300787115818e-05, "loss": 0.9965, "step": 5795 }, { "epoch": 0.85, "grad_norm": 0.1474609375, "learning_rate": 2.8341513034688766e-05, "loss": 1.0245, "step": 5800 }, { "epoch": 0.85, "grad_norm": 0.142578125, "learning_rate": 2.807689154506571e-05, "loss": 1.036, "step": 5805 }, { "epoch": 0.85, "grad_norm": 0.185546875, "learning_rate": 2.7813437710560498e-05, "loss": 1.1713, "step": 5810 }, { "epoch": 0.85, "grad_norm": 0.16796875, "learning_rate": 2.755115291734342e-05, "loss": 1.2636, "step": 5815 }, { "epoch": 0.85, "grad_norm": 0.2138671875, "learning_rate": 2.729003854543388e-05, "loss": 1.0071, "step": 5820 }, { "epoch": 0.85, "grad_norm": 0.1416015625, "learning_rate": 2.7030095968692937e-05, "loss": 0.9706, "step": 5825 }, { "epoch": 0.85, "grad_norm": 0.1416015625, "learning_rate": 2.6771326554816345e-05, "loss": 1.1242, "step": 5830 }, { "epoch": 0.85, "grad_norm": 0.1298828125, "learning_rate": 2.6513731665327085e-05, "loss": 0.9072, "step": 5835 }, { "epoch": 0.85, "grad_norm": 0.138671875, "learning_rate": 2.6257312655568522e-05, "loss": 0.9881, "step": 5840 }, { "epoch": 0.85, "grad_norm": 0.1337890625, "learning_rate": 2.6002070874696915e-05, "loss": 1.1109, "step": 5845 }, { "epoch": 0.85, "grad_norm": 0.1318359375, "learning_rate": 2.5748007665674646e-05, "loss": 1.0871, "step": 5850 }, { "epoch": 0.85, "grad_norm": 0.1611328125, "learning_rate": 2.5495124365262885e-05, "loss": 1.0898, "step": 5855 }, { "epoch": 0.86, "grad_norm": 0.1494140625, "learning_rate": 2.5243422304014802e-05, "loss": 1.0403, "step": 5860 }, { "epoch": 0.86, "grad_norm": 0.1650390625, "learning_rate": 2.4992902806268258e-05, "loss": 1.0823, "step": 5865 }, { "epoch": 0.86, "grad_norm": 0.16015625, "learning_rate": 2.4743567190139287e-05, "loss": 0.9326, "step": 5870 }, { "epoch": 0.86, "grad_norm": 0.1396484375, "learning_rate": 2.44954167675146e-05, "loss": 1.0215, "step": 5875 }, { "epoch": 0.86, "grad_norm": 0.1474609375, "learning_rate": 2.42484528440452e-05, "loss": 0.9758, "step": 5880 }, { "epoch": 0.86, "grad_norm": 0.154296875, "learning_rate": 2.4002676719139166e-05, "loss": 0.8542, "step": 5885 }, { "epoch": 0.86, "grad_norm": 0.1884765625, "learning_rate": 2.3758089685955025e-05, "loss": 1.1913, "step": 5890 }, { "epoch": 0.86, "grad_norm": 0.16015625, "learning_rate": 2.3514693031394806e-05, "loss": 0.7058, "step": 5895 }, { "epoch": 0.86, "grad_norm": 0.1484375, "learning_rate": 2.3272488036097273e-05, "loss": 0.9125, "step": 5900 }, { "epoch": 0.86, "grad_norm": 0.1513671875, "learning_rate": 2.3031475974431337e-05, "loss": 1.0825, "step": 5905 }, { "epoch": 0.86, "grad_norm": 0.1611328125, "learning_rate": 2.27916581144893e-05, "loss": 0.8948, "step": 5910 }, { "epoch": 0.86, "grad_norm": 0.1708984375, "learning_rate": 2.2553035718079955e-05, "loss": 1.2793, "step": 5915 }, { "epoch": 0.86, "grad_norm": 0.166015625, "learning_rate": 2.2315610040722245e-05, "loss": 1.1075, "step": 5920 }, { "epoch": 0.87, "grad_norm": 0.15625, "learning_rate": 2.2079382331638503e-05, "loss": 1.0355, "step": 5925 }, { "epoch": 0.87, "grad_norm": 0.140625, "learning_rate": 2.1844353833748027e-05, "loss": 0.8773, "step": 5930 }, { "epoch": 0.87, "grad_norm": 0.142578125, "learning_rate": 2.161052578366024e-05, "loss": 1.0663, "step": 5935 }, { "epoch": 0.87, "grad_norm": 0.1494140625, "learning_rate": 2.1377899411668483e-05, "loss": 1.1142, "step": 5940 }, { "epoch": 0.87, "grad_norm": 0.169921875, "learning_rate": 2.1146475941743485e-05, "loss": 1.1089, "step": 5945 }, { "epoch": 0.87, "grad_norm": 0.2001953125, "learning_rate": 2.091625659152671e-05, "loss": 1.1264, "step": 5950 }, { "epoch": 0.87, "grad_norm": 0.16015625, "learning_rate": 2.0687242572324268e-05, "loss": 1.1726, "step": 5955 }, { "epoch": 0.87, "grad_norm": 0.1455078125, "learning_rate": 2.0459435089100255e-05, "loss": 0.9742, "step": 5960 }, { "epoch": 0.87, "grad_norm": 0.13671875, "learning_rate": 2.0232835340470683e-05, "loss": 0.9213, "step": 5965 }, { "epoch": 0.87, "grad_norm": 0.13671875, "learning_rate": 2.0007444518696876e-05, "loss": 1.0536, "step": 5970 }, { "epoch": 0.87, "grad_norm": 0.1767578125, "learning_rate": 1.9783263809679512e-05, "loss": 1.0193, "step": 5975 }, { "epoch": 0.87, "grad_norm": 0.15234375, "learning_rate": 1.9560294392952093e-05, "loss": 1.0072, "step": 5980 }, { "epoch": 0.87, "grad_norm": 0.16015625, "learning_rate": 1.9338537441674853e-05, "loss": 1.0281, "step": 5985 }, { "epoch": 0.87, "grad_norm": 0.185546875, "learning_rate": 1.9117994122628767e-05, "loss": 1.0693, "step": 5990 }, { "epoch": 0.88, "grad_norm": 0.1767578125, "learning_rate": 1.8898665596209175e-05, "loss": 0.9922, "step": 5995 }, { "epoch": 0.88, "grad_norm": 0.1806640625, "learning_rate": 1.8680553016419632e-05, "loss": 1.1259, "step": 6000 }, { "epoch": 0.88, "grad_norm": 0.1513671875, "learning_rate": 1.8463657530866068e-05, "loss": 1.0088, "step": 6005 }, { "epoch": 0.88, "grad_norm": 0.146484375, "learning_rate": 1.8247980280750616e-05, "loss": 1.2548, "step": 6010 }, { "epoch": 0.88, "grad_norm": 0.15234375, "learning_rate": 1.803352240086567e-05, "loss": 1.0571, "step": 6015 }, { "epoch": 0.88, "grad_norm": 0.150390625, "learning_rate": 1.7820285019587795e-05, "loss": 1.2816, "step": 6020 }, { "epoch": 0.88, "grad_norm": 0.18359375, "learning_rate": 1.7608269258871823e-05, "loss": 1.1032, "step": 6025 }, { "epoch": 0.88, "grad_norm": 0.1396484375, "learning_rate": 1.739747623424512e-05, "loss": 0.9287, "step": 6030 }, { "epoch": 0.88, "grad_norm": 0.1455078125, "learning_rate": 1.718790705480161e-05, "loss": 0.8832, "step": 6035 }, { "epoch": 0.88, "grad_norm": 0.171875, "learning_rate": 1.6979562823195738e-05, "loss": 1.0844, "step": 6040 }, { "epoch": 0.88, "grad_norm": 0.1748046875, "learning_rate": 1.6772444635636986e-05, "loss": 0.9688, "step": 6045 }, { "epoch": 0.88, "grad_norm": 0.126953125, "learning_rate": 1.656655358188386e-05, "loss": 1.0039, "step": 6050 }, { "epoch": 0.88, "grad_norm": 0.1357421875, "learning_rate": 1.6361890745238472e-05, "loss": 1.1248, "step": 6055 }, { "epoch": 0.88, "grad_norm": 0.2197265625, "learning_rate": 1.6158457202540472e-05, "loss": 0.9911, "step": 6060 }, { "epoch": 0.89, "grad_norm": 0.138671875, "learning_rate": 1.595625402416151e-05, "loss": 1.1771, "step": 6065 }, { "epoch": 0.89, "grad_norm": 0.1416015625, "learning_rate": 1.5755282273999694e-05, "loss": 0.9097, "step": 6070 }, { "epoch": 0.89, "grad_norm": 0.1650390625, "learning_rate": 1.5555543009473953e-05, "loss": 1.3623, "step": 6075 }, { "epoch": 0.89, "grad_norm": 0.1845703125, "learning_rate": 1.5357037281518522e-05, "loss": 1.1559, "step": 6080 }, { "epoch": 0.89, "grad_norm": 0.16796875, "learning_rate": 1.5159766134577174e-05, "loss": 1.0551, "step": 6085 }, { "epoch": 0.89, "grad_norm": 0.1650390625, "learning_rate": 1.4963730606598003e-05, "loss": 1.0872, "step": 6090 }, { "epoch": 0.89, "grad_norm": 0.158203125, "learning_rate": 1.476893172902785e-05, "loss": 1.1479, "step": 6095 }, { "epoch": 0.89, "grad_norm": 0.1396484375, "learning_rate": 1.4575370526806936e-05, "loss": 0.9236, "step": 6100 }, { "epoch": 0.89, "grad_norm": 0.134765625, "learning_rate": 1.4383048018363292e-05, "loss": 1.2065, "step": 6105 }, { "epoch": 0.89, "grad_norm": 0.162109375, "learning_rate": 1.4191965215607561e-05, "loss": 0.9892, "step": 6110 }, { "epoch": 0.89, "grad_norm": 0.2080078125, "learning_rate": 1.4002123123927678e-05, "loss": 1.0428, "step": 6115 }, { "epoch": 0.89, "grad_norm": 0.14453125, "learning_rate": 1.3813522742183615e-05, "loss": 1.0641, "step": 6120 }, { "epoch": 0.89, "grad_norm": 0.140625, "learning_rate": 1.3626165062701867e-05, "loss": 1.1613, "step": 6125 }, { "epoch": 0.9, "grad_norm": 0.14453125, "learning_rate": 1.3440051071270526e-05, "loss": 1.176, "step": 6130 }, { "epoch": 0.9, "grad_norm": 0.146484375, "learning_rate": 1.3255181747133944e-05, "loss": 0.9431, "step": 6135 }, { "epoch": 0.9, "grad_norm": 0.162109375, "learning_rate": 1.307155806298771e-05, "loss": 0.8613, "step": 6140 }, { "epoch": 0.9, "grad_norm": 0.171875, "learning_rate": 1.2889180984973298e-05, "loss": 1.0397, "step": 6145 }, { "epoch": 0.9, "grad_norm": 0.1396484375, "learning_rate": 1.2708051472673204e-05, "loss": 1.1134, "step": 6150 }, { "epoch": 0.9, "grad_norm": 0.150390625, "learning_rate": 1.2528170479105816e-05, "loss": 1.1562, "step": 6155 }, { "epoch": 0.9, "grad_norm": 0.1357421875, "learning_rate": 1.234953895072044e-05, "loss": 0.9906, "step": 6160 }, { "epoch": 0.9, "grad_norm": 0.1552734375, "learning_rate": 1.2172157827392228e-05, "loss": 0.9779, "step": 6165 }, { "epoch": 0.9, "grad_norm": 0.1396484375, "learning_rate": 1.1996028042417257e-05, "loss": 1.2537, "step": 6170 }, { "epoch": 0.9, "grad_norm": 0.1552734375, "learning_rate": 1.1821150522507652e-05, "loss": 0.9378, "step": 6175 }, { "epoch": 0.9, "grad_norm": 0.1318359375, "learning_rate": 1.1647526187786889e-05, "loss": 1.0642, "step": 6180 }, { "epoch": 0.9, "grad_norm": 0.140625, "learning_rate": 1.1475155951784582e-05, "loss": 1.3321, "step": 6185 }, { "epoch": 0.9, "grad_norm": 0.1572265625, "learning_rate": 1.1304040721431874e-05, "loss": 1.1456, "step": 6190 }, { "epoch": 0.9, "grad_norm": 0.1875, "learning_rate": 1.1134181397056686e-05, "loss": 1.101, "step": 6195 }, { "epoch": 0.91, "grad_norm": 0.154296875, "learning_rate": 1.0965578872379035e-05, "loss": 1.2256, "step": 6200 }, { "epoch": 0.91, "grad_norm": 0.1494140625, "learning_rate": 1.0798234034506166e-05, "loss": 0.918, "step": 6205 }, { "epoch": 0.91, "grad_norm": 0.2001953125, "learning_rate": 1.0632147763927985e-05, "loss": 1.3359, "step": 6210 }, { "epoch": 0.91, "grad_norm": 0.15625, "learning_rate": 1.0467320934512359e-05, "loss": 0.8908, "step": 6215 }, { "epoch": 0.91, "grad_norm": 0.1435546875, "learning_rate": 1.0303754413500648e-05, "loss": 1.0354, "step": 6220 }, { "epoch": 0.91, "grad_norm": 0.158203125, "learning_rate": 1.014144906150305e-05, "loss": 1.2781, "step": 6225 }, { "epoch": 0.91, "grad_norm": 0.1416015625, "learning_rate": 9.980405732494036e-06, "loss": 1.2613, "step": 6230 }, { "epoch": 0.91, "grad_norm": 0.140625, "learning_rate": 9.82062527380792e-06, "loss": 1.1093, "step": 6235 }, { "epoch": 0.91, "grad_norm": 0.1611328125, "learning_rate": 9.662108526134417e-06, "loss": 1.0236, "step": 6240 }, { "epoch": 0.91, "grad_norm": 0.1416015625, "learning_rate": 9.504856323514165e-06, "loss": 0.9299, "step": 6245 }, { "epoch": 0.91, "grad_norm": 0.1455078125, "learning_rate": 9.348869493334384e-06, "loss": 0.8213, "step": 6250 }, { "epoch": 0.91, "grad_norm": 0.1416015625, "learning_rate": 9.194148856324413e-06, "loss": 1.0443, "step": 6255 }, { "epoch": 0.91, "grad_norm": 0.142578125, "learning_rate": 9.040695226551599e-06, "loss": 0.9715, "step": 6260 }, { "epoch": 0.91, "grad_norm": 0.1455078125, "learning_rate": 8.888509411416861e-06, "loss": 0.9815, "step": 6265 }, { "epoch": 0.92, "grad_norm": 0.1708984375, "learning_rate": 8.73759221165038e-06, "loss": 1.045, "step": 6270 }, { "epoch": 0.92, "grad_norm": 0.1689453125, "learning_rate": 8.587944421307559e-06, "loss": 1.1522, "step": 6275 }, { "epoch": 0.92, "grad_norm": 0.1533203125, "learning_rate": 8.439566827764811e-06, "loss": 0.9543, "step": 6280 }, { "epoch": 0.92, "grad_norm": 0.1875, "learning_rate": 8.29246021171523e-06, "loss": 1.1236, "step": 6285 }, { "epoch": 0.92, "grad_norm": 0.154296875, "learning_rate": 8.146625347164788e-06, "loss": 1.2259, "step": 6290 }, { "epoch": 0.92, "grad_norm": 0.142578125, "learning_rate": 8.002063001427974e-06, "loss": 1.2055, "step": 6295 }, { "epoch": 0.92, "grad_norm": 0.1630859375, "learning_rate": 7.85877393512388e-06, "loss": 1.1735, "step": 6300 }, { "epoch": 0.92, "grad_norm": 0.1513671875, "learning_rate": 7.716758902172328e-06, "loss": 1.0167, "step": 6305 }, { "epoch": 0.92, "grad_norm": 0.1572265625, "learning_rate": 7.576018649789657e-06, "loss": 1.3354, "step": 6310 }, { "epoch": 0.92, "grad_norm": 0.1533203125, "learning_rate": 7.436553918484912e-06, "loss": 1.0759, "step": 6315 }, { "epoch": 0.92, "grad_norm": 0.1513671875, "learning_rate": 7.298365442056004e-06, "loss": 1.2152, "step": 6320 }, { "epoch": 0.92, "grad_norm": 0.1689453125, "learning_rate": 7.161453947585772e-06, "loss": 1.2261, "step": 6325 }, { "epoch": 0.92, "grad_norm": 0.14453125, "learning_rate": 7.025820155438151e-06, "loss": 1.2291, "step": 6330 }, { "epoch": 0.93, "grad_norm": 0.1357421875, "learning_rate": 6.891464779254452e-06, "loss": 1.0477, "step": 6335 }, { "epoch": 0.93, "grad_norm": 0.1767578125, "learning_rate": 6.758388525949483e-06, "loss": 0.9963, "step": 6340 }, { "epoch": 0.93, "grad_norm": 0.1689453125, "learning_rate": 6.626592095707984e-06, "loss": 1.0916, "step": 6345 }, { "epoch": 0.93, "grad_norm": 0.169921875, "learning_rate": 6.4960761819808365e-06, "loss": 0.8691, "step": 6350 }, { "epoch": 0.93, "grad_norm": 0.1455078125, "learning_rate": 6.366841471481477e-06, "loss": 1.0464, "step": 6355 }, { "epoch": 0.93, "grad_norm": 0.1376953125, "learning_rate": 6.238888644182178e-06, "loss": 1.017, "step": 6360 }, { "epoch": 0.93, "grad_norm": 0.169921875, "learning_rate": 6.112218373310635e-06, "loss": 0.9035, "step": 6365 }, { "epoch": 0.93, "grad_norm": 0.1650390625, "learning_rate": 5.9868313253462756e-06, "loss": 1.1066, "step": 6370 }, { "epoch": 0.93, "grad_norm": 0.1806640625, "learning_rate": 5.862728160016844e-06, "loss": 1.1089, "step": 6375 }, { "epoch": 0.93, "grad_norm": 0.12451171875, "learning_rate": 5.739909530294824e-06, "loss": 0.9805, "step": 6380 }, { "epoch": 0.93, "grad_norm": 0.1787109375, "learning_rate": 5.6183760823942385e-06, "loss": 0.9165, "step": 6385 }, { "epoch": 0.93, "grad_norm": 0.1640625, "learning_rate": 5.49812845576686e-06, "loss": 1.0781, "step": 6390 }, { "epoch": 0.93, "grad_norm": 0.1650390625, "learning_rate": 5.379167283099312e-06, "loss": 1.2878, "step": 6395 }, { "epoch": 0.93, "grad_norm": 0.15625, "learning_rate": 5.261493190309302e-06, "loss": 0.7727, "step": 6400 }, { "epoch": 0.94, "grad_norm": 0.1748046875, "learning_rate": 5.145106796542676e-06, "loss": 1.3831, "step": 6405 }, { "epoch": 0.94, "grad_norm": 0.1640625, "learning_rate": 5.030008714169892e-06, "loss": 0.8447, "step": 6410 }, { "epoch": 0.94, "grad_norm": 0.16015625, "learning_rate": 4.916199548783029e-06, "loss": 1.0932, "step": 6415 }, { "epoch": 0.94, "grad_norm": 0.1416015625, "learning_rate": 4.803679899192393e-06, "loss": 0.958, "step": 6420 }, { "epoch": 0.94, "grad_norm": 0.171875, "learning_rate": 4.692450357423522e-06, "loss": 1.147, "step": 6425 }, { "epoch": 0.94, "grad_norm": 0.1630859375, "learning_rate": 4.582511508714027e-06, "loss": 1.072, "step": 6430 }, { "epoch": 0.94, "grad_norm": 0.166015625, "learning_rate": 4.473863931510447e-06, "loss": 1.0882, "step": 6435 }, { "epoch": 0.94, "grad_norm": 0.1640625, "learning_rate": 4.366508197465285e-06, "loss": 1.2075, "step": 6440 }, { "epoch": 0.94, "grad_norm": 0.16015625, "learning_rate": 4.260444871433927e-06, "loss": 0.979, "step": 6445 }, { "epoch": 0.94, "grad_norm": 0.146484375, "learning_rate": 4.155674511471835e-06, "loss": 1.0348, "step": 6450 }, { "epoch": 0.94, "grad_norm": 0.2275390625, "learning_rate": 4.052197668831359e-06, "loss": 1.0764, "step": 6455 }, { "epoch": 0.94, "grad_norm": 0.1650390625, "learning_rate": 3.950014887959069e-06, "loss": 1.0324, "step": 6460 }, { "epoch": 0.94, "grad_norm": 0.150390625, "learning_rate": 3.849126706492679e-06, "loss": 1.1, "step": 6465 }, { "epoch": 0.94, "grad_norm": 0.1396484375, "learning_rate": 3.7495336552584302e-06, "loss": 1.0531, "step": 6470 }, { "epoch": 0.95, "grad_norm": 0.12890625, "learning_rate": 3.651236258268159e-06, "loss": 0.9983, "step": 6475 }, { "epoch": 0.95, "grad_norm": 0.14453125, "learning_rate": 3.554235032716596e-06, "loss": 1.1773, "step": 6480 }, { "epoch": 0.95, "grad_norm": 0.171875, "learning_rate": 3.4585304889785675e-06, "loss": 0.9983, "step": 6485 }, { "epoch": 0.95, "grad_norm": 0.16796875, "learning_rate": 3.3641231306064666e-06, "loss": 1.2404, "step": 6490 }, { "epoch": 0.95, "grad_norm": 0.1572265625, "learning_rate": 3.2710134543273986e-06, "loss": 1.0835, "step": 6495 }, { "epoch": 0.95, "grad_norm": 0.1591796875, "learning_rate": 3.1792019500407885e-06, "loss": 0.9551, "step": 6500 }, { "epoch": 0.95, "grad_norm": 0.150390625, "learning_rate": 3.088689100815556e-06, "loss": 1.1014, "step": 6505 }, { "epoch": 0.95, "grad_norm": 0.1484375, "learning_rate": 2.9994753828878617e-06, "loss": 1.1044, "step": 6510 }, { "epoch": 0.95, "grad_norm": 0.146484375, "learning_rate": 2.911561265658308e-06, "loss": 1.0531, "step": 6515 }, { "epoch": 0.95, "grad_norm": 0.1357421875, "learning_rate": 2.8249472116896878e-06, "loss": 0.9011, "step": 6520 }, { "epoch": 0.95, "grad_norm": 0.1494140625, "learning_rate": 2.739633676704462e-06, "loss": 0.9713, "step": 6525 }, { "epoch": 0.95, "grad_norm": 0.150390625, "learning_rate": 2.655621109582368e-06, "loss": 1.0419, "step": 6530 }, { "epoch": 0.95, "grad_norm": 0.1337890625, "learning_rate": 2.572909952358066e-06, "loss": 1.0271, "step": 6535 }, { "epoch": 0.96, "grad_norm": 0.1474609375, "learning_rate": 2.4915006402187467e-06, "loss": 1.0825, "step": 6540 }, { "epoch": 0.96, "grad_norm": 0.1611328125, "learning_rate": 2.4113936015020266e-06, "loss": 1.1312, "step": 6545 }, { "epoch": 0.96, "grad_norm": 0.1748046875, "learning_rate": 2.332589257693446e-06, "loss": 1.1144, "step": 6550 }, { "epoch": 0.96, "grad_norm": 0.1650390625, "learning_rate": 2.255088023424501e-06, "loss": 1.0819, "step": 6555 }, { "epoch": 0.96, "grad_norm": 0.146484375, "learning_rate": 2.1788903064701717e-06, "loss": 1.1108, "step": 6560 }, { "epoch": 0.96, "grad_norm": 0.140625, "learning_rate": 2.103996507747147e-06, "loss": 1.0079, "step": 6565 }, { "epoch": 0.96, "grad_norm": 0.146484375, "learning_rate": 2.0304070213112978e-06, "loss": 0.8256, "step": 6570 }, { "epoch": 0.96, "grad_norm": 0.1689453125, "learning_rate": 1.9581222343560134e-06, "loss": 1.0926, "step": 6575 }, { "epoch": 0.96, "grad_norm": 0.1484375, "learning_rate": 1.8871425272098676e-06, "loss": 1.0518, "step": 6580 }, { "epoch": 0.96, "grad_norm": 0.1650390625, "learning_rate": 1.8174682733346493e-06, "loss": 1.0388, "step": 6585 }, { "epoch": 0.96, "grad_norm": 0.1845703125, "learning_rate": 1.7490998393236145e-06, "loss": 1.0957, "step": 6590 }, { "epoch": 0.96, "grad_norm": 0.14453125, "learning_rate": 1.6820375848993208e-06, "loss": 0.988, "step": 6595 }, { "epoch": 0.96, "grad_norm": 0.166015625, "learning_rate": 1.6162818629118227e-06, "loss": 1.0691, "step": 6600 }, { "epoch": 0.96, "grad_norm": 0.1435546875, "learning_rate": 1.551833019336868e-06, "loss": 1.3649, "step": 6605 }, { "epoch": 0.97, "grad_norm": 0.2041015625, "learning_rate": 1.4886913932739278e-06, "loss": 1.1142, "step": 6610 }, { "epoch": 0.97, "grad_norm": 0.15625, "learning_rate": 1.4268573169446408e-06, "loss": 1.1082, "step": 6615 }, { "epoch": 0.97, "grad_norm": 0.16796875, "learning_rate": 1.3663311156908164e-06, "loss": 0.9187, "step": 6620 }, { "epoch": 0.97, "grad_norm": 0.193359375, "learning_rate": 1.307113107972907e-06, "loss": 1.0999, "step": 6625 }, { "epoch": 0.97, "grad_norm": 0.1513671875, "learning_rate": 1.2492036053682043e-06, "loss": 0.9749, "step": 6630 }, { "epoch": 0.97, "grad_norm": 0.244140625, "learning_rate": 1.1926029125693406e-06, "loss": 0.9945, "step": 6635 }, { "epoch": 0.97, "grad_norm": 0.21484375, "learning_rate": 1.1373113273825675e-06, "loss": 1.1462, "step": 6640 }, { "epoch": 0.97, "grad_norm": 0.1572265625, "learning_rate": 1.0833291407262025e-06, "loss": 0.9899, "step": 6645 }, { "epoch": 0.97, "grad_norm": 0.1572265625, "learning_rate": 1.0306566366291292e-06, "loss": 0.9839, "step": 6650 }, { "epoch": 0.97, "grad_norm": 0.1669921875, "learning_rate": 9.792940922293825e-07, "loss": 1.1274, "step": 6655 }, { "epoch": 0.97, "grad_norm": 0.173828125, "learning_rate": 9.292417777724827e-07, "loss": 1.2596, "step": 6660 }, { "epoch": 0.97, "grad_norm": 0.1201171875, "learning_rate": 8.804999566101868e-07, "loss": 0.9965, "step": 6665 }, { "epoch": 0.97, "grad_norm": 0.14453125, "learning_rate": 8.330688851990453e-07, "loss": 1.2265, "step": 6670 }, { "epoch": 0.97, "grad_norm": 0.146484375, "learning_rate": 7.869488130991254e-07, "loss": 0.9328, "step": 6675 }, { "epoch": 0.98, "grad_norm": 0.154296875, "learning_rate": 7.421399829725394e-07, "loss": 0.9776, "step": 6680 }, { "epoch": 0.98, "grad_norm": 0.1552734375, "learning_rate": 6.98642630582308e-07, "loss": 1.2955, "step": 6685 }, { "epoch": 0.98, "grad_norm": 0.1357421875, "learning_rate": 6.564569847910262e-07, "loss": 1.0571, "step": 6690 }, { "epoch": 0.98, "grad_norm": 0.17578125, "learning_rate": 6.15583267559755e-07, "loss": 1.1025, "step": 6695 }, { "epoch": 0.98, "grad_norm": 0.140625, "learning_rate": 5.760216939468266e-07, "loss": 1.13, "step": 6700 }, { "epoch": 0.98, "grad_norm": 0.1279296875, "learning_rate": 5.377724721066235e-07, "loss": 1.0371, "step": 6705 }, { "epoch": 0.98, "grad_norm": 0.1435546875, "learning_rate": 5.008358032885518e-07, "loss": 0.9811, "step": 6710 }, { "epoch": 0.98, "grad_norm": 0.1455078125, "learning_rate": 4.652118818360418e-07, "loss": 0.9162, "step": 6715 }, { "epoch": 0.98, "grad_norm": 0.18359375, "learning_rate": 4.309008951854099e-07, "loss": 1.0896, "step": 6720 }, { "epoch": 0.98, "grad_norm": 0.1552734375, "learning_rate": 3.9790302386499835e-07, "loss": 1.1249, "step": 6725 }, { "epoch": 0.98, "grad_norm": 0.130859375, "learning_rate": 3.6621844149400954e-07, "loss": 0.928, "step": 6730 }, { "epoch": 0.98, "grad_norm": 0.1640625, "learning_rate": 3.3584731478189524e-07, "loss": 1.1065, "step": 6735 }, { "epoch": 0.98, "grad_norm": 0.154296875, "learning_rate": 3.067898035272465e-07, "loss": 1.0509, "step": 6740 }, { "epoch": 0.98, "grad_norm": 0.142578125, "learning_rate": 2.7904606061698865e-07, "loss": 0.8672, "step": 6745 }, { "epoch": 0.99, "grad_norm": 0.1611328125, "learning_rate": 2.5261623202571527e-07, "loss": 0.9601, "step": 6750 }, { "epoch": 0.99, "grad_norm": 0.1484375, "learning_rate": 2.275004568147443e-07, "loss": 1.0955, "step": 6755 }, { "epoch": 0.99, "grad_norm": 0.2177734375, "learning_rate": 2.036988671314799e-07, "loss": 1.1564, "step": 6760 }, { "epoch": 0.99, "grad_norm": 0.1513671875, "learning_rate": 1.8121158820869066e-07, "loss": 1.0113, "step": 6765 }, { "epoch": 0.99, "grad_norm": 0.15625, "learning_rate": 1.6003873836389904e-07, "loss": 0.9622, "step": 6770 }, { "epoch": 0.99, "grad_norm": 0.1181640625, "learning_rate": 1.4018042899868744e-07, "loss": 1.1171, "step": 6775 }, { "epoch": 0.99, "grad_norm": 0.17578125, "learning_rate": 1.216367645981431e-07, "loss": 1.1264, "step": 6780 }, { "epoch": 0.99, "grad_norm": 0.1484375, "learning_rate": 1.0440784273030302e-07, "loss": 0.8995, "step": 6785 }, { "epoch": 0.99, "grad_norm": 0.1552734375, "learning_rate": 8.84937540456543e-08, "loss": 0.9804, "step": 6790 }, { "epoch": 0.99, "grad_norm": 0.1435546875, "learning_rate": 7.389458227666234e-08, "loss": 1.2118, "step": 6795 }, { "epoch": 0.99, "grad_norm": 0.146484375, "learning_rate": 6.061040423735453e-08, "loss": 1.2899, "step": 6800 }, { "epoch": 0.99, "grad_norm": 0.1689453125, "learning_rate": 4.864128982276506e-08, "loss": 1.1498, "step": 6805 }, { "epoch": 0.99, "grad_norm": 0.1279296875, "learning_rate": 3.798730200876843e-08, "loss": 1.0363, "step": 6810 }, { "epoch": 1.0, "grad_norm": 0.1513671875, "learning_rate": 2.8648496851663152e-08, "loss": 1.0867, "step": 6815 }, { "epoch": 1.0, "grad_norm": 0.13671875, "learning_rate": 2.062492348783862e-08, "loss": 0.9538, "step": 6820 }, { "epoch": 1.0, "grad_norm": 0.1591796875, "learning_rate": 1.3916624133580857e-08, "loss": 1.1978, "step": 6825 }, { "epoch": 1.0, "grad_norm": 0.140625, "learning_rate": 8.523634084794951e-09, "loss": 0.9851, "step": 6830 }, { "epoch": 1.0, "grad_norm": 0.1455078125, "learning_rate": 4.445981716866276e-09, "loss": 1.0124, "step": 6835 }, { "epoch": 1.0, "grad_norm": 0.15625, "learning_rate": 1.6836884844662059e-09, "loss": 1.0845, "step": 6840 }, { "epoch": 1.0, "grad_norm": 0.1513671875, "learning_rate": 2.3676892152435516e-10, "loss": 1.007, "step": 6845 }, { "epoch": 1.0, "eval_loss": 1.0585945844650269, "eval_runtime": 12437.9638, "eval_samples_per_second": 0.11, "eval_steps_per_second": 0.11, "step": 6848 } ], "logging_steps": 5, "max_steps": 6848, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 5.565477044449444e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }