|
{
|
|
"best_global_step": 6681,
|
|
"best_metric": 3.168698787689209,
|
|
"best_model_checkpoint": "./luc-bat-poet-model\\checkpoint-6681",
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 6681,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.00044917324050419694,
|
|
"grad_norm": 5.473126411437988,
|
|
"learning_rate": 0.0,
|
|
"loss": 5.4778,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.011229331012604924,
|
|
"grad_norm": 0.8930467963218689,
|
|
"learning_rate": 0.00012,
|
|
"loss": 5.0452,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.02245866202520985,
|
|
"grad_norm": 0.7617964744567871,
|
|
"learning_rate": 0.000245,
|
|
"loss": 4.5144,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.03368799303781477,
|
|
"grad_norm": 0.8281979560852051,
|
|
"learning_rate": 0.00037,
|
|
"loss": 4.3613,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.0449173240504197,
|
|
"grad_norm": 0.8189941644668579,
|
|
"learning_rate": 0.000495,
|
|
"loss": 4.2239,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.05614665506302462,
|
|
"grad_norm": 1.01946222782135,
|
|
"learning_rate": 0.00062,
|
|
"loss": 4.1386,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.06737598607562954,
|
|
"grad_norm": 0.9489204287528992,
|
|
"learning_rate": 0.000745,
|
|
"loss": 4.0065,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.07860531708823447,
|
|
"grad_norm": 1.6589257717132568,
|
|
"learning_rate": 0.00087,
|
|
"loss": 3.9373,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.0898346481008394,
|
|
"grad_norm": 1.1351829767227173,
|
|
"learning_rate": 0.000995,
|
|
"loss": 3.9028,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.10106397911344432,
|
|
"grad_norm": 0.7443220615386963,
|
|
"learning_rate": 0.000999995416032659,
|
|
"loss": 3.8687,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.11229331012604923,
|
|
"grad_norm": 0.8907812833786011,
|
|
"learning_rate": 0.0009999808922703088,
|
|
"loss": 3.8144,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.12352264113865416,
|
|
"grad_norm": 0.651372492313385,
|
|
"learning_rate": 0.0009999564210436207,
|
|
"loss": 3.8007,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.1347519721512591,
|
|
"grad_norm": 0.7908700704574585,
|
|
"learning_rate": 0.000999922002839467,
|
|
"loss": 3.7684,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.145981303163864,
|
|
"grad_norm": 0.6001281142234802,
|
|
"learning_rate": 0.0009998776383426215,
|
|
"loss": 3.743,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.15721063417646894,
|
|
"grad_norm": 0.6484293937683105,
|
|
"learning_rate": 0.0009998233284357462,
|
|
"loss": 3.7416,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.16843996518907386,
|
|
"grad_norm": 0.7714053988456726,
|
|
"learning_rate": 0.0009997590741993743,
|
|
"loss": 3.7108,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.1796692962016788,
|
|
"grad_norm": 0.6280301213264465,
|
|
"learning_rate": 0.0009996848769118882,
|
|
"loss": 3.6854,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.19089862721428372,
|
|
"grad_norm": 0.644794225692749,
|
|
"learning_rate": 0.0009996007380494937,
|
|
"loss": 3.6737,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.20212795822688864,
|
|
"grad_norm": 0.6872850060462952,
|
|
"learning_rate": 0.0009995066592861919,
|
|
"loss": 3.662,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.21335728923949357,
|
|
"grad_norm": 0.601828396320343,
|
|
"learning_rate": 0.0009994026424937441,
|
|
"loss": 3.678,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.22458662025209847,
|
|
"grad_norm": 0.6280804872512817,
|
|
"learning_rate": 0.0009992886897416365,
|
|
"loss": 3.6652,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.2358159512647034,
|
|
"grad_norm": 0.5711939930915833,
|
|
"learning_rate": 0.0009991648032970373,
|
|
"loss": 3.627,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.24704528227730832,
|
|
"grad_norm": 0.5883836150169373,
|
|
"learning_rate": 0.000999030985624753,
|
|
"loss": 3.6433,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.25827461328991325,
|
|
"grad_norm": 0.5956864356994629,
|
|
"learning_rate": 0.000998887239387178,
|
|
"loss": 3.6207,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.2695039443025182,
|
|
"grad_norm": 0.577458918094635,
|
|
"learning_rate": 0.000998733567444243,
|
|
"loss": 3.6074,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.2807332753151231,
|
|
"grad_norm": 0.8079866170883179,
|
|
"learning_rate": 0.0009985699728533573,
|
|
"loss": 3.6119,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.291962606327728,
|
|
"grad_norm": 0.5645362138748169,
|
|
"learning_rate": 0.0009983964588693478,
|
|
"loss": 3.6411,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.30319193734033295,
|
|
"grad_norm": 0.5666602253913879,
|
|
"learning_rate": 0.0009982130289443944,
|
|
"loss": 3.5725,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.3144212683529379,
|
|
"grad_norm": 0.6160576343536377,
|
|
"learning_rate": 0.0009980196867279626,
|
|
"loss": 3.6055,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.3256505993655428,
|
|
"grad_norm": 0.6656046509742737,
|
|
"learning_rate": 0.0009978164360667286,
|
|
"loss": 3.5776,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.33687993037814773,
|
|
"grad_norm": 0.6236292123794556,
|
|
"learning_rate": 0.0009976032810045043,
|
|
"loss": 3.5509,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.34810926139075266,
|
|
"grad_norm": 0.5736802816390991,
|
|
"learning_rate": 0.0009973802257821566,
|
|
"loss": 3.557,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.3593385924033576,
|
|
"grad_norm": 0.5729912519454956,
|
|
"learning_rate": 0.000997147274837523,
|
|
"loss": 3.5374,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.3705679234159625,
|
|
"grad_norm": 0.6009777188301086,
|
|
"learning_rate": 0.000996904432805323,
|
|
"loss": 3.5564,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.38179725442856743,
|
|
"grad_norm": 0.5442889928817749,
|
|
"learning_rate": 0.0009966517045170659,
|
|
"loss": 3.5109,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.39302658544117236,
|
|
"grad_norm": 0.5199196338653564,
|
|
"learning_rate": 0.0009963890950009549,
|
|
"loss": 3.526,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.4042559164537773,
|
|
"grad_norm": 0.5444336533546448,
|
|
"learning_rate": 0.000996116609481788,
|
|
"loss": 3.5177,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.4154852474663822,
|
|
"grad_norm": 0.5710541009902954,
|
|
"learning_rate": 0.000995834253380852,
|
|
"loss": 3.4767,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.42671457847898714,
|
|
"grad_norm": 0.5203866362571716,
|
|
"learning_rate": 0.000995542032315816,
|
|
"loss": 3.4972,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.43794390949159206,
|
|
"grad_norm": 0.545213520526886,
|
|
"learning_rate": 0.0009952399521006192,
|
|
"loss": 3.4623,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.44917324050419694,
|
|
"grad_norm": 0.5384295582771301,
|
|
"learning_rate": 0.0009949280187453561,
|
|
"loss": 3.4367,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.46040257151680186,
|
|
"grad_norm": 0.5814492106437683,
|
|
"learning_rate": 0.0009946062384561555,
|
|
"loss": 3.441,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 0.4716319025294068,
|
|
"grad_norm": 0.5313106179237366,
|
|
"learning_rate": 0.000994274617635058,
|
|
"loss": 3.4336,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.4828612335420117,
|
|
"grad_norm": 0.6174953579902649,
|
|
"learning_rate": 0.0009939331628798882,
|
|
"loss": 3.3919,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 0.49409056455461664,
|
|
"grad_norm": 0.566247284412384,
|
|
"learning_rate": 0.0009935818809841239,
|
|
"loss": 3.4281,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.5053198955672216,
|
|
"grad_norm": 0.5577934384346008,
|
|
"learning_rate": 0.0009932207789367603,
|
|
"loss": 3.4043,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 0.5165492265798265,
|
|
"grad_norm": 0.5082374811172485,
|
|
"learning_rate": 0.0009928498639221715,
|
|
"loss": 3.3853,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.5277785575924314,
|
|
"grad_norm": 0.49451902508735657,
|
|
"learning_rate": 0.0009924691433199674,
|
|
"loss": 3.3794,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 0.5390078886050363,
|
|
"grad_norm": 0.5019585490226746,
|
|
"learning_rate": 0.0009920786247048464,
|
|
"loss": 3.4127,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.5502372196176413,
|
|
"grad_norm": 0.599949300289154,
|
|
"learning_rate": 0.0009916783158464455,
|
|
"loss": 3.3923,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 0.5614665506302462,
|
|
"grad_norm": 0.5792480707168579,
|
|
"learning_rate": 0.0009912682247091853,
|
|
"loss": 3.3656,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.5726958816428511,
|
|
"grad_norm": 0.5839366912841797,
|
|
"learning_rate": 0.0009908483594521116,
|
|
"loss": 3.3895,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 0.583925212655456,
|
|
"grad_norm": 0.5739409923553467,
|
|
"learning_rate": 0.0009904187284287332,
|
|
"loss": 3.3506,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.595154543668061,
|
|
"grad_norm": 0.5897430777549744,
|
|
"learning_rate": 0.0009899793401868546,
|
|
"loss": 3.3247,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 0.6063838746806659,
|
|
"grad_norm": 0.5063382983207703,
|
|
"learning_rate": 0.0009895302034684083,
|
|
"loss": 3.3369,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.6176132056932708,
|
|
"grad_norm": 0.5431721806526184,
|
|
"learning_rate": 0.0009890713272092786,
|
|
"loss": 3.3446,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 0.6288425367058758,
|
|
"grad_norm": 0.4924439489841461,
|
|
"learning_rate": 0.0009886027205391248,
|
|
"loss": 3.3377,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.6400718677184807,
|
|
"grad_norm": 0.54843670129776,
|
|
"learning_rate": 0.0009881243927811992,
|
|
"loss": 3.345,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 0.6513011987310856,
|
|
"grad_norm": 0.5399184226989746,
|
|
"learning_rate": 0.0009876363534521626,
|
|
"loss": 3.3344,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.6625305297436905,
|
|
"grad_norm": 0.5677058696746826,
|
|
"learning_rate": 0.0009871386122618933,
|
|
"loss": 3.3328,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 0.6737598607562955,
|
|
"grad_norm": 0.482010155916214,
|
|
"learning_rate": 0.0009866311791132953,
|
|
"loss": 3.3433,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.6849891917689004,
|
|
"grad_norm": 0.49796995520591736,
|
|
"learning_rate": 0.000986114064102101,
|
|
"loss": 3.3413,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 0.6962185227815053,
|
|
"grad_norm": 0.4976087808609009,
|
|
"learning_rate": 0.0009855872775166696,
|
|
"loss": 3.3176,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.7074478537941102,
|
|
"grad_norm": 0.5586963891983032,
|
|
"learning_rate": 0.0009850508298377832,
|
|
"loss": 3.3188,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 0.7186771848067152,
|
|
"grad_norm": 0.45419466495513916,
|
|
"learning_rate": 0.0009845047317384378,
|
|
"loss": 3.2902,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.7299065158193201,
|
|
"grad_norm": 0.5402281880378723,
|
|
"learning_rate": 0.0009839489940836317,
|
|
"loss": 3.2893,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 0.741135846831925,
|
|
"grad_norm": 0.5439639687538147,
|
|
"learning_rate": 0.0009833836279301484,
|
|
"loss": 3.3065,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.7523651778445299,
|
|
"grad_norm": 0.49050387740135193,
|
|
"learning_rate": 0.0009828086445263368,
|
|
"loss": 3.2796,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 0.7635945088571349,
|
|
"grad_norm": 0.5266304016113281,
|
|
"learning_rate": 0.000982224055311888,
|
|
"loss": 3.29,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.7748238398697398,
|
|
"grad_norm": 0.5335237979888916,
|
|
"learning_rate": 0.0009816298719176073,
|
|
"loss": 3.2889,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 0.7860531708823447,
|
|
"grad_norm": 0.5236843824386597,
|
|
"learning_rate": 0.0009810261061651826,
|
|
"loss": 3.2758,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.7972825018949496,
|
|
"grad_norm": 0.541266679763794,
|
|
"learning_rate": 0.0009804127700669496,
|
|
"loss": 3.3053,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 0.8085118329075546,
|
|
"grad_norm": 0.47298872470855713,
|
|
"learning_rate": 0.0009797898758256525,
|
|
"loss": 3.277,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.8197411639201595,
|
|
"grad_norm": 0.5097940564155579,
|
|
"learning_rate": 0.0009791574358342014,
|
|
"loss": 3.2537,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 0.8309704949327644,
|
|
"grad_norm": 0.5668537616729736,
|
|
"learning_rate": 0.0009785154626754259,
|
|
"loss": 3.2514,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.8421998259453694,
|
|
"grad_norm": 0.5071256160736084,
|
|
"learning_rate": 0.000977863969121824,
|
|
"loss": 3.2467,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 0.8534291569579743,
|
|
"grad_norm": 0.4892118573188782,
|
|
"learning_rate": 0.000977202968135309,
|
|
"loss": 3.2772,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.8646584879705792,
|
|
"grad_norm": 0.463008850812912,
|
|
"learning_rate": 0.000976532472866951,
|
|
"loss": 3.2492,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 0.8758878189831841,
|
|
"grad_norm": 0.5179749727249146,
|
|
"learning_rate": 0.0009758524966567152,
|
|
"loss": 3.2289,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.8871171499957891,
|
|
"grad_norm": 0.5276876091957092,
|
|
"learning_rate": 0.000975163053033197,
|
|
"loss": 3.2474,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 0.8983464810083939,
|
|
"grad_norm": 0.5618298649787903,
|
|
"learning_rate": 0.000974464155713352,
|
|
"loss": 3.2652,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.9095758120209988,
|
|
"grad_norm": 0.4800003468990326,
|
|
"learning_rate": 0.0009737558186022242,
|
|
"loss": 3.2424,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 0.9208051430336037,
|
|
"grad_norm": 0.47998154163360596,
|
|
"learning_rate": 0.0009730380557926682,
|
|
"loss": 3.2737,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.9320344740462086,
|
|
"grad_norm": 0.5447694659233093,
|
|
"learning_rate": 0.00097231088156507,
|
|
"loss": 3.249,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 0.9432638050588136,
|
|
"grad_norm": 0.5559055209159851,
|
|
"learning_rate": 0.0009715743103870615,
|
|
"loss": 3.2566,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.9544931360714185,
|
|
"grad_norm": 0.478614866733551,
|
|
"learning_rate": 0.0009708283569132341,
|
|
"loss": 3.2076,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 0.9657224670840234,
|
|
"grad_norm": 0.44457143545150757,
|
|
"learning_rate": 0.000970073035984846,
|
|
"loss": 3.2052,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.9769517980966284,
|
|
"grad_norm": 0.5057160258293152,
|
|
"learning_rate": 0.0009693083626295274,
|
|
"loss": 3.1944,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 0.9881811291092333,
|
|
"grad_norm": 0.487543523311615,
|
|
"learning_rate": 0.0009685343520609816,
|
|
"loss": 3.2862,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.9994104601218382,
|
|
"grad_norm": 0.5547086000442505,
|
|
"learning_rate": 0.0009677510196786822,
|
|
"loss": 3.2249,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_loss": 3.2925968170166016,
|
|
"eval_runtime": 230.4828,
|
|
"eval_samples_per_second": 54.598,
|
|
"eval_steps_per_second": 54.598,
|
|
"step": 2227
|
|
},
|
|
{
|
|
"epoch": 1.0103309845315964,
|
|
"grad_norm": 0.5509684085845947,
|
|
"learning_rate": 0.0009669583810675666,
|
|
"loss": 3.0297,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 1.0215603155442015,
|
|
"grad_norm": 0.5036989450454712,
|
|
"learning_rate": 0.0009661564519977263,
|
|
"loss": 2.9815,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 1.0327896465568063,
|
|
"grad_norm": 0.5602796673774719,
|
|
"learning_rate": 0.0009653452484240923,
|
|
"loss": 2.994,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 1.0440189775694113,
|
|
"grad_norm": 0.5729214549064636,
|
|
"learning_rate": 0.0009645247864861191,
|
|
"loss": 2.9956,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 1.0552483085820161,
|
|
"grad_norm": 0.5456846356391907,
|
|
"learning_rate": 0.0009636950825074618,
|
|
"loss": 2.985,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 1.0664776395946212,
|
|
"grad_norm": 0.544643223285675,
|
|
"learning_rate": 0.0009628561529956529,
|
|
"loss": 2.9973,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"epoch": 1.077706970607226,
|
|
"grad_norm": 0.5306060314178467,
|
|
"learning_rate": 0.0009620080146417731,
|
|
"loss": 3.0053,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 1.088936301619831,
|
|
"grad_norm": 0.5001072883605957,
|
|
"learning_rate": 0.0009611506843201193,
|
|
"loss": 3.0244,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"epoch": 1.1001656326324358,
|
|
"grad_norm": 0.52583909034729,
|
|
"learning_rate": 0.0009602841790878688,
|
|
"loss": 3.0266,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 1.1113949636450409,
|
|
"grad_norm": 0.536445677280426,
|
|
"learning_rate": 0.0009594085161847405,
|
|
"loss": 3.0124,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"epoch": 1.1226242946576457,
|
|
"grad_norm": 0.5341405272483826,
|
|
"learning_rate": 0.0009585237130326508,
|
|
"loss": 3.0272,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 1.1338536256702507,
|
|
"grad_norm": 0.5340954661369324,
|
|
"learning_rate": 0.0009576297872353686,
|
|
"loss": 3.0152,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"epoch": 1.1450829566828555,
|
|
"grad_norm": 0.4479193687438965,
|
|
"learning_rate": 0.0009567267565781628,
|
|
"loss": 3.0202,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 1.1563122876954606,
|
|
"grad_norm": 0.5316035747528076,
|
|
"learning_rate": 0.0009558146390274512,
|
|
"loss": 3.015,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"epoch": 1.1675416187080654,
|
|
"grad_norm": 0.5239371061325073,
|
|
"learning_rate": 0.0009548934527304407,
|
|
"loss": 3.0618,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 1.1787709497206704,
|
|
"grad_norm": 0.6486944556236267,
|
|
"learning_rate": 0.0009539632160147672,
|
|
"loss": 3.0004,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"epoch": 1.1900002807332752,
|
|
"grad_norm": 0.5308857560157776,
|
|
"learning_rate": 0.0009530239473881313,
|
|
"loss": 3.0425,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 1.2012296117458803,
|
|
"grad_norm": 0.5612149834632874,
|
|
"learning_rate": 0.0009520756655379293,
|
|
"loss": 3.0447,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"epoch": 1.212458942758485,
|
|
"grad_norm": 0.5429418683052063,
|
|
"learning_rate": 0.0009511183893308821,
|
|
"loss": 2.9887,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 1.2236882737710901,
|
|
"grad_norm": 0.5688816905021667,
|
|
"learning_rate": 0.0009501521378126594,
|
|
"loss": 2.9961,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"epoch": 1.234917604783695,
|
|
"grad_norm": 0.5409512519836426,
|
|
"learning_rate": 0.0009491769302075008,
|
|
"loss": 3.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 1.2461469357963,
|
|
"grad_norm": 0.5384955406188965,
|
|
"learning_rate": 0.0009481927859178337,
|
|
"loss": 3.0271,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"epoch": 1.2573762668089048,
|
|
"grad_norm": 0.5857961177825928,
|
|
"learning_rate": 0.0009471997245238865,
|
|
"loss": 2.9983,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 1.2686055978215098,
|
|
"grad_norm": 0.5337027907371521,
|
|
"learning_rate": 0.0009461977657833003,
|
|
"loss": 3.0552,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"epoch": 1.2798349288341146,
|
|
"grad_norm": 0.5078946352005005,
|
|
"learning_rate": 0.0009451869296307341,
|
|
"loss": 3.0191,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 1.2910642598467197,
|
|
"grad_norm": 0.5108660459518433,
|
|
"learning_rate": 0.00094416723617747,
|
|
"loss": 3.0234,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"epoch": 1.3022935908593245,
|
|
"grad_norm": 0.5631129741668701,
|
|
"learning_rate": 0.0009431387057110118,
|
|
"loss": 3.0319,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 1.3135229218719295,
|
|
"grad_norm": 0.5249589085578918,
|
|
"learning_rate": 0.0009421013586946816,
|
|
"loss": 2.9866,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"epoch": 1.3247522528845344,
|
|
"grad_norm": 0.4992469251155853,
|
|
"learning_rate": 0.000941055215767213,
|
|
"loss": 3.0144,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 1.3359815838971394,
|
|
"grad_norm": 0.4509263336658478,
|
|
"learning_rate": 0.0009400002977423405,
|
|
"loss": 3.0092,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"epoch": 1.3472109149097442,
|
|
"grad_norm": 0.515438973903656,
|
|
"learning_rate": 0.0009389366256083849,
|
|
"loss": 2.9993,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 1.3584402459223492,
|
|
"grad_norm": 0.5087840557098389,
|
|
"learning_rate": 0.0009378642205278363,
|
|
"loss": 3.0242,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"epoch": 1.369669576934954,
|
|
"grad_norm": 0.5046051144599915,
|
|
"learning_rate": 0.0009367831038369326,
|
|
"loss": 2.9971,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 1.380898907947559,
|
|
"grad_norm": 0.5728681087493896,
|
|
"learning_rate": 0.0009356932970452353,
|
|
"loss": 3.0292,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"epoch": 1.392128238960164,
|
|
"grad_norm": 0.5724380016326904,
|
|
"learning_rate": 0.0009345948218352014,
|
|
"loss": 3.0098,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 1.403357569972769,
|
|
"grad_norm": 0.5322164297103882,
|
|
"learning_rate": 0.0009334877000617518,
|
|
"loss": 2.9968,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"epoch": 1.4145869009853738,
|
|
"grad_norm": 0.558423638343811,
|
|
"learning_rate": 0.0009323719537518374,
|
|
"loss": 3.0334,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 1.4258162319979788,
|
|
"grad_norm": 0.5415078997612,
|
|
"learning_rate": 0.0009312476051039994,
|
|
"loss": 3.0313,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"epoch": 1.4370455630105836,
|
|
"grad_norm": 0.46919873356819153,
|
|
"learning_rate": 0.0009301146764879292,
|
|
"loss": 2.9992,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 1.4482748940231884,
|
|
"grad_norm": 0.5965465903282166,
|
|
"learning_rate": 0.0009289731904440217,
|
|
"loss": 3.0071,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"epoch": 1.4595042250357935,
|
|
"grad_norm": 0.4882059693336487,
|
|
"learning_rate": 0.0009278231696829288,
|
|
"loss": 2.968,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 1.4707335560483985,
|
|
"grad_norm": 0.6297493577003479,
|
|
"learning_rate": 0.0009266646370851055,
|
|
"loss": 3.0411,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"epoch": 1.4819628870610033,
|
|
"grad_norm": 0.5603842735290527,
|
|
"learning_rate": 0.0009254976157003563,
|
|
"loss": 3.0203,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 1.4931922180736081,
|
|
"grad_norm": 0.49509698152542114,
|
|
"learning_rate": 0.0009243221287473755,
|
|
"loss": 3.0176,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"epoch": 1.5044215490862132,
|
|
"grad_norm": 0.48536983132362366,
|
|
"learning_rate": 0.0009231381996132862,
|
|
"loss": 2.9547,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 1.5156508800988182,
|
|
"grad_norm": 0.47351208329200745,
|
|
"learning_rate": 0.0009219458518531739,
|
|
"loss": 2.9666,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"epoch": 1.526880211111423,
|
|
"grad_norm": 0.5615521669387817,
|
|
"learning_rate": 0.0009207451091896191,
|
|
"loss": 3.0295,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 1.5381095421240278,
|
|
"grad_norm": 0.5138916969299316,
|
|
"learning_rate": 0.0009195359955122244,
|
|
"loss": 3.0146,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"epoch": 1.5493388731366329,
|
|
"grad_norm": 0.5883649587631226,
|
|
"learning_rate": 0.0009183185348771392,
|
|
"loss": 3.0151,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 1.560568204149238,
|
|
"grad_norm": 0.5921751260757446,
|
|
"learning_rate": 0.0009170927515065821,
|
|
"loss": 3.0314,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"epoch": 1.5717975351618427,
|
|
"grad_norm": 0.5592530965805054,
|
|
"learning_rate": 0.0009158586697883576,
|
|
"loss": 2.9921,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 1.5830268661744475,
|
|
"grad_norm": 0.5621814727783203,
|
|
"learning_rate": 0.0009146163142753716,
|
|
"loss": 2.9987,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"epoch": 1.5942561971870526,
|
|
"grad_norm": 0.5482603311538696,
|
|
"learning_rate": 0.0009133657096851431,
|
|
"loss": 2.9802,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 1.6054855281996576,
|
|
"grad_norm": 0.5254377722740173,
|
|
"learning_rate": 0.0009121068808993124,
|
|
"loss": 3.0121,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"epoch": 1.6167148592122624,
|
|
"grad_norm": 0.47623664140701294,
|
|
"learning_rate": 0.0009108398529631451,
|
|
"loss": 3.0068,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 1.6279441902248672,
|
|
"grad_norm": 0.49733710289001465,
|
|
"learning_rate": 0.0009095646510850351,
|
|
"loss": 3.0104,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"epoch": 1.6391735212374723,
|
|
"grad_norm": 0.5388875603675842,
|
|
"learning_rate": 0.0009082813006360026,
|
|
"loss": 2.9823,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 1.6504028522500773,
|
|
"grad_norm": 0.5329872965812683,
|
|
"learning_rate": 0.0009069898271491887,
|
|
"loss": 2.9945,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"epoch": 1.6616321832626821,
|
|
"grad_norm": 0.5175071358680725,
|
|
"learning_rate": 0.0009056902563193486,
|
|
"loss": 2.9875,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 1.672861514275287,
|
|
"grad_norm": 0.514216423034668,
|
|
"learning_rate": 0.0009043826140023388,
|
|
"loss": 3.016,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"epoch": 1.684090845287892,
|
|
"grad_norm": 0.5547803640365601,
|
|
"learning_rate": 0.0009030669262146046,
|
|
"loss": 2.9906,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 1.695320176300497,
|
|
"grad_norm": 0.5035697817802429,
|
|
"learning_rate": 0.0009017432191326611,
|
|
"loss": 2.9795,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"epoch": 1.7065495073131018,
|
|
"grad_norm": 0.4960135519504547,
|
|
"learning_rate": 0.0009004115190925724,
|
|
"loss": 2.986,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 1.7177788383257067,
|
|
"grad_norm": 0.5573786497116089,
|
|
"learning_rate": 0.0008990718525894286,
|
|
"loss": 2.9981,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"epoch": 1.7290081693383117,
|
|
"grad_norm": 0.558542788028717,
|
|
"learning_rate": 0.0008977242462768177,
|
|
"loss": 3.0122,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 1.7402375003509167,
|
|
"grad_norm": 0.48205050826072693,
|
|
"learning_rate": 0.0008963687269662957,
|
|
"loss": 2.9558,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"epoch": 1.7514668313635215,
|
|
"grad_norm": 0.48525846004486084,
|
|
"learning_rate": 0.0008950053216268534,
|
|
"loss": 3.0034,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 1.7626961623761264,
|
|
"grad_norm": 0.5863490700721741,
|
|
"learning_rate": 0.0008936340573843795,
|
|
"loss": 3.0222,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"epoch": 1.7739254933887314,
|
|
"grad_norm": 0.54740309715271,
|
|
"learning_rate": 0.0008922549615211206,
|
|
"loss": 2.9785,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 1.7851548244013364,
|
|
"grad_norm": 0.5275555849075317,
|
|
"learning_rate": 0.0008908680614751392,
|
|
"loss": 2.982,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"epoch": 1.7963841554139413,
|
|
"grad_norm": 0.5472078919410706,
|
|
"learning_rate": 0.0008894733848397674,
|
|
"loss": 3.0128,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 1.807613486426546,
|
|
"grad_norm": 0.5604407787322998,
|
|
"learning_rate": 0.0008880709593630578,
|
|
"loss": 3.0119,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"epoch": 1.818842817439151,
|
|
"grad_norm": 0.5137823224067688,
|
|
"learning_rate": 0.0008866608129472313,
|
|
"loss": 2.9858,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 1.8300721484517561,
|
|
"grad_norm": 0.5707024931907654,
|
|
"learning_rate": 0.0008852429736481227,
|
|
"loss": 3.013,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"epoch": 1.841301479464361,
|
|
"grad_norm": 0.5344915986061096,
|
|
"learning_rate": 0.0008838174696746215,
|
|
"loss": 2.9899,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 1.8525308104769658,
|
|
"grad_norm": 0.504295289516449,
|
|
"learning_rate": 0.0008823843293881117,
|
|
"loss": 3.0095,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"epoch": 1.8637601414895708,
|
|
"grad_norm": 0.5654752254486084,
|
|
"learning_rate": 0.0008809435813019065,
|
|
"loss": 2.988,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 1.8749894725021758,
|
|
"grad_norm": 0.5086371302604675,
|
|
"learning_rate": 0.0008794952540806817,
|
|
"loss": 3.0304,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"epoch": 1.8862188035147807,
|
|
"grad_norm": 0.5218560099601746,
|
|
"learning_rate": 0.0008780393765399055,
|
|
"loss": 2.9817,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 1.8974481345273855,
|
|
"grad_norm": 0.48831528425216675,
|
|
"learning_rate": 0.0008765759776452646,
|
|
"loss": 3.0245,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"epoch": 1.9086774655399905,
|
|
"grad_norm": 0.5015767812728882,
|
|
"learning_rate": 0.0008751050865120882,
|
|
"loss": 3.0238,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 1.9199067965525953,
|
|
"grad_norm": 0.5325757265090942,
|
|
"learning_rate": 0.000873626732404769,
|
|
"loss": 2.9993,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"epoch": 1.9311361275652001,
|
|
"grad_norm": 0.48629334568977356,
|
|
"learning_rate": 0.0008721409447361803,
|
|
"loss": 2.9634,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 1.9423654585778052,
|
|
"grad_norm": 0.49022358655929565,
|
|
"learning_rate": 0.0008706477530670917,
|
|
"loss": 2.9736,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"epoch": 1.9535947895904102,
|
|
"grad_norm": 0.5039647221565247,
|
|
"learning_rate": 0.0008691471871055801,
|
|
"loss": 2.9802,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 1.964824120603015,
|
|
"grad_norm": 0.5223824977874756,
|
|
"learning_rate": 0.0008676392767064391,
|
|
"loss": 3.0397,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"epoch": 1.9760534516156198,
|
|
"grad_norm": 0.5172558426856995,
|
|
"learning_rate": 0.0008661240518705854,
|
|
"loss": 2.9756,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 1.9872827826282249,
|
|
"grad_norm": 0.4955403208732605,
|
|
"learning_rate": 0.0008646015427444609,
|
|
"loss": 2.9748,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"epoch": 1.99851211364083,
|
|
"grad_norm": 0.553287923336029,
|
|
"learning_rate": 0.0008630717796194337,
|
|
"loss": 2.9501,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_loss": 3.1917288303375244,
|
|
"eval_runtime": 227.8959,
|
|
"eval_samples_per_second": 55.218,
|
|
"eval_steps_per_second": 55.218,
|
|
"step": 4454
|
|
},
|
|
{
|
|
"epoch": 2.0094326380505882,
|
|
"grad_norm": 0.5545716285705566,
|
|
"learning_rate": 0.0008615347929311949,
|
|
"loss": 2.7426,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"epoch": 2.020661969063193,
|
|
"grad_norm": 0.5116350650787354,
|
|
"learning_rate": 0.0008599906132591541,
|
|
"loss": 2.6669,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 2.031891300075798,
|
|
"grad_norm": 0.6248686909675598,
|
|
"learning_rate": 0.0008584392713258295,
|
|
"loss": 2.6597,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"epoch": 2.043120631088403,
|
|
"grad_norm": 0.5305931568145752,
|
|
"learning_rate": 0.0008568807979962379,
|
|
"loss": 2.6635,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 2.054349962101008,
|
|
"grad_norm": 0.5366395711898804,
|
|
"learning_rate": 0.0008553152242772798,
|
|
"loss": 2.668,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"epoch": 2.0655792931136125,
|
|
"grad_norm": 0.6074578762054443,
|
|
"learning_rate": 0.0008537425813171232,
|
|
"loss": 2.7031,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 2.0768086241262176,
|
|
"grad_norm": 0.6074210405349731,
|
|
"learning_rate": 0.0008521629004045832,
|
|
"loss": 2.6721,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"epoch": 2.0880379551388226,
|
|
"grad_norm": 0.6065968871116638,
|
|
"learning_rate": 0.0008505762129685002,
|
|
"loss": 2.6774,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 2.0992672861514277,
|
|
"grad_norm": 0.5635676383972168,
|
|
"learning_rate": 0.0008489825505771136,
|
|
"loss": 2.6537,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"epoch": 2.1104966171640323,
|
|
"grad_norm": 0.5447210669517517,
|
|
"learning_rate": 0.000847381944937435,
|
|
"loss": 2.6964,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 2.1217259481766373,
|
|
"grad_norm": 0.6450474858283997,
|
|
"learning_rate": 0.0008457744278946162,
|
|
"loss": 2.6591,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"epoch": 2.1329552791892423,
|
|
"grad_norm": 0.5620629787445068,
|
|
"learning_rate": 0.0008441600314313165,
|
|
"loss": 2.6787,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 2.1441846102018474,
|
|
"grad_norm": 0.5661433935165405,
|
|
"learning_rate": 0.0008425387876670658,
|
|
"loss": 2.7193,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"epoch": 2.155413941214452,
|
|
"grad_norm": 0.6069077849388123,
|
|
"learning_rate": 0.0008409107288576259,
|
|
"loss": 2.6947,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 2.166643272227057,
|
|
"grad_norm": 0.6271758675575256,
|
|
"learning_rate": 0.0008392758873943484,
|
|
"loss": 2.6952,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"epoch": 2.177872603239662,
|
|
"grad_norm": 0.5473480820655823,
|
|
"learning_rate": 0.0008376342958035308,
|
|
"loss": 2.6981,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 2.189101934252267,
|
|
"grad_norm": 0.631519079208374,
|
|
"learning_rate": 0.0008359859867457686,
|
|
"loss": 2.6921,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"epoch": 2.2003312652648717,
|
|
"grad_norm": 0.6639063954353333,
|
|
"learning_rate": 0.0008343309930153064,
|
|
"loss": 2.6837,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 2.2115605962774767,
|
|
"grad_norm": 0.6233927011489868,
|
|
"learning_rate": 0.0008326693475393846,
|
|
"loss": 2.7112,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"epoch": 2.2227899272900817,
|
|
"grad_norm": 0.636464536190033,
|
|
"learning_rate": 0.0008310010833775849,
|
|
"loss": 2.7213,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 2.2340192583026868,
|
|
"grad_norm": 0.5854448676109314,
|
|
"learning_rate": 0.0008293262337211723,
|
|
"loss": 2.7131,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"epoch": 2.2452485893152914,
|
|
"grad_norm": 0.6958891749382019,
|
|
"learning_rate": 0.0008276448318924346,
|
|
"loss": 2.6883,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 2.2564779203278964,
|
|
"grad_norm": 0.5899659991264343,
|
|
"learning_rate": 0.0008259569113440198,
|
|
"loss": 2.6872,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"epoch": 2.2677072513405014,
|
|
"grad_norm": 0.6791245937347412,
|
|
"learning_rate": 0.0008242625056582698,
|
|
"loss": 2.7202,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 2.2789365823531065,
|
|
"grad_norm": 0.5778390169143677,
|
|
"learning_rate": 0.0008225616485465535,
|
|
"loss": 2.7153,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"epoch": 2.290165913365711,
|
|
"grad_norm": 0.5727918148040771,
|
|
"learning_rate": 0.000820854373848595,
|
|
"loss": 2.7314,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 2.301395244378316,
|
|
"grad_norm": 0.6461373567581177,
|
|
"learning_rate": 0.0008191407155318007,
|
|
"loss": 2.6973,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"epoch": 2.312624575390921,
|
|
"grad_norm": 0.6795935034751892,
|
|
"learning_rate": 0.0008174207076905835,
|
|
"loss": 2.6605,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 2.323853906403526,
|
|
"grad_norm": 0.6217265725135803,
|
|
"learning_rate": 0.0008156943845456843,
|
|
"loss": 2.6715,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"epoch": 2.3350832374161308,
|
|
"grad_norm": 0.6071234941482544,
|
|
"learning_rate": 0.0008139617804434918,
|
|
"loss": 2.6806,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 2.346312568428736,
|
|
"grad_norm": 0.6218218207359314,
|
|
"learning_rate": 0.0008122229298553583,
|
|
"loss": 2.7077,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"epoch": 2.357541899441341,
|
|
"grad_norm": 0.5912306904792786,
|
|
"learning_rate": 0.0008104778673769142,
|
|
"loss": 2.7314,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 2.368771230453946,
|
|
"grad_norm": 0.5841456651687622,
|
|
"learning_rate": 0.0008087266277273799,
|
|
"loss": 2.6645,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"epoch": 2.3800005614665505,
|
|
"grad_norm": 0.6491414904594421,
|
|
"learning_rate": 0.0008069692457488749,
|
|
"loss": 2.7115,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 2.3912298924791555,
|
|
"grad_norm": 0.6534895896911621,
|
|
"learning_rate": 0.0008052057564057244,
|
|
"loss": 2.7057,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"epoch": 2.4024592234917606,
|
|
"grad_norm": 0.5900655388832092,
|
|
"learning_rate": 0.000803436194783764,
|
|
"loss": 2.7302,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 2.4136885545043656,
|
|
"grad_norm": 0.5586231350898743,
|
|
"learning_rate": 0.0008016605960896412,
|
|
"loss": 2.7339,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"epoch": 2.42491788551697,
|
|
"grad_norm": 0.705515444278717,
|
|
"learning_rate": 0.0007998789956501159,
|
|
"loss": 2.7323,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 2.436147216529575,
|
|
"grad_norm": 0.5936200022697449,
|
|
"learning_rate": 0.0007980914289113558,
|
|
"loss": 2.7116,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"epoch": 2.4473765475421803,
|
|
"grad_norm": 0.6085701584815979,
|
|
"learning_rate": 0.000796297931438233,
|
|
"loss": 2.7406,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 2.458605878554785,
|
|
"grad_norm": 0.5549573302268982,
|
|
"learning_rate": 0.0007944985389136157,
|
|
"loss": 2.7408,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"epoch": 2.46983520956739,
|
|
"grad_norm": 0.5694999694824219,
|
|
"learning_rate": 0.0007926932871376575,
|
|
"loss": 2.7216,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 2.481064540579995,
|
|
"grad_norm": 0.5795106887817383,
|
|
"learning_rate": 0.0007908822120270867,
|
|
"loss": 2.6724,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"epoch": 2.4922938715926,
|
|
"grad_norm": 0.5619019865989685,
|
|
"learning_rate": 0.0007890653496144902,
|
|
"loss": 2.6867,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 2.503523202605205,
|
|
"grad_norm": 0.5836601257324219,
|
|
"learning_rate": 0.0007872427360475974,
|
|
"loss": 2.7091,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"epoch": 2.5147525336178096,
|
|
"grad_norm": 0.6521953344345093,
|
|
"learning_rate": 0.0007854144075885614,
|
|
"loss": 2.7138,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 2.5259818646304146,
|
|
"grad_norm": 0.6129563450813293,
|
|
"learning_rate": 0.0007835804006132364,
|
|
"loss": 2.6796,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"epoch": 2.5372111956430197,
|
|
"grad_norm": 0.5933245420455933,
|
|
"learning_rate": 0.0007817407516104547,
|
|
"loss": 2.6541,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 2.5484405266556243,
|
|
"grad_norm": 0.5935245156288147,
|
|
"learning_rate": 0.0007798954971813009,
|
|
"loss": 2.6849,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"epoch": 2.5596698576682293,
|
|
"grad_norm": 0.7134594321250916,
|
|
"learning_rate": 0.0007780446740383829,
|
|
"loss": 2.7141,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 2.5708991886808343,
|
|
"grad_norm": 0.6013050675392151,
|
|
"learning_rate": 0.0007761883190051029,
|
|
"loss": 2.7276,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"epoch": 2.5821285196934394,
|
|
"grad_norm": 0.6081655025482178,
|
|
"learning_rate": 0.000774326469014923,
|
|
"loss": 2.7205,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 2.5933578507060444,
|
|
"grad_norm": 0.6464730501174927,
|
|
"learning_rate": 0.0007724591611106315,
|
|
"loss": 2.6872,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"epoch": 2.604587181718649,
|
|
"grad_norm": 0.578700840473175,
|
|
"learning_rate": 0.0007705864324436059,
|
|
"loss": 2.7152,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 2.615816512731254,
|
|
"grad_norm": 0.5782270431518555,
|
|
"learning_rate": 0.000768708320273073,
|
|
"loss": 2.7233,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"epoch": 2.627045843743859,
|
|
"grad_norm": 0.5789017081260681,
|
|
"learning_rate": 0.000766824861965369,
|
|
"loss": 2.7474,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"epoch": 2.6382751747564637,
|
|
"grad_norm": 0.6152642369270325,
|
|
"learning_rate": 0.0007649360949931941,
|
|
"loss": 2.7071,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"epoch": 2.6495045057690687,
|
|
"grad_norm": 0.6417413353919983,
|
|
"learning_rate": 0.0007630420569348688,
|
|
"loss": 2.694,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 2.6607338367816737,
|
|
"grad_norm": 0.5956742167472839,
|
|
"learning_rate": 0.0007611427854735855,
|
|
"loss": 2.7318,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"epoch": 2.671963167794279,
|
|
"grad_norm": 0.6362649202346802,
|
|
"learning_rate": 0.0007592383183966581,
|
|
"loss": 2.6966,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"epoch": 2.683192498806884,
|
|
"grad_norm": 0.5551230311393738,
|
|
"learning_rate": 0.0007573286935947715,
|
|
"loss": 2.6876,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"epoch": 2.6944218298194884,
|
|
"grad_norm": 0.6288260817527771,
|
|
"learning_rate": 0.0007554139490612269,
|
|
"loss": 2.7336,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 2.7056511608320934,
|
|
"grad_norm": 0.5820605158805847,
|
|
"learning_rate": 0.0007534941228911856,
|
|
"loss": 2.683,
|
|
"step": 6025
|
|
},
|
|
{
|
|
"epoch": 2.7168804918446985,
|
|
"grad_norm": 0.6264435648918152,
|
|
"learning_rate": 0.0007515692532809126,
|
|
"loss": 2.7461,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"epoch": 2.728109822857303,
|
|
"grad_norm": 0.6948567628860474,
|
|
"learning_rate": 0.0007496393785270148,
|
|
"loss": 2.7297,
|
|
"step": 6075
|
|
},
|
|
{
|
|
"epoch": 2.739339153869908,
|
|
"grad_norm": 0.5976940393447876,
|
|
"learning_rate": 0.0007477045370256802,
|
|
"loss": 2.7419,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 2.750568484882513,
|
|
"grad_norm": 0.5582289099693298,
|
|
"learning_rate": 0.0007457647672719133,
|
|
"loss": 2.7238,
|
|
"step": 6125
|
|
},
|
|
{
|
|
"epoch": 2.761797815895118,
|
|
"grad_norm": 0.6097133755683899,
|
|
"learning_rate": 0.00074389798763174,
|
|
"loss": 2.7345,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"epoch": 2.7730271469077232,
|
|
"grad_norm": 0.576604425907135,
|
|
"learning_rate": 0.0007419486705442532,
|
|
"loss": 2.7075,
|
|
"step": 6175
|
|
},
|
|
{
|
|
"epoch": 2.784256477920328,
|
|
"grad_norm": 0.6071319580078125,
|
|
"learning_rate": 0.0007399945397212636,
|
|
"loss": 2.7122,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 2.795485808932933,
|
|
"grad_norm": 0.6164671182632446,
|
|
"learning_rate": 0.0007380356340415503,
|
|
"loss": 2.698,
|
|
"step": 6225
|
|
},
|
|
{
|
|
"epoch": 2.806715139945538,
|
|
"grad_norm": 0.5970659255981445,
|
|
"learning_rate": 0.0007360719924788919,
|
|
"loss": 2.7429,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"epoch": 2.8179444709581425,
|
|
"grad_norm": 0.5380481481552124,
|
|
"learning_rate": 0.0007341036541012898,
|
|
"loss": 2.6655,
|
|
"step": 6275
|
|
},
|
|
{
|
|
"epoch": 2.8291738019707475,
|
|
"grad_norm": 0.5859966278076172,
|
|
"learning_rate": 0.0007321306580701923,
|
|
"loss": 2.7115,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 2.8404031329833526,
|
|
"grad_norm": 0.5644718408584595,
|
|
"learning_rate": 0.0007301530436397148,
|
|
"loss": 2.6945,
|
|
"step": 6325
|
|
},
|
|
{
|
|
"epoch": 2.8516324639959576,
|
|
"grad_norm": 0.5971605777740479,
|
|
"learning_rate": 0.0007281708501558591,
|
|
"loss": 2.7082,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"epoch": 2.8628617950085626,
|
|
"grad_norm": 0.6274561882019043,
|
|
"learning_rate": 0.0007261841170557303,
|
|
"loss": 2.7207,
|
|
"step": 6375
|
|
},
|
|
{
|
|
"epoch": 2.8740911260211672,
|
|
"grad_norm": 0.611348032951355,
|
|
"learning_rate": 0.0007241928838667522,
|
|
"loss": 2.7155,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 2.8853204570337723,
|
|
"grad_norm": 0.5650250911712646,
|
|
"learning_rate": 0.000722197190205881,
|
|
"loss": 2.7063,
|
|
"step": 6425
|
|
},
|
|
{
|
|
"epoch": 2.896549788046377,
|
|
"grad_norm": 0.6043295860290527,
|
|
"learning_rate": 0.0007201970757788173,
|
|
"loss": 2.6909,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"epoch": 2.907779119058982,
|
|
"grad_norm": 0.582062304019928,
|
|
"learning_rate": 0.0007181925803792153,
|
|
"loss": 2.7262,
|
|
"step": 6475
|
|
},
|
|
{
|
|
"epoch": 2.919008450071587,
|
|
"grad_norm": 0.6272075772285461,
|
|
"learning_rate": 0.0007161837438878926,
|
|
"loss": 2.7224,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 2.930237781084192,
|
|
"grad_norm": 0.6399256587028503,
|
|
"learning_rate": 0.0007141706062720349,
|
|
"loss": 2.7202,
|
|
"step": 6525
|
|
},
|
|
{
|
|
"epoch": 2.941467112096797,
|
|
"grad_norm": 0.637417197227478,
|
|
"learning_rate": 0.0007121532075844023,
|
|
"loss": 2.6624,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"epoch": 2.9526964431094016,
|
|
"grad_norm": 0.7301665544509888,
|
|
"learning_rate": 0.0007101315879625315,
|
|
"loss": 2.7103,
|
|
"step": 6575
|
|
},
|
|
{
|
|
"epoch": 2.9639257741220066,
|
|
"grad_norm": 0.6282750964164734,
|
|
"learning_rate": 0.000708105787627938,
|
|
"loss": 2.6985,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 2.9751551051346117,
|
|
"grad_norm": 0.7559499144554138,
|
|
"learning_rate": 0.0007060758468853153,
|
|
"loss": 2.6989,
|
|
"step": 6625
|
|
},
|
|
{
|
|
"epoch": 2.9863844361472163,
|
|
"grad_norm": 0.6338511109352112,
|
|
"learning_rate": 0.0007040418061217324,
|
|
"loss": 2.7278,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"epoch": 2.9976137671598213,
|
|
"grad_norm": 0.6456329226493835,
|
|
"learning_rate": 0.0007020037058058326,
|
|
"loss": 2.6851,
|
|
"step": 6675
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"eval_loss": 3.168698787689209,
|
|
"eval_runtime": 228.8127,
|
|
"eval_samples_per_second": 54.997,
|
|
"eval_steps_per_second": 54.997,
|
|
"step": 6681
|
|
}
|
|
],
|
|
"logging_steps": 25,
|
|
"max_steps": 17808,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 8,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.3044200408064e+16,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|