0224_docs_chatml_cp3500 / trainer_state.json
leedahyeon's picture
Upload folder using huggingface_hub
c5d5dd8 verified
{
"best_metric": 2.5859532356262207,
"best_model_checkpoint": "./FT_models/[LDH]0224_docs_chatml/checkpoint-500",
"epoch": 52.23880597014925,
"eval_steps": 500,
"global_step": 3500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.14925373134328357,
"grad_norm": 0.7784017324447632,
"learning_rate": 0.0001999995055317446,
"loss": 2.1851,
"step": 10
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.5991038084030151,
"learning_rate": 0.0001999955498150411,
"loss": 1.5526,
"step": 20
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.6780698895454407,
"learning_rate": 0.00019998763853811184,
"loss": 1.405,
"step": 30
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.7235689759254456,
"learning_rate": 0.00019997577201390606,
"loss": 1.3309,
"step": 40
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.7904847860336304,
"learning_rate": 0.0001999599507118322,
"loss": 1.1353,
"step": 50
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.7685543894767761,
"learning_rate": 0.00019994017525773913,
"loss": 1.0295,
"step": 60
},
{
"epoch": 1.044776119402985,
"grad_norm": 0.963201642036438,
"learning_rate": 0.0001999164464338918,
"loss": 0.9815,
"step": 70
},
{
"epoch": 1.1940298507462686,
"grad_norm": 1.0725153684616089,
"learning_rate": 0.0001998887651789398,
"loss": 0.7019,
"step": 80
},
{
"epoch": 1.3432835820895521,
"grad_norm": 1.2167037725448608,
"learning_rate": 0.0001998571325878806,
"loss": 0.639,
"step": 90
},
{
"epoch": 1.4925373134328357,
"grad_norm": 1.256081461906433,
"learning_rate": 0.00019982154991201608,
"loss": 0.5614,
"step": 100
},
{
"epoch": 1.6417910447761193,
"grad_norm": 0.7639100551605225,
"learning_rate": 0.00019978201855890308,
"loss": 0.5479,
"step": 110
},
{
"epoch": 1.7910447761194028,
"grad_norm": 1.0223500728607178,
"learning_rate": 0.00019973854009229763,
"loss": 0.3933,
"step": 120
},
{
"epoch": 1.9402985074626866,
"grad_norm": 1.0072954893112183,
"learning_rate": 0.00019969111623209323,
"loss": 0.4029,
"step": 130
},
{
"epoch": 2.08955223880597,
"grad_norm": 1.0809686183929443,
"learning_rate": 0.00019963974885425266,
"loss": 0.3913,
"step": 140
},
{
"epoch": 2.2388059701492535,
"grad_norm": 0.4724240005016327,
"learning_rate": 0.00019958443999073397,
"loss": 0.2301,
"step": 150
},
{
"epoch": 2.388059701492537,
"grad_norm": 0.6505578756332397,
"learning_rate": 0.00019952519182940993,
"loss": 0.2533,
"step": 160
},
{
"epoch": 2.5373134328358207,
"grad_norm": 0.39939674735069275,
"learning_rate": 0.0001994620067139815,
"loss": 0.1551,
"step": 170
},
{
"epoch": 2.6865671641791042,
"grad_norm": 0.26736512780189514,
"learning_rate": 0.00019939488714388524,
"loss": 0.2489,
"step": 180
},
{
"epoch": 2.835820895522388,
"grad_norm": 0.7464708089828491,
"learning_rate": 0.00019932383577419432,
"loss": 0.2118,
"step": 190
},
{
"epoch": 2.9850746268656714,
"grad_norm": 0.348090261220932,
"learning_rate": 0.0001992488554155135,
"loss": 0.2277,
"step": 200
},
{
"epoch": 3.1343283582089554,
"grad_norm": 0.4606758654117584,
"learning_rate": 0.0001991699490338681,
"loss": 0.1216,
"step": 210
},
{
"epoch": 3.283582089552239,
"grad_norm": 0.5326898694038391,
"learning_rate": 0.00019908711975058637,
"loss": 0.0946,
"step": 220
},
{
"epoch": 3.4328358208955225,
"grad_norm": 0.5780682563781738,
"learning_rate": 0.00019900037084217637,
"loss": 0.0982,
"step": 230
},
{
"epoch": 3.582089552238806,
"grad_norm": 0.5874966382980347,
"learning_rate": 0.00019890970574019617,
"loss": 0.1244,
"step": 240
},
{
"epoch": 3.7313432835820897,
"grad_norm": 0.5044897198677063,
"learning_rate": 0.00019881512803111796,
"loss": 0.1127,
"step": 250
},
{
"epoch": 3.8805970149253732,
"grad_norm": 0.45550045371055603,
"learning_rate": 0.00019871664145618657,
"loss": 0.1211,
"step": 260
},
{
"epoch": 4.029850746268656,
"grad_norm": 0.3092655539512634,
"learning_rate": 0.00019861424991127115,
"loss": 0.1156,
"step": 270
},
{
"epoch": 4.17910447761194,
"grad_norm": 0.4061109721660614,
"learning_rate": 0.00019850795744671116,
"loss": 0.0695,
"step": 280
},
{
"epoch": 4.3283582089552235,
"grad_norm": 0.36566805839538574,
"learning_rate": 0.00019839776826715614,
"loss": 0.0677,
"step": 290
},
{
"epoch": 4.477611940298507,
"grad_norm": 0.29778343439102173,
"learning_rate": 0.00019828368673139947,
"loss": 0.0445,
"step": 300
},
{
"epoch": 4.6268656716417915,
"grad_norm": 0.18671129643917084,
"learning_rate": 0.00019816571735220583,
"loss": 0.0644,
"step": 310
},
{
"epoch": 4.776119402985074,
"grad_norm": 0.3877570629119873,
"learning_rate": 0.0001980438647961327,
"loss": 0.0912,
"step": 320
},
{
"epoch": 4.925373134328359,
"grad_norm": 0.5179444551467896,
"learning_rate": 0.00019791813388334581,
"loss": 0.0607,
"step": 330
},
{
"epoch": 5.074626865671641,
"grad_norm": 0.15815891325473785,
"learning_rate": 0.00019778852958742853,
"loss": 0.0542,
"step": 340
},
{
"epoch": 5.223880597014926,
"grad_norm": 0.42946234345436096,
"learning_rate": 0.00019765505703518496,
"loss": 0.0563,
"step": 350
},
{
"epoch": 5.373134328358209,
"grad_norm": 0.5582062005996704,
"learning_rate": 0.00019751772150643722,
"loss": 0.0408,
"step": 360
},
{
"epoch": 5.522388059701493,
"grad_norm": 0.2707679569721222,
"learning_rate": 0.0001973765284338167,
"loss": 0.0395,
"step": 370
},
{
"epoch": 5.6716417910447765,
"grad_norm": 0.5061952471733093,
"learning_rate": 0.00019723148340254892,
"loss": 0.0501,
"step": 380
},
{
"epoch": 5.82089552238806,
"grad_norm": 0.31659457087516785,
"learning_rate": 0.0001970825921502328,
"loss": 0.049,
"step": 390
},
{
"epoch": 5.970149253731344,
"grad_norm": 0.2836756110191345,
"learning_rate": 0.00019692986056661356,
"loss": 0.0483,
"step": 400
},
{
"epoch": 6.119402985074627,
"grad_norm": 0.1509529948234558,
"learning_rate": 0.0001967732946933499,
"loss": 0.0394,
"step": 410
},
{
"epoch": 6.268656716417911,
"grad_norm": 0.2609387934207916,
"learning_rate": 0.00019661290072377482,
"loss": 0.0388,
"step": 420
},
{
"epoch": 6.417910447761194,
"grad_norm": 0.5987827777862549,
"learning_rate": 0.0001964486850026507,
"loss": 0.0383,
"step": 430
},
{
"epoch": 6.567164179104478,
"grad_norm": 0.4549473822116852,
"learning_rate": 0.00019628065402591845,
"loss": 0.038,
"step": 440
},
{
"epoch": 6.7164179104477615,
"grad_norm": 0.2026955932378769,
"learning_rate": 0.0001961088144404403,
"loss": 0.0384,
"step": 450
},
{
"epoch": 6.865671641791045,
"grad_norm": 0.21201670169830322,
"learning_rate": 0.00019593317304373705,
"loss": 0.0347,
"step": 460
},
{
"epoch": 7.014925373134329,
"grad_norm": 0.0839223638176918,
"learning_rate": 0.00019575373678371909,
"loss": 0.0458,
"step": 470
},
{
"epoch": 7.164179104477612,
"grad_norm": 0.38603273034095764,
"learning_rate": 0.0001955705127584117,
"loss": 0.0366,
"step": 480
},
{
"epoch": 7.313432835820896,
"grad_norm": 0.22731441259384155,
"learning_rate": 0.00019538350821567404,
"loss": 0.0324,
"step": 490
},
{
"epoch": 7.462686567164179,
"grad_norm": 0.3230607509613037,
"learning_rate": 0.00019519273055291266,
"loss": 0.0321,
"step": 500
},
{
"epoch": 7.462686567164179,
"eval_loss": 2.5859532356262207,
"eval_runtime": 34.8624,
"eval_samples_per_second": 9.093,
"eval_steps_per_second": 4.561,
"step": 500
},
{
"epoch": 7.611940298507463,
"grad_norm": 0.1323220282793045,
"learning_rate": 0.00019499818731678873,
"loss": 0.0271,
"step": 510
},
{
"epoch": 7.7611940298507465,
"grad_norm": 0.17675843834877014,
"learning_rate": 0.00019479988620291956,
"loss": 0.0356,
"step": 520
},
{
"epoch": 7.91044776119403,
"grad_norm": 0.5695576071739197,
"learning_rate": 0.00019459783505557424,
"loss": 0.0377,
"step": 530
},
{
"epoch": 8.059701492537313,
"grad_norm": 0.14659564197063446,
"learning_rate": 0.0001943920418673633,
"loss": 0.0295,
"step": 540
},
{
"epoch": 8.208955223880597,
"grad_norm": 0.3027053475379944,
"learning_rate": 0.0001941825147789225,
"loss": 0.0283,
"step": 550
},
{
"epoch": 8.35820895522388,
"grad_norm": 0.3131660521030426,
"learning_rate": 0.00019396926207859084,
"loss": 0.0354,
"step": 560
},
{
"epoch": 8.507462686567164,
"grad_norm": 0.2541915476322174,
"learning_rate": 0.00019375229220208276,
"loss": 0.0305,
"step": 570
},
{
"epoch": 8.656716417910447,
"grad_norm": 0.15617048740386963,
"learning_rate": 0.0001935316137321543,
"loss": 0.0374,
"step": 580
},
{
"epoch": 8.805970149253731,
"grad_norm": 0.8228221535682678,
"learning_rate": 0.00019330723539826375,
"loss": 0.0332,
"step": 590
},
{
"epoch": 8.955223880597014,
"grad_norm": 0.29615482687950134,
"learning_rate": 0.0001930791660762262,
"loss": 0.0319,
"step": 600
},
{
"epoch": 9.104477611940299,
"grad_norm": 0.222768172621727,
"learning_rate": 0.0001928474147878626,
"loss": 0.0279,
"step": 610
},
{
"epoch": 9.253731343283581,
"grad_norm": 0.1895366758108139,
"learning_rate": 0.0001926119907006426,
"loss": 0.0337,
"step": 620
},
{
"epoch": 9.402985074626866,
"grad_norm": 0.19570475816726685,
"learning_rate": 0.00019237290312732226,
"loss": 0.0276,
"step": 630
},
{
"epoch": 9.552238805970148,
"grad_norm": 0.5487334728240967,
"learning_rate": 0.0001921301615255754,
"loss": 0.0346,
"step": 640
},
{
"epoch": 9.701492537313433,
"grad_norm": 0.17226798832416534,
"learning_rate": 0.00019188377549761963,
"loss": 0.0256,
"step": 650
},
{
"epoch": 9.850746268656717,
"grad_norm": 0.2766994833946228,
"learning_rate": 0.00019163375478983632,
"loss": 0.0296,
"step": 660
},
{
"epoch": 10.0,
"grad_norm": 0.25644323229789734,
"learning_rate": 0.00019138010929238534,
"loss": 0.0326,
"step": 670
},
{
"epoch": 10.149253731343283,
"grad_norm": 0.3244101405143738,
"learning_rate": 0.0001911228490388136,
"loss": 0.0262,
"step": 680
},
{
"epoch": 10.298507462686567,
"grad_norm": 0.24429234862327576,
"learning_rate": 0.00019086198420565823,
"loss": 0.0321,
"step": 690
},
{
"epoch": 10.447761194029852,
"grad_norm": 0.15597452223300934,
"learning_rate": 0.000190597525112044,
"loss": 0.0314,
"step": 700
},
{
"epoch": 10.597014925373134,
"grad_norm": 0.22565604746341705,
"learning_rate": 0.00019032948221927524,
"loss": 0.028,
"step": 710
},
{
"epoch": 10.746268656716419,
"grad_norm": 0.1876748502254486,
"learning_rate": 0.00019005786613042185,
"loss": 0.0292,
"step": 720
},
{
"epoch": 10.895522388059701,
"grad_norm": 0.33821797370910645,
"learning_rate": 0.00018978268758989991,
"loss": 0.0289,
"step": 730
},
{
"epoch": 11.044776119402986,
"grad_norm": 0.1624440997838974,
"learning_rate": 0.00018950395748304678,
"loss": 0.0306,
"step": 740
},
{
"epoch": 11.194029850746269,
"grad_norm": 0.17152149975299835,
"learning_rate": 0.0001892216868356904,
"loss": 0.0293,
"step": 750
},
{
"epoch": 11.343283582089553,
"grad_norm": 0.22814153134822845,
"learning_rate": 0.00018893588681371303,
"loss": 0.0294,
"step": 760
},
{
"epoch": 11.492537313432836,
"grad_norm": 0.3319993317127228,
"learning_rate": 0.00018864656872260985,
"loss": 0.0282,
"step": 770
},
{
"epoch": 11.64179104477612,
"grad_norm": 0.1329302042722702,
"learning_rate": 0.00018835374400704154,
"loss": 0.029,
"step": 780
},
{
"epoch": 11.791044776119403,
"grad_norm": 0.2312331646680832,
"learning_rate": 0.00018805742425038145,
"loss": 0.0313,
"step": 790
},
{
"epoch": 11.940298507462687,
"grad_norm": 0.11133890599012375,
"learning_rate": 0.00018775762117425777,
"loss": 0.0291,
"step": 800
},
{
"epoch": 12.08955223880597,
"grad_norm": 0.42977747321128845,
"learning_rate": 0.00018745434663808942,
"loss": 0.0293,
"step": 810
},
{
"epoch": 12.238805970149254,
"grad_norm": 0.4400336742401123,
"learning_rate": 0.00018714761263861728,
"loss": 0.0258,
"step": 820
},
{
"epoch": 12.388059701492537,
"grad_norm": 0.3886387348175049,
"learning_rate": 0.00018683743130942928,
"loss": 0.03,
"step": 830
},
{
"epoch": 12.537313432835822,
"grad_norm": 0.1791338473558426,
"learning_rate": 0.00018652381492048083,
"loss": 0.0259,
"step": 840
},
{
"epoch": 12.686567164179104,
"grad_norm": 0.17430442571640015,
"learning_rate": 0.00018620677587760916,
"loss": 0.0282,
"step": 850
},
{
"epoch": 12.835820895522389,
"grad_norm": 0.3342001140117645,
"learning_rate": 0.00018588632672204264,
"loss": 0.0301,
"step": 860
},
{
"epoch": 12.985074626865671,
"grad_norm": 0.2671709358692169,
"learning_rate": 0.00018556248012990468,
"loss": 0.0269,
"step": 870
},
{
"epoch": 13.134328358208956,
"grad_norm": 0.27415236830711365,
"learning_rate": 0.0001852352489117124,
"loss": 0.026,
"step": 880
},
{
"epoch": 13.283582089552239,
"grad_norm": 0.13104292750358582,
"learning_rate": 0.0001849046460118698,
"loss": 0.0228,
"step": 890
},
{
"epoch": 13.432835820895523,
"grad_norm": 0.0972188338637352,
"learning_rate": 0.00018457068450815562,
"loss": 0.0224,
"step": 900
},
{
"epoch": 13.582089552238806,
"grad_norm": 0.36645978689193726,
"learning_rate": 0.00018423337761120618,
"loss": 0.023,
"step": 910
},
{
"epoch": 13.73134328358209,
"grad_norm": 0.09311423450708389,
"learning_rate": 0.00018389273866399275,
"loss": 0.0249,
"step": 920
},
{
"epoch": 13.880597014925373,
"grad_norm": 0.26524588465690613,
"learning_rate": 0.00018354878114129367,
"loss": 0.0283,
"step": 930
},
{
"epoch": 14.029850746268657,
"grad_norm": 0.09386450052261353,
"learning_rate": 0.00018320151864916135,
"loss": 0.0248,
"step": 940
},
{
"epoch": 14.17910447761194,
"grad_norm": 0.15628236532211304,
"learning_rate": 0.00018285096492438424,
"loss": 0.0242,
"step": 950
},
{
"epoch": 14.328358208955224,
"grad_norm": 0.09542356431484222,
"learning_rate": 0.00018249713383394303,
"loss": 0.0227,
"step": 960
},
{
"epoch": 14.477611940298507,
"grad_norm": 0.07725339382886887,
"learning_rate": 0.00018214003937446253,
"loss": 0.0238,
"step": 970
},
{
"epoch": 14.626865671641792,
"grad_norm": 0.10331985354423523,
"learning_rate": 0.0001817796956716578,
"loss": 0.0269,
"step": 980
},
{
"epoch": 14.776119402985074,
"grad_norm": 0.15463414788246155,
"learning_rate": 0.00018141611697977529,
"loss": 0.0217,
"step": 990
},
{
"epoch": 14.925373134328359,
"grad_norm": 0.14817017316818237,
"learning_rate": 0.0001810493176810292,
"loss": 0.025,
"step": 1000
},
{
"epoch": 14.925373134328359,
"eval_loss": 2.6268842220306396,
"eval_runtime": 34.7235,
"eval_samples_per_second": 9.129,
"eval_steps_per_second": 4.579,
"step": 1000
},
{
"epoch": 15.074626865671641,
"grad_norm": 0.12652496993541718,
"learning_rate": 0.00018067931228503246,
"loss": 0.0209,
"step": 1010
},
{
"epoch": 15.223880597014926,
"grad_norm": 0.10728372633457184,
"learning_rate": 0.00018030611542822257,
"loss": 0.0222,
"step": 1020
},
{
"epoch": 15.373134328358208,
"grad_norm": 0.06331785768270493,
"learning_rate": 0.00017992974187328305,
"loss": 0.0217,
"step": 1030
},
{
"epoch": 15.522388059701493,
"grad_norm": 0.21516793966293335,
"learning_rate": 0.000179550206508559,
"loss": 0.022,
"step": 1040
},
{
"epoch": 15.671641791044776,
"grad_norm": 0.06852179765701294,
"learning_rate": 0.00017916752434746856,
"loss": 0.0213,
"step": 1050
},
{
"epoch": 15.82089552238806,
"grad_norm": 0.07877414673566818,
"learning_rate": 0.00017878171052790868,
"loss": 0.0244,
"step": 1060
},
{
"epoch": 15.970149253731343,
"grad_norm": 0.28540125489234924,
"learning_rate": 0.00017839278031165658,
"loss": 0.0244,
"step": 1070
},
{
"epoch": 16.119402985074625,
"grad_norm": 0.060050830245018005,
"learning_rate": 0.00017800074908376584,
"loss": 0.0197,
"step": 1080
},
{
"epoch": 16.26865671641791,
"grad_norm": 0.14403647184371948,
"learning_rate": 0.0001776056323519579,
"loss": 0.0197,
"step": 1090
},
{
"epoch": 16.417910447761194,
"grad_norm": 0.14129780232906342,
"learning_rate": 0.00017720744574600863,
"loss": 0.0235,
"step": 1100
},
{
"epoch": 16.567164179104477,
"grad_norm": 0.14378951489925385,
"learning_rate": 0.00017680620501712996,
"loss": 0.0213,
"step": 1110
},
{
"epoch": 16.71641791044776,
"grad_norm": 0.09062501788139343,
"learning_rate": 0.00017640192603734692,
"loss": 0.0247,
"step": 1120
},
{
"epoch": 16.865671641791046,
"grad_norm": 0.33645904064178467,
"learning_rate": 0.00017599462479886974,
"loss": 0.0236,
"step": 1130
},
{
"epoch": 17.01492537313433,
"grad_norm": 0.10567416250705719,
"learning_rate": 0.00017558431741346122,
"loss": 0.0227,
"step": 1140
},
{
"epoch": 17.16417910447761,
"grad_norm": 0.3064155578613281,
"learning_rate": 0.00017517102011179933,
"loss": 0.0238,
"step": 1150
},
{
"epoch": 17.313432835820894,
"grad_norm": 0.256085067987442,
"learning_rate": 0.00017475474924283536,
"loss": 0.021,
"step": 1160
},
{
"epoch": 17.46268656716418,
"grad_norm": 0.8420864343643188,
"learning_rate": 0.000174335521273147,
"loss": 0.0241,
"step": 1170
},
{
"epoch": 17.611940298507463,
"grad_norm": 0.21292446553707123,
"learning_rate": 0.00017391335278628712,
"loss": 0.0251,
"step": 1180
},
{
"epoch": 17.761194029850746,
"grad_norm": 0.14111167192459106,
"learning_rate": 0.0001734882604821276,
"loss": 0.0251,
"step": 1190
},
{
"epoch": 17.91044776119403,
"grad_norm": 0.2401103377342224,
"learning_rate": 0.00017306026117619889,
"loss": 0.0294,
"step": 1200
},
{
"epoch": 18.059701492537314,
"grad_norm": 0.19608861207962036,
"learning_rate": 0.00017262937179902472,
"loss": 0.025,
"step": 1210
},
{
"epoch": 18.208955223880597,
"grad_norm": 0.10760471224784851,
"learning_rate": 0.00017219560939545246,
"loss": 0.0252,
"step": 1220
},
{
"epoch": 18.35820895522388,
"grad_norm": 0.1591477394104004,
"learning_rate": 0.0001717589911239788,
"loss": 0.0225,
"step": 1230
},
{
"epoch": 18.507462686567163,
"grad_norm": 0.21945133805274963,
"learning_rate": 0.00017131953425607104,
"loss": 0.0234,
"step": 1240
},
{
"epoch": 18.65671641791045,
"grad_norm": 0.147572860121727,
"learning_rate": 0.00017087725617548385,
"loss": 0.0216,
"step": 1250
},
{
"epoch": 18.80597014925373,
"grad_norm": 0.19251006841659546,
"learning_rate": 0.00017043217437757164,
"loss": 0.0258,
"step": 1260
},
{
"epoch": 18.955223880597014,
"grad_norm": 0.3675619959831238,
"learning_rate": 0.00016998430646859654,
"loss": 0.0263,
"step": 1270
},
{
"epoch": 19.104477611940297,
"grad_norm": 0.2582189440727234,
"learning_rate": 0.00016953367016503182,
"loss": 0.0258,
"step": 1280
},
{
"epoch": 19.253731343283583,
"grad_norm": 0.22689688205718994,
"learning_rate": 0.00016908028329286112,
"loss": 0.0226,
"step": 1290
},
{
"epoch": 19.402985074626866,
"grad_norm": 0.24671746790409088,
"learning_rate": 0.0001686241637868734,
"loss": 0.0224,
"step": 1300
},
{
"epoch": 19.55223880597015,
"grad_norm": 0.20800915360450745,
"learning_rate": 0.00016816532968995328,
"loss": 0.0276,
"step": 1310
},
{
"epoch": 19.701492537313435,
"grad_norm": 0.27109411358833313,
"learning_rate": 0.00016770379915236766,
"loss": 0.0345,
"step": 1320
},
{
"epoch": 19.850746268656717,
"grad_norm": 0.14396627247333527,
"learning_rate": 0.00016723959043104728,
"loss": 0.0244,
"step": 1330
},
{
"epoch": 20.0,
"grad_norm": 0.3481923043727875,
"learning_rate": 0.00016677272188886483,
"loss": 0.0256,
"step": 1340
},
{
"epoch": 20.149253731343283,
"grad_norm": 0.4062567353248596,
"learning_rate": 0.00016630321199390867,
"loss": 0.0195,
"step": 1350
},
{
"epoch": 20.298507462686565,
"grad_norm": 0.41222649812698364,
"learning_rate": 0.00016583107931875192,
"loss": 0.023,
"step": 1360
},
{
"epoch": 20.44776119402985,
"grad_norm": 0.15970966219902039,
"learning_rate": 0.00016535634253971794,
"loss": 0.024,
"step": 1370
},
{
"epoch": 20.597014925373134,
"grad_norm": 0.2472175806760788,
"learning_rate": 0.00016487902043614173,
"loss": 0.0259,
"step": 1380
},
{
"epoch": 20.746268656716417,
"grad_norm": 0.30030208826065063,
"learning_rate": 0.00016439913188962685,
"loss": 0.0287,
"step": 1390
},
{
"epoch": 20.895522388059703,
"grad_norm": 0.17114506661891937,
"learning_rate": 0.0001639166958832985,
"loss": 0.0279,
"step": 1400
},
{
"epoch": 21.044776119402986,
"grad_norm": 0.22200174629688263,
"learning_rate": 0.00016343173150105278,
"loss": 0.0254,
"step": 1410
},
{
"epoch": 21.19402985074627,
"grad_norm": 0.3260650336742401,
"learning_rate": 0.0001629442579268016,
"loss": 0.0252,
"step": 1420
},
{
"epoch": 21.34328358208955,
"grad_norm": 0.2409895658493042,
"learning_rate": 0.0001624542944437139,
"loss": 0.0255,
"step": 1430
},
{
"epoch": 21.492537313432837,
"grad_norm": 0.37012478709220886,
"learning_rate": 0.00016196186043345288,
"loss": 0.0293,
"step": 1440
},
{
"epoch": 21.64179104477612,
"grad_norm": 0.1385307013988495,
"learning_rate": 0.00016146697537540924,
"loss": 0.0273,
"step": 1450
},
{
"epoch": 21.791044776119403,
"grad_norm": 0.33346429467201233,
"learning_rate": 0.0001609696588459307,
"loss": 0.0294,
"step": 1460
},
{
"epoch": 21.940298507462686,
"grad_norm": 0.14864549040794373,
"learning_rate": 0.00016046993051754756,
"loss": 0.0269,
"step": 1470
},
{
"epoch": 22.08955223880597,
"grad_norm": 0.11181981861591339,
"learning_rate": 0.0001599678101581945,
"loss": 0.022,
"step": 1480
},
{
"epoch": 22.238805970149254,
"grad_norm": 0.18521477282047272,
"learning_rate": 0.00015946331763042867,
"loss": 0.0189,
"step": 1490
},
{
"epoch": 22.388059701492537,
"grad_norm": 0.07250799983739853,
"learning_rate": 0.00015895647289064396,
"loss": 0.0191,
"step": 1500
},
{
"epoch": 22.388059701492537,
"eval_loss": 2.901611566543579,
"eval_runtime": 34.7441,
"eval_samples_per_second": 9.124,
"eval_steps_per_second": 4.576,
"step": 1500
},
{
"epoch": 22.53731343283582,
"grad_norm": 0.11993291229009628,
"learning_rate": 0.0001584472959882815,
"loss": 0.0232,
"step": 1510
},
{
"epoch": 22.686567164179106,
"grad_norm": 0.08708442002534866,
"learning_rate": 0.0001579358070650367,
"loss": 0.0203,
"step": 1520
},
{
"epoch": 22.83582089552239,
"grad_norm": 0.17798146605491638,
"learning_rate": 0.00015742202635406235,
"loss": 0.0278,
"step": 1530
},
{
"epoch": 22.98507462686567,
"grad_norm": 0.33858776092529297,
"learning_rate": 0.0001569059741791684,
"loss": 0.0266,
"step": 1540
},
{
"epoch": 23.134328358208954,
"grad_norm": 0.14069178700447083,
"learning_rate": 0.0001563876709540178,
"loss": 0.0235,
"step": 1550
},
{
"epoch": 23.28358208955224,
"grad_norm": 0.23493413627147675,
"learning_rate": 0.00015586713718131922,
"loss": 0.0225,
"step": 1560
},
{
"epoch": 23.432835820895523,
"grad_norm": 0.23869164288043976,
"learning_rate": 0.0001553443934520159,
"loss": 0.0207,
"step": 1570
},
{
"epoch": 23.582089552238806,
"grad_norm": 0.04802766814827919,
"learning_rate": 0.00015481946044447099,
"loss": 0.0208,
"step": 1580
},
{
"epoch": 23.73134328358209,
"grad_norm": 0.10397443175315857,
"learning_rate": 0.00015429235892364994,
"loss": 0.0218,
"step": 1590
},
{
"epoch": 23.880597014925375,
"grad_norm": 0.06602863222360611,
"learning_rate": 0.00015376310974029873,
"loss": 0.0191,
"step": 1600
},
{
"epoch": 24.029850746268657,
"grad_norm": 0.12586766481399536,
"learning_rate": 0.0001532317338301192,
"loss": 0.0222,
"step": 1610
},
{
"epoch": 24.17910447761194,
"grad_norm": 0.058740176260471344,
"learning_rate": 0.00015269825221294098,
"loss": 0.0194,
"step": 1620
},
{
"epoch": 24.328358208955223,
"grad_norm": 0.046583324670791626,
"learning_rate": 0.0001521626859918898,
"loss": 0.0192,
"step": 1630
},
{
"epoch": 24.47761194029851,
"grad_norm": 0.07122834771871567,
"learning_rate": 0.00015162505635255287,
"loss": 0.0191,
"step": 1640
},
{
"epoch": 24.62686567164179,
"grad_norm": 0.17352773249149323,
"learning_rate": 0.0001510853845621409,
"loss": 0.0215,
"step": 1650
},
{
"epoch": 24.776119402985074,
"grad_norm": 0.039071228355169296,
"learning_rate": 0.00015054369196864644,
"loss": 0.0194,
"step": 1660
},
{
"epoch": 24.925373134328357,
"grad_norm": 0.0495145358145237,
"learning_rate": 0.00015000000000000001,
"loss": 0.0196,
"step": 1670
},
{
"epoch": 25.074626865671643,
"grad_norm": 0.06830393522977829,
"learning_rate": 0.0001494543301632219,
"loss": 0.0196,
"step": 1680
},
{
"epoch": 25.223880597014926,
"grad_norm": 0.046369269490242004,
"learning_rate": 0.0001489067040435717,
"loss": 0.0203,
"step": 1690
},
{
"epoch": 25.37313432835821,
"grad_norm": 0.08401994407176971,
"learning_rate": 0.00014835714330369446,
"loss": 0.0177,
"step": 1700
},
{
"epoch": 25.52238805970149,
"grad_norm": 0.15754491090774536,
"learning_rate": 0.0001478056696827636,
"loss": 0.0183,
"step": 1710
},
{
"epoch": 25.671641791044777,
"grad_norm": 0.03660280629992485,
"learning_rate": 0.00014725230499562119,
"loss": 0.0187,
"step": 1720
},
{
"epoch": 25.82089552238806,
"grad_norm": 0.03743986785411835,
"learning_rate": 0.00014669707113191483,
"loss": 0.0177,
"step": 1730
},
{
"epoch": 25.970149253731343,
"grad_norm": 0.061036206781864166,
"learning_rate": 0.00014613999005523174,
"loss": 0.0216,
"step": 1740
},
{
"epoch": 26.119402985074625,
"grad_norm": 0.052410393953323364,
"learning_rate": 0.00014558108380223012,
"loss": 0.0179,
"step": 1750
},
{
"epoch": 26.26865671641791,
"grad_norm": 0.04502878338098526,
"learning_rate": 0.00014502037448176734,
"loss": 0.0173,
"step": 1760
},
{
"epoch": 26.417910447761194,
"grad_norm": 0.09054642915725708,
"learning_rate": 0.00014445788427402528,
"loss": 0.0175,
"step": 1770
},
{
"epoch": 26.567164179104477,
"grad_norm": 0.04439815133810043,
"learning_rate": 0.00014389363542963306,
"loss": 0.0166,
"step": 1780
},
{
"epoch": 26.71641791044776,
"grad_norm": 0.057578206062316895,
"learning_rate": 0.00014332765026878687,
"loss": 0.0204,
"step": 1790
},
{
"epoch": 26.865671641791046,
"grad_norm": 0.04316519573330879,
"learning_rate": 0.00014275995118036693,
"loss": 0.0204,
"step": 1800
},
{
"epoch": 27.01492537313433,
"grad_norm": 0.044371455907821655,
"learning_rate": 0.00014219056062105193,
"loss": 0.0188,
"step": 1810
},
{
"epoch": 27.16417910447761,
"grad_norm": 0.0417649932205677,
"learning_rate": 0.00014161950111443077,
"loss": 0.0167,
"step": 1820
},
{
"epoch": 27.313432835820894,
"grad_norm": 0.08106902241706848,
"learning_rate": 0.0001410467952501114,
"loss": 0.0196,
"step": 1830
},
{
"epoch": 27.46268656716418,
"grad_norm": 0.1356726437807083,
"learning_rate": 0.00014047246568282736,
"loss": 0.0214,
"step": 1840
},
{
"epoch": 27.611940298507463,
"grad_norm": 0.11032121628522873,
"learning_rate": 0.00013989653513154165,
"loss": 0.0188,
"step": 1850
},
{
"epoch": 27.761194029850746,
"grad_norm": 0.1375930905342102,
"learning_rate": 0.0001393190263785479,
"loss": 0.0217,
"step": 1860
},
{
"epoch": 27.91044776119403,
"grad_norm": 0.12069140374660492,
"learning_rate": 0.00013873996226856933,
"loss": 0.0198,
"step": 1870
},
{
"epoch": 28.059701492537314,
"grad_norm": 0.043185122311115265,
"learning_rate": 0.00013815936570785487,
"loss": 0.0182,
"step": 1880
},
{
"epoch": 28.208955223880597,
"grad_norm": 0.14993228018283844,
"learning_rate": 0.00013757725966327322,
"loss": 0.0167,
"step": 1890
},
{
"epoch": 28.35820895522388,
"grad_norm": 0.0337139368057251,
"learning_rate": 0.00013699366716140435,
"loss": 0.0186,
"step": 1900
},
{
"epoch": 28.507462686567163,
"grad_norm": 0.26397907733917236,
"learning_rate": 0.0001364086112876284,
"loss": 0.0186,
"step": 1910
},
{
"epoch": 28.65671641791045,
"grad_norm": 0.06083005666732788,
"learning_rate": 0.00013582211518521273,
"loss": 0.0195,
"step": 1920
},
{
"epoch": 28.80597014925373,
"grad_norm": 0.10358510911464691,
"learning_rate": 0.00013523420205439646,
"loss": 0.0186,
"step": 1930
},
{
"epoch": 28.955223880597014,
"grad_norm": 0.06939724832773209,
"learning_rate": 0.00013464489515147238,
"loss": 0.0201,
"step": 1940
},
{
"epoch": 29.104477611940297,
"grad_norm": 0.038348495960235596,
"learning_rate": 0.00013405421778786737,
"loss": 0.0198,
"step": 1950
},
{
"epoch": 29.253731343283583,
"grad_norm": 0.045545510947704315,
"learning_rate": 0.00013346219332922016,
"loss": 0.0199,
"step": 1960
},
{
"epoch": 29.402985074626866,
"grad_norm": 0.03619709983468056,
"learning_rate": 0.0001328688451944569,
"loss": 0.018,
"step": 1970
},
{
"epoch": 29.55223880597015,
"grad_norm": 0.03463749587535858,
"learning_rate": 0.00013227419685486492,
"loss": 0.0192,
"step": 1980
},
{
"epoch": 29.701492537313435,
"grad_norm": 0.16788923740386963,
"learning_rate": 0.0001316782718331643,
"loss": 0.0209,
"step": 1990
},
{
"epoch": 29.850746268656717,
"grad_norm": 0.04790572449564934,
"learning_rate": 0.00013108109370257712,
"loss": 0.0211,
"step": 2000
},
{
"epoch": 29.850746268656717,
"eval_loss": 2.8224031925201416,
"eval_runtime": 34.7119,
"eval_samples_per_second": 9.132,
"eval_steps_per_second": 4.581,
"step": 2000
},
{
"epoch": 30.0,
"grad_norm": 0.33791643381118774,
"learning_rate": 0.00013048268608589533,
"loss": 0.0198,
"step": 2010
},
{
"epoch": 30.149253731343283,
"grad_norm": 0.04068003594875336,
"learning_rate": 0.00012988307265454597,
"loss": 0.0192,
"step": 2020
},
{
"epoch": 30.298507462686565,
"grad_norm": 0.05943215265870094,
"learning_rate": 0.00012928227712765504,
"loss": 0.0181,
"step": 2030
},
{
"epoch": 30.44776119402985,
"grad_norm": 0.18460267782211304,
"learning_rate": 0.00012868032327110904,
"loss": 0.0208,
"step": 2040
},
{
"epoch": 30.597014925373134,
"grad_norm": 0.061664972454309464,
"learning_rate": 0.00012807723489661495,
"loss": 0.0205,
"step": 2050
},
{
"epoch": 30.746268656716417,
"grad_norm": 0.36015012860298157,
"learning_rate": 0.0001274730358607583,
"loss": 0.0184,
"step": 2060
},
{
"epoch": 30.895522388059703,
"grad_norm": 0.1974068284034729,
"learning_rate": 0.00012686775006405946,
"loss": 0.0196,
"step": 2070
},
{
"epoch": 31.044776119402986,
"grad_norm": 0.12781082093715668,
"learning_rate": 0.0001262614014500282,
"loss": 0.021,
"step": 2080
},
{
"epoch": 31.19402985074627,
"grad_norm": 0.22529159486293793,
"learning_rate": 0.00012565401400421651,
"loss": 0.018,
"step": 2090
},
{
"epoch": 31.34328358208955,
"grad_norm": 0.0831318125128746,
"learning_rate": 0.00012504561175326985,
"loss": 0.0194,
"step": 2100
},
{
"epoch": 31.492537313432837,
"grad_norm": 0.07306008040904999,
"learning_rate": 0.0001244362187639767,
"loss": 0.0183,
"step": 2110
},
{
"epoch": 31.64179104477612,
"grad_norm": 0.08519799262285233,
"learning_rate": 0.0001238258591423165,
"loss": 0.0184,
"step": 2120
},
{
"epoch": 31.791044776119403,
"grad_norm": 0.061566926538944244,
"learning_rate": 0.00012321455703250616,
"loss": 0.0198,
"step": 2130
},
{
"epoch": 31.940298507462686,
"grad_norm": 0.04921621084213257,
"learning_rate": 0.0001226023366160449,
"loss": 0.0192,
"step": 2140
},
{
"epoch": 32.08955223880597,
"grad_norm": 0.0568234883248806,
"learning_rate": 0.00012198922211075778,
"loss": 0.0186,
"step": 2150
},
{
"epoch": 32.23880597014925,
"grad_norm": 0.09815705567598343,
"learning_rate": 0.00012137523776983757,
"loss": 0.0175,
"step": 2160
},
{
"epoch": 32.38805970149254,
"grad_norm": 0.18607860803604126,
"learning_rate": 0.00012076040788088554,
"loss": 0.0178,
"step": 2170
},
{
"epoch": 32.53731343283582,
"grad_norm": 0.1101093739271164,
"learning_rate": 0.00012014475676495052,
"loss": 0.0179,
"step": 2180
},
{
"epoch": 32.6865671641791,
"grad_norm": 0.03449343517422676,
"learning_rate": 0.000119528308775567,
"loss": 0.0206,
"step": 2190
},
{
"epoch": 32.83582089552239,
"grad_norm": 0.042853228747844696,
"learning_rate": 0.00011891108829779165,
"loss": 0.0191,
"step": 2200
},
{
"epoch": 32.985074626865675,
"grad_norm": 0.048137255012989044,
"learning_rate": 0.00011829311974723867,
"loss": 0.0182,
"step": 2210
},
{
"epoch": 33.134328358208954,
"grad_norm": 0.19318771362304688,
"learning_rate": 0.00011767442756911417,
"loss": 0.0178,
"step": 2220
},
{
"epoch": 33.28358208955224,
"grad_norm": 0.08087719976902008,
"learning_rate": 0.00011705503623724898,
"loss": 0.0177,
"step": 2230
},
{
"epoch": 33.43283582089552,
"grad_norm": 0.04939524829387665,
"learning_rate": 0.00011643497025313061,
"loss": 0.0194,
"step": 2240
},
{
"epoch": 33.582089552238806,
"grad_norm": 0.04750213399529457,
"learning_rate": 0.0001158142541449341,
"loss": 0.0183,
"step": 2250
},
{
"epoch": 33.73134328358209,
"grad_norm": 0.04823266714811325,
"learning_rate": 0.0001151929124665516,
"loss": 0.0172,
"step": 2260
},
{
"epoch": 33.88059701492537,
"grad_norm": 0.03645576909184456,
"learning_rate": 0.00011457096979662114,
"loss": 0.018,
"step": 2270
},
{
"epoch": 34.02985074626866,
"grad_norm": 0.025920415297150612,
"learning_rate": 0.00011394845073755455,
"loss": 0.0178,
"step": 2280
},
{
"epoch": 34.17910447761194,
"grad_norm": 0.038914065808057785,
"learning_rate": 0.00011332537991456398,
"loss": 0.0168,
"step": 2290
},
{
"epoch": 34.32835820895522,
"grad_norm": 0.07641326636075974,
"learning_rate": 0.00011270178197468789,
"loss": 0.0194,
"step": 2300
},
{
"epoch": 34.47761194029851,
"grad_norm": 0.08722823858261108,
"learning_rate": 0.00011207768158581613,
"loss": 0.0199,
"step": 2310
},
{
"epoch": 34.62686567164179,
"grad_norm": 0.4223034679889679,
"learning_rate": 0.00011145310343571411,
"loss": 0.02,
"step": 2320
},
{
"epoch": 34.776119402985074,
"grad_norm": 0.12001374363899231,
"learning_rate": 0.0001108280722310462,
"loss": 0.0173,
"step": 2330
},
{
"epoch": 34.92537313432836,
"grad_norm": 0.14997516572475433,
"learning_rate": 0.00011020261269639842,
"loss": 0.0195,
"step": 2340
},
{
"epoch": 35.07462686567164,
"grad_norm": 0.052674125880002975,
"learning_rate": 0.00010957674957330042,
"loss": 0.018,
"step": 2350
},
{
"epoch": 35.223880597014926,
"grad_norm": 0.06538953632116318,
"learning_rate": 0.00010895050761924668,
"loss": 0.0168,
"step": 2360
},
{
"epoch": 35.37313432835821,
"grad_norm": 0.08561014384031296,
"learning_rate": 0.00010832391160671729,
"loss": 0.0177,
"step": 2370
},
{
"epoch": 35.52238805970149,
"grad_norm": 0.057601574808359146,
"learning_rate": 0.00010769698632219794,
"loss": 0.0176,
"step": 2380
},
{
"epoch": 35.67164179104478,
"grad_norm": 0.051563702523708344,
"learning_rate": 0.00010706975656519946,
"loss": 0.0196,
"step": 2390
},
{
"epoch": 35.82089552238806,
"grad_norm": 0.034905366599559784,
"learning_rate": 0.00010644224714727681,
"loss": 0.0183,
"step": 2400
},
{
"epoch": 35.97014925373134,
"grad_norm": 0.053671374917030334,
"learning_rate": 0.00010581448289104758,
"loss": 0.0179,
"step": 2410
},
{
"epoch": 36.11940298507463,
"grad_norm": 0.06900997459888458,
"learning_rate": 0.00010518648862921012,
"loss": 0.0187,
"step": 2420
},
{
"epoch": 36.26865671641791,
"grad_norm": 0.12105145305395126,
"learning_rate": 0.00010455828920356115,
"loss": 0.0164,
"step": 2430
},
{
"epoch": 36.417910447761194,
"grad_norm": 0.04216486215591431,
"learning_rate": 0.00010392990946401313,
"loss": 0.0187,
"step": 2440
},
{
"epoch": 36.56716417910448,
"grad_norm": 0.03389362245798111,
"learning_rate": 0.00010330137426761135,
"loss": 0.02,
"step": 2450
},
{
"epoch": 36.71641791044776,
"grad_norm": 0.06512041389942169,
"learning_rate": 0.00010267270847755048,
"loss": 0.0181,
"step": 2460
},
{
"epoch": 36.865671641791046,
"grad_norm": 0.052388746291399,
"learning_rate": 0.00010204393696219117,
"loss": 0.0171,
"step": 2470
},
{
"epoch": 37.014925373134325,
"grad_norm": 0.16947126388549805,
"learning_rate": 0.00010141508459407623,
"loss": 0.0205,
"step": 2480
},
{
"epoch": 37.16417910447761,
"grad_norm": 0.03826919198036194,
"learning_rate": 0.00010078617624894684,
"loss": 0.0168,
"step": 2490
},
{
"epoch": 37.3134328358209,
"grad_norm": 0.033569592982530594,
"learning_rate": 0.00010015723680475846,
"loss": 0.0162,
"step": 2500
},
{
"epoch": 37.3134328358209,
"eval_loss": 2.9721479415893555,
"eval_runtime": 34.6726,
"eval_samples_per_second": 9.143,
"eval_steps_per_second": 4.586,
"step": 2500
},
{
"epoch": 37.46268656716418,
"grad_norm": 0.03183290734887123,
"learning_rate": 9.95282911406968e-05,
"loss": 0.0175,
"step": 2510
},
{
"epoch": 37.61194029850746,
"grad_norm": 0.03882099688053131,
"learning_rate": 9.889936413619356e-05,
"loss": 0.0162,
"step": 2520
},
{
"epoch": 37.76119402985075,
"grad_norm": 0.08923082053661346,
"learning_rate": 9.827048066994225e-05,
"loss": 0.0206,
"step": 2530
},
{
"epoch": 37.91044776119403,
"grad_norm": 0.07512518018484116,
"learning_rate": 9.764166561891432e-05,
"loss": 0.018,
"step": 2540
},
{
"epoch": 38.059701492537314,
"grad_norm": 0.07171567529439926,
"learning_rate": 9.70129438573747e-05,
"loss": 0.0189,
"step": 2550
},
{
"epoch": 38.208955223880594,
"grad_norm": 0.041823286563158035,
"learning_rate": 9.63843402558981e-05,
"loss": 0.0182,
"step": 2560
},
{
"epoch": 38.35820895522388,
"grad_norm": 0.042579639703035355,
"learning_rate": 9.57558796803852e-05,
"loss": 0.015,
"step": 2570
},
{
"epoch": 38.507462686567166,
"grad_norm": 0.032053157687187195,
"learning_rate": 9.512758699107879e-05,
"loss": 0.0192,
"step": 2580
},
{
"epoch": 38.656716417910445,
"grad_norm": 0.03747331723570824,
"learning_rate": 9.449948704158071e-05,
"loss": 0.0185,
"step": 2590
},
{
"epoch": 38.80597014925373,
"grad_norm": 0.03378698602318764,
"learning_rate": 9.38716046778684e-05,
"loss": 0.0177,
"step": 2600
},
{
"epoch": 38.95522388059702,
"grad_norm": 0.0920565128326416,
"learning_rate": 9.324396473731217e-05,
"loss": 0.0186,
"step": 2610
},
{
"epoch": 39.1044776119403,
"grad_norm": 0.10099617391824722,
"learning_rate": 9.261659204769284e-05,
"loss": 0.017,
"step": 2620
},
{
"epoch": 39.25373134328358,
"grad_norm": 0.040173519402742386,
"learning_rate": 9.198951142621929e-05,
"loss": 0.0175,
"step": 2630
},
{
"epoch": 39.40298507462686,
"grad_norm": 0.04502606391906738,
"learning_rate": 9.136274767854716e-05,
"loss": 0.0181,
"step": 2640
},
{
"epoch": 39.55223880597015,
"grad_norm": 0.039172179996967316,
"learning_rate": 9.07363255977973e-05,
"loss": 0.018,
"step": 2650
},
{
"epoch": 39.701492537313435,
"grad_norm": 0.05952875688672066,
"learning_rate": 9.011026996357503e-05,
"loss": 0.0171,
"step": 2660
},
{
"epoch": 39.850746268656714,
"grad_norm": 0.03593125194311142,
"learning_rate": 8.948460554099018e-05,
"loss": 0.017,
"step": 2670
},
{
"epoch": 40.0,
"grad_norm": 0.041005708277225494,
"learning_rate": 8.885935707967716e-05,
"loss": 0.0164,
"step": 2680
},
{
"epoch": 40.149253731343286,
"grad_norm": 0.03663647174835205,
"learning_rate": 8.823454931281616e-05,
"loss": 0.0177,
"step": 2690
},
{
"epoch": 40.298507462686565,
"grad_norm": 0.034003522247076035,
"learning_rate": 8.76102069561545e-05,
"loss": 0.0166,
"step": 2700
},
{
"epoch": 40.44776119402985,
"grad_norm": 0.06096246466040611,
"learning_rate": 8.698635470702923e-05,
"loss": 0.0166,
"step": 2710
},
{
"epoch": 40.59701492537313,
"grad_norm": 0.03656260296702385,
"learning_rate": 8.636301724339004e-05,
"loss": 0.0162,
"step": 2720
},
{
"epoch": 40.74626865671642,
"grad_norm": 0.03990943357348442,
"learning_rate": 8.574021922282292e-05,
"loss": 0.018,
"step": 2730
},
{
"epoch": 40.8955223880597,
"grad_norm": 0.03584331274032593,
"learning_rate": 8.511798528157512e-05,
"loss": 0.0184,
"step": 2740
},
{
"epoch": 41.04477611940298,
"grad_norm": 0.027507085353136063,
"learning_rate": 8.449634003358022e-05,
"loss": 0.0163,
"step": 2750
},
{
"epoch": 41.19402985074627,
"grad_norm": 0.03266240283846855,
"learning_rate": 8.387530806948476e-05,
"loss": 0.0175,
"step": 2760
},
{
"epoch": 41.343283582089555,
"grad_norm": 0.041966021060943604,
"learning_rate": 8.325491395567541e-05,
"loss": 0.0175,
"step": 2770
},
{
"epoch": 41.492537313432834,
"grad_norm": 0.03868953138589859,
"learning_rate": 8.263518223330697e-05,
"loss": 0.0179,
"step": 2780
},
{
"epoch": 41.64179104477612,
"grad_norm": 0.03475317731499672,
"learning_rate": 8.201613741733203e-05,
"loss": 0.0159,
"step": 2790
},
{
"epoch": 41.791044776119406,
"grad_norm": 0.04897564649581909,
"learning_rate": 8.13978039955308e-05,
"loss": 0.0183,
"step": 2800
},
{
"epoch": 41.940298507462686,
"grad_norm": 0.03372865915298462,
"learning_rate": 8.078020642754274e-05,
"loss": 0.0169,
"step": 2810
},
{
"epoch": 42.08955223880597,
"grad_norm": 0.046989791095256805,
"learning_rate": 8.016336914389874e-05,
"loss": 0.0174,
"step": 2820
},
{
"epoch": 42.23880597014925,
"grad_norm": 0.03214934468269348,
"learning_rate": 7.954731654505491e-05,
"loss": 0.0146,
"step": 2830
},
{
"epoch": 42.38805970149254,
"grad_norm": 0.05004828795790672,
"learning_rate": 7.89320730004274e-05,
"loss": 0.0182,
"step": 2840
},
{
"epoch": 42.53731343283582,
"grad_norm": 0.031027644872665405,
"learning_rate": 7.831766284742807e-05,
"loss": 0.0156,
"step": 2850
},
{
"epoch": 42.6865671641791,
"grad_norm": 0.04323369264602661,
"learning_rate": 7.77041103905023e-05,
"loss": 0.0177,
"step": 2860
},
{
"epoch": 42.83582089552239,
"grad_norm": 0.03035310097038746,
"learning_rate": 7.709143990016702e-05,
"loss": 0.0164,
"step": 2870
},
{
"epoch": 42.985074626865675,
"grad_norm": 0.03815029561519623,
"learning_rate": 7.6479675612051e-05,
"loss": 0.0205,
"step": 2880
},
{
"epoch": 43.134328358208954,
"grad_norm": 0.039352428168058395,
"learning_rate": 7.586884172593609e-05,
"loss": 0.0169,
"step": 2890
},
{
"epoch": 43.28358208955224,
"grad_norm": 0.031735971570014954,
"learning_rate": 7.525896240479976e-05,
"loss": 0.0159,
"step": 2900
},
{
"epoch": 43.43283582089552,
"grad_norm": 0.03945886343717575,
"learning_rate": 7.465006177385953e-05,
"loss": 0.0166,
"step": 2910
},
{
"epoch": 43.582089552238806,
"grad_norm": 0.030156582593917847,
"learning_rate": 7.404216391961847e-05,
"loss": 0.0176,
"step": 2920
},
{
"epoch": 43.73134328358209,
"grad_norm": 0.03693369776010513,
"learning_rate": 7.343529288891239e-05,
"loss": 0.0159,
"step": 2930
},
{
"epoch": 43.88059701492537,
"grad_norm": 0.03338786959648132,
"learning_rate": 7.282947268795877e-05,
"loss": 0.0171,
"step": 2940
},
{
"epoch": 44.02985074626866,
"grad_norm": 0.021122202277183533,
"learning_rate": 7.222472728140695e-05,
"loss": 0.0178,
"step": 2950
},
{
"epoch": 44.17910447761194,
"grad_norm": 0.02877069264650345,
"learning_rate": 7.162108059139032e-05,
"loss": 0.0186,
"step": 2960
},
{
"epoch": 44.32835820895522,
"grad_norm": 0.07340731471776962,
"learning_rate": 7.101855649657991e-05,
"loss": 0.0187,
"step": 2970
},
{
"epoch": 44.47761194029851,
"grad_norm": 0.04397398233413696,
"learning_rate": 7.041717883123977e-05,
"loss": 0.0169,
"step": 2980
},
{
"epoch": 44.62686567164179,
"grad_norm": 0.03213610127568245,
"learning_rate": 6.981697138428434e-05,
"loss": 0.017,
"step": 2990
},
{
"epoch": 44.776119402985074,
"grad_norm": 0.024713682010769844,
"learning_rate": 6.921795789833723e-05,
"loss": 0.016,
"step": 3000
},
{
"epoch": 44.776119402985074,
"eval_loss": 3.2444493770599365,
"eval_runtime": 34.6924,
"eval_samples_per_second": 9.137,
"eval_steps_per_second": 4.583,
"step": 3000
},
{
"epoch": 44.92537313432836,
"grad_norm": 0.028246300294995308,
"learning_rate": 6.862016206879216e-05,
"loss": 0.017,
"step": 3010
},
{
"epoch": 45.07462686567164,
"grad_norm": 0.03457006812095642,
"learning_rate": 6.802360754287547e-05,
"loss": 0.0162,
"step": 3020
},
{
"epoch": 45.223880597014926,
"grad_norm": 0.04756162688136101,
"learning_rate": 6.742831791871096e-05,
"loss": 0.0176,
"step": 3030
},
{
"epoch": 45.37313432835821,
"grad_norm": 0.03616981953382492,
"learning_rate": 6.683431674438612e-05,
"loss": 0.0153,
"step": 3040
},
{
"epoch": 45.52238805970149,
"grad_norm": 0.03674984350800514,
"learning_rate": 6.624162751702076e-05,
"loss": 0.0155,
"step": 3050
},
{
"epoch": 45.67164179104478,
"grad_norm": 0.026007067412137985,
"learning_rate": 6.565027368183769e-05,
"loss": 0.0167,
"step": 3060
},
{
"epoch": 45.82089552238806,
"grad_norm": 0.027320370078086853,
"learning_rate": 6.506027863123492e-05,
"loss": 0.0183,
"step": 3070
},
{
"epoch": 45.97014925373134,
"grad_norm": 0.037796761840581894,
"learning_rate": 6.447166570386063e-05,
"loss": 0.0184,
"step": 3080
},
{
"epoch": 46.11940298507463,
"grad_norm": 0.03350226208567619,
"learning_rate": 6.388445818368991e-05,
"loss": 0.0156,
"step": 3090
},
{
"epoch": 46.26865671641791,
"grad_norm": 0.026382336392998695,
"learning_rate": 6.329867929910347e-05,
"loss": 0.0156,
"step": 3100
},
{
"epoch": 46.417910447761194,
"grad_norm": 0.03873557224869728,
"learning_rate": 6.271435222196916e-05,
"loss": 0.021,
"step": 3110
},
{
"epoch": 46.56716417910448,
"grad_norm": 0.039839617908000946,
"learning_rate": 6.213150006672499e-05,
"loss": 0.0172,
"step": 3120
},
{
"epoch": 46.71641791044776,
"grad_norm": 0.03648209199309349,
"learning_rate": 6.15501458894651e-05,
"loss": 0.0161,
"step": 3130
},
{
"epoch": 46.865671641791046,
"grad_norm": 0.04052448272705078,
"learning_rate": 6.097031268702746e-05,
"loss": 0.0178,
"step": 3140
},
{
"epoch": 47.014925373134325,
"grad_norm": 0.03359508886933327,
"learning_rate": 6.039202339608432e-05,
"loss": 0.0172,
"step": 3150
},
{
"epoch": 47.16417910447761,
"grad_norm": 0.03347504511475563,
"learning_rate": 5.981530089223489e-05,
"loss": 0.0162,
"step": 3160
},
{
"epoch": 47.3134328358209,
"grad_norm": 0.03658764436841011,
"learning_rate": 5.924016798910037e-05,
"loss": 0.0164,
"step": 3170
},
{
"epoch": 47.46268656716418,
"grad_norm": 0.02819441817700863,
"learning_rate": 5.866664743742162e-05,
"loss": 0.0169,
"step": 3180
},
{
"epoch": 47.61194029850746,
"grad_norm": 0.03421681374311447,
"learning_rate": 5.809476192415905e-05,
"loss": 0.0185,
"step": 3190
},
{
"epoch": 47.76119402985075,
"grad_norm": 0.050075121223926544,
"learning_rate": 5.752453407159522e-05,
"loss": 0.0178,
"step": 3200
},
{
"epoch": 47.91044776119403,
"grad_norm": 0.030316824093461037,
"learning_rate": 5.69559864364402e-05,
"loss": 0.0151,
"step": 3210
},
{
"epoch": 48.059701492537314,
"grad_norm": 0.053834062069654465,
"learning_rate": 5.6389141508938903e-05,
"loss": 0.0173,
"step": 3220
},
{
"epoch": 48.208955223880594,
"grad_norm": 0.036416202783584595,
"learning_rate": 5.5824021711981686e-05,
"loss": 0.0158,
"step": 3230
},
{
"epoch": 48.35820895522388,
"grad_norm": 0.030989298596978188,
"learning_rate": 5.5260649400217326e-05,
"loss": 0.017,
"step": 3240
},
{
"epoch": 48.507462686567166,
"grad_norm": 0.023621903732419014,
"learning_rate": 5.469904685916861e-05,
"loss": 0.0202,
"step": 3250
},
{
"epoch": 48.656716417910445,
"grad_norm": 0.03500565141439438,
"learning_rate": 5.4139236304350935e-05,
"loss": 0.0178,
"step": 3260
},
{
"epoch": 48.80597014925373,
"grad_norm": 0.040891725569963455,
"learning_rate": 5.3581239880393375e-05,
"loss": 0.0179,
"step": 3270
},
{
"epoch": 48.95522388059702,
"grad_norm": 0.027273844927549362,
"learning_rate": 5.302507966016295e-05,
"loss": 0.014,
"step": 3280
},
{
"epoch": 49.1044776119403,
"grad_norm": 0.02311500906944275,
"learning_rate": 5.247077764389099e-05,
"loss": 0.0167,
"step": 3290
},
{
"epoch": 49.25373134328358,
"grad_norm": 0.052333466708660126,
"learning_rate": 5.191835575830352e-05,
"loss": 0.0177,
"step": 3300
},
{
"epoch": 49.40298507462686,
"grad_norm": 0.042159441858530045,
"learning_rate": 5.136783585575336e-05,
"loss": 0.0162,
"step": 3310
},
{
"epoch": 49.55223880597015,
"grad_norm": 0.025089124217629433,
"learning_rate": 5.081923971335582e-05,
"loss": 0.0166,
"step": 3320
},
{
"epoch": 49.701492537313435,
"grad_norm": 0.03574312478303909,
"learning_rate": 5.0272589032127594e-05,
"loss": 0.0168,
"step": 3330
},
{
"epoch": 49.850746268656714,
"grad_norm": 0.035489972680807114,
"learning_rate": 4.972790543612783e-05,
"loss": 0.0149,
"step": 3340
},
{
"epoch": 50.0,
"grad_norm": 0.04449532926082611,
"learning_rate": 4.918521047160308e-05,
"loss": 0.0185,
"step": 3350
},
{
"epoch": 50.149253731343286,
"grad_norm": 0.037168972194194794,
"learning_rate": 4.864452560613485e-05,
"loss": 0.0159,
"step": 3360
},
{
"epoch": 50.298507462686565,
"grad_norm": 0.032931167632341385,
"learning_rate": 4.810587222779043e-05,
"loss": 0.0165,
"step": 3370
},
{
"epoch": 50.44776119402985,
"grad_norm": 0.08401083201169968,
"learning_rate": 4.756927164427685e-05,
"loss": 0.0186,
"step": 3380
},
{
"epoch": 50.59701492537313,
"grad_norm": 0.03464508801698685,
"learning_rate": 4.703474508209793e-05,
"loss": 0.0156,
"step": 3390
},
{
"epoch": 50.74626865671642,
"grad_norm": 0.03466491773724556,
"learning_rate": 4.650231368571486e-05,
"loss": 0.0173,
"step": 3400
},
{
"epoch": 50.8955223880597,
"grad_norm": 0.030173856765031815,
"learning_rate": 4.597199851670932e-05,
"loss": 0.0161,
"step": 3410
},
{
"epoch": 51.04477611940298,
"grad_norm": 0.029572507366538048,
"learning_rate": 4.54438205529508e-05,
"loss": 0.0162,
"step": 3420
},
{
"epoch": 51.19402985074627,
"grad_norm": 0.032448384910821915,
"learning_rate": 4.491780068776663e-05,
"loss": 0.0173,
"step": 3430
},
{
"epoch": 51.343283582089555,
"grad_norm": 0.029228707775473595,
"learning_rate": 4.4393959729115244e-05,
"loss": 0.0182,
"step": 3440
},
{
"epoch": 51.492537313432834,
"grad_norm": 0.03576023131608963,
"learning_rate": 4.387231839876349e-05,
"loss": 0.0156,
"step": 3450
},
{
"epoch": 51.64179104477612,
"grad_norm": 0.02814898081123829,
"learning_rate": 4.335289733146665e-05,
"loss": 0.0163,
"step": 3460
},
{
"epoch": 51.791044776119406,
"grad_norm": 0.05810336023569107,
"learning_rate": 4.283571707415214e-05,
"loss": 0.0172,
"step": 3470
},
{
"epoch": 51.940298507462686,
"grad_norm": 0.03793029487133026,
"learning_rate": 4.2320798085107036e-05,
"loss": 0.0177,
"step": 3480
},
{
"epoch": 52.08955223880597,
"grad_norm": 0.02555203065276146,
"learning_rate": 4.18081607331685e-05,
"loss": 0.0168,
"step": 3490
},
{
"epoch": 52.23880597014925,
"grad_norm": 0.028642071411013603,
"learning_rate": 4.129782529691815e-05,
"loss": 0.0167,
"step": 3500
},
{
"epoch": 52.23880597014925,
"eval_loss": 3.4037022590637207,
"eval_runtime": 34.709,
"eval_samples_per_second": 9.133,
"eval_steps_per_second": 4.581,
"step": 3500
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 75,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.873632111370404e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}