Model_1 / trainer_state.json
tttpro's picture
Upload 20 files
bbea403 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998899122967965,
"eval_steps": 500,
"global_step": 3406,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0029356720854280577,
"grad_norm": 2.3989222049713135,
"learning_rate": 1.9999583060217186e-05,
"loss": 1.4046,
"step": 10
},
{
"epoch": 0.005871344170856115,
"grad_norm": 2.4013407230377197,
"learning_rate": 1.9998315387870395e-05,
"loss": 1.2805,
"step": 20
},
{
"epoch": 0.008807016256284174,
"grad_norm": 2.5987584590911865,
"learning_rate": 1.9996197048273697e-05,
"loss": 1.2627,
"step": 30
},
{
"epoch": 0.01174268834171223,
"grad_norm": 2.5626957416534424,
"learning_rate": 1.999322822165767e-05,
"loss": 1.34,
"step": 40
},
{
"epoch": 0.014678360427140288,
"grad_norm": 2.4768199920654297,
"learning_rate": 1.998940916061322e-05,
"loss": 1.2963,
"step": 50
},
{
"epoch": 0.017614032512568347,
"grad_norm": 2.1327669620513916,
"learning_rate": 1.9984740190070102e-05,
"loss": 1.3513,
"step": 60
},
{
"epoch": 0.020549704597996404,
"grad_norm": 2.7165908813476562,
"learning_rate": 1.9979221707269273e-05,
"loss": 1.2629,
"step": 70
},
{
"epoch": 0.02348537668342446,
"grad_norm": 2.5773496627807617,
"learning_rate": 1.997285418172908e-05,
"loss": 1.2504,
"step": 80
},
{
"epoch": 0.02642104876885252,
"grad_norm": 2.7524304389953613,
"learning_rate": 1.9965638155205335e-05,
"loss": 1.2854,
"step": 90
},
{
"epoch": 0.029356720854280576,
"grad_norm": 2.542572498321533,
"learning_rate": 1.995757424164521e-05,
"loss": 1.2583,
"step": 100
},
{
"epoch": 0.03229239293970863,
"grad_norm": 3.595125198364258,
"learning_rate": 1.9948663127135003e-05,
"loss": 1.2612,
"step": 110
},
{
"epoch": 0.035228065025136694,
"grad_norm": 2.8669538497924805,
"learning_rate": 1.9938905569841754e-05,
"loss": 1.237,
"step": 120
},
{
"epoch": 0.03816373711056475,
"grad_norm": 3.1492984294891357,
"learning_rate": 1.9928302399948767e-05,
"loss": 1.2394,
"step": 130
},
{
"epoch": 0.04109940919599281,
"grad_norm": 3.0048630237579346,
"learning_rate": 1.991685451958495e-05,
"loss": 1.1899,
"step": 140
},
{
"epoch": 0.04403508128142086,
"grad_norm": 2.9907774925231934,
"learning_rate": 1.990456290274808e-05,
"loss": 1.1939,
"step": 150
},
{
"epoch": 0.04697075336684892,
"grad_norm": 3.010820150375366,
"learning_rate": 1.9891428595221914e-05,
"loss": 1.2212,
"step": 160
},
{
"epoch": 0.049906425452276984,
"grad_norm": 2.486607551574707,
"learning_rate": 1.9877452714487232e-05,
"loss": 1.1824,
"step": 170
},
{
"epoch": 0.05284209753770504,
"grad_norm": 2.491534471511841,
"learning_rate": 1.9862636449626752e-05,
"loss": 1.2118,
"step": 180
},
{
"epoch": 0.0557777696231331,
"grad_norm": 2.5148768424987793,
"learning_rate": 1.9846981061223958e-05,
"loss": 1.2377,
"step": 190
},
{
"epoch": 0.05871344170856115,
"grad_norm": 2.640007734298706,
"learning_rate": 1.9830487881255864e-05,
"loss": 1.1995,
"step": 200
},
{
"epoch": 0.06164911379398921,
"grad_norm": 2.6175191402435303,
"learning_rate": 1.981315831297966e-05,
"loss": 1.1114,
"step": 210
},
{
"epoch": 0.06458478587941727,
"grad_norm": 2.8480780124664307,
"learning_rate": 1.9794993830813358e-05,
"loss": 1.1897,
"step": 220
},
{
"epoch": 0.06752045796484532,
"grad_norm": 2.635657787322998,
"learning_rate": 1.9775995980210306e-05,
"loss": 1.1592,
"step": 230
},
{
"epoch": 0.07045613005027339,
"grad_norm": 2.8263065814971924,
"learning_rate": 1.9756166377527734e-05,
"loss": 1.1712,
"step": 240
},
{
"epoch": 0.07339180213570144,
"grad_norm": 3.2777886390686035,
"learning_rate": 1.9735506709889213e-05,
"loss": 1.1724,
"step": 250
},
{
"epoch": 0.0763274742211295,
"grad_norm": 2.587409019470215,
"learning_rate": 1.9714018735041125e-05,
"loss": 1.1767,
"step": 260
},
{
"epoch": 0.07926314630655756,
"grad_norm": 2.9432199001312256,
"learning_rate": 1.9691704281203098e-05,
"loss": 1.1553,
"step": 270
},
{
"epoch": 0.08219881839198562,
"grad_norm": 3.1995699405670166,
"learning_rate": 1.966856524691247e-05,
"loss": 1.1577,
"step": 280
},
{
"epoch": 0.08513449047741367,
"grad_norm": 2.805522918701172,
"learning_rate": 1.9644603600862753e-05,
"loss": 1.1627,
"step": 290
},
{
"epoch": 0.08807016256284173,
"grad_norm": 2.6327457427978516,
"learning_rate": 1.961982138173615e-05,
"loss": 1.1483,
"step": 300
},
{
"epoch": 0.09100583464826979,
"grad_norm": 4.197879314422607,
"learning_rate": 1.959422069803007e-05,
"loss": 1.2015,
"step": 310
},
{
"epoch": 0.09394150673369785,
"grad_norm": 3.2353358268737793,
"learning_rate": 1.956780372787777e-05,
"loss": 1.1068,
"step": 320
},
{
"epoch": 0.0968771788191259,
"grad_norm": 2.8246076107025146,
"learning_rate": 1.9540572718863012e-05,
"loss": 1.1424,
"step": 330
},
{
"epoch": 0.09981285090455397,
"grad_norm": 4.32204532623291,
"learning_rate": 1.9512529987828853e-05,
"loss": 1.1508,
"step": 340
},
{
"epoch": 0.10274852298998202,
"grad_norm": 2.8142335414886475,
"learning_rate": 1.9483677920680512e-05,
"loss": 1.1623,
"step": 350
},
{
"epoch": 0.10568419507541008,
"grad_norm": 2.9336957931518555,
"learning_rate": 1.9454018972182383e-05,
"loss": 1.161,
"step": 360
},
{
"epoch": 0.10861986716083813,
"grad_norm": 2.9903533458709717,
"learning_rate": 1.9423555665749182e-05,
"loss": 1.1444,
"step": 370
},
{
"epoch": 0.1115555392462662,
"grad_norm": 2.8897149562835693,
"learning_rate": 1.939229059323124e-05,
"loss": 1.171,
"step": 380
},
{
"epoch": 0.11449121133169425,
"grad_norm": 3.222294807434082,
"learning_rate": 1.9360226414694008e-05,
"loss": 1.1365,
"step": 390
},
{
"epoch": 0.1174268834171223,
"grad_norm": 2.9000742435455322,
"learning_rate": 1.932736585819171e-05,
"loss": 1.1342,
"step": 400
},
{
"epoch": 0.12036255550255036,
"grad_norm": 2.682969093322754,
"learning_rate": 1.929371171953526e-05,
"loss": 1.0428,
"step": 410
},
{
"epoch": 0.12329822758797843,
"grad_norm": 2.8611629009246826,
"learning_rate": 1.9259266862054366e-05,
"loss": 1.1135,
"step": 420
},
{
"epoch": 0.12623389967340648,
"grad_norm": 2.9910166263580322,
"learning_rate": 1.9224034216353947e-05,
"loss": 1.1154,
"step": 430
},
{
"epoch": 0.12916957175883453,
"grad_norm": 3.245227336883545,
"learning_rate": 1.9188016780064768e-05,
"loss": 1.2029,
"step": 440
},
{
"epoch": 0.1321052438442626,
"grad_norm": 3.2252321243286133,
"learning_rate": 1.9151217617588412e-05,
"loss": 1.1272,
"step": 450
},
{
"epoch": 0.13504091592969064,
"grad_norm": 3.23498272895813,
"learning_rate": 1.9113639859836544e-05,
"loss": 1.1421,
"step": 460
},
{
"epoch": 0.13797658801511872,
"grad_norm": 3.0721843242645264,
"learning_rate": 1.9075286703964554e-05,
"loss": 1.14,
"step": 470
},
{
"epoch": 0.14091226010054678,
"grad_norm": 3.130610704421997,
"learning_rate": 1.9036161413099512e-05,
"loss": 1.1699,
"step": 480
},
{
"epoch": 0.14384793218597483,
"grad_norm": 3.054914712905884,
"learning_rate": 1.899626731606255e-05,
"loss": 1.0919,
"step": 490
},
{
"epoch": 0.14678360427140288,
"grad_norm": 3.3167009353637695,
"learning_rate": 1.895560780708565e-05,
"loss": 1.0625,
"step": 500
},
{
"epoch": 0.14971927635683094,
"grad_norm": 3.075392484664917,
"learning_rate": 1.8914186345522846e-05,
"loss": 1.0899,
"step": 510
},
{
"epoch": 0.152654948442259,
"grad_norm": 3.1269266605377197,
"learning_rate": 1.8872006455555906e-05,
"loss": 1.1148,
"step": 520
},
{
"epoch": 0.15559062052768705,
"grad_norm": 3.867361068725586,
"learning_rate": 1.8829071725894483e-05,
"loss": 1.02,
"step": 530
},
{
"epoch": 0.15852629261311513,
"grad_norm": 3.529639720916748,
"learning_rate": 1.87853858094708e-05,
"loss": 1.1167,
"step": 540
},
{
"epoch": 0.16146196469854318,
"grad_norm": 3.098249673843384,
"learning_rate": 1.8740952423128842e-05,
"loss": 1.0181,
"step": 550
},
{
"epoch": 0.16439763678397123,
"grad_norm": 3.1614904403686523,
"learning_rate": 1.869577534730812e-05,
"loss": 1.1118,
"step": 560
},
{
"epoch": 0.1673333088693993,
"grad_norm": 3.054616928100586,
"learning_rate": 1.8649858425722033e-05,
"loss": 1.0666,
"step": 570
},
{
"epoch": 0.17026898095482734,
"grad_norm": 3.479527711868286,
"learning_rate": 1.8603205565030846e-05,
"loss": 1.108,
"step": 580
},
{
"epoch": 0.1732046530402554,
"grad_norm": 2.9523024559020996,
"learning_rate": 1.8555820734509297e-05,
"loss": 1.0833,
"step": 590
},
{
"epoch": 0.17614032512568345,
"grad_norm": 2.9907584190368652,
"learning_rate": 1.8507707965708892e-05,
"loss": 1.0283,
"step": 600
},
{
"epoch": 0.17907599721111153,
"grad_norm": 3.2911376953125,
"learning_rate": 1.8458871352114894e-05,
"loss": 1.0747,
"step": 610
},
{
"epoch": 0.18201166929653959,
"grad_norm": 3.1361849308013916,
"learning_rate": 1.840931504879806e-05,
"loss": 1.11,
"step": 620
},
{
"epoch": 0.18494734138196764,
"grad_norm": 3.527332067489624,
"learning_rate": 1.8359043272061086e-05,
"loss": 1.0424,
"step": 630
},
{
"epoch": 0.1878830134673957,
"grad_norm": 3.5494275093078613,
"learning_rate": 1.8308060299079926e-05,
"loss": 1.0818,
"step": 640
},
{
"epoch": 0.19081868555282375,
"grad_norm": 3.4427106380462646,
"learning_rate": 1.8256370467539847e-05,
"loss": 1.0883,
"step": 650
},
{
"epoch": 0.1937543576382518,
"grad_norm": 3.092515230178833,
"learning_rate": 1.82039781752664e-05,
"loss": 1.0285,
"step": 660
},
{
"epoch": 0.19669002972367985,
"grad_norm": 2.667904853820801,
"learning_rate": 1.815088787985124e-05,
"loss": 0.9751,
"step": 670
},
{
"epoch": 0.19962570180910794,
"grad_norm": 3.5892174243927,
"learning_rate": 1.809710409827285e-05,
"loss": 1.0603,
"step": 680
},
{
"epoch": 0.202561373894536,
"grad_norm": 3.122434616088867,
"learning_rate": 1.804263140651227e-05,
"loss": 1.0919,
"step": 690
},
{
"epoch": 0.20549704597996404,
"grad_norm": 2.9182698726654053,
"learning_rate": 1.798747443916374e-05,
"loss": 1.0553,
"step": 700
},
{
"epoch": 0.2084327180653921,
"grad_norm": 3.260917901992798,
"learning_rate": 1.793163788904038e-05,
"loss": 1.0711,
"step": 710
},
{
"epoch": 0.21136839015082015,
"grad_norm": 3.4142649173736572,
"learning_rate": 1.7875126506774956e-05,
"loss": 1.0423,
"step": 720
},
{
"epoch": 0.2143040622362482,
"grad_norm": 3.0127294063568115,
"learning_rate": 1.781794510041564e-05,
"loss": 1.0679,
"step": 730
},
{
"epoch": 0.21723973432167626,
"grad_norm": 3.398015022277832,
"learning_rate": 1.776009853501698e-05,
"loss": 1.0558,
"step": 740
},
{
"epoch": 0.2201754064071043,
"grad_norm": 3.6017568111419678,
"learning_rate": 1.770159173222595e-05,
"loss": 1.0198,
"step": 750
},
{
"epoch": 0.2231110784925324,
"grad_norm": 3.5204339027404785,
"learning_rate": 1.7642429669863225e-05,
"loss": 0.9951,
"step": 760
},
{
"epoch": 0.22604675057796045,
"grad_norm": 3.3134777545928955,
"learning_rate": 1.7582617381499655e-05,
"loss": 0.9906,
"step": 770
},
{
"epoch": 0.2289824226633885,
"grad_norm": 3.5017244815826416,
"learning_rate": 1.7522159956028003e-05,
"loss": 1.0711,
"step": 780
},
{
"epoch": 0.23191809474881656,
"grad_norm": 3.1634137630462646,
"learning_rate": 1.7461062537229987e-05,
"loss": 0.9909,
"step": 790
},
{
"epoch": 0.2348537668342446,
"grad_norm": 3.368623971939087,
"learning_rate": 1.739933032333863e-05,
"loss": 0.9815,
"step": 800
},
{
"epoch": 0.23778943891967266,
"grad_norm": 3.1064817905426025,
"learning_rate": 1.733696856659599e-05,
"loss": 1.0191,
"step": 810
},
{
"epoch": 0.24072511100510072,
"grad_norm": 3.2074899673461914,
"learning_rate": 1.7273982572806303e-05,
"loss": 1.0314,
"step": 820
},
{
"epoch": 0.2436607830905288,
"grad_norm": 2.5882649421691895,
"learning_rate": 1.721037770088455e-05,
"loss": 0.958,
"step": 830
},
{
"epoch": 0.24659645517595685,
"grad_norm": 3.730363130569458,
"learning_rate": 1.7146159362400515e-05,
"loss": 1.0272,
"step": 840
},
{
"epoch": 0.2495321272613849,
"grad_norm": 2.940425395965576,
"learning_rate": 1.708133302111837e-05,
"loss": 1.0437,
"step": 850
},
{
"epoch": 0.25246779934681296,
"grad_norm": 4.833681106567383,
"learning_rate": 1.7015904192531814e-05,
"loss": 1.0393,
"step": 860
},
{
"epoch": 0.255403471432241,
"grad_norm": 3.417707681655884,
"learning_rate": 1.694987844339479e-05,
"loss": 1.0602,
"step": 870
},
{
"epoch": 0.25833914351766907,
"grad_norm": 3.239388942718506,
"learning_rate": 1.6883261391247888e-05,
"loss": 0.9515,
"step": 880
},
{
"epoch": 0.2612748156030971,
"grad_norm": 3.1867291927337646,
"learning_rate": 1.6816058703940366e-05,
"loss": 0.9961,
"step": 890
},
{
"epoch": 0.2642104876885252,
"grad_norm": 3.193343162536621,
"learning_rate": 1.6748276099147952e-05,
"loss": 1.0066,
"step": 900
},
{
"epoch": 0.26714615977395323,
"grad_norm": 3.1413753032684326,
"learning_rate": 1.6679919343886376e-05,
"loss": 0.9714,
"step": 910
},
{
"epoch": 0.2700818318593813,
"grad_norm": 3.0826566219329834,
"learning_rate": 1.661099425402067e-05,
"loss": 0.9689,
"step": 920
},
{
"epoch": 0.2730175039448094,
"grad_norm": 3.5959160327911377,
"learning_rate": 1.6541506693770403e-05,
"loss": 0.9867,
"step": 930
},
{
"epoch": 0.27595317603023745,
"grad_norm": 3.8435122966766357,
"learning_rate": 1.647146257521071e-05,
"loss": 1.0281,
"step": 940
},
{
"epoch": 0.2788888481156655,
"grad_norm": 3.396488904953003,
"learning_rate": 1.6400867857769287e-05,
"loss": 0.975,
"step": 950
},
{
"epoch": 0.28182452020109355,
"grad_norm": 3.2766590118408203,
"learning_rate": 1.6329728547719375e-05,
"loss": 0.9373,
"step": 960
},
{
"epoch": 0.2847601922865216,
"grad_norm": 3.673755645751953,
"learning_rate": 1.625805069766873e-05,
"loss": 0.9651,
"step": 970
},
{
"epoch": 0.28769586437194966,
"grad_norm": 3.8751864433288574,
"learning_rate": 1.6185840406044657e-05,
"loss": 0.9262,
"step": 980
},
{
"epoch": 0.2906315364573777,
"grad_norm": 3.708500623703003,
"learning_rate": 1.611310381657515e-05,
"loss": 0.9972,
"step": 990
},
{
"epoch": 0.29356720854280577,
"grad_norm": 3.4258711338043213,
"learning_rate": 1.60398471177662e-05,
"loss": 0.9331,
"step": 1000
},
{
"epoch": 0.2965028806282338,
"grad_norm": 3.4662258625030518,
"learning_rate": 1.596607654237522e-05,
"loss": 0.9592,
"step": 1010
},
{
"epoch": 0.2994385527136619,
"grad_norm": 2.938396453857422,
"learning_rate": 1.589179836688081e-05,
"loss": 0.9568,
"step": 1020
},
{
"epoch": 0.30237422479908993,
"grad_norm": 3.248762845993042,
"learning_rate": 1.5817018910948712e-05,
"loss": 0.9928,
"step": 1030
},
{
"epoch": 0.305309896884518,
"grad_norm": 3.423213243484497,
"learning_rate": 1.574174453689415e-05,
"loss": 0.9387,
"step": 1040
},
{
"epoch": 0.30824556896994604,
"grad_norm": 3.249216318130493,
"learning_rate": 1.566598164914049e-05,
"loss": 0.8925,
"step": 1050
},
{
"epoch": 0.3111812410553741,
"grad_norm": 3.6318016052246094,
"learning_rate": 1.5589736693674372e-05,
"loss": 1.0153,
"step": 1060
},
{
"epoch": 0.31411691314080215,
"grad_norm": 3.9752533435821533,
"learning_rate": 1.551301615749726e-05,
"loss": 0.9323,
"step": 1070
},
{
"epoch": 0.31705258522623025,
"grad_norm": 3.46864914894104,
"learning_rate": 1.5435826568073532e-05,
"loss": 0.8901,
"step": 1080
},
{
"epoch": 0.3199882573116583,
"grad_norm": 4.399304389953613,
"learning_rate": 1.535817449277511e-05,
"loss": 0.9118,
"step": 1090
},
{
"epoch": 0.32292392939708636,
"grad_norm": 3.2890026569366455,
"learning_rate": 1.5280066538322703e-05,
"loss": 0.8655,
"step": 1100
},
{
"epoch": 0.3258596014825144,
"grad_norm": 3.491983652114868,
"learning_rate": 1.5201509350223708e-05,
"loss": 0.9217,
"step": 1110
},
{
"epoch": 0.32879527356794247,
"grad_norm": 5.0014824867248535,
"learning_rate": 1.5122509612206785e-05,
"loss": 0.9362,
"step": 1120
},
{
"epoch": 0.3317309456533705,
"grad_norm": 4.092339038848877,
"learning_rate": 1.5043074045653215e-05,
"loss": 0.9262,
"step": 1130
},
{
"epoch": 0.3346666177387986,
"grad_norm": 3.286433219909668,
"learning_rate": 1.496320940902503e-05,
"loss": 0.8891,
"step": 1140
},
{
"epoch": 0.33760228982422663,
"grad_norm": 3.6521873474121094,
"learning_rate": 1.4882922497290007e-05,
"loss": 0.9281,
"step": 1150
},
{
"epoch": 0.3405379619096547,
"grad_norm": 3.8015809059143066,
"learning_rate": 1.4802220141343516e-05,
"loss": 0.8949,
"step": 1160
},
{
"epoch": 0.34347363399508274,
"grad_norm": 4.149661064147949,
"learning_rate": 1.472110920742738e-05,
"loss": 0.8889,
"step": 1170
},
{
"epoch": 0.3464093060805108,
"grad_norm": 3.5252785682678223,
"learning_rate": 1.4639596596545656e-05,
"loss": 0.8397,
"step": 1180
},
{
"epoch": 0.34934497816593885,
"grad_norm": 3.6884541511535645,
"learning_rate": 1.4557689243877507e-05,
"loss": 0.9142,
"step": 1190
},
{
"epoch": 0.3522806502513669,
"grad_norm": 3.9577550888061523,
"learning_rate": 1.4475394118187146e-05,
"loss": 0.9809,
"step": 1200
},
{
"epoch": 0.35521632233679495,
"grad_norm": 3.6897339820861816,
"learning_rate": 1.4392718221230917e-05,
"loss": 0.9141,
"step": 1210
},
{
"epoch": 0.35815199442222306,
"grad_norm": 3.061516046524048,
"learning_rate": 1.4309668587161596e-05,
"loss": 0.8669,
"step": 1220
},
{
"epoch": 0.3610876665076511,
"grad_norm": 3.1191623210906982,
"learning_rate": 1.4226252281929902e-05,
"loss": 0.8384,
"step": 1230
},
{
"epoch": 0.36402333859307917,
"grad_norm": 4.198310852050781,
"learning_rate": 1.4142476402683327e-05,
"loss": 0.8971,
"step": 1240
},
{
"epoch": 0.3669590106785072,
"grad_norm": 3.8184337615966797,
"learning_rate": 1.4058348077162301e-05,
"loss": 0.8783,
"step": 1250
},
{
"epoch": 0.3698946827639353,
"grad_norm": 3.842637777328491,
"learning_rate": 1.3973874463093747e-05,
"loss": 0.9623,
"step": 1260
},
{
"epoch": 0.37283035484936333,
"grad_norm": 3.5173559188842773,
"learning_rate": 1.3889062747582118e-05,
"loss": 0.8092,
"step": 1270
},
{
"epoch": 0.3757660269347914,
"grad_norm": 3.8953890800476074,
"learning_rate": 1.3803920146497887e-05,
"loss": 0.8762,
"step": 1280
},
{
"epoch": 0.37870169902021944,
"grad_norm": 3.0550928115844727,
"learning_rate": 1.3718453903863616e-05,
"loss": 0.8321,
"step": 1290
},
{
"epoch": 0.3816373711056475,
"grad_norm": 3.9677445888519287,
"learning_rate": 1.3632671291237645e-05,
"loss": 0.8566,
"step": 1300
},
{
"epoch": 0.38457304319107555,
"grad_norm": 3.887268304824829,
"learning_rate": 1.35465796070954e-05,
"loss": 0.8944,
"step": 1310
},
{
"epoch": 0.3875087152765036,
"grad_norm": 3.1006393432617188,
"learning_rate": 1.3460186176208439e-05,
"loss": 0.7583,
"step": 1320
},
{
"epoch": 0.39044438736193166,
"grad_norm": 3.7594895362854004,
"learning_rate": 1.337349834902125e-05,
"loss": 0.814,
"step": 1330
},
{
"epoch": 0.3933800594473597,
"grad_norm": 4.34951114654541,
"learning_rate": 1.328652350102588e-05,
"loss": 0.8006,
"step": 1340
},
{
"epoch": 0.39631573153278776,
"grad_norm": 2.9645636081695557,
"learning_rate": 1.3199269032134395e-05,
"loss": 0.8129,
"step": 1350
},
{
"epoch": 0.39925140361821587,
"grad_norm": 3.858602285385132,
"learning_rate": 1.3111742366049317e-05,
"loss": 0.8366,
"step": 1360
},
{
"epoch": 0.4021870757036439,
"grad_norm": 3.2778103351593018,
"learning_rate": 1.3023950949631979e-05,
"loss": 0.8551,
"step": 1370
},
{
"epoch": 0.405122747789072,
"grad_norm": 3.2402875423431396,
"learning_rate": 1.2935902252268965e-05,
"loss": 0.8398,
"step": 1380
},
{
"epoch": 0.40805841987450003,
"grad_norm": 4.325957775115967,
"learning_rate": 1.2847603765236589e-05,
"loss": 0.836,
"step": 1390
},
{
"epoch": 0.4109940919599281,
"grad_norm": 3.5310022830963135,
"learning_rate": 1.2759063001063531e-05,
"loss": 0.8369,
"step": 1400
},
{
"epoch": 0.41392976404535614,
"grad_norm": 3.5352087020874023,
"learning_rate": 1.2670287492891675e-05,
"loss": 0.8988,
"step": 1410
},
{
"epoch": 0.4168654361307842,
"grad_norm": 3.190788745880127,
"learning_rate": 1.258128479383516e-05,
"loss": 0.8352,
"step": 1420
},
{
"epoch": 0.41980110821621225,
"grad_norm": 3.459728240966797,
"learning_rate": 1.249206247633778e-05,
"loss": 0.8295,
"step": 1430
},
{
"epoch": 0.4227367803016403,
"grad_norm": 3.5794529914855957,
"learning_rate": 1.2402628131528686e-05,
"loss": 0.8103,
"step": 1440
},
{
"epoch": 0.42567245238706836,
"grad_norm": 4.169612407684326,
"learning_rate": 1.2312989368576547e-05,
"loss": 0.7757,
"step": 1450
},
{
"epoch": 0.4286081244724964,
"grad_norm": 3.301011562347412,
"learning_rate": 1.2223153814042137e-05,
"loss": 0.7871,
"step": 1460
},
{
"epoch": 0.43154379655792446,
"grad_norm": 4.524185657501221,
"learning_rate": 1.2133129111229466e-05,
"loss": 0.851,
"step": 1470
},
{
"epoch": 0.4344794686433525,
"grad_norm": 3.72041392326355,
"learning_rate": 1.2042922919535484e-05,
"loss": 0.803,
"step": 1480
},
{
"epoch": 0.43741514072878057,
"grad_norm": 3.926424503326416,
"learning_rate": 1.1952542913798406e-05,
"loss": 0.761,
"step": 1490
},
{
"epoch": 0.4403508128142086,
"grad_norm": 3.5725414752960205,
"learning_rate": 1.1861996783644727e-05,
"loss": 0.8086,
"step": 1500
},
{
"epoch": 0.44328648489963673,
"grad_norm": 4.109748363494873,
"learning_rate": 1.1771292232834983e-05,
"loss": 0.8483,
"step": 1510
},
{
"epoch": 0.4462221569850648,
"grad_norm": 3.673794984817505,
"learning_rate": 1.1680436978608314e-05,
"loss": 0.738,
"step": 1520
},
{
"epoch": 0.44915782907049284,
"grad_norm": 3.831571102142334,
"learning_rate": 1.1589438751025852e-05,
"loss": 0.7462,
"step": 1530
},
{
"epoch": 0.4520935011559209,
"grad_norm": 4.181507587432861,
"learning_rate": 1.149830529231307e-05,
"loss": 0.7707,
"step": 1540
},
{
"epoch": 0.45502917324134895,
"grad_norm": 3.3295936584472656,
"learning_rate": 1.140704435620104e-05,
"loss": 0.7832,
"step": 1550
},
{
"epoch": 0.457964845326777,
"grad_norm": 4.025683403015137,
"learning_rate": 1.1315663707266742e-05,
"loss": 0.74,
"step": 1560
},
{
"epoch": 0.46090051741220506,
"grad_norm": 3.792701244354248,
"learning_rate": 1.1224171120272455e-05,
"loss": 0.6698,
"step": 1570
},
{
"epoch": 0.4638361894976331,
"grad_norm": 3.7220959663391113,
"learning_rate": 1.1132574379504269e-05,
"loss": 0.7604,
"step": 1580
},
{
"epoch": 0.46677186158306117,
"grad_norm": 4.423033714294434,
"learning_rate": 1.1040881278109784e-05,
"loss": 0.7466,
"step": 1590
},
{
"epoch": 0.4697075336684892,
"grad_norm": 3.633347272872925,
"learning_rate": 1.0949099617435062e-05,
"loss": 0.7452,
"step": 1600
},
{
"epoch": 0.4726432057539173,
"grad_norm": 3.661238431930542,
"learning_rate": 1.0857237206360885e-05,
"loss": 0.7637,
"step": 1610
},
{
"epoch": 0.4755788778393453,
"grad_norm": 4.33590030670166,
"learning_rate": 1.0765301860638364e-05,
"loss": 0.7364,
"step": 1620
},
{
"epoch": 0.4785145499247734,
"grad_norm": 3.7030036449432373,
"learning_rate": 1.0673301402223964e-05,
"loss": 0.7356,
"step": 1630
},
{
"epoch": 0.48145022201020143,
"grad_norm": 4.784999847412109,
"learning_rate": 1.0581243658614013e-05,
"loss": 0.765,
"step": 1640
},
{
"epoch": 0.48438589409562954,
"grad_norm": 3.2158679962158203,
"learning_rate": 1.0489136462178718e-05,
"loss": 0.75,
"step": 1650
},
{
"epoch": 0.4873215661810576,
"grad_norm": 4.584315299987793,
"learning_rate": 1.039698764949579e-05,
"loss": 0.7347,
"step": 1660
},
{
"epoch": 0.49025723826648565,
"grad_norm": 3.4453585147857666,
"learning_rate": 1.0304805060683692e-05,
"loss": 0.7887,
"step": 1670
},
{
"epoch": 0.4931929103519137,
"grad_norm": 3.9263744354248047,
"learning_rate": 1.021259653873459e-05,
"loss": 0.7492,
"step": 1680
},
{
"epoch": 0.49612858243734176,
"grad_norm": 4.6535539627075195,
"learning_rate": 1.012036992884708e-05,
"loss": 0.7676,
"step": 1690
},
{
"epoch": 0.4990642545227698,
"grad_norm": 4.22018575668335,
"learning_rate": 1.0028133077758688e-05,
"loss": 0.7088,
"step": 1700
},
{
"epoch": 0.5019999266081978,
"grad_norm": 4.408539295196533,
"learning_rate": 9.935893833078284e-06,
"loss": 0.7646,
"step": 1710
},
{
"epoch": 0.5049355986936259,
"grad_norm": 5.264422416687012,
"learning_rate": 9.843660042618372e-06,
"loss": 0.8147,
"step": 1720
},
{
"epoch": 0.507871270779054,
"grad_norm": 4.2693047523498535,
"learning_rate": 9.75143955372742e-06,
"loss": 0.7104,
"step": 1730
},
{
"epoch": 0.510806942864482,
"grad_norm": 4.856871128082275,
"learning_rate": 9.659240212622175e-06,
"loss": 0.7367,
"step": 1740
},
{
"epoch": 0.5137426149499101,
"grad_norm": 2.8976457118988037,
"learning_rate": 9.567069863720113e-06,
"loss": 0.7564,
"step": 1750
},
{
"epoch": 0.5166782870353381,
"grad_norm": 5.992892742156982,
"learning_rate": 9.474936348972021e-06,
"loss": 0.7735,
"step": 1760
},
{
"epoch": 0.5196139591207662,
"grad_norm": 3.6526339054107666,
"learning_rate": 9.382847507194797e-06,
"loss": 0.7035,
"step": 1770
},
{
"epoch": 0.5225496312061942,
"grad_norm": 4.040701389312744,
"learning_rate": 9.290811173404513e-06,
"loss": 0.6347,
"step": 1780
},
{
"epoch": 0.5254853032916224,
"grad_norm": 3.848483085632324,
"learning_rate": 9.198835178149807e-06,
"loss": 0.6359,
"step": 1790
},
{
"epoch": 0.5284209753770504,
"grad_norm": 3.2821764945983887,
"learning_rate": 9.106927346845663e-06,
"loss": 0.7137,
"step": 1800
},
{
"epoch": 0.5313566474624785,
"grad_norm": 4.672881603240967,
"learning_rate": 9.015095499107578e-06,
"loss": 0.7085,
"step": 1810
},
{
"epoch": 0.5342923195479065,
"grad_norm": 3.976231098175049,
"learning_rate": 8.923347448086311e-06,
"loss": 0.6501,
"step": 1820
},
{
"epoch": 0.5372279916333346,
"grad_norm": 4.726049423217773,
"learning_rate": 8.831690999803101e-06,
"loss": 0.8129,
"step": 1830
},
{
"epoch": 0.5401636637187626,
"grad_norm": 6.278385162353516,
"learning_rate": 8.740133952485515e-06,
"loss": 0.6732,
"step": 1840
},
{
"epoch": 0.5430993358041907,
"grad_norm": 4.620763301849365,
"learning_rate": 8.648684095904001e-06,
"loss": 0.6872,
"step": 1850
},
{
"epoch": 0.5460350078896188,
"grad_norm": 4.494777679443359,
"learning_rate": 8.557349210709098e-06,
"loss": 0.6686,
"step": 1860
},
{
"epoch": 0.5489706799750468,
"grad_norm": 4.2295637130737305,
"learning_rate": 8.46613706776945e-06,
"loss": 0.6853,
"step": 1870
},
{
"epoch": 0.5519063520604749,
"grad_norm": 3.5783040523529053,
"learning_rate": 8.375055427510673e-06,
"loss": 0.6923,
"step": 1880
},
{
"epoch": 0.5548420241459029,
"grad_norm": 3.5585546493530273,
"learning_rate": 8.284112039255071e-06,
"loss": 0.6744,
"step": 1890
},
{
"epoch": 0.557777696231331,
"grad_norm": 3.939253330230713,
"learning_rate": 8.193314640562315e-06,
"loss": 0.627,
"step": 1900
},
{
"epoch": 0.560713368316759,
"grad_norm": 3.630519390106201,
"learning_rate": 8.102670956571139e-06,
"loss": 0.6627,
"step": 1910
},
{
"epoch": 0.5636490404021871,
"grad_norm": 11.943046569824219,
"learning_rate": 8.012188699342072e-06,
"loss": 0.6476,
"step": 1920
},
{
"epoch": 0.5665847124876151,
"grad_norm": 5.358550071716309,
"learning_rate": 7.92187556720126e-06,
"loss": 0.6968,
"step": 1930
},
{
"epoch": 0.5695203845730432,
"grad_norm": 3.8031585216522217,
"learning_rate": 7.831739244085534e-06,
"loss": 0.6811,
"step": 1940
},
{
"epoch": 0.5724560566584712,
"grad_norm": 3.1659951210021973,
"learning_rate": 7.741787398888617e-06,
"loss": 0.6501,
"step": 1950
},
{
"epoch": 0.5753917287438993,
"grad_norm": 3.7877001762390137,
"learning_rate": 7.652027684808644e-06,
"loss": 0.6496,
"step": 1960
},
{
"epoch": 0.5783274008293273,
"grad_norm": 4.701345920562744,
"learning_rate": 7.56246773869705e-06,
"loss": 0.659,
"step": 1970
},
{
"epoch": 0.5812630729147554,
"grad_norm": 4.617175579071045,
"learning_rate": 7.47311518040879e-06,
"loss": 0.6429,
"step": 1980
},
{
"epoch": 0.5841987450001834,
"grad_norm": 5.269269943237305,
"learning_rate": 7.3839776121540385e-06,
"loss": 0.6845,
"step": 1990
},
{
"epoch": 0.5871344170856115,
"grad_norm": 3.911558151245117,
"learning_rate": 7.2950626178514e-06,
"loss": 0.6536,
"step": 2000
},
{
"epoch": 0.5900700891710396,
"grad_norm": NaN,
"learning_rate": 7.215235676567183e-06,
"loss": 0.6691,
"step": 2010
},
{
"epoch": 0.5930057612564676,
"grad_norm": 5.29760217666626,
"learning_rate": 7.126764398128368e-06,
"loss": 0.6483,
"step": 2020
},
{
"epoch": 0.5959414333418958,
"grad_norm": 3.4294636249542236,
"learning_rate": 7.038537577614009e-06,
"loss": 0.5965,
"step": 2030
},
{
"epoch": 0.5988771054273238,
"grad_norm": 3.6569931507110596,
"learning_rate": 6.950562721455325e-06,
"loss": 0.5782,
"step": 2040
},
{
"epoch": 0.6018127775127519,
"grad_norm": 3.845431089401245,
"learning_rate": 6.86284731464614e-06,
"loss": 0.6419,
"step": 2050
},
{
"epoch": 0.6047484495981799,
"grad_norm": 3.8947107791900635,
"learning_rate": 6.775398820106065e-06,
"loss": 0.5942,
"step": 2060
},
{
"epoch": 0.607684121683608,
"grad_norm": 5.501591682434082,
"learning_rate": 6.688224678045507e-06,
"loss": 0.5874,
"step": 2070
},
{
"epoch": 0.610619793769036,
"grad_norm": 4.684408187866211,
"learning_rate": 6.6013323053327065e-06,
"loss": 0.6178,
"step": 2080
},
{
"epoch": 0.6135554658544641,
"grad_norm": 4.132544040679932,
"learning_rate": 6.5147290948626365e-06,
"loss": 0.5972,
"step": 2090
},
{
"epoch": 0.6164911379398921,
"grad_norm": 3.2844135761260986,
"learning_rate": 6.428422414928066e-06,
"loss": 0.5808,
"step": 2100
},
{
"epoch": 0.6194268100253202,
"grad_norm": 4.8152289390563965,
"learning_rate": 6.342419608592626e-06,
"loss": 0.6407,
"step": 2110
},
{
"epoch": 0.6223624821107482,
"grad_norm": 4.975841999053955,
"learning_rate": 6.25672799306605e-06,
"loss": 0.5792,
"step": 2120
},
{
"epoch": 0.6252981541961763,
"grad_norm": 3.772268772125244,
"learning_rate": 6.171354859081639e-06,
"loss": 0.7062,
"step": 2130
},
{
"epoch": 0.6282338262816043,
"grad_norm": 3.6091275215148926,
"learning_rate": 6.086307470275947e-06,
"loss": 0.6015,
"step": 2140
},
{
"epoch": 0.6311694983670324,
"grad_norm": 3.9650683403015137,
"learning_rate": 6.001593062570776e-06,
"loss": 0.699,
"step": 2150
},
{
"epoch": 0.6341051704524605,
"grad_norm": 3.4142041206359863,
"learning_rate": 5.917218843557551e-06,
"loss": 0.5912,
"step": 2160
},
{
"epoch": 0.6370408425378885,
"grad_norm": 3.6262595653533936,
"learning_rate": 5.8415788415375744e-06,
"loss": 0.6029,
"step": 2170
},
{
"epoch": 0.6399765146233166,
"grad_norm": 3.612025737762451,
"learning_rate": 5.757870733799642e-06,
"loss": 0.6054,
"step": 2180
},
{
"epoch": 0.6429121867087446,
"grad_norm": 3.7721731662750244,
"learning_rate": 5.6745235509072135e-06,
"loss": 0.5703,
"step": 2190
},
{
"epoch": 0.6458478587941727,
"grad_norm": 4.44386100769043,
"learning_rate": 5.591544384126769e-06,
"loss": 0.6101,
"step": 2200
},
{
"epoch": 0.6487835308796007,
"grad_norm": 3.6553893089294434,
"learning_rate": 5.508940293413603e-06,
"loss": 0.6131,
"step": 2210
},
{
"epoch": 0.6517192029650288,
"grad_norm": 4.550465106964111,
"learning_rate": 5.426718306811134e-06,
"loss": 0.5761,
"step": 2220
},
{
"epoch": 0.6546548750504568,
"grad_norm": 3.433598279953003,
"learning_rate": 5.344885419852961e-06,
"loss": 0.6456,
"step": 2230
},
{
"epoch": 0.6575905471358849,
"grad_norm": 5.087676048278809,
"learning_rate": 5.263448594967673e-06,
"loss": 0.657,
"step": 2240
},
{
"epoch": 0.6605262192213129,
"grad_norm": 4.578396797180176,
"learning_rate": 5.182414760886484e-06,
"loss": 0.6083,
"step": 2250
},
{
"epoch": 0.663461891306741,
"grad_norm": 6.043960094451904,
"learning_rate": 5.1017908120537105e-06,
"loss": 0.5721,
"step": 2260
},
{
"epoch": 0.666397563392169,
"grad_norm": 3.624394178390503,
"learning_rate": 5.021583608040208e-06,
"loss": 0.5952,
"step": 2270
},
{
"epoch": 0.6693332354775972,
"grad_norm": 2.965820074081421,
"learning_rate": 4.941799972959752e-06,
"loss": 0.5074,
"step": 2280
},
{
"epoch": 0.6722689075630253,
"grad_norm": 5.590756416320801,
"learning_rate": 4.862446694888403e-06,
"loss": 0.5274,
"step": 2290
},
{
"epoch": 0.6752045796484533,
"grad_norm": 4.188043594360352,
"learning_rate": 4.783530525287006e-06,
"loss": 0.5694,
"step": 2300
},
{
"epoch": 0.6781402517338814,
"grad_norm": 3.925184488296509,
"learning_rate": 4.705058178426753e-06,
"loss": 0.55,
"step": 2310
},
{
"epoch": 0.6810759238193094,
"grad_norm": 4.226954936981201,
"learning_rate": 4.627036330817926e-06,
"loss": 0.5432,
"step": 2320
},
{
"epoch": 0.6840115959047375,
"grad_norm": 3.9109609127044678,
"learning_rate": 4.5494716206418555e-06,
"loss": 0.5332,
"step": 2330
},
{
"epoch": 0.6869472679901655,
"grad_norm": 4.698592662811279,
"learning_rate": 4.4723706471861385e-06,
"loss": 0.5744,
"step": 2340
},
{
"epoch": 0.6898829400755936,
"grad_norm": 4.461889266967773,
"learning_rate": 4.3957399702831505e-06,
"loss": 0.5314,
"step": 2350
},
{
"epoch": 0.6928186121610216,
"grad_norm": 4.412221908569336,
"learning_rate": 4.31958610975195e-06,
"loss": 0.554,
"step": 2360
},
{
"epoch": 0.6957542842464497,
"grad_norm": 3.776421308517456,
"learning_rate": 4.243915544843549e-06,
"loss": 0.4857,
"step": 2370
},
{
"epoch": 0.6986899563318777,
"grad_norm": 4.851159572601318,
"learning_rate": 4.168734713689658e-06,
"loss": 0.5484,
"step": 2380
},
{
"epoch": 0.7016256284173058,
"grad_norm": 3.8917558193206787,
"learning_rate": 4.094050012754925e-06,
"loss": 0.4888,
"step": 2390
},
{
"epoch": 0.7045613005027338,
"grad_norm": 4.396358966827393,
"learning_rate": 4.019867796292709e-06,
"loss": 0.5125,
"step": 2400
},
{
"epoch": 0.7074969725881619,
"grad_norm": 4.374291896820068,
"learning_rate": 3.946194375804452e-06,
"loss": 0.5262,
"step": 2410
},
{
"epoch": 0.7104326446735899,
"grad_norm": 4.330350875854492,
"learning_rate": 3.873036019502716e-06,
"loss": 0.5581,
"step": 2420
},
{
"epoch": 0.713368316759018,
"grad_norm": 4.86287784576416,
"learning_rate": 3.800398951777845e-06,
"loss": 0.5687,
"step": 2430
},
{
"epoch": 0.7163039888444461,
"grad_norm": 5.004453659057617,
"learning_rate": 3.7282893526683914e-06,
"loss": 0.5136,
"step": 2440
},
{
"epoch": 0.7192396609298741,
"grad_norm": 3.035261631011963,
"learning_rate": 3.656713357335334e-06,
"loss": 0.5358,
"step": 2450
},
{
"epoch": 0.7221753330153022,
"grad_norm": 3.3477425575256348,
"learning_rate": 3.585677055540072e-06,
"loss": 0.5214,
"step": 2460
},
{
"epoch": 0.7251110051007302,
"grad_norm": 3.495814323425293,
"learning_rate": 3.5151864911263066e-06,
"loss": 0.5048,
"step": 2470
},
{
"epoch": 0.7280466771861583,
"grad_norm": 3.35532283782959,
"learning_rate": 3.4452476615058316e-06,
"loss": 0.509,
"step": 2480
},
{
"epoch": 0.7309823492715863,
"grad_norm": 3.0357613563537598,
"learning_rate": 3.3758665171482474e-06,
"loss": 0.5361,
"step": 2490
},
{
"epoch": 0.7339180213570144,
"grad_norm": 4.02761173248291,
"learning_rate": 3.3070489610747146e-06,
"loss": 0.5033,
"step": 2500
},
{
"epoch": 0.7368536934424424,
"grad_norm": 4.085331916809082,
"learning_rate": 3.238800848355702e-06,
"loss": 0.526,
"step": 2510
},
{
"epoch": 0.7397893655278706,
"grad_norm": 4.161253929138184,
"learning_rate": 3.1711279856128387e-06,
"loss": 0.5014,
"step": 2520
},
{
"epoch": 0.7427250376132986,
"grad_norm": 3.7220897674560547,
"learning_rate": 3.10403613052487e-06,
"loss": 0.4514,
"step": 2530
},
{
"epoch": 0.7456607096987267,
"grad_norm": 4.337230682373047,
"learning_rate": 3.037530991337807e-06,
"loss": 0.5645,
"step": 2540
},
{
"epoch": 0.7485963817841547,
"grad_norm": 4.30481481552124,
"learning_rate": 2.9716182263792314e-06,
"loss": 0.5026,
"step": 2550
},
{
"epoch": 0.7515320538695828,
"grad_norm": 3.3447349071502686,
"learning_rate": 2.9063034435769242e-06,
"loss": 0.5318,
"step": 2560
},
{
"epoch": 0.7544677259550108,
"grad_norm": 3.936032295227051,
"learning_rate": 2.8415921999816966e-06,
"loss": 0.5106,
"step": 2570
},
{
"epoch": 0.7574033980404389,
"grad_norm": 3.9542150497436523,
"learning_rate": 2.7774900012946037e-06,
"loss": 0.5299,
"step": 2580
},
{
"epoch": 0.760339070125867,
"grad_norm": 4.351448059082031,
"learning_rate": 2.714002301398524e-06,
"loss": 0.5211,
"step": 2590
},
{
"epoch": 0.763274742211295,
"grad_norm": 4.167295932769775,
"learning_rate": 2.6511345018941225e-06,
"loss": 0.5071,
"step": 2600
},
{
"epoch": 0.7662104142967231,
"grad_norm": 5.125722408294678,
"learning_rate": 2.588891951640288e-06,
"loss": 0.5199,
"step": 2610
},
{
"epoch": 0.7691460863821511,
"grad_norm": 4.25960111618042,
"learning_rate": 2.527279946299037e-06,
"loss": 0.4537,
"step": 2620
},
{
"epoch": 0.7720817584675792,
"grad_norm": 3.875459909439087,
"learning_rate": 2.4663037278849665e-06,
"loss": 0.4993,
"step": 2630
},
{
"epoch": 0.7750174305530072,
"grad_norm": 4.285188674926758,
"learning_rate": 2.405968484319231e-06,
"loss": 0.4596,
"step": 2640
},
{
"epoch": 0.7779531026384353,
"grad_norm": 3.827913284301758,
"learning_rate": 2.3462793489881884e-06,
"loss": 0.5141,
"step": 2650
},
{
"epoch": 0.7808887747238633,
"grad_norm": 4.174901485443115,
"learning_rate": 2.2872414003066146e-06,
"loss": 0.4483,
"step": 2660
},
{
"epoch": 0.7838244468092914,
"grad_norm": 3.4712812900543213,
"learning_rate": 2.2288596612856306e-06,
"loss": 0.4834,
"step": 2670
},
{
"epoch": 0.7867601188947194,
"grad_norm": 4.7577972412109375,
"learning_rate": 2.1711390991053547e-06,
"loss": 0.4913,
"step": 2680
},
{
"epoch": 0.7896957909801475,
"grad_norm": 3.8983209133148193,
"learning_rate": 2.1140846246922774e-06,
"loss": 0.4748,
"step": 2690
},
{
"epoch": 0.7926314630655755,
"grad_norm": 3.3365228176116943,
"learning_rate": 2.0577010923014353e-06,
"loss": 0.5014,
"step": 2700
},
{
"epoch": 0.7955671351510036,
"grad_norm": 3.7394635677337646,
"learning_rate": 2.001993299103411e-06,
"loss": 0.4524,
"step": 2710
},
{
"epoch": 0.7985028072364317,
"grad_norm": 3.325190544128418,
"learning_rate": 1.946965984776181e-06,
"loss": 0.486,
"step": 2720
},
{
"epoch": 0.8014384793218597,
"grad_norm": 3.337636947631836,
"learning_rate": 1.8926238311018551e-06,
"loss": 0.4112,
"step": 2730
},
{
"epoch": 0.8043741514072879,
"grad_norm": 4.279343128204346,
"learning_rate": 1.8443055276768218e-06,
"loss": 0.4758,
"step": 2740
},
{
"epoch": 0.8073098234927159,
"grad_norm": 3.3319621086120605,
"learning_rate": 1.7912778684550137e-06,
"loss": 0.4209,
"step": 2750
},
{
"epoch": 0.810245495578144,
"grad_norm": 4.332451343536377,
"learning_rate": 1.7389486159957436e-06,
"loss": 0.4913,
"step": 2760
},
{
"epoch": 0.813181167663572,
"grad_norm": 5.22000789642334,
"learning_rate": 1.6873222225271656e-06,
"loss": 0.4523,
"step": 2770
},
{
"epoch": 0.8161168397490001,
"grad_norm": 4.080671787261963,
"learning_rate": 1.63640308047745e-06,
"loss": 0.4545,
"step": 2780
},
{
"epoch": 0.8190525118344281,
"grad_norm": 5.591613292694092,
"learning_rate": 1.5861955221010671e-06,
"loss": 0.5272,
"step": 2790
},
{
"epoch": 0.8219881839198562,
"grad_norm": 3.4515106678009033,
"learning_rate": 1.536703819110198e-06,
"loss": 0.4166,
"step": 2800
},
{
"epoch": 0.8249238560052842,
"grad_norm": 3.1613569259643555,
"learning_rate": 1.4879321823112802e-06,
"loss": 0.5194,
"step": 2810
},
{
"epoch": 0.8278595280907123,
"grad_norm": 4.931222915649414,
"learning_rate": 1.4398847612467736e-06,
"loss": 0.4626,
"step": 2820
},
{
"epoch": 0.8307952001761403,
"grad_norm": 3.088315963745117,
"learning_rate": 1.3925656438420876e-06,
"loss": 0.4246,
"step": 2830
},
{
"epoch": 0.8337308722615684,
"grad_norm": 3.7036452293395996,
"learning_rate": 1.3459788560577847e-06,
"loss": 0.431,
"step": 2840
},
{
"epoch": 0.8366665443469964,
"grad_norm": 4.452617168426514,
"learning_rate": 1.3001283615470517e-06,
"loss": 0.4478,
"step": 2850
},
{
"epoch": 0.8396022164324245,
"grad_norm": 3.2161977291107178,
"learning_rate": 1.255018061318467e-06,
"loss": 0.4432,
"step": 2860
},
{
"epoch": 0.8425378885178526,
"grad_norm": 4.302596092224121,
"learning_rate": 1.2106517934040917e-06,
"loss": 0.4598,
"step": 2870
},
{
"epoch": 0.8454735606032806,
"grad_norm": 4.297342300415039,
"learning_rate": 1.1670333325329353e-06,
"loss": 0.4908,
"step": 2880
},
{
"epoch": 0.8484092326887087,
"grad_norm": 3.9199209213256836,
"learning_rate": 1.1241663898097865e-06,
"loss": 0.4239,
"step": 2890
},
{
"epoch": 0.8513449047741367,
"grad_norm": 4.693470001220703,
"learning_rate": 1.08205461239948e-06,
"loss": 0.4636,
"step": 2900
},
{
"epoch": 0.8542805768595648,
"grad_norm": 4.2040510177612305,
"learning_rate": 1.04070158321659e-06,
"loss": 0.4595,
"step": 2910
},
{
"epoch": 0.8572162489449928,
"grad_norm": 4.8676252365112305,
"learning_rate": 1.00011082062058e-06,
"loss": 0.4699,
"step": 2920
},
{
"epoch": 0.8601519210304209,
"grad_norm": 3.176576852798462,
"learning_rate": 9.602857781164721e-07,
"loss": 0.4599,
"step": 2930
},
{
"epoch": 0.8630875931158489,
"grad_norm": 4.111423969268799,
"learning_rate": 9.212298440610101e-07,
"loss": 0.4601,
"step": 2940
},
{
"epoch": 0.866023265201277,
"grad_norm": 3.106792449951172,
"learning_rate": 8.829463413743811e-07,
"loss": 0.453,
"step": 2950
},
{
"epoch": 0.868958937286705,
"grad_norm": 4.940300941467285,
"learning_rate": 8.454385272574906e-07,
"loss": 0.4298,
"step": 2960
},
{
"epoch": 0.8718946093721331,
"grad_norm": 4.7473249435424805,
"learning_rate": 8.087095929148436e-07,
"loss": 0.457,
"step": 2970
},
{
"epoch": 0.8748302814575611,
"grad_norm": 4.263439655303955,
"learning_rate": 7.727626632830221e-07,
"loss": 0.4194,
"step": 2980
},
{
"epoch": 0.8777659535429893,
"grad_norm": 4.69775390625,
"learning_rate": 7.376007967648302e-07,
"loss": 0.4457,
"step": 2990
},
{
"epoch": 0.8807016256284173,
"grad_norm": 4.177097320556641,
"learning_rate": 7.032269849690654e-07,
"loss": 0.4532,
"step": 3000
},
{
"epoch": 0.8836372977138454,
"grad_norm": 4.312076568603516,
"learning_rate": 6.696441524559983e-07,
"loss": 0.4772,
"step": 3010
},
{
"epoch": 0.8865729697992735,
"grad_norm": 4.328220367431641,
"learning_rate": 6.368551564885439e-07,
"loss": 0.4239,
"step": 3020
},
{
"epoch": 0.8895086418847015,
"grad_norm": 3.3847310543060303,
"learning_rate": 6.048627867891665e-07,
"loss": 0.4564,
"step": 3030
},
{
"epoch": 0.8924443139701296,
"grad_norm": 3.408613681793213,
"learning_rate": 5.736697653025192e-07,
"loss": 0.4206,
"step": 3040
},
{
"epoch": 0.8953799860555576,
"grad_norm": 4.045165061950684,
"learning_rate": 5.432787459638722e-07,
"loss": 0.4751,
"step": 3050
},
{
"epoch": 0.8983156581409857,
"grad_norm": 3.965830087661743,
"learning_rate": 5.136923144732997e-07,
"loss": 0.4273,
"step": 3060
},
{
"epoch": 0.9012513302264137,
"grad_norm": 3.462986707687378,
"learning_rate": 4.849129880756886e-07,
"loss": 0.472,
"step": 3070
},
{
"epoch": 0.9041870023118418,
"grad_norm": 5.114270210266113,
"learning_rate": 4.569432153465736e-07,
"loss": 0.5233,
"step": 3080
},
{
"epoch": 0.9071226743972698,
"grad_norm": 4.655681610107422,
"learning_rate": 4.297853759838055e-07,
"loss": 0.4543,
"step": 3090
},
{
"epoch": 0.9100583464826979,
"grad_norm": 4.586308002471924,
"learning_rate": 4.034417806050872e-07,
"loss": 0.4383,
"step": 3100
},
{
"epoch": 0.9129940185681259,
"grad_norm": 3.405298948287964,
"learning_rate": 3.779146705513814e-07,
"loss": 0.4644,
"step": 3110
},
{
"epoch": 0.915929690653554,
"grad_norm": 3.8995399475097656,
"learning_rate": 3.532062176962159e-07,
"loss": 0.4591,
"step": 3120
},
{
"epoch": 0.918865362738982,
"grad_norm": 3.6494014263153076,
"learning_rate": 3.293185242608954e-07,
"loss": 0.4354,
"step": 3130
},
{
"epoch": 0.9218010348244101,
"grad_norm": 4.192446708679199,
"learning_rate": 3.062536226356472e-07,
"loss": 0.4466,
"step": 3140
},
{
"epoch": 0.9247367069098382,
"grad_norm": 3.4682350158691406,
"learning_rate": 2.8401347520670253e-07,
"loss": 0.4629,
"step": 3150
},
{
"epoch": 0.9276723789952662,
"grad_norm": 3.987903594970703,
"learning_rate": 2.625999741893304e-07,
"loss": 0.5156,
"step": 3160
},
{
"epoch": 0.9306080510806943,
"grad_norm": 4.3802103996276855,
"learning_rate": 2.420149414668493e-07,
"loss": 0.42,
"step": 3170
},
{
"epoch": 0.9335437231661223,
"grad_norm": 4.355963230133057,
"learning_rate": 2.222601284356185e-07,
"loss": 0.4408,
"step": 3180
},
{
"epoch": 0.9364793952515504,
"grad_norm": 5.095834255218506,
"learning_rate": 2.0333721585602984e-07,
"loss": 0.4558,
"step": 3190
},
{
"epoch": 0.9394150673369784,
"grad_norm": 3.3932993412017822,
"learning_rate": 1.8524781370950262e-07,
"loss": 0.4475,
"step": 3200
},
{
"epoch": 0.9423507394224065,
"grad_norm": 5.2142014503479,
"learning_rate": 1.679934610615064e-07,
"loss": 0.4351,
"step": 3210
},
{
"epoch": 0.9452864115078345,
"grad_norm": 4.271505832672119,
"learning_rate": 1.515756259306178e-07,
"loss": 0.4431,
"step": 3220
},
{
"epoch": 0.9482220835932627,
"grad_norm": 4.888089656829834,
"learning_rate": 1.3599570516361737e-07,
"loss": 0.4256,
"step": 3230
},
{
"epoch": 0.9511577556786907,
"grad_norm": 4.216527938842773,
"learning_rate": 1.212550243166455e-07,
"loss": 0.4811,
"step": 3240
},
{
"epoch": 0.9540934277641188,
"grad_norm": 4.079187393188477,
"learning_rate": 1.0735483754242049e-07,
"loss": 0.4435,
"step": 3250
},
{
"epoch": 0.9570290998495468,
"grad_norm": 4.337707042694092,
"learning_rate": 9.429632748354068e-08,
"loss": 0.4152,
"step": 3260
},
{
"epoch": 0.9599647719349749,
"grad_norm": 3.4174439907073975,
"learning_rate": 8.208060517185146e-08,
"loss": 0.4579,
"step": 3270
},
{
"epoch": 0.9629004440204029,
"grad_norm": 4.021118640899658,
"learning_rate": 7.070870993393209e-08,
"loss": 0.4531,
"step": 3280
},
{
"epoch": 0.965836116105831,
"grad_norm": 3.2883527278900146,
"learning_rate": 6.01816093026586e-08,
"loss": 0.4384,
"step": 3290
},
{
"epoch": 0.9687717881912591,
"grad_norm": 3.5450029373168945,
"learning_rate": 5.0500198934889665e-08,
"loss": 0.4028,
"step": 3300
},
{
"epoch": 0.9717074602766871,
"grad_norm": 3.9169414043426514,
"learning_rate": 4.16653025352598e-08,
"loss": 0.4894,
"step": 3310
},
{
"epoch": 0.9746431323621152,
"grad_norm": 4.527153015136719,
"learning_rate": 3.367767178609982e-08,
"loss": 0.4403,
"step": 3320
},
{
"epoch": 0.9775788044475432,
"grad_norm": 4.728188514709473,
"learning_rate": 2.6537986283485805e-08,
"loss": 0.4123,
"step": 3330
},
{
"epoch": 0.9805144765329713,
"grad_norm": 4.7194037437438965,
"learning_rate": 2.024685347941202e-08,
"loss": 0.4456,
"step": 3340
},
{
"epoch": 0.9834501486183993,
"grad_norm": 4.34934139251709,
"learning_rate": 1.4804808630112244e-08,
"loss": 0.4383,
"step": 3350
},
{
"epoch": 0.9863858207038274,
"grad_norm": 4.218658924102783,
"learning_rate": 1.0212314750518426e-08,
"loss": 0.4368,
"step": 3360
},
{
"epoch": 0.9893214927892554,
"grad_norm": 4.213326454162598,
"learning_rate": 6.469762574868866e-09,
"loss": 0.4769,
"step": 3370
},
{
"epoch": 0.9922571648746835,
"grad_norm": 3.9344699382781982,
"learning_rate": 3.5774705234625783e-09,
"loss": 0.4412,
"step": 3380
},
{
"epoch": 0.9951928369601115,
"grad_norm": 3.622770071029663,
"learning_rate": 1.5356846755654187e-09,
"loss": 0.5059,
"step": 3390
},
{
"epoch": 0.9981285090455396,
"grad_norm": 5.872491359710693,
"learning_rate": 3.4457874847793063e-10,
"loss": 0.4697,
"step": 3400
},
{
"epoch": 0.9998899122967965,
"step": 3406,
"total_flos": 3.2891568428128666e+18,
"train_loss": 0.7668657067657847,
"train_runtime": 24228.5265,
"train_samples_per_second": 4.499,
"train_steps_per_second": 0.141
}
],
"logging_steps": 10,
"max_steps": 3406,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.2891568428128666e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}