|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998899122967965, |
|
"eval_steps": 500, |
|
"global_step": 3406, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0029356720854280577, |
|
"grad_norm": 2.3989222049713135, |
|
"learning_rate": 1.9999583060217186e-05, |
|
"loss": 1.4046, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005871344170856115, |
|
"grad_norm": 2.4013407230377197, |
|
"learning_rate": 1.9998315387870395e-05, |
|
"loss": 1.2805, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008807016256284174, |
|
"grad_norm": 2.5987584590911865, |
|
"learning_rate": 1.9996197048273697e-05, |
|
"loss": 1.2627, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01174268834171223, |
|
"grad_norm": 2.5626957416534424, |
|
"learning_rate": 1.999322822165767e-05, |
|
"loss": 1.34, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.014678360427140288, |
|
"grad_norm": 2.4768199920654297, |
|
"learning_rate": 1.998940916061322e-05, |
|
"loss": 1.2963, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.017614032512568347, |
|
"grad_norm": 2.1327669620513916, |
|
"learning_rate": 1.9984740190070102e-05, |
|
"loss": 1.3513, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.020549704597996404, |
|
"grad_norm": 2.7165908813476562, |
|
"learning_rate": 1.9979221707269273e-05, |
|
"loss": 1.2629, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02348537668342446, |
|
"grad_norm": 2.5773496627807617, |
|
"learning_rate": 1.997285418172908e-05, |
|
"loss": 1.2504, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02642104876885252, |
|
"grad_norm": 2.7524304389953613, |
|
"learning_rate": 1.9965638155205335e-05, |
|
"loss": 1.2854, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.029356720854280576, |
|
"grad_norm": 2.542572498321533, |
|
"learning_rate": 1.995757424164521e-05, |
|
"loss": 1.2583, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03229239293970863, |
|
"grad_norm": 3.595125198364258, |
|
"learning_rate": 1.9948663127135003e-05, |
|
"loss": 1.2612, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.035228065025136694, |
|
"grad_norm": 2.8669538497924805, |
|
"learning_rate": 1.9938905569841754e-05, |
|
"loss": 1.237, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03816373711056475, |
|
"grad_norm": 3.1492984294891357, |
|
"learning_rate": 1.9928302399948767e-05, |
|
"loss": 1.2394, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04109940919599281, |
|
"grad_norm": 3.0048630237579346, |
|
"learning_rate": 1.991685451958495e-05, |
|
"loss": 1.1899, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04403508128142086, |
|
"grad_norm": 2.9907774925231934, |
|
"learning_rate": 1.990456290274808e-05, |
|
"loss": 1.1939, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04697075336684892, |
|
"grad_norm": 3.010820150375366, |
|
"learning_rate": 1.9891428595221914e-05, |
|
"loss": 1.2212, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.049906425452276984, |
|
"grad_norm": 2.486607551574707, |
|
"learning_rate": 1.9877452714487232e-05, |
|
"loss": 1.1824, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.05284209753770504, |
|
"grad_norm": 2.491534471511841, |
|
"learning_rate": 1.9862636449626752e-05, |
|
"loss": 1.2118, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0557777696231331, |
|
"grad_norm": 2.5148768424987793, |
|
"learning_rate": 1.9846981061223958e-05, |
|
"loss": 1.2377, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05871344170856115, |
|
"grad_norm": 2.640007734298706, |
|
"learning_rate": 1.9830487881255864e-05, |
|
"loss": 1.1995, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06164911379398921, |
|
"grad_norm": 2.6175191402435303, |
|
"learning_rate": 1.981315831297966e-05, |
|
"loss": 1.1114, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06458478587941727, |
|
"grad_norm": 2.8480780124664307, |
|
"learning_rate": 1.9794993830813358e-05, |
|
"loss": 1.1897, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06752045796484532, |
|
"grad_norm": 2.635657787322998, |
|
"learning_rate": 1.9775995980210306e-05, |
|
"loss": 1.1592, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.07045613005027339, |
|
"grad_norm": 2.8263065814971924, |
|
"learning_rate": 1.9756166377527734e-05, |
|
"loss": 1.1712, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.07339180213570144, |
|
"grad_norm": 3.2777886390686035, |
|
"learning_rate": 1.9735506709889213e-05, |
|
"loss": 1.1724, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0763274742211295, |
|
"grad_norm": 2.587409019470215, |
|
"learning_rate": 1.9714018735041125e-05, |
|
"loss": 1.1767, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07926314630655756, |
|
"grad_norm": 2.9432199001312256, |
|
"learning_rate": 1.9691704281203098e-05, |
|
"loss": 1.1553, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.08219881839198562, |
|
"grad_norm": 3.1995699405670166, |
|
"learning_rate": 1.966856524691247e-05, |
|
"loss": 1.1577, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.08513449047741367, |
|
"grad_norm": 2.805522918701172, |
|
"learning_rate": 1.9644603600862753e-05, |
|
"loss": 1.1627, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.08807016256284173, |
|
"grad_norm": 2.6327457427978516, |
|
"learning_rate": 1.961982138173615e-05, |
|
"loss": 1.1483, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09100583464826979, |
|
"grad_norm": 4.197879314422607, |
|
"learning_rate": 1.959422069803007e-05, |
|
"loss": 1.2015, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.09394150673369785, |
|
"grad_norm": 3.2353358268737793, |
|
"learning_rate": 1.956780372787777e-05, |
|
"loss": 1.1068, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0968771788191259, |
|
"grad_norm": 2.8246076107025146, |
|
"learning_rate": 1.9540572718863012e-05, |
|
"loss": 1.1424, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.09981285090455397, |
|
"grad_norm": 4.32204532623291, |
|
"learning_rate": 1.9512529987828853e-05, |
|
"loss": 1.1508, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.10274852298998202, |
|
"grad_norm": 2.8142335414886475, |
|
"learning_rate": 1.9483677920680512e-05, |
|
"loss": 1.1623, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.10568419507541008, |
|
"grad_norm": 2.9336957931518555, |
|
"learning_rate": 1.9454018972182383e-05, |
|
"loss": 1.161, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.10861986716083813, |
|
"grad_norm": 2.9903533458709717, |
|
"learning_rate": 1.9423555665749182e-05, |
|
"loss": 1.1444, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1115555392462662, |
|
"grad_norm": 2.8897149562835693, |
|
"learning_rate": 1.939229059323124e-05, |
|
"loss": 1.171, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.11449121133169425, |
|
"grad_norm": 3.222294807434082, |
|
"learning_rate": 1.9360226414694008e-05, |
|
"loss": 1.1365, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1174268834171223, |
|
"grad_norm": 2.9000742435455322, |
|
"learning_rate": 1.932736585819171e-05, |
|
"loss": 1.1342, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.12036255550255036, |
|
"grad_norm": 2.682969093322754, |
|
"learning_rate": 1.929371171953526e-05, |
|
"loss": 1.0428, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.12329822758797843, |
|
"grad_norm": 2.8611629009246826, |
|
"learning_rate": 1.9259266862054366e-05, |
|
"loss": 1.1135, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.12623389967340648, |
|
"grad_norm": 2.9910166263580322, |
|
"learning_rate": 1.9224034216353947e-05, |
|
"loss": 1.1154, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.12916957175883453, |
|
"grad_norm": 3.245227336883545, |
|
"learning_rate": 1.9188016780064768e-05, |
|
"loss": 1.2029, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1321052438442626, |
|
"grad_norm": 3.2252321243286133, |
|
"learning_rate": 1.9151217617588412e-05, |
|
"loss": 1.1272, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.13504091592969064, |
|
"grad_norm": 3.23498272895813, |
|
"learning_rate": 1.9113639859836544e-05, |
|
"loss": 1.1421, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.13797658801511872, |
|
"grad_norm": 3.0721843242645264, |
|
"learning_rate": 1.9075286703964554e-05, |
|
"loss": 1.14, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.14091226010054678, |
|
"grad_norm": 3.130610704421997, |
|
"learning_rate": 1.9036161413099512e-05, |
|
"loss": 1.1699, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.14384793218597483, |
|
"grad_norm": 3.054914712905884, |
|
"learning_rate": 1.899626731606255e-05, |
|
"loss": 1.0919, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.14678360427140288, |
|
"grad_norm": 3.3167009353637695, |
|
"learning_rate": 1.895560780708565e-05, |
|
"loss": 1.0625, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14971927635683094, |
|
"grad_norm": 3.075392484664917, |
|
"learning_rate": 1.8914186345522846e-05, |
|
"loss": 1.0899, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.152654948442259, |
|
"grad_norm": 3.1269266605377197, |
|
"learning_rate": 1.8872006455555906e-05, |
|
"loss": 1.1148, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.15559062052768705, |
|
"grad_norm": 3.867361068725586, |
|
"learning_rate": 1.8829071725894483e-05, |
|
"loss": 1.02, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.15852629261311513, |
|
"grad_norm": 3.529639720916748, |
|
"learning_rate": 1.87853858094708e-05, |
|
"loss": 1.1167, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.16146196469854318, |
|
"grad_norm": 3.098249673843384, |
|
"learning_rate": 1.8740952423128842e-05, |
|
"loss": 1.0181, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.16439763678397123, |
|
"grad_norm": 3.1614904403686523, |
|
"learning_rate": 1.869577534730812e-05, |
|
"loss": 1.1118, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1673333088693993, |
|
"grad_norm": 3.054616928100586, |
|
"learning_rate": 1.8649858425722033e-05, |
|
"loss": 1.0666, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.17026898095482734, |
|
"grad_norm": 3.479527711868286, |
|
"learning_rate": 1.8603205565030846e-05, |
|
"loss": 1.108, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1732046530402554, |
|
"grad_norm": 2.9523024559020996, |
|
"learning_rate": 1.8555820734509297e-05, |
|
"loss": 1.0833, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.17614032512568345, |
|
"grad_norm": 2.9907584190368652, |
|
"learning_rate": 1.8507707965708892e-05, |
|
"loss": 1.0283, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17907599721111153, |
|
"grad_norm": 3.2911376953125, |
|
"learning_rate": 1.8458871352114894e-05, |
|
"loss": 1.0747, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.18201166929653959, |
|
"grad_norm": 3.1361849308013916, |
|
"learning_rate": 1.840931504879806e-05, |
|
"loss": 1.11, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.18494734138196764, |
|
"grad_norm": 3.527332067489624, |
|
"learning_rate": 1.8359043272061086e-05, |
|
"loss": 1.0424, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.1878830134673957, |
|
"grad_norm": 3.5494275093078613, |
|
"learning_rate": 1.8308060299079926e-05, |
|
"loss": 1.0818, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.19081868555282375, |
|
"grad_norm": 3.4427106380462646, |
|
"learning_rate": 1.8256370467539847e-05, |
|
"loss": 1.0883, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1937543576382518, |
|
"grad_norm": 3.092515230178833, |
|
"learning_rate": 1.82039781752664e-05, |
|
"loss": 1.0285, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.19669002972367985, |
|
"grad_norm": 2.667904853820801, |
|
"learning_rate": 1.815088787985124e-05, |
|
"loss": 0.9751, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.19962570180910794, |
|
"grad_norm": 3.5892174243927, |
|
"learning_rate": 1.809710409827285e-05, |
|
"loss": 1.0603, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.202561373894536, |
|
"grad_norm": 3.122434616088867, |
|
"learning_rate": 1.804263140651227e-05, |
|
"loss": 1.0919, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.20549704597996404, |
|
"grad_norm": 2.9182698726654053, |
|
"learning_rate": 1.798747443916374e-05, |
|
"loss": 1.0553, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2084327180653921, |
|
"grad_norm": 3.260917901992798, |
|
"learning_rate": 1.793163788904038e-05, |
|
"loss": 1.0711, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.21136839015082015, |
|
"grad_norm": 3.4142649173736572, |
|
"learning_rate": 1.7875126506774956e-05, |
|
"loss": 1.0423, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.2143040622362482, |
|
"grad_norm": 3.0127294063568115, |
|
"learning_rate": 1.781794510041564e-05, |
|
"loss": 1.0679, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.21723973432167626, |
|
"grad_norm": 3.398015022277832, |
|
"learning_rate": 1.776009853501698e-05, |
|
"loss": 1.0558, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.2201754064071043, |
|
"grad_norm": 3.6017568111419678, |
|
"learning_rate": 1.770159173222595e-05, |
|
"loss": 1.0198, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2231110784925324, |
|
"grad_norm": 3.5204339027404785, |
|
"learning_rate": 1.7642429669863225e-05, |
|
"loss": 0.9951, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.22604675057796045, |
|
"grad_norm": 3.3134777545928955, |
|
"learning_rate": 1.7582617381499655e-05, |
|
"loss": 0.9906, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2289824226633885, |
|
"grad_norm": 3.5017244815826416, |
|
"learning_rate": 1.7522159956028003e-05, |
|
"loss": 1.0711, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.23191809474881656, |
|
"grad_norm": 3.1634137630462646, |
|
"learning_rate": 1.7461062537229987e-05, |
|
"loss": 0.9909, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2348537668342446, |
|
"grad_norm": 3.368623971939087, |
|
"learning_rate": 1.739933032333863e-05, |
|
"loss": 0.9815, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.23778943891967266, |
|
"grad_norm": 3.1064817905426025, |
|
"learning_rate": 1.733696856659599e-05, |
|
"loss": 1.0191, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.24072511100510072, |
|
"grad_norm": 3.2074899673461914, |
|
"learning_rate": 1.7273982572806303e-05, |
|
"loss": 1.0314, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2436607830905288, |
|
"grad_norm": 2.5882649421691895, |
|
"learning_rate": 1.721037770088455e-05, |
|
"loss": 0.958, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.24659645517595685, |
|
"grad_norm": 3.730363130569458, |
|
"learning_rate": 1.7146159362400515e-05, |
|
"loss": 1.0272, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2495321272613849, |
|
"grad_norm": 2.940425395965576, |
|
"learning_rate": 1.708133302111837e-05, |
|
"loss": 1.0437, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.25246779934681296, |
|
"grad_norm": 4.833681106567383, |
|
"learning_rate": 1.7015904192531814e-05, |
|
"loss": 1.0393, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.255403471432241, |
|
"grad_norm": 3.417707681655884, |
|
"learning_rate": 1.694987844339479e-05, |
|
"loss": 1.0602, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.25833914351766907, |
|
"grad_norm": 3.239388942718506, |
|
"learning_rate": 1.6883261391247888e-05, |
|
"loss": 0.9515, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2612748156030971, |
|
"grad_norm": 3.1867291927337646, |
|
"learning_rate": 1.6816058703940366e-05, |
|
"loss": 0.9961, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.2642104876885252, |
|
"grad_norm": 3.193343162536621, |
|
"learning_rate": 1.6748276099147952e-05, |
|
"loss": 1.0066, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.26714615977395323, |
|
"grad_norm": 3.1413753032684326, |
|
"learning_rate": 1.6679919343886376e-05, |
|
"loss": 0.9714, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.2700818318593813, |
|
"grad_norm": 3.0826566219329834, |
|
"learning_rate": 1.661099425402067e-05, |
|
"loss": 0.9689, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.2730175039448094, |
|
"grad_norm": 3.5959160327911377, |
|
"learning_rate": 1.6541506693770403e-05, |
|
"loss": 0.9867, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.27595317603023745, |
|
"grad_norm": 3.8435122966766357, |
|
"learning_rate": 1.647146257521071e-05, |
|
"loss": 1.0281, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.2788888481156655, |
|
"grad_norm": 3.396488904953003, |
|
"learning_rate": 1.6400867857769287e-05, |
|
"loss": 0.975, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.28182452020109355, |
|
"grad_norm": 3.2766590118408203, |
|
"learning_rate": 1.6329728547719375e-05, |
|
"loss": 0.9373, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2847601922865216, |
|
"grad_norm": 3.673755645751953, |
|
"learning_rate": 1.625805069766873e-05, |
|
"loss": 0.9651, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.28769586437194966, |
|
"grad_norm": 3.8751864433288574, |
|
"learning_rate": 1.6185840406044657e-05, |
|
"loss": 0.9262, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2906315364573777, |
|
"grad_norm": 3.708500623703003, |
|
"learning_rate": 1.611310381657515e-05, |
|
"loss": 0.9972, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.29356720854280577, |
|
"grad_norm": 3.4258711338043213, |
|
"learning_rate": 1.60398471177662e-05, |
|
"loss": 0.9331, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2965028806282338, |
|
"grad_norm": 3.4662258625030518, |
|
"learning_rate": 1.596607654237522e-05, |
|
"loss": 0.9592, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.2994385527136619, |
|
"grad_norm": 2.938396453857422, |
|
"learning_rate": 1.589179836688081e-05, |
|
"loss": 0.9568, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.30237422479908993, |
|
"grad_norm": 3.248762845993042, |
|
"learning_rate": 1.5817018910948712e-05, |
|
"loss": 0.9928, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.305309896884518, |
|
"grad_norm": 3.423213243484497, |
|
"learning_rate": 1.574174453689415e-05, |
|
"loss": 0.9387, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.30824556896994604, |
|
"grad_norm": 3.249216318130493, |
|
"learning_rate": 1.566598164914049e-05, |
|
"loss": 0.8925, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3111812410553741, |
|
"grad_norm": 3.6318016052246094, |
|
"learning_rate": 1.5589736693674372e-05, |
|
"loss": 1.0153, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.31411691314080215, |
|
"grad_norm": 3.9752533435821533, |
|
"learning_rate": 1.551301615749726e-05, |
|
"loss": 0.9323, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.31705258522623025, |
|
"grad_norm": 3.46864914894104, |
|
"learning_rate": 1.5435826568073532e-05, |
|
"loss": 0.8901, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3199882573116583, |
|
"grad_norm": 4.399304389953613, |
|
"learning_rate": 1.535817449277511e-05, |
|
"loss": 0.9118, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.32292392939708636, |
|
"grad_norm": 3.2890026569366455, |
|
"learning_rate": 1.5280066538322703e-05, |
|
"loss": 0.8655, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3258596014825144, |
|
"grad_norm": 3.491983652114868, |
|
"learning_rate": 1.5201509350223708e-05, |
|
"loss": 0.9217, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.32879527356794247, |
|
"grad_norm": 5.0014824867248535, |
|
"learning_rate": 1.5122509612206785e-05, |
|
"loss": 0.9362, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.3317309456533705, |
|
"grad_norm": 4.092339038848877, |
|
"learning_rate": 1.5043074045653215e-05, |
|
"loss": 0.9262, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3346666177387986, |
|
"grad_norm": 3.286433219909668, |
|
"learning_rate": 1.496320940902503e-05, |
|
"loss": 0.8891, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.33760228982422663, |
|
"grad_norm": 3.6521873474121094, |
|
"learning_rate": 1.4882922497290007e-05, |
|
"loss": 0.9281, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3405379619096547, |
|
"grad_norm": 3.8015809059143066, |
|
"learning_rate": 1.4802220141343516e-05, |
|
"loss": 0.8949, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.34347363399508274, |
|
"grad_norm": 4.149661064147949, |
|
"learning_rate": 1.472110920742738e-05, |
|
"loss": 0.8889, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.3464093060805108, |
|
"grad_norm": 3.5252785682678223, |
|
"learning_rate": 1.4639596596545656e-05, |
|
"loss": 0.8397, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.34934497816593885, |
|
"grad_norm": 3.6884541511535645, |
|
"learning_rate": 1.4557689243877507e-05, |
|
"loss": 0.9142, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3522806502513669, |
|
"grad_norm": 3.9577550888061523, |
|
"learning_rate": 1.4475394118187146e-05, |
|
"loss": 0.9809, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.35521632233679495, |
|
"grad_norm": 3.6897339820861816, |
|
"learning_rate": 1.4392718221230917e-05, |
|
"loss": 0.9141, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.35815199442222306, |
|
"grad_norm": 3.061516046524048, |
|
"learning_rate": 1.4309668587161596e-05, |
|
"loss": 0.8669, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3610876665076511, |
|
"grad_norm": 3.1191623210906982, |
|
"learning_rate": 1.4226252281929902e-05, |
|
"loss": 0.8384, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.36402333859307917, |
|
"grad_norm": 4.198310852050781, |
|
"learning_rate": 1.4142476402683327e-05, |
|
"loss": 0.8971, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3669590106785072, |
|
"grad_norm": 3.8184337615966797, |
|
"learning_rate": 1.4058348077162301e-05, |
|
"loss": 0.8783, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3698946827639353, |
|
"grad_norm": 3.842637777328491, |
|
"learning_rate": 1.3973874463093747e-05, |
|
"loss": 0.9623, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.37283035484936333, |
|
"grad_norm": 3.5173559188842773, |
|
"learning_rate": 1.3889062747582118e-05, |
|
"loss": 0.8092, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.3757660269347914, |
|
"grad_norm": 3.8953890800476074, |
|
"learning_rate": 1.3803920146497887e-05, |
|
"loss": 0.8762, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.37870169902021944, |
|
"grad_norm": 3.0550928115844727, |
|
"learning_rate": 1.3718453903863616e-05, |
|
"loss": 0.8321, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3816373711056475, |
|
"grad_norm": 3.9677445888519287, |
|
"learning_rate": 1.3632671291237645e-05, |
|
"loss": 0.8566, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.38457304319107555, |
|
"grad_norm": 3.887268304824829, |
|
"learning_rate": 1.35465796070954e-05, |
|
"loss": 0.8944, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.3875087152765036, |
|
"grad_norm": 3.1006393432617188, |
|
"learning_rate": 1.3460186176208439e-05, |
|
"loss": 0.7583, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.39044438736193166, |
|
"grad_norm": 3.7594895362854004, |
|
"learning_rate": 1.337349834902125e-05, |
|
"loss": 0.814, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.3933800594473597, |
|
"grad_norm": 4.34951114654541, |
|
"learning_rate": 1.328652350102588e-05, |
|
"loss": 0.8006, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.39631573153278776, |
|
"grad_norm": 2.9645636081695557, |
|
"learning_rate": 1.3199269032134395e-05, |
|
"loss": 0.8129, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.39925140361821587, |
|
"grad_norm": 3.858602285385132, |
|
"learning_rate": 1.3111742366049317e-05, |
|
"loss": 0.8366, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.4021870757036439, |
|
"grad_norm": 3.2778103351593018, |
|
"learning_rate": 1.3023950949631979e-05, |
|
"loss": 0.8551, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.405122747789072, |
|
"grad_norm": 3.2402875423431396, |
|
"learning_rate": 1.2935902252268965e-05, |
|
"loss": 0.8398, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.40805841987450003, |
|
"grad_norm": 4.325957775115967, |
|
"learning_rate": 1.2847603765236589e-05, |
|
"loss": 0.836, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.4109940919599281, |
|
"grad_norm": 3.5310022830963135, |
|
"learning_rate": 1.2759063001063531e-05, |
|
"loss": 0.8369, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.41392976404535614, |
|
"grad_norm": 3.5352087020874023, |
|
"learning_rate": 1.2670287492891675e-05, |
|
"loss": 0.8988, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.4168654361307842, |
|
"grad_norm": 3.190788745880127, |
|
"learning_rate": 1.258128479383516e-05, |
|
"loss": 0.8352, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.41980110821621225, |
|
"grad_norm": 3.459728240966797, |
|
"learning_rate": 1.249206247633778e-05, |
|
"loss": 0.8295, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.4227367803016403, |
|
"grad_norm": 3.5794529914855957, |
|
"learning_rate": 1.2402628131528686e-05, |
|
"loss": 0.8103, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.42567245238706836, |
|
"grad_norm": 4.169612407684326, |
|
"learning_rate": 1.2312989368576547e-05, |
|
"loss": 0.7757, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.4286081244724964, |
|
"grad_norm": 3.301011562347412, |
|
"learning_rate": 1.2223153814042137e-05, |
|
"loss": 0.7871, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.43154379655792446, |
|
"grad_norm": 4.524185657501221, |
|
"learning_rate": 1.2133129111229466e-05, |
|
"loss": 0.851, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.4344794686433525, |
|
"grad_norm": 3.72041392326355, |
|
"learning_rate": 1.2042922919535484e-05, |
|
"loss": 0.803, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.43741514072878057, |
|
"grad_norm": 3.926424503326416, |
|
"learning_rate": 1.1952542913798406e-05, |
|
"loss": 0.761, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.4403508128142086, |
|
"grad_norm": 3.5725414752960205, |
|
"learning_rate": 1.1861996783644727e-05, |
|
"loss": 0.8086, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.44328648489963673, |
|
"grad_norm": 4.109748363494873, |
|
"learning_rate": 1.1771292232834983e-05, |
|
"loss": 0.8483, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.4462221569850648, |
|
"grad_norm": 3.673794984817505, |
|
"learning_rate": 1.1680436978608314e-05, |
|
"loss": 0.738, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.44915782907049284, |
|
"grad_norm": 3.831571102142334, |
|
"learning_rate": 1.1589438751025852e-05, |
|
"loss": 0.7462, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.4520935011559209, |
|
"grad_norm": 4.181507587432861, |
|
"learning_rate": 1.149830529231307e-05, |
|
"loss": 0.7707, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.45502917324134895, |
|
"grad_norm": 3.3295936584472656, |
|
"learning_rate": 1.140704435620104e-05, |
|
"loss": 0.7832, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.457964845326777, |
|
"grad_norm": 4.025683403015137, |
|
"learning_rate": 1.1315663707266742e-05, |
|
"loss": 0.74, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.46090051741220506, |
|
"grad_norm": 3.792701244354248, |
|
"learning_rate": 1.1224171120272455e-05, |
|
"loss": 0.6698, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.4638361894976331, |
|
"grad_norm": 3.7220959663391113, |
|
"learning_rate": 1.1132574379504269e-05, |
|
"loss": 0.7604, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.46677186158306117, |
|
"grad_norm": 4.423033714294434, |
|
"learning_rate": 1.1040881278109784e-05, |
|
"loss": 0.7466, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4697075336684892, |
|
"grad_norm": 3.633347272872925, |
|
"learning_rate": 1.0949099617435062e-05, |
|
"loss": 0.7452, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4726432057539173, |
|
"grad_norm": 3.661238431930542, |
|
"learning_rate": 1.0857237206360885e-05, |
|
"loss": 0.7637, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.4755788778393453, |
|
"grad_norm": 4.33590030670166, |
|
"learning_rate": 1.0765301860638364e-05, |
|
"loss": 0.7364, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.4785145499247734, |
|
"grad_norm": 3.7030036449432373, |
|
"learning_rate": 1.0673301402223964e-05, |
|
"loss": 0.7356, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.48145022201020143, |
|
"grad_norm": 4.784999847412109, |
|
"learning_rate": 1.0581243658614013e-05, |
|
"loss": 0.765, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.48438589409562954, |
|
"grad_norm": 3.2158679962158203, |
|
"learning_rate": 1.0489136462178718e-05, |
|
"loss": 0.75, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.4873215661810576, |
|
"grad_norm": 4.584315299987793, |
|
"learning_rate": 1.039698764949579e-05, |
|
"loss": 0.7347, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.49025723826648565, |
|
"grad_norm": 3.4453585147857666, |
|
"learning_rate": 1.0304805060683692e-05, |
|
"loss": 0.7887, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.4931929103519137, |
|
"grad_norm": 3.9263744354248047, |
|
"learning_rate": 1.021259653873459e-05, |
|
"loss": 0.7492, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.49612858243734176, |
|
"grad_norm": 4.6535539627075195, |
|
"learning_rate": 1.012036992884708e-05, |
|
"loss": 0.7676, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.4990642545227698, |
|
"grad_norm": 4.22018575668335, |
|
"learning_rate": 1.0028133077758688e-05, |
|
"loss": 0.7088, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5019999266081978, |
|
"grad_norm": 4.408539295196533, |
|
"learning_rate": 9.935893833078284e-06, |
|
"loss": 0.7646, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.5049355986936259, |
|
"grad_norm": 5.264422416687012, |
|
"learning_rate": 9.843660042618372e-06, |
|
"loss": 0.8147, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.507871270779054, |
|
"grad_norm": 4.2693047523498535, |
|
"learning_rate": 9.75143955372742e-06, |
|
"loss": 0.7104, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.510806942864482, |
|
"grad_norm": 4.856871128082275, |
|
"learning_rate": 9.659240212622175e-06, |
|
"loss": 0.7367, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5137426149499101, |
|
"grad_norm": 2.8976457118988037, |
|
"learning_rate": 9.567069863720113e-06, |
|
"loss": 0.7564, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5166782870353381, |
|
"grad_norm": 5.992892742156982, |
|
"learning_rate": 9.474936348972021e-06, |
|
"loss": 0.7735, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5196139591207662, |
|
"grad_norm": 3.6526339054107666, |
|
"learning_rate": 9.382847507194797e-06, |
|
"loss": 0.7035, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5225496312061942, |
|
"grad_norm": 4.040701389312744, |
|
"learning_rate": 9.290811173404513e-06, |
|
"loss": 0.6347, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5254853032916224, |
|
"grad_norm": 3.848483085632324, |
|
"learning_rate": 9.198835178149807e-06, |
|
"loss": 0.6359, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.5284209753770504, |
|
"grad_norm": 3.2821764945983887, |
|
"learning_rate": 9.106927346845663e-06, |
|
"loss": 0.7137, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5313566474624785, |
|
"grad_norm": 4.672881603240967, |
|
"learning_rate": 9.015095499107578e-06, |
|
"loss": 0.7085, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.5342923195479065, |
|
"grad_norm": 3.976231098175049, |
|
"learning_rate": 8.923347448086311e-06, |
|
"loss": 0.6501, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5372279916333346, |
|
"grad_norm": 4.726049423217773, |
|
"learning_rate": 8.831690999803101e-06, |
|
"loss": 0.8129, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5401636637187626, |
|
"grad_norm": 6.278385162353516, |
|
"learning_rate": 8.740133952485515e-06, |
|
"loss": 0.6732, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5430993358041907, |
|
"grad_norm": 4.620763301849365, |
|
"learning_rate": 8.648684095904001e-06, |
|
"loss": 0.6872, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5460350078896188, |
|
"grad_norm": 4.494777679443359, |
|
"learning_rate": 8.557349210709098e-06, |
|
"loss": 0.6686, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5489706799750468, |
|
"grad_norm": 4.2295637130737305, |
|
"learning_rate": 8.46613706776945e-06, |
|
"loss": 0.6853, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.5519063520604749, |
|
"grad_norm": 3.5783040523529053, |
|
"learning_rate": 8.375055427510673e-06, |
|
"loss": 0.6923, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.5548420241459029, |
|
"grad_norm": 3.5585546493530273, |
|
"learning_rate": 8.284112039255071e-06, |
|
"loss": 0.6744, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.557777696231331, |
|
"grad_norm": 3.939253330230713, |
|
"learning_rate": 8.193314640562315e-06, |
|
"loss": 0.627, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.560713368316759, |
|
"grad_norm": 3.630519390106201, |
|
"learning_rate": 8.102670956571139e-06, |
|
"loss": 0.6627, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.5636490404021871, |
|
"grad_norm": 11.943046569824219, |
|
"learning_rate": 8.012188699342072e-06, |
|
"loss": 0.6476, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5665847124876151, |
|
"grad_norm": 5.358550071716309, |
|
"learning_rate": 7.92187556720126e-06, |
|
"loss": 0.6968, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5695203845730432, |
|
"grad_norm": 3.8031585216522217, |
|
"learning_rate": 7.831739244085534e-06, |
|
"loss": 0.6811, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5724560566584712, |
|
"grad_norm": 3.1659951210021973, |
|
"learning_rate": 7.741787398888617e-06, |
|
"loss": 0.6501, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5753917287438993, |
|
"grad_norm": 3.7877001762390137, |
|
"learning_rate": 7.652027684808644e-06, |
|
"loss": 0.6496, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5783274008293273, |
|
"grad_norm": 4.701345920562744, |
|
"learning_rate": 7.56246773869705e-06, |
|
"loss": 0.659, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5812630729147554, |
|
"grad_norm": 4.617175579071045, |
|
"learning_rate": 7.47311518040879e-06, |
|
"loss": 0.6429, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5841987450001834, |
|
"grad_norm": 5.269269943237305, |
|
"learning_rate": 7.3839776121540385e-06, |
|
"loss": 0.6845, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5871344170856115, |
|
"grad_norm": 3.911558151245117, |
|
"learning_rate": 7.2950626178514e-06, |
|
"loss": 0.6536, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5900700891710396, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.215235676567183e-06, |
|
"loss": 0.6691, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5930057612564676, |
|
"grad_norm": 5.29760217666626, |
|
"learning_rate": 7.126764398128368e-06, |
|
"loss": 0.6483, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5959414333418958, |
|
"grad_norm": 3.4294636249542236, |
|
"learning_rate": 7.038537577614009e-06, |
|
"loss": 0.5965, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.5988771054273238, |
|
"grad_norm": 3.6569931507110596, |
|
"learning_rate": 6.950562721455325e-06, |
|
"loss": 0.5782, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.6018127775127519, |
|
"grad_norm": 3.845431089401245, |
|
"learning_rate": 6.86284731464614e-06, |
|
"loss": 0.6419, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.6047484495981799, |
|
"grad_norm": 3.8947107791900635, |
|
"learning_rate": 6.775398820106065e-06, |
|
"loss": 0.5942, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.607684121683608, |
|
"grad_norm": 5.501591682434082, |
|
"learning_rate": 6.688224678045507e-06, |
|
"loss": 0.5874, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.610619793769036, |
|
"grad_norm": 4.684408187866211, |
|
"learning_rate": 6.6013323053327065e-06, |
|
"loss": 0.6178, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.6135554658544641, |
|
"grad_norm": 4.132544040679932, |
|
"learning_rate": 6.5147290948626365e-06, |
|
"loss": 0.5972, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.6164911379398921, |
|
"grad_norm": 3.2844135761260986, |
|
"learning_rate": 6.428422414928066e-06, |
|
"loss": 0.5808, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6194268100253202, |
|
"grad_norm": 4.8152289390563965, |
|
"learning_rate": 6.342419608592626e-06, |
|
"loss": 0.6407, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.6223624821107482, |
|
"grad_norm": 4.975841999053955, |
|
"learning_rate": 6.25672799306605e-06, |
|
"loss": 0.5792, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.6252981541961763, |
|
"grad_norm": 3.772268772125244, |
|
"learning_rate": 6.171354859081639e-06, |
|
"loss": 0.7062, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6282338262816043, |
|
"grad_norm": 3.6091275215148926, |
|
"learning_rate": 6.086307470275947e-06, |
|
"loss": 0.6015, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.6311694983670324, |
|
"grad_norm": 3.9650683403015137, |
|
"learning_rate": 6.001593062570776e-06, |
|
"loss": 0.699, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6341051704524605, |
|
"grad_norm": 3.4142041206359863, |
|
"learning_rate": 5.917218843557551e-06, |
|
"loss": 0.5912, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6370408425378885, |
|
"grad_norm": 3.6262595653533936, |
|
"learning_rate": 5.8415788415375744e-06, |
|
"loss": 0.6029, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.6399765146233166, |
|
"grad_norm": 3.612025737762451, |
|
"learning_rate": 5.757870733799642e-06, |
|
"loss": 0.6054, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.6429121867087446, |
|
"grad_norm": 3.7721731662750244, |
|
"learning_rate": 5.6745235509072135e-06, |
|
"loss": 0.5703, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.6458478587941727, |
|
"grad_norm": 4.44386100769043, |
|
"learning_rate": 5.591544384126769e-06, |
|
"loss": 0.6101, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6487835308796007, |
|
"grad_norm": 3.6553893089294434, |
|
"learning_rate": 5.508940293413603e-06, |
|
"loss": 0.6131, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.6517192029650288, |
|
"grad_norm": 4.550465106964111, |
|
"learning_rate": 5.426718306811134e-06, |
|
"loss": 0.5761, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.6546548750504568, |
|
"grad_norm": 3.433598279953003, |
|
"learning_rate": 5.344885419852961e-06, |
|
"loss": 0.6456, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.6575905471358849, |
|
"grad_norm": 5.087676048278809, |
|
"learning_rate": 5.263448594967673e-06, |
|
"loss": 0.657, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.6605262192213129, |
|
"grad_norm": 4.578396797180176, |
|
"learning_rate": 5.182414760886484e-06, |
|
"loss": 0.6083, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.663461891306741, |
|
"grad_norm": 6.043960094451904, |
|
"learning_rate": 5.1017908120537105e-06, |
|
"loss": 0.5721, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.666397563392169, |
|
"grad_norm": 3.624394178390503, |
|
"learning_rate": 5.021583608040208e-06, |
|
"loss": 0.5952, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.6693332354775972, |
|
"grad_norm": 2.965820074081421, |
|
"learning_rate": 4.941799972959752e-06, |
|
"loss": 0.5074, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6722689075630253, |
|
"grad_norm": 5.590756416320801, |
|
"learning_rate": 4.862446694888403e-06, |
|
"loss": 0.5274, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.6752045796484533, |
|
"grad_norm": 4.188043594360352, |
|
"learning_rate": 4.783530525287006e-06, |
|
"loss": 0.5694, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6781402517338814, |
|
"grad_norm": 3.925184488296509, |
|
"learning_rate": 4.705058178426753e-06, |
|
"loss": 0.55, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6810759238193094, |
|
"grad_norm": 4.226954936981201, |
|
"learning_rate": 4.627036330817926e-06, |
|
"loss": 0.5432, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6840115959047375, |
|
"grad_norm": 3.9109609127044678, |
|
"learning_rate": 4.5494716206418555e-06, |
|
"loss": 0.5332, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.6869472679901655, |
|
"grad_norm": 4.698592662811279, |
|
"learning_rate": 4.4723706471861385e-06, |
|
"loss": 0.5744, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6898829400755936, |
|
"grad_norm": 4.461889266967773, |
|
"learning_rate": 4.3957399702831505e-06, |
|
"loss": 0.5314, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6928186121610216, |
|
"grad_norm": 4.412221908569336, |
|
"learning_rate": 4.31958610975195e-06, |
|
"loss": 0.554, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.6957542842464497, |
|
"grad_norm": 3.776421308517456, |
|
"learning_rate": 4.243915544843549e-06, |
|
"loss": 0.4857, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6986899563318777, |
|
"grad_norm": 4.851159572601318, |
|
"learning_rate": 4.168734713689658e-06, |
|
"loss": 0.5484, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.7016256284173058, |
|
"grad_norm": 3.8917558193206787, |
|
"learning_rate": 4.094050012754925e-06, |
|
"loss": 0.4888, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.7045613005027338, |
|
"grad_norm": 4.396358966827393, |
|
"learning_rate": 4.019867796292709e-06, |
|
"loss": 0.5125, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7074969725881619, |
|
"grad_norm": 4.374291896820068, |
|
"learning_rate": 3.946194375804452e-06, |
|
"loss": 0.5262, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.7104326446735899, |
|
"grad_norm": 4.330350875854492, |
|
"learning_rate": 3.873036019502716e-06, |
|
"loss": 0.5581, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.713368316759018, |
|
"grad_norm": 4.86287784576416, |
|
"learning_rate": 3.800398951777845e-06, |
|
"loss": 0.5687, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.7163039888444461, |
|
"grad_norm": 5.004453659057617, |
|
"learning_rate": 3.7282893526683914e-06, |
|
"loss": 0.5136, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.7192396609298741, |
|
"grad_norm": 3.035261631011963, |
|
"learning_rate": 3.656713357335334e-06, |
|
"loss": 0.5358, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.7221753330153022, |
|
"grad_norm": 3.3477425575256348, |
|
"learning_rate": 3.585677055540072e-06, |
|
"loss": 0.5214, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7251110051007302, |
|
"grad_norm": 3.495814323425293, |
|
"learning_rate": 3.5151864911263066e-06, |
|
"loss": 0.5048, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.7280466771861583, |
|
"grad_norm": 3.35532283782959, |
|
"learning_rate": 3.4452476615058316e-06, |
|
"loss": 0.509, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.7309823492715863, |
|
"grad_norm": 3.0357613563537598, |
|
"learning_rate": 3.3758665171482474e-06, |
|
"loss": 0.5361, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.7339180213570144, |
|
"grad_norm": 4.02761173248291, |
|
"learning_rate": 3.3070489610747146e-06, |
|
"loss": 0.5033, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7368536934424424, |
|
"grad_norm": 4.085331916809082, |
|
"learning_rate": 3.238800848355702e-06, |
|
"loss": 0.526, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.7397893655278706, |
|
"grad_norm": 4.161253929138184, |
|
"learning_rate": 3.1711279856128387e-06, |
|
"loss": 0.5014, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7427250376132986, |
|
"grad_norm": 3.7220897674560547, |
|
"learning_rate": 3.10403613052487e-06, |
|
"loss": 0.4514, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.7456607096987267, |
|
"grad_norm": 4.337230682373047, |
|
"learning_rate": 3.037530991337807e-06, |
|
"loss": 0.5645, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.7485963817841547, |
|
"grad_norm": 4.30481481552124, |
|
"learning_rate": 2.9716182263792314e-06, |
|
"loss": 0.5026, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.7515320538695828, |
|
"grad_norm": 3.3447349071502686, |
|
"learning_rate": 2.9063034435769242e-06, |
|
"loss": 0.5318, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.7544677259550108, |
|
"grad_norm": 3.936032295227051, |
|
"learning_rate": 2.8415921999816966e-06, |
|
"loss": 0.5106, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.7574033980404389, |
|
"grad_norm": 3.9542150497436523, |
|
"learning_rate": 2.7774900012946037e-06, |
|
"loss": 0.5299, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.760339070125867, |
|
"grad_norm": 4.351448059082031, |
|
"learning_rate": 2.714002301398524e-06, |
|
"loss": 0.5211, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.763274742211295, |
|
"grad_norm": 4.167295932769775, |
|
"learning_rate": 2.6511345018941225e-06, |
|
"loss": 0.5071, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7662104142967231, |
|
"grad_norm": 5.125722408294678, |
|
"learning_rate": 2.588891951640288e-06, |
|
"loss": 0.5199, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.7691460863821511, |
|
"grad_norm": 4.25960111618042, |
|
"learning_rate": 2.527279946299037e-06, |
|
"loss": 0.4537, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.7720817584675792, |
|
"grad_norm": 3.875459909439087, |
|
"learning_rate": 2.4663037278849665e-06, |
|
"loss": 0.4993, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.7750174305530072, |
|
"grad_norm": 4.285188674926758, |
|
"learning_rate": 2.405968484319231e-06, |
|
"loss": 0.4596, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.7779531026384353, |
|
"grad_norm": 3.827913284301758, |
|
"learning_rate": 2.3462793489881884e-06, |
|
"loss": 0.5141, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.7808887747238633, |
|
"grad_norm": 4.174901485443115, |
|
"learning_rate": 2.2872414003066146e-06, |
|
"loss": 0.4483, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.7838244468092914, |
|
"grad_norm": 3.4712812900543213, |
|
"learning_rate": 2.2288596612856306e-06, |
|
"loss": 0.4834, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.7867601188947194, |
|
"grad_norm": 4.7577972412109375, |
|
"learning_rate": 2.1711390991053547e-06, |
|
"loss": 0.4913, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.7896957909801475, |
|
"grad_norm": 3.8983209133148193, |
|
"learning_rate": 2.1140846246922774e-06, |
|
"loss": 0.4748, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.7926314630655755, |
|
"grad_norm": 3.3365228176116943, |
|
"learning_rate": 2.0577010923014353e-06, |
|
"loss": 0.5014, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7955671351510036, |
|
"grad_norm": 3.7394635677337646, |
|
"learning_rate": 2.001993299103411e-06, |
|
"loss": 0.4524, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.7985028072364317, |
|
"grad_norm": 3.325190544128418, |
|
"learning_rate": 1.946965984776181e-06, |
|
"loss": 0.486, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.8014384793218597, |
|
"grad_norm": 3.337636947631836, |
|
"learning_rate": 1.8926238311018551e-06, |
|
"loss": 0.4112, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.8043741514072879, |
|
"grad_norm": 4.279343128204346, |
|
"learning_rate": 1.8443055276768218e-06, |
|
"loss": 0.4758, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.8073098234927159, |
|
"grad_norm": 3.3319621086120605, |
|
"learning_rate": 1.7912778684550137e-06, |
|
"loss": 0.4209, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.810245495578144, |
|
"grad_norm": 4.332451343536377, |
|
"learning_rate": 1.7389486159957436e-06, |
|
"loss": 0.4913, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.813181167663572, |
|
"grad_norm": 5.22000789642334, |
|
"learning_rate": 1.6873222225271656e-06, |
|
"loss": 0.4523, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.8161168397490001, |
|
"grad_norm": 4.080671787261963, |
|
"learning_rate": 1.63640308047745e-06, |
|
"loss": 0.4545, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.8190525118344281, |
|
"grad_norm": 5.591613292694092, |
|
"learning_rate": 1.5861955221010671e-06, |
|
"loss": 0.5272, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.8219881839198562, |
|
"grad_norm": 3.4515106678009033, |
|
"learning_rate": 1.536703819110198e-06, |
|
"loss": 0.4166, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8249238560052842, |
|
"grad_norm": 3.1613569259643555, |
|
"learning_rate": 1.4879321823112802e-06, |
|
"loss": 0.5194, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.8278595280907123, |
|
"grad_norm": 4.931222915649414, |
|
"learning_rate": 1.4398847612467736e-06, |
|
"loss": 0.4626, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8307952001761403, |
|
"grad_norm": 3.088315963745117, |
|
"learning_rate": 1.3925656438420876e-06, |
|
"loss": 0.4246, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.8337308722615684, |
|
"grad_norm": 3.7036452293395996, |
|
"learning_rate": 1.3459788560577847e-06, |
|
"loss": 0.431, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.8366665443469964, |
|
"grad_norm": 4.452617168426514, |
|
"learning_rate": 1.3001283615470517e-06, |
|
"loss": 0.4478, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.8396022164324245, |
|
"grad_norm": 3.2161977291107178, |
|
"learning_rate": 1.255018061318467e-06, |
|
"loss": 0.4432, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.8425378885178526, |
|
"grad_norm": 4.302596092224121, |
|
"learning_rate": 1.2106517934040917e-06, |
|
"loss": 0.4598, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.8454735606032806, |
|
"grad_norm": 4.297342300415039, |
|
"learning_rate": 1.1670333325329353e-06, |
|
"loss": 0.4908, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.8484092326887087, |
|
"grad_norm": 3.9199209213256836, |
|
"learning_rate": 1.1241663898097865e-06, |
|
"loss": 0.4239, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.8513449047741367, |
|
"grad_norm": 4.693470001220703, |
|
"learning_rate": 1.08205461239948e-06, |
|
"loss": 0.4636, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8542805768595648, |
|
"grad_norm": 4.2040510177612305, |
|
"learning_rate": 1.04070158321659e-06, |
|
"loss": 0.4595, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.8572162489449928, |
|
"grad_norm": 4.8676252365112305, |
|
"learning_rate": 1.00011082062058e-06, |
|
"loss": 0.4699, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.8601519210304209, |
|
"grad_norm": 3.176576852798462, |
|
"learning_rate": 9.602857781164721e-07, |
|
"loss": 0.4599, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.8630875931158489, |
|
"grad_norm": 4.111423969268799, |
|
"learning_rate": 9.212298440610101e-07, |
|
"loss": 0.4601, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.866023265201277, |
|
"grad_norm": 3.106792449951172, |
|
"learning_rate": 8.829463413743811e-07, |
|
"loss": 0.453, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.868958937286705, |
|
"grad_norm": 4.940300941467285, |
|
"learning_rate": 8.454385272574906e-07, |
|
"loss": 0.4298, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.8718946093721331, |
|
"grad_norm": 4.7473249435424805, |
|
"learning_rate": 8.087095929148436e-07, |
|
"loss": 0.457, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.8748302814575611, |
|
"grad_norm": 4.263439655303955, |
|
"learning_rate": 7.727626632830221e-07, |
|
"loss": 0.4194, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.8777659535429893, |
|
"grad_norm": 4.69775390625, |
|
"learning_rate": 7.376007967648302e-07, |
|
"loss": 0.4457, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.8807016256284173, |
|
"grad_norm": 4.177097320556641, |
|
"learning_rate": 7.032269849690654e-07, |
|
"loss": 0.4532, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8836372977138454, |
|
"grad_norm": 4.312076568603516, |
|
"learning_rate": 6.696441524559983e-07, |
|
"loss": 0.4772, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.8865729697992735, |
|
"grad_norm": 4.328220367431641, |
|
"learning_rate": 6.368551564885439e-07, |
|
"loss": 0.4239, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.8895086418847015, |
|
"grad_norm": 3.3847310543060303, |
|
"learning_rate": 6.048627867891665e-07, |
|
"loss": 0.4564, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.8924443139701296, |
|
"grad_norm": 3.408613681793213, |
|
"learning_rate": 5.736697653025192e-07, |
|
"loss": 0.4206, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.8953799860555576, |
|
"grad_norm": 4.045165061950684, |
|
"learning_rate": 5.432787459638722e-07, |
|
"loss": 0.4751, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.8983156581409857, |
|
"grad_norm": 3.965830087661743, |
|
"learning_rate": 5.136923144732997e-07, |
|
"loss": 0.4273, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.9012513302264137, |
|
"grad_norm": 3.462986707687378, |
|
"learning_rate": 4.849129880756886e-07, |
|
"loss": 0.472, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.9041870023118418, |
|
"grad_norm": 5.114270210266113, |
|
"learning_rate": 4.569432153465736e-07, |
|
"loss": 0.5233, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.9071226743972698, |
|
"grad_norm": 4.655681610107422, |
|
"learning_rate": 4.297853759838055e-07, |
|
"loss": 0.4543, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.9100583464826979, |
|
"grad_norm": 4.586308002471924, |
|
"learning_rate": 4.034417806050872e-07, |
|
"loss": 0.4383, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9129940185681259, |
|
"grad_norm": 3.405298948287964, |
|
"learning_rate": 3.779146705513814e-07, |
|
"loss": 0.4644, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.915929690653554, |
|
"grad_norm": 3.8995399475097656, |
|
"learning_rate": 3.532062176962159e-07, |
|
"loss": 0.4591, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.918865362738982, |
|
"grad_norm": 3.6494014263153076, |
|
"learning_rate": 3.293185242608954e-07, |
|
"loss": 0.4354, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.9218010348244101, |
|
"grad_norm": 4.192446708679199, |
|
"learning_rate": 3.062536226356472e-07, |
|
"loss": 0.4466, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.9247367069098382, |
|
"grad_norm": 3.4682350158691406, |
|
"learning_rate": 2.8401347520670253e-07, |
|
"loss": 0.4629, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9276723789952662, |
|
"grad_norm": 3.987903594970703, |
|
"learning_rate": 2.625999741893304e-07, |
|
"loss": 0.5156, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.9306080510806943, |
|
"grad_norm": 4.3802103996276855, |
|
"learning_rate": 2.420149414668493e-07, |
|
"loss": 0.42, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.9335437231661223, |
|
"grad_norm": 4.355963230133057, |
|
"learning_rate": 2.222601284356185e-07, |
|
"loss": 0.4408, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.9364793952515504, |
|
"grad_norm": 5.095834255218506, |
|
"learning_rate": 2.0333721585602984e-07, |
|
"loss": 0.4558, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.9394150673369784, |
|
"grad_norm": 3.3932993412017822, |
|
"learning_rate": 1.8524781370950262e-07, |
|
"loss": 0.4475, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9423507394224065, |
|
"grad_norm": 5.2142014503479, |
|
"learning_rate": 1.679934610615064e-07, |
|
"loss": 0.4351, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.9452864115078345, |
|
"grad_norm": 4.271505832672119, |
|
"learning_rate": 1.515756259306178e-07, |
|
"loss": 0.4431, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.9482220835932627, |
|
"grad_norm": 4.888089656829834, |
|
"learning_rate": 1.3599570516361737e-07, |
|
"loss": 0.4256, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.9511577556786907, |
|
"grad_norm": 4.216527938842773, |
|
"learning_rate": 1.212550243166455e-07, |
|
"loss": 0.4811, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.9540934277641188, |
|
"grad_norm": 4.079187393188477, |
|
"learning_rate": 1.0735483754242049e-07, |
|
"loss": 0.4435, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.9570290998495468, |
|
"grad_norm": 4.337707042694092, |
|
"learning_rate": 9.429632748354068e-08, |
|
"loss": 0.4152, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.9599647719349749, |
|
"grad_norm": 3.4174439907073975, |
|
"learning_rate": 8.208060517185146e-08, |
|
"loss": 0.4579, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.9629004440204029, |
|
"grad_norm": 4.021118640899658, |
|
"learning_rate": 7.070870993393209e-08, |
|
"loss": 0.4531, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.965836116105831, |
|
"grad_norm": 3.2883527278900146, |
|
"learning_rate": 6.01816093026586e-08, |
|
"loss": 0.4384, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.9687717881912591, |
|
"grad_norm": 3.5450029373168945, |
|
"learning_rate": 5.0500198934889665e-08, |
|
"loss": 0.4028, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9717074602766871, |
|
"grad_norm": 3.9169414043426514, |
|
"learning_rate": 4.16653025352598e-08, |
|
"loss": 0.4894, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.9746431323621152, |
|
"grad_norm": 4.527153015136719, |
|
"learning_rate": 3.367767178609982e-08, |
|
"loss": 0.4403, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.9775788044475432, |
|
"grad_norm": 4.728188514709473, |
|
"learning_rate": 2.6537986283485805e-08, |
|
"loss": 0.4123, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.9805144765329713, |
|
"grad_norm": 4.7194037437438965, |
|
"learning_rate": 2.024685347941202e-08, |
|
"loss": 0.4456, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.9834501486183993, |
|
"grad_norm": 4.34934139251709, |
|
"learning_rate": 1.4804808630112244e-08, |
|
"loss": 0.4383, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.9863858207038274, |
|
"grad_norm": 4.218658924102783, |
|
"learning_rate": 1.0212314750518426e-08, |
|
"loss": 0.4368, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.9893214927892554, |
|
"grad_norm": 4.213326454162598, |
|
"learning_rate": 6.469762574868866e-09, |
|
"loss": 0.4769, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.9922571648746835, |
|
"grad_norm": 3.9344699382781982, |
|
"learning_rate": 3.5774705234625783e-09, |
|
"loss": 0.4412, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.9951928369601115, |
|
"grad_norm": 3.622770071029663, |
|
"learning_rate": 1.5356846755654187e-09, |
|
"loss": 0.5059, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.9981285090455396, |
|
"grad_norm": 5.872491359710693, |
|
"learning_rate": 3.4457874847793063e-10, |
|
"loss": 0.4697, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9998899122967965, |
|
"step": 3406, |
|
"total_flos": 3.2891568428128666e+18, |
|
"train_loss": 0.7668657067657847, |
|
"train_runtime": 24228.5265, |
|
"train_samples_per_second": 4.499, |
|
"train_steps_per_second": 0.141 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3406, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.2891568428128666e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|