JenniferHJF's picture
Training in progress, step 2036, checkpoint
2aec4d8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2036,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004911591355599214,
"grad_norm": 4.0625,
"learning_rate": 4.977897838899804e-05,
"loss": 2.9953,
"step": 10
},
{
"epoch": 0.009823182711198428,
"grad_norm": 1.6328125,
"learning_rate": 4.953339882121808e-05,
"loss": 0.0935,
"step": 20
},
{
"epoch": 0.014734774066797643,
"grad_norm": 1.578125,
"learning_rate": 4.928781925343812e-05,
"loss": 0.073,
"step": 30
},
{
"epoch": 0.019646365422396856,
"grad_norm": 1.2890625,
"learning_rate": 4.904223968565815e-05,
"loss": 0.0679,
"step": 40
},
{
"epoch": 0.02455795677799607,
"grad_norm": 1.5078125,
"learning_rate": 4.87966601178782e-05,
"loss": 0.055,
"step": 50
},
{
"epoch": 0.029469548133595286,
"grad_norm": 1.234375,
"learning_rate": 4.855108055009823e-05,
"loss": 0.0542,
"step": 60
},
{
"epoch": 0.0343811394891945,
"grad_norm": 0.984375,
"learning_rate": 4.830550098231827e-05,
"loss": 0.0458,
"step": 70
},
{
"epoch": 0.03929273084479371,
"grad_norm": 0.95703125,
"learning_rate": 4.805992141453831e-05,
"loss": 0.0485,
"step": 80
},
{
"epoch": 0.04420432220039293,
"grad_norm": 1.21875,
"learning_rate": 4.781434184675835e-05,
"loss": 0.0563,
"step": 90
},
{
"epoch": 0.04911591355599214,
"grad_norm": 0.85546875,
"learning_rate": 4.756876227897839e-05,
"loss": 0.0525,
"step": 100
},
{
"epoch": 0.054027504911591355,
"grad_norm": 1.09375,
"learning_rate": 4.732318271119843e-05,
"loss": 0.0501,
"step": 110
},
{
"epoch": 0.05893909626719057,
"grad_norm": 0.94921875,
"learning_rate": 4.7077603143418466e-05,
"loss": 0.0466,
"step": 120
},
{
"epoch": 0.06385068762278978,
"grad_norm": 0.90625,
"learning_rate": 4.683202357563851e-05,
"loss": 0.0472,
"step": 130
},
{
"epoch": 0.068762278978389,
"grad_norm": 1.1015625,
"learning_rate": 4.658644400785855e-05,
"loss": 0.0432,
"step": 140
},
{
"epoch": 0.07367387033398821,
"grad_norm": 1.1015625,
"learning_rate": 4.634086444007859e-05,
"loss": 0.0497,
"step": 150
},
{
"epoch": 0.07858546168958742,
"grad_norm": 1.3125,
"learning_rate": 4.609528487229863e-05,
"loss": 0.0406,
"step": 160
},
{
"epoch": 0.08349705304518663,
"grad_norm": 1.1015625,
"learning_rate": 4.584970530451866e-05,
"loss": 0.048,
"step": 170
},
{
"epoch": 0.08840864440078586,
"grad_norm": 0.875,
"learning_rate": 4.560412573673871e-05,
"loss": 0.0437,
"step": 180
},
{
"epoch": 0.09332023575638507,
"grad_norm": 0.8515625,
"learning_rate": 4.535854616895874e-05,
"loss": 0.0478,
"step": 190
},
{
"epoch": 0.09823182711198428,
"grad_norm": 0.8359375,
"learning_rate": 4.511296660117879e-05,
"loss": 0.0373,
"step": 200
},
{
"epoch": 0.1031434184675835,
"grad_norm": 0.6171875,
"learning_rate": 4.486738703339882e-05,
"loss": 0.0446,
"step": 210
},
{
"epoch": 0.10805500982318271,
"grad_norm": 0.81640625,
"learning_rate": 4.462180746561886e-05,
"loss": 0.0414,
"step": 220
},
{
"epoch": 0.11296660117878192,
"grad_norm": 0.70703125,
"learning_rate": 4.43762278978389e-05,
"loss": 0.0429,
"step": 230
},
{
"epoch": 0.11787819253438114,
"grad_norm": 1.046875,
"learning_rate": 4.413064833005894e-05,
"loss": 0.0386,
"step": 240
},
{
"epoch": 0.12278978388998035,
"grad_norm": 1.015625,
"learning_rate": 4.388506876227898e-05,
"loss": 0.0404,
"step": 250
},
{
"epoch": 0.12770137524557956,
"grad_norm": 1.0390625,
"learning_rate": 4.3639489194499023e-05,
"loss": 0.037,
"step": 260
},
{
"epoch": 0.13261296660117877,
"grad_norm": 0.9140625,
"learning_rate": 4.339390962671906e-05,
"loss": 0.0389,
"step": 270
},
{
"epoch": 0.137524557956778,
"grad_norm": 0.97265625,
"learning_rate": 4.31483300589391e-05,
"loss": 0.0416,
"step": 280
},
{
"epoch": 0.14243614931237722,
"grad_norm": 1.7421875,
"learning_rate": 4.290275049115914e-05,
"loss": 0.0377,
"step": 290
},
{
"epoch": 0.14734774066797643,
"grad_norm": 0.6015625,
"learning_rate": 4.265717092337918e-05,
"loss": 0.0327,
"step": 300
},
{
"epoch": 0.15225933202357564,
"grad_norm": 1.015625,
"learning_rate": 4.241159135559922e-05,
"loss": 0.0377,
"step": 310
},
{
"epoch": 0.15717092337917485,
"grad_norm": 0.87109375,
"learning_rate": 4.216601178781925e-05,
"loss": 0.038,
"step": 320
},
{
"epoch": 0.16208251473477406,
"grad_norm": 0.65625,
"learning_rate": 4.19204322200393e-05,
"loss": 0.0381,
"step": 330
},
{
"epoch": 0.16699410609037327,
"grad_norm": 0.6953125,
"learning_rate": 4.167485265225933e-05,
"loss": 0.0331,
"step": 340
},
{
"epoch": 0.1719056974459725,
"grad_norm": 0.76171875,
"learning_rate": 4.142927308447937e-05,
"loss": 0.0374,
"step": 350
},
{
"epoch": 0.17681728880157171,
"grad_norm": 0.90234375,
"learning_rate": 4.118369351669941e-05,
"loss": 0.0389,
"step": 360
},
{
"epoch": 0.18172888015717092,
"grad_norm": 1.1015625,
"learning_rate": 4.093811394891945e-05,
"loss": 0.0443,
"step": 370
},
{
"epoch": 0.18664047151277013,
"grad_norm": 0.6484375,
"learning_rate": 4.069253438113949e-05,
"loss": 0.0374,
"step": 380
},
{
"epoch": 0.19155206286836934,
"grad_norm": 0.47265625,
"learning_rate": 4.044695481335953e-05,
"loss": 0.0232,
"step": 390
},
{
"epoch": 0.19646365422396855,
"grad_norm": 0.92578125,
"learning_rate": 4.020137524557957e-05,
"loss": 0.043,
"step": 400
},
{
"epoch": 0.2013752455795678,
"grad_norm": 0.8125,
"learning_rate": 3.995579567779961e-05,
"loss": 0.0344,
"step": 410
},
{
"epoch": 0.206286836935167,
"grad_norm": 0.828125,
"learning_rate": 3.971021611001965e-05,
"loss": 0.0271,
"step": 420
},
{
"epoch": 0.2111984282907662,
"grad_norm": 0.953125,
"learning_rate": 3.946463654223969e-05,
"loss": 0.0378,
"step": 430
},
{
"epoch": 0.21611001964636542,
"grad_norm": 1.0,
"learning_rate": 3.921905697445973e-05,
"loss": 0.0356,
"step": 440
},
{
"epoch": 0.22102161100196463,
"grad_norm": 0.6953125,
"learning_rate": 3.897347740667976e-05,
"loss": 0.0336,
"step": 450
},
{
"epoch": 0.22593320235756384,
"grad_norm": 0.89453125,
"learning_rate": 3.872789783889981e-05,
"loss": 0.0334,
"step": 460
},
{
"epoch": 0.23084479371316308,
"grad_norm": 0.5390625,
"learning_rate": 3.848231827111984e-05,
"loss": 0.0316,
"step": 470
},
{
"epoch": 0.2357563850687623,
"grad_norm": 0.7421875,
"learning_rate": 3.823673870333989e-05,
"loss": 0.037,
"step": 480
},
{
"epoch": 0.2406679764243615,
"grad_norm": 0.65234375,
"learning_rate": 3.799115913555992e-05,
"loss": 0.0336,
"step": 490
},
{
"epoch": 0.2455795677799607,
"grad_norm": 0.67578125,
"learning_rate": 3.774557956777996e-05,
"loss": 0.0316,
"step": 500
},
{
"epoch": 0.2504911591355599,
"grad_norm": 0.89453125,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0287,
"step": 510
},
{
"epoch": 0.2554027504911591,
"grad_norm": 0.58984375,
"learning_rate": 3.725442043222004e-05,
"loss": 0.0353,
"step": 520
},
{
"epoch": 0.26031434184675833,
"grad_norm": 0.828125,
"learning_rate": 3.7008840864440084e-05,
"loss": 0.0323,
"step": 530
},
{
"epoch": 0.26522593320235754,
"grad_norm": 0.640625,
"learning_rate": 3.676326129666012e-05,
"loss": 0.0372,
"step": 540
},
{
"epoch": 0.27013752455795675,
"grad_norm": 0.8671875,
"learning_rate": 3.651768172888016e-05,
"loss": 0.0267,
"step": 550
},
{
"epoch": 0.275049115913556,
"grad_norm": 1.0859375,
"learning_rate": 3.62721021611002e-05,
"loss": 0.0304,
"step": 560
},
{
"epoch": 0.27996070726915523,
"grad_norm": 0.5859375,
"learning_rate": 3.602652259332024e-05,
"loss": 0.0326,
"step": 570
},
{
"epoch": 0.28487229862475444,
"grad_norm": 0.62890625,
"learning_rate": 3.578094302554028e-05,
"loss": 0.0403,
"step": 580
},
{
"epoch": 0.28978388998035365,
"grad_norm": 0.8515625,
"learning_rate": 3.553536345776032e-05,
"loss": 0.0306,
"step": 590
},
{
"epoch": 0.29469548133595286,
"grad_norm": 0.78515625,
"learning_rate": 3.528978388998035e-05,
"loss": 0.028,
"step": 600
},
{
"epoch": 0.29960707269155207,
"grad_norm": 0.578125,
"learning_rate": 3.50442043222004e-05,
"loss": 0.029,
"step": 610
},
{
"epoch": 0.3045186640471513,
"grad_norm": 0.6875,
"learning_rate": 3.479862475442043e-05,
"loss": 0.0299,
"step": 620
},
{
"epoch": 0.3094302554027505,
"grad_norm": 0.94140625,
"learning_rate": 3.455304518664047e-05,
"loss": 0.0293,
"step": 630
},
{
"epoch": 0.3143418467583497,
"grad_norm": 0.84375,
"learning_rate": 3.4307465618860514e-05,
"loss": 0.0288,
"step": 640
},
{
"epoch": 0.3192534381139489,
"grad_norm": 0.462890625,
"learning_rate": 3.406188605108055e-05,
"loss": 0.0272,
"step": 650
},
{
"epoch": 0.3241650294695481,
"grad_norm": 1.484375,
"learning_rate": 3.3816306483300594e-05,
"loss": 0.0392,
"step": 660
},
{
"epoch": 0.3290766208251473,
"grad_norm": 0.6640625,
"learning_rate": 3.357072691552063e-05,
"loss": 0.032,
"step": 670
},
{
"epoch": 0.33398821218074654,
"grad_norm": 1.109375,
"learning_rate": 3.332514734774067e-05,
"loss": 0.0388,
"step": 680
},
{
"epoch": 0.3388998035363458,
"grad_norm": 1.390625,
"learning_rate": 3.307956777996071e-05,
"loss": 0.03,
"step": 690
},
{
"epoch": 0.343811394891945,
"grad_norm": 0.80859375,
"learning_rate": 3.283398821218075e-05,
"loss": 0.0316,
"step": 700
},
{
"epoch": 0.3487229862475442,
"grad_norm": 0.6796875,
"learning_rate": 3.258840864440079e-05,
"loss": 0.037,
"step": 710
},
{
"epoch": 0.35363457760314343,
"grad_norm": 0.78125,
"learning_rate": 3.234282907662083e-05,
"loss": 0.0302,
"step": 720
},
{
"epoch": 0.35854616895874264,
"grad_norm": 0.5546875,
"learning_rate": 3.209724950884086e-05,
"loss": 0.0294,
"step": 730
},
{
"epoch": 0.36345776031434185,
"grad_norm": 0.60546875,
"learning_rate": 3.185166994106091e-05,
"loss": 0.0303,
"step": 740
},
{
"epoch": 0.36836935166994106,
"grad_norm": 1.125,
"learning_rate": 3.160609037328094e-05,
"loss": 0.0304,
"step": 750
},
{
"epoch": 0.37328094302554027,
"grad_norm": 0.609375,
"learning_rate": 3.1360510805500984e-05,
"loss": 0.0303,
"step": 760
},
{
"epoch": 0.3781925343811395,
"grad_norm": 0.9609375,
"learning_rate": 3.1114931237721024e-05,
"loss": 0.0323,
"step": 770
},
{
"epoch": 0.3831041257367387,
"grad_norm": 0.796875,
"learning_rate": 3.086935166994106e-05,
"loss": 0.0324,
"step": 780
},
{
"epoch": 0.3880157170923379,
"grad_norm": 0.7421875,
"learning_rate": 3.0623772102161104e-05,
"loss": 0.0318,
"step": 790
},
{
"epoch": 0.3929273084479371,
"grad_norm": 0.6484375,
"learning_rate": 3.0378192534381138e-05,
"loss": 0.0295,
"step": 800
},
{
"epoch": 0.39783889980353637,
"grad_norm": 0.6484375,
"learning_rate": 3.013261296660118e-05,
"loss": 0.0267,
"step": 810
},
{
"epoch": 0.4027504911591356,
"grad_norm": 1.0234375,
"learning_rate": 2.988703339882122e-05,
"loss": 0.0324,
"step": 820
},
{
"epoch": 0.4076620825147348,
"grad_norm": 0.81640625,
"learning_rate": 2.964145383104126e-05,
"loss": 0.027,
"step": 830
},
{
"epoch": 0.412573673870334,
"grad_norm": 0.609375,
"learning_rate": 2.9395874263261296e-05,
"loss": 0.0294,
"step": 840
},
{
"epoch": 0.4174852652259332,
"grad_norm": 0.625,
"learning_rate": 2.915029469548134e-05,
"loss": 0.0265,
"step": 850
},
{
"epoch": 0.4223968565815324,
"grad_norm": 0.6640625,
"learning_rate": 2.8904715127701376e-05,
"loss": 0.027,
"step": 860
},
{
"epoch": 0.42730844793713163,
"grad_norm": 0.4921875,
"learning_rate": 2.865913555992142e-05,
"loss": 0.0308,
"step": 870
},
{
"epoch": 0.43222003929273084,
"grad_norm": 0.57421875,
"learning_rate": 2.8413555992141457e-05,
"loss": 0.0312,
"step": 880
},
{
"epoch": 0.43713163064833005,
"grad_norm": 0.8984375,
"learning_rate": 2.816797642436149e-05,
"loss": 0.0297,
"step": 890
},
{
"epoch": 0.44204322200392926,
"grad_norm": 0.5703125,
"learning_rate": 2.7922396856581534e-05,
"loss": 0.0308,
"step": 900
},
{
"epoch": 0.44695481335952847,
"grad_norm": 0.8203125,
"learning_rate": 2.767681728880157e-05,
"loss": 0.0307,
"step": 910
},
{
"epoch": 0.4518664047151277,
"grad_norm": 0.6015625,
"learning_rate": 2.7431237721021615e-05,
"loss": 0.031,
"step": 920
},
{
"epoch": 0.4567779960707269,
"grad_norm": 0.609375,
"learning_rate": 2.718565815324165e-05,
"loss": 0.0388,
"step": 930
},
{
"epoch": 0.46168958742632615,
"grad_norm": 0.578125,
"learning_rate": 2.6940078585461692e-05,
"loss": 0.0293,
"step": 940
},
{
"epoch": 0.46660117878192536,
"grad_norm": 0.90234375,
"learning_rate": 2.669449901768173e-05,
"loss": 0.0339,
"step": 950
},
{
"epoch": 0.4715127701375246,
"grad_norm": 0.5625,
"learning_rate": 2.6448919449901772e-05,
"loss": 0.0322,
"step": 960
},
{
"epoch": 0.4764243614931238,
"grad_norm": 0.5234375,
"learning_rate": 2.620333988212181e-05,
"loss": 0.0316,
"step": 970
},
{
"epoch": 0.481335952848723,
"grad_norm": 0.7734375,
"learning_rate": 2.595776031434185e-05,
"loss": 0.0329,
"step": 980
},
{
"epoch": 0.4862475442043222,
"grad_norm": 0.58984375,
"learning_rate": 2.5712180746561886e-05,
"loss": 0.0251,
"step": 990
},
{
"epoch": 0.4911591355599214,
"grad_norm": 0.9921875,
"learning_rate": 2.546660117878193e-05,
"loss": 0.0256,
"step": 1000
},
{
"epoch": 0.4960707269155206,
"grad_norm": 1.1015625,
"learning_rate": 2.5221021611001967e-05,
"loss": 0.0294,
"step": 1010
},
{
"epoch": 0.5009823182711198,
"grad_norm": 0.6484375,
"learning_rate": 2.4975442043222004e-05,
"loss": 0.0251,
"step": 1020
},
{
"epoch": 0.5058939096267191,
"grad_norm": 0.55859375,
"learning_rate": 2.4729862475442044e-05,
"loss": 0.0275,
"step": 1030
},
{
"epoch": 0.5108055009823183,
"grad_norm": 0.5546875,
"learning_rate": 2.4484282907662084e-05,
"loss": 0.0261,
"step": 1040
},
{
"epoch": 0.5157170923379175,
"grad_norm": 0.72265625,
"learning_rate": 2.4238703339882125e-05,
"loss": 0.0292,
"step": 1050
},
{
"epoch": 0.5206286836935167,
"grad_norm": 0.640625,
"learning_rate": 2.3993123772102165e-05,
"loss": 0.0252,
"step": 1060
},
{
"epoch": 0.5255402750491159,
"grad_norm": 0.671875,
"learning_rate": 2.37475442043222e-05,
"loss": 0.0305,
"step": 1070
},
{
"epoch": 0.5304518664047151,
"grad_norm": 0.74609375,
"learning_rate": 2.350196463654224e-05,
"loss": 0.029,
"step": 1080
},
{
"epoch": 0.5353634577603144,
"grad_norm": 0.62109375,
"learning_rate": 2.325638506876228e-05,
"loss": 0.0306,
"step": 1090
},
{
"epoch": 0.5402750491159135,
"grad_norm": 0.5078125,
"learning_rate": 2.301080550098232e-05,
"loss": 0.0295,
"step": 1100
},
{
"epoch": 0.5451866404715128,
"grad_norm": 0.55859375,
"learning_rate": 2.276522593320236e-05,
"loss": 0.0244,
"step": 1110
},
{
"epoch": 0.550098231827112,
"grad_norm": 0.7890625,
"learning_rate": 2.2519646365422397e-05,
"loss": 0.0261,
"step": 1120
},
{
"epoch": 0.5550098231827112,
"grad_norm": 0.59375,
"learning_rate": 2.2274066797642437e-05,
"loss": 0.0227,
"step": 1130
},
{
"epoch": 0.5599214145383105,
"grad_norm": 0.44140625,
"learning_rate": 2.2028487229862477e-05,
"loss": 0.0244,
"step": 1140
},
{
"epoch": 0.5648330058939096,
"grad_norm": 0.63671875,
"learning_rate": 2.1782907662082517e-05,
"loss": 0.0235,
"step": 1150
},
{
"epoch": 0.5697445972495089,
"grad_norm": 0.60546875,
"learning_rate": 2.1537328094302554e-05,
"loss": 0.0297,
"step": 1160
},
{
"epoch": 0.574656188605108,
"grad_norm": 0.70703125,
"learning_rate": 2.1291748526522595e-05,
"loss": 0.0267,
"step": 1170
},
{
"epoch": 0.5795677799607073,
"grad_norm": 1.03125,
"learning_rate": 2.1046168958742635e-05,
"loss": 0.0276,
"step": 1180
},
{
"epoch": 0.5844793713163065,
"grad_norm": 0.5390625,
"learning_rate": 2.0800589390962675e-05,
"loss": 0.0281,
"step": 1190
},
{
"epoch": 0.5893909626719057,
"grad_norm": 0.6796875,
"learning_rate": 2.0555009823182712e-05,
"loss": 0.0306,
"step": 1200
},
{
"epoch": 0.5943025540275049,
"grad_norm": 0.46875,
"learning_rate": 2.030943025540275e-05,
"loss": 0.0253,
"step": 1210
},
{
"epoch": 0.5992141453831041,
"grad_norm": 0.625,
"learning_rate": 2.006385068762279e-05,
"loss": 0.0258,
"step": 1220
},
{
"epoch": 0.6041257367387033,
"grad_norm": 0.67578125,
"learning_rate": 1.981827111984283e-05,
"loss": 0.0275,
"step": 1230
},
{
"epoch": 0.6090373280943026,
"grad_norm": 0.78515625,
"learning_rate": 1.957269155206287e-05,
"loss": 0.0275,
"step": 1240
},
{
"epoch": 0.6139489194499018,
"grad_norm": 0.8125,
"learning_rate": 1.932711198428291e-05,
"loss": 0.0304,
"step": 1250
},
{
"epoch": 0.618860510805501,
"grad_norm": 0.703125,
"learning_rate": 1.9081532416502947e-05,
"loss": 0.0234,
"step": 1260
},
{
"epoch": 0.6237721021611002,
"grad_norm": 0.77734375,
"learning_rate": 1.8835952848722987e-05,
"loss": 0.0315,
"step": 1270
},
{
"epoch": 0.6286836935166994,
"grad_norm": 0.82421875,
"learning_rate": 1.8590373280943028e-05,
"loss": 0.0293,
"step": 1280
},
{
"epoch": 0.6335952848722987,
"grad_norm": 0.4609375,
"learning_rate": 1.8344793713163068e-05,
"loss": 0.0261,
"step": 1290
},
{
"epoch": 0.6385068762278978,
"grad_norm": 0.85546875,
"learning_rate": 1.8099214145383105e-05,
"loss": 0.0264,
"step": 1300
},
{
"epoch": 0.6434184675834971,
"grad_norm": 0.73828125,
"learning_rate": 1.7853634577603145e-05,
"loss": 0.0283,
"step": 1310
},
{
"epoch": 0.6483300589390962,
"grad_norm": 1.0625,
"learning_rate": 1.7608055009823182e-05,
"loss": 0.0244,
"step": 1320
},
{
"epoch": 0.6532416502946955,
"grad_norm": 0.8203125,
"learning_rate": 1.7362475442043222e-05,
"loss": 0.0249,
"step": 1330
},
{
"epoch": 0.6581532416502947,
"grad_norm": 0.765625,
"learning_rate": 1.7116895874263263e-05,
"loss": 0.0246,
"step": 1340
},
{
"epoch": 0.6630648330058939,
"grad_norm": 0.77734375,
"learning_rate": 1.68713163064833e-05,
"loss": 0.0268,
"step": 1350
},
{
"epoch": 0.6679764243614931,
"grad_norm": 0.5390625,
"learning_rate": 1.662573673870334e-05,
"loss": 0.025,
"step": 1360
},
{
"epoch": 0.6728880157170923,
"grad_norm": 0.82421875,
"learning_rate": 1.638015717092338e-05,
"loss": 0.0209,
"step": 1370
},
{
"epoch": 0.6777996070726916,
"grad_norm": 0.5546875,
"learning_rate": 1.613457760314342e-05,
"loss": 0.0264,
"step": 1380
},
{
"epoch": 0.6827111984282908,
"grad_norm": 1.0703125,
"learning_rate": 1.5888998035363457e-05,
"loss": 0.0227,
"step": 1390
},
{
"epoch": 0.68762278978389,
"grad_norm": 0.66015625,
"learning_rate": 1.5643418467583497e-05,
"loss": 0.0313,
"step": 1400
},
{
"epoch": 0.6925343811394892,
"grad_norm": 0.7578125,
"learning_rate": 1.5397838899803538e-05,
"loss": 0.0242,
"step": 1410
},
{
"epoch": 0.6974459724950884,
"grad_norm": 0.6796875,
"learning_rate": 1.5152259332023578e-05,
"loss": 0.0274,
"step": 1420
},
{
"epoch": 0.7023575638506876,
"grad_norm": 0.80078125,
"learning_rate": 1.4906679764243617e-05,
"loss": 0.0235,
"step": 1430
},
{
"epoch": 0.7072691552062869,
"grad_norm": 0.66015625,
"learning_rate": 1.4661100196463657e-05,
"loss": 0.0259,
"step": 1440
},
{
"epoch": 0.712180746561886,
"grad_norm": 0.453125,
"learning_rate": 1.4415520628683694e-05,
"loss": 0.0246,
"step": 1450
},
{
"epoch": 0.7170923379174853,
"grad_norm": 0.71484375,
"learning_rate": 1.4169941060903732e-05,
"loss": 0.0275,
"step": 1460
},
{
"epoch": 0.7220039292730844,
"grad_norm": 1.1015625,
"learning_rate": 1.3924361493123773e-05,
"loss": 0.0308,
"step": 1470
},
{
"epoch": 0.7269155206286837,
"grad_norm": 0.9765625,
"learning_rate": 1.3678781925343811e-05,
"loss": 0.029,
"step": 1480
},
{
"epoch": 0.731827111984283,
"grad_norm": 1.7578125,
"learning_rate": 1.3433202357563852e-05,
"loss": 0.0284,
"step": 1490
},
{
"epoch": 0.7367387033398821,
"grad_norm": 0.7265625,
"learning_rate": 1.318762278978389e-05,
"loss": 0.0253,
"step": 1500
},
{
"epoch": 0.7416502946954814,
"grad_norm": 0.8515625,
"learning_rate": 1.294204322200393e-05,
"loss": 0.028,
"step": 1510
},
{
"epoch": 0.7465618860510805,
"grad_norm": 0.53515625,
"learning_rate": 1.2696463654223969e-05,
"loss": 0.0292,
"step": 1520
},
{
"epoch": 0.7514734774066798,
"grad_norm": 0.62890625,
"learning_rate": 1.245088408644401e-05,
"loss": 0.0259,
"step": 1530
},
{
"epoch": 0.756385068762279,
"grad_norm": 0.6171875,
"learning_rate": 1.2205304518664048e-05,
"loss": 0.0263,
"step": 1540
},
{
"epoch": 0.7612966601178782,
"grad_norm": 0.72265625,
"learning_rate": 1.1959724950884087e-05,
"loss": 0.0279,
"step": 1550
},
{
"epoch": 0.7662082514734774,
"grad_norm": 0.70703125,
"learning_rate": 1.1714145383104127e-05,
"loss": 0.0276,
"step": 1560
},
{
"epoch": 0.7711198428290766,
"grad_norm": 0.98828125,
"learning_rate": 1.1468565815324165e-05,
"loss": 0.0314,
"step": 1570
},
{
"epoch": 0.7760314341846758,
"grad_norm": 0.6640625,
"learning_rate": 1.1222986247544206e-05,
"loss": 0.0283,
"step": 1580
},
{
"epoch": 0.7809430255402751,
"grad_norm": 0.703125,
"learning_rate": 1.0977406679764244e-05,
"loss": 0.049,
"step": 1590
},
{
"epoch": 0.7858546168958742,
"grad_norm": 0.73828125,
"learning_rate": 1.0731827111984283e-05,
"loss": 0.0294,
"step": 1600
},
{
"epoch": 0.7907662082514735,
"grad_norm": 0.78125,
"learning_rate": 1.0486247544204323e-05,
"loss": 0.0257,
"step": 1610
},
{
"epoch": 0.7956777996070727,
"grad_norm": 0.69140625,
"learning_rate": 1.0240667976424362e-05,
"loss": 0.0316,
"step": 1620
},
{
"epoch": 0.8005893909626719,
"grad_norm": 1.0234375,
"learning_rate": 9.995088408644402e-06,
"loss": 0.033,
"step": 1630
},
{
"epoch": 0.8055009823182712,
"grad_norm": 0.77734375,
"learning_rate": 9.74950884086444e-06,
"loss": 0.0263,
"step": 1640
},
{
"epoch": 0.8104125736738703,
"grad_norm": 0.69140625,
"learning_rate": 9.503929273084481e-06,
"loss": 0.0248,
"step": 1650
},
{
"epoch": 0.8153241650294696,
"grad_norm": 0.80859375,
"learning_rate": 9.258349705304518e-06,
"loss": 0.0256,
"step": 1660
},
{
"epoch": 0.8202357563850687,
"grad_norm": 0.498046875,
"learning_rate": 9.012770137524558e-06,
"loss": 0.0258,
"step": 1670
},
{
"epoch": 0.825147347740668,
"grad_norm": 0.64453125,
"learning_rate": 8.767190569744597e-06,
"loss": 0.0271,
"step": 1680
},
{
"epoch": 0.8300589390962672,
"grad_norm": 0.7890625,
"learning_rate": 8.521611001964637e-06,
"loss": 0.0334,
"step": 1690
},
{
"epoch": 0.8349705304518664,
"grad_norm": 1.0078125,
"learning_rate": 8.276031434184677e-06,
"loss": 0.0245,
"step": 1700
},
{
"epoch": 0.8398821218074656,
"grad_norm": 0.63671875,
"learning_rate": 8.030451866404716e-06,
"loss": 0.0262,
"step": 1710
},
{
"epoch": 0.8447937131630648,
"grad_norm": 0.5390625,
"learning_rate": 7.784872298624756e-06,
"loss": 0.027,
"step": 1720
},
{
"epoch": 0.849705304518664,
"grad_norm": 0.6953125,
"learning_rate": 7.539292730844794e-06,
"loss": 0.0253,
"step": 1730
},
{
"epoch": 0.8546168958742633,
"grad_norm": 0.62109375,
"learning_rate": 7.293713163064833e-06,
"loss": 0.0247,
"step": 1740
},
{
"epoch": 0.8595284872298625,
"grad_norm": 1.140625,
"learning_rate": 7.048133595284873e-06,
"loss": 0.0281,
"step": 1750
},
{
"epoch": 0.8644400785854617,
"grad_norm": 0.59375,
"learning_rate": 6.802554027504912e-06,
"loss": 0.0269,
"step": 1760
},
{
"epoch": 0.869351669941061,
"grad_norm": 1.2421875,
"learning_rate": 6.556974459724952e-06,
"loss": 0.0255,
"step": 1770
},
{
"epoch": 0.8742632612966601,
"grad_norm": 0.7578125,
"learning_rate": 6.311394891944991e-06,
"loss": 0.0253,
"step": 1780
},
{
"epoch": 0.8791748526522594,
"grad_norm": 0.73828125,
"learning_rate": 6.06581532416503e-06,
"loss": 0.024,
"step": 1790
},
{
"epoch": 0.8840864440078585,
"grad_norm": 0.60546875,
"learning_rate": 5.820235756385069e-06,
"loss": 0.03,
"step": 1800
},
{
"epoch": 0.8889980353634578,
"grad_norm": 0.5546875,
"learning_rate": 5.5746561886051085e-06,
"loss": 0.0246,
"step": 1810
},
{
"epoch": 0.8939096267190569,
"grad_norm": 0.8671875,
"learning_rate": 5.329076620825148e-06,
"loss": 0.0249,
"step": 1820
},
{
"epoch": 0.8988212180746562,
"grad_norm": 0.55859375,
"learning_rate": 5.0834970530451866e-06,
"loss": 0.0267,
"step": 1830
},
{
"epoch": 0.9037328094302554,
"grad_norm": 0.5703125,
"learning_rate": 4.837917485265226e-06,
"loss": 0.0232,
"step": 1840
},
{
"epoch": 0.9086444007858546,
"grad_norm": 0.6953125,
"learning_rate": 4.5923379174852655e-06,
"loss": 0.0242,
"step": 1850
},
{
"epoch": 0.9135559921414538,
"grad_norm": 0.71484375,
"learning_rate": 4.346758349705305e-06,
"loss": 0.0256,
"step": 1860
},
{
"epoch": 0.918467583497053,
"grad_norm": 1.125,
"learning_rate": 4.1011787819253435e-06,
"loss": 0.0318,
"step": 1870
},
{
"epoch": 0.9233791748526523,
"grad_norm": 0.69921875,
"learning_rate": 3.855599214145384e-06,
"loss": 0.0268,
"step": 1880
},
{
"epoch": 0.9282907662082515,
"grad_norm": 0.58203125,
"learning_rate": 3.6100196463654228e-06,
"loss": 0.0249,
"step": 1890
},
{
"epoch": 0.9332023575638507,
"grad_norm": 0.66796875,
"learning_rate": 3.364440078585462e-06,
"loss": 0.0306,
"step": 1900
},
{
"epoch": 0.9381139489194499,
"grad_norm": 0.59765625,
"learning_rate": 3.1188605108055012e-06,
"loss": 0.0301,
"step": 1910
},
{
"epoch": 0.9430255402750491,
"grad_norm": 0.65625,
"learning_rate": 2.8732809430255403e-06,
"loss": 0.0248,
"step": 1920
},
{
"epoch": 0.9479371316306483,
"grad_norm": 0.76953125,
"learning_rate": 2.6277013752455797e-06,
"loss": 0.0267,
"step": 1930
},
{
"epoch": 0.9528487229862476,
"grad_norm": 0.61328125,
"learning_rate": 2.382121807465619e-06,
"loss": 0.0296,
"step": 1940
},
{
"epoch": 0.9577603143418467,
"grad_norm": 0.59375,
"learning_rate": 2.136542239685658e-06,
"loss": 0.0264,
"step": 1950
},
{
"epoch": 0.962671905697446,
"grad_norm": 0.51953125,
"learning_rate": 1.8909626719056976e-06,
"loss": 0.024,
"step": 1960
},
{
"epoch": 0.9675834970530451,
"grad_norm": 0.56640625,
"learning_rate": 1.6453831041257368e-06,
"loss": 0.0315,
"step": 1970
},
{
"epoch": 0.9724950884086444,
"grad_norm": 0.859375,
"learning_rate": 1.3998035363457763e-06,
"loss": 0.024,
"step": 1980
},
{
"epoch": 0.9774066797642437,
"grad_norm": 0.8125,
"learning_rate": 1.1542239685658153e-06,
"loss": 0.0282,
"step": 1990
},
{
"epoch": 0.9823182711198428,
"grad_norm": 0.6328125,
"learning_rate": 9.086444007858547e-07,
"loss": 0.0231,
"step": 2000
},
{
"epoch": 0.9872298624754421,
"grad_norm": 0.75390625,
"learning_rate": 6.630648330058939e-07,
"loss": 0.0247,
"step": 2010
},
{
"epoch": 0.9921414538310412,
"grad_norm": 0.5546875,
"learning_rate": 4.174852652259332e-07,
"loss": 0.0233,
"step": 2020
},
{
"epoch": 0.9970530451866405,
"grad_norm": 0.85546875,
"learning_rate": 1.719056974459725e-07,
"loss": 0.0366,
"step": 2030
}
],
"logging_steps": 10,
"max_steps": 2036,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.9084808495824896e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}