|
{ |
|
"best_global_step": 1800, |
|
"best_metric": 2.285778284072876, |
|
"best_model_checkpoint": "models/mini_stage2/checkpoint-1800", |
|
"epoch": 0.8933002481389578, |
|
"eval_steps": 300, |
|
"global_step": 1800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004962779156327543, |
|
"grad_norm": 7.886381149291992, |
|
"learning_rate": 1.3366336633663365e-06, |
|
"loss": 4.6656, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009925558312655087, |
|
"grad_norm": 5.6866021156311035, |
|
"learning_rate": 2.821782178217822e-06, |
|
"loss": 4.4733, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01488833746898263, |
|
"grad_norm": 4.5250091552734375, |
|
"learning_rate": 4.306930693069306e-06, |
|
"loss": 4.0093, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.019851116625310174, |
|
"grad_norm": 3.3117995262145996, |
|
"learning_rate": 5.792079207920792e-06, |
|
"loss": 3.9259, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02481389578163772, |
|
"grad_norm": 3.5785279273986816, |
|
"learning_rate": 7.277227722772277e-06, |
|
"loss": 3.8315, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02977667493796526, |
|
"grad_norm": 5.32008695602417, |
|
"learning_rate": 8.762376237623762e-06, |
|
"loss": 3.673, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.034739454094292806, |
|
"grad_norm": 4.776355266571045, |
|
"learning_rate": 1.0247524752475248e-05, |
|
"loss": 3.5076, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03970223325062035, |
|
"grad_norm": 4.523674488067627, |
|
"learning_rate": 1.1732673267326734e-05, |
|
"loss": 3.4416, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04466501240694789, |
|
"grad_norm": 4.444321155548096, |
|
"learning_rate": 1.3217821782178218e-05, |
|
"loss": 3.4362, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04962779156327544, |
|
"grad_norm": 4.364451885223389, |
|
"learning_rate": 1.4702970297029704e-05, |
|
"loss": 3.3934, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05459057071960298, |
|
"grad_norm": 5.098319053649902, |
|
"learning_rate": 1.618811881188119e-05, |
|
"loss": 3.3343, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05955334987593052, |
|
"grad_norm": 4.762153148651123, |
|
"learning_rate": 1.7673267326732672e-05, |
|
"loss": 3.3018, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06451612903225806, |
|
"grad_norm": 4.8034563064575195, |
|
"learning_rate": 1.915841584158416e-05, |
|
"loss": 3.2882, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06947890818858561, |
|
"grad_norm": 4.955429553985596, |
|
"learning_rate": 2.0643564356435643e-05, |
|
"loss": 3.3027, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07444168734491315, |
|
"grad_norm": 4.338839530944824, |
|
"learning_rate": 2.212871287128713e-05, |
|
"loss": 3.2177, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0794044665012407, |
|
"grad_norm": 4.854152202606201, |
|
"learning_rate": 2.3613861386138615e-05, |
|
"loss": 3.2708, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08436724565756824, |
|
"grad_norm": 4.855740070343018, |
|
"learning_rate": 2.5099009900990097e-05, |
|
"loss": 3.2645, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08933002481389578, |
|
"grad_norm": 5.82074499130249, |
|
"learning_rate": 2.6584158415841586e-05, |
|
"loss": 3.1939, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09429280397022333, |
|
"grad_norm": 5.284363746643066, |
|
"learning_rate": 2.8069306930693072e-05, |
|
"loss": 3.0575, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09925558312655088, |
|
"grad_norm": 5.663958549499512, |
|
"learning_rate": 2.9554455445544555e-05, |
|
"loss": 3.0799, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10421836228287841, |
|
"grad_norm": 5.67520809173584, |
|
"learning_rate": 2.9884169884169887e-05, |
|
"loss": 3.0824, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10918114143920596, |
|
"grad_norm": 5.432056903839111, |
|
"learning_rate": 2.9718698290126862e-05, |
|
"loss": 3.0693, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1141439205955335, |
|
"grad_norm": 5.387454032897949, |
|
"learning_rate": 2.9553226696083837e-05, |
|
"loss": 3.1014, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11910669975186104, |
|
"grad_norm": 5.515647888183594, |
|
"learning_rate": 2.9387755102040816e-05, |
|
"loss": 3.0458, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.12406947890818859, |
|
"grad_norm": 5.962894916534424, |
|
"learning_rate": 2.9222283507997795e-05, |
|
"loss": 3.04, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 5.549206733703613, |
|
"learning_rate": 2.9056811913954774e-05, |
|
"loss": 3.0311, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.13399503722084366, |
|
"grad_norm": 4.67368221282959, |
|
"learning_rate": 2.889134031991175e-05, |
|
"loss": 2.9778, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13895781637717122, |
|
"grad_norm": 5.445274829864502, |
|
"learning_rate": 2.8725868725868724e-05, |
|
"loss": 3.0701, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.14392059553349876, |
|
"grad_norm": 5.643553256988525, |
|
"learning_rate": 2.8560397131825703e-05, |
|
"loss": 2.9039, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1488833746898263, |
|
"grad_norm": 5.453239440917969, |
|
"learning_rate": 2.8394925537782682e-05, |
|
"loss": 3.0449, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1488833746898263, |
|
"eval_loss": 2.5684616565704346, |
|
"eval_runtime": 119.1131, |
|
"eval_samples_per_second": 142.251, |
|
"eval_steps_per_second": 4.45, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 5.800920009613037, |
|
"learning_rate": 2.822945394373966e-05, |
|
"loss": 2.8896, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1588089330024814, |
|
"grad_norm": 4.8736724853515625, |
|
"learning_rate": 2.8063982349696636e-05, |
|
"loss": 3.0527, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.16377171215880892, |
|
"grad_norm": 4.498941898345947, |
|
"learning_rate": 2.7898510755653615e-05, |
|
"loss": 3.0153, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1687344913151365, |
|
"grad_norm": 5.518968105316162, |
|
"learning_rate": 2.773303916161059e-05, |
|
"loss": 2.869, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.17369727047146402, |
|
"grad_norm": 5.431845188140869, |
|
"learning_rate": 2.756756756756757e-05, |
|
"loss": 2.9678, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17866004962779156, |
|
"grad_norm": 5.1342973709106445, |
|
"learning_rate": 2.7402095973524544e-05, |
|
"loss": 2.9756, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.18362282878411912, |
|
"grad_norm": 4.8498101234436035, |
|
"learning_rate": 2.7236624379481523e-05, |
|
"loss": 2.9348, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.18858560794044665, |
|
"grad_norm": 5.223404407501221, |
|
"learning_rate": 2.70711527854385e-05, |
|
"loss": 2.9967, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 5.078680038452148, |
|
"learning_rate": 2.690568119139548e-05, |
|
"loss": 2.8953, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19851116625310175, |
|
"grad_norm": 4.336000442504883, |
|
"learning_rate": 2.6740209597352456e-05, |
|
"loss": 2.9546, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.20347394540942929, |
|
"grad_norm": 5.194223880767822, |
|
"learning_rate": 2.657473800330943e-05, |
|
"loss": 2.9919, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.20843672456575682, |
|
"grad_norm": 4.870718002319336, |
|
"learning_rate": 2.640926640926641e-05, |
|
"loss": 2.8487, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.21339950372208435, |
|
"grad_norm": 5.526561260223389, |
|
"learning_rate": 2.624379481522339e-05, |
|
"loss": 2.7609, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.21836228287841192, |
|
"grad_norm": 5.372407913208008, |
|
"learning_rate": 2.6078323221180364e-05, |
|
"loss": 2.9126, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.22332506203473945, |
|
"grad_norm": 4.934298992156982, |
|
"learning_rate": 2.5912851627137343e-05, |
|
"loss": 2.8991, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.228287841191067, |
|
"grad_norm": 4.916905403137207, |
|
"learning_rate": 2.574738003309432e-05, |
|
"loss": 2.9272, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.23325062034739455, |
|
"grad_norm": 5.660480976104736, |
|
"learning_rate": 2.5581908439051297e-05, |
|
"loss": 2.9084, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.23821339950372208, |
|
"grad_norm": 4.987634658813477, |
|
"learning_rate": 2.5416436845008272e-05, |
|
"loss": 2.7963, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.24317617866004962, |
|
"grad_norm": 6.746623992919922, |
|
"learning_rate": 2.525096525096525e-05, |
|
"loss": 2.822, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.24813895781637718, |
|
"grad_norm": 4.658477783203125, |
|
"learning_rate": 2.508549365692223e-05, |
|
"loss": 2.9376, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2531017369727047, |
|
"grad_norm": 5.2974162101745605, |
|
"learning_rate": 2.492002206287921e-05, |
|
"loss": 2.8969, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 5.682817459106445, |
|
"learning_rate": 2.4754550468836184e-05, |
|
"loss": 2.7745, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2630272952853598, |
|
"grad_norm": 5.85469388961792, |
|
"learning_rate": 2.458907887479316e-05, |
|
"loss": 2.8103, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.2679900744416873, |
|
"grad_norm": 4.600194454193115, |
|
"learning_rate": 2.4423607280750138e-05, |
|
"loss": 2.8189, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2729528535980149, |
|
"grad_norm": 6.168639659881592, |
|
"learning_rate": 2.4258135686707117e-05, |
|
"loss": 2.8322, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.27791563275434245, |
|
"grad_norm": 4.763863563537598, |
|
"learning_rate": 2.4092664092664092e-05, |
|
"loss": 2.7627, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.28287841191067, |
|
"grad_norm": 5.0977091789245605, |
|
"learning_rate": 2.392719249862107e-05, |
|
"loss": 2.7796, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2878411910669975, |
|
"grad_norm": 4.786524772644043, |
|
"learning_rate": 2.376172090457805e-05, |
|
"loss": 2.8515, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.29280397022332505, |
|
"grad_norm": 5.386171817779541, |
|
"learning_rate": 2.3596249310535025e-05, |
|
"loss": 2.8758, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2977667493796526, |
|
"grad_norm": 4.96185827255249, |
|
"learning_rate": 2.3430777716492e-05, |
|
"loss": 2.7963, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2977667493796526, |
|
"eval_loss": 2.4141688346862793, |
|
"eval_runtime": 112.2744, |
|
"eval_samples_per_second": 150.916, |
|
"eval_steps_per_second": 4.721, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3027295285359802, |
|
"grad_norm": 7.280220985412598, |
|
"learning_rate": 2.326530612244898e-05, |
|
"loss": 2.8259, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 5.671541690826416, |
|
"learning_rate": 2.3099834528405958e-05, |
|
"loss": 2.829, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.31265508684863524, |
|
"grad_norm": 5.268543720245361, |
|
"learning_rate": 2.2934362934362936e-05, |
|
"loss": 2.7699, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3176178660049628, |
|
"grad_norm": 4.945257186889648, |
|
"learning_rate": 2.2768891340319915e-05, |
|
"loss": 2.7311, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 5.198265075683594, |
|
"learning_rate": 2.2603419746276887e-05, |
|
"loss": 2.735, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.32754342431761785, |
|
"grad_norm": 4.624203205108643, |
|
"learning_rate": 2.2437948152233866e-05, |
|
"loss": 2.7306, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3325062034739454, |
|
"grad_norm": 5.143148899078369, |
|
"learning_rate": 2.2272476558190845e-05, |
|
"loss": 2.7467, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.337468982630273, |
|
"grad_norm": 4.643360614776611, |
|
"learning_rate": 2.2107004964147823e-05, |
|
"loss": 2.7494, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3424317617866005, |
|
"grad_norm": 5.404439926147461, |
|
"learning_rate": 2.19415333701048e-05, |
|
"loss": 2.7386, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.34739454094292804, |
|
"grad_norm": 4.590571403503418, |
|
"learning_rate": 2.1776061776061778e-05, |
|
"loss": 2.8513, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3523573200992556, |
|
"grad_norm": 5.185126781463623, |
|
"learning_rate": 2.1610590182018756e-05, |
|
"loss": 2.673, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3573200992555831, |
|
"grad_norm": 5.4743547439575195, |
|
"learning_rate": 2.144511858797573e-05, |
|
"loss": 2.8101, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.36228287841191065, |
|
"grad_norm": 5.463687896728516, |
|
"learning_rate": 2.1279646993932707e-05, |
|
"loss": 2.7527, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.36724565756823824, |
|
"grad_norm": 6.102997303009033, |
|
"learning_rate": 2.1114175399889686e-05, |
|
"loss": 2.7213, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.37220843672456577, |
|
"grad_norm": 5.1998724937438965, |
|
"learning_rate": 2.0948703805846664e-05, |
|
"loss": 2.753, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3771712158808933, |
|
"grad_norm": 5.181605815887451, |
|
"learning_rate": 2.0783232211803643e-05, |
|
"loss": 2.8034, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.38213399503722084, |
|
"grad_norm": 5.865420818328857, |
|
"learning_rate": 2.061776061776062e-05, |
|
"loss": 2.8288, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 4.965085029602051, |
|
"learning_rate": 2.0452289023717594e-05, |
|
"loss": 2.613, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3920595533498759, |
|
"grad_norm": 5.534601211547852, |
|
"learning_rate": 2.0286817429674573e-05, |
|
"loss": 2.7315, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3970223325062035, |
|
"grad_norm": 5.220632076263428, |
|
"learning_rate": 2.012134583563155e-05, |
|
"loss": 2.8077, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.40198511166253104, |
|
"grad_norm": 5.059537410736084, |
|
"learning_rate": 1.9955874241588527e-05, |
|
"loss": 2.7442, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.40694789081885857, |
|
"grad_norm": 5.325795650482178, |
|
"learning_rate": 1.9790402647545506e-05, |
|
"loss": 2.7351, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4119106699751861, |
|
"grad_norm": 8.455648422241211, |
|
"learning_rate": 1.9624931053502484e-05, |
|
"loss": 2.7643, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.41687344913151364, |
|
"grad_norm": 5.344241619110107, |
|
"learning_rate": 1.945945945945946e-05, |
|
"loss": 2.8984, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4218362282878412, |
|
"grad_norm": 4.816617965698242, |
|
"learning_rate": 1.9293987865416435e-05, |
|
"loss": 2.7377, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4267990074441687, |
|
"grad_norm": 5.334592819213867, |
|
"learning_rate": 1.9128516271373414e-05, |
|
"loss": 2.7021, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4317617866004963, |
|
"grad_norm": 5.0712385177612305, |
|
"learning_rate": 1.8963044677330393e-05, |
|
"loss": 2.6756, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.43672456575682383, |
|
"grad_norm": 5.225603103637695, |
|
"learning_rate": 1.879757308328737e-05, |
|
"loss": 2.7852, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.44168734491315137, |
|
"grad_norm": 5.172723770141602, |
|
"learning_rate": 1.8632101489244347e-05, |
|
"loss": 2.7531, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.4466501240694789, |
|
"grad_norm": 4.887333869934082, |
|
"learning_rate": 1.8466629895201322e-05, |
|
"loss": 2.6636, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4466501240694789, |
|
"eval_loss": 2.3456013202667236, |
|
"eval_runtime": 113.8402, |
|
"eval_samples_per_second": 148.84, |
|
"eval_steps_per_second": 4.656, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.45161290322580644, |
|
"grad_norm": 5.249894142150879, |
|
"learning_rate": 1.83011583011583e-05, |
|
"loss": 2.7089, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.456575682382134, |
|
"grad_norm": 4.984340667724609, |
|
"learning_rate": 1.813568670711528e-05, |
|
"loss": 2.8029, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 5.297105312347412, |
|
"learning_rate": 1.7970215113072255e-05, |
|
"loss": 2.721, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4665012406947891, |
|
"grad_norm": 5.074361801147461, |
|
"learning_rate": 1.7804743519029234e-05, |
|
"loss": 2.5606, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.47146401985111663, |
|
"grad_norm": 5.639915943145752, |
|
"learning_rate": 1.7639271924986212e-05, |
|
"loss": 2.6397, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.47642679900744417, |
|
"grad_norm": 5.445567607879639, |
|
"learning_rate": 1.7473800330943188e-05, |
|
"loss": 2.6563, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.4813895781637717, |
|
"grad_norm": 4.462475299835205, |
|
"learning_rate": 1.7308328736900166e-05, |
|
"loss": 2.7163, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.48635235732009924, |
|
"grad_norm": 5.833073139190674, |
|
"learning_rate": 1.7142857142857142e-05, |
|
"loss": 2.6225, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4913151364764268, |
|
"grad_norm": 5.428512096405029, |
|
"learning_rate": 1.697738554881412e-05, |
|
"loss": 2.645, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.49627791563275436, |
|
"grad_norm": 4.768510818481445, |
|
"learning_rate": 1.68119139547711e-05, |
|
"loss": 2.6576, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5012406947890818, |
|
"grad_norm": 4.692178726196289, |
|
"learning_rate": 1.6646442360728078e-05, |
|
"loss": 2.7019, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5062034739454094, |
|
"grad_norm": 5.51165771484375, |
|
"learning_rate": 1.6480970766685053e-05, |
|
"loss": 2.7195, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.511166253101737, |
|
"grad_norm": 4.9710516929626465, |
|
"learning_rate": 1.631549917264203e-05, |
|
"loss": 2.7242, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 5.166830062866211, |
|
"learning_rate": 1.6150027578599008e-05, |
|
"loss": 2.6729, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5210918114143921, |
|
"grad_norm": 4.945868015289307, |
|
"learning_rate": 1.5984555984555986e-05, |
|
"loss": 2.7637, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5260545905707196, |
|
"grad_norm": 5.7991943359375, |
|
"learning_rate": 1.581908439051296e-05, |
|
"loss": 2.677, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5310173697270472, |
|
"grad_norm": 5.255971431732178, |
|
"learning_rate": 1.565361279646994e-05, |
|
"loss": 2.7018, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5359801488833746, |
|
"grad_norm": 5.910732746124268, |
|
"learning_rate": 1.548814120242692e-05, |
|
"loss": 2.6469, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5409429280397022, |
|
"grad_norm": 4.5612616539001465, |
|
"learning_rate": 1.5322669608383895e-05, |
|
"loss": 2.7186, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5459057071960298, |
|
"grad_norm": 5.111081600189209, |
|
"learning_rate": 1.515719801434087e-05, |
|
"loss": 2.6728, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5508684863523573, |
|
"grad_norm": 5.450465202331543, |
|
"learning_rate": 1.4991726420297849e-05, |
|
"loss": 2.6694, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5558312655086849, |
|
"grad_norm": 4.850940704345703, |
|
"learning_rate": 1.4826254826254827e-05, |
|
"loss": 2.7839, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5607940446650124, |
|
"grad_norm": 5.036434173583984, |
|
"learning_rate": 1.4660783232211803e-05, |
|
"loss": 2.5834, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.56575682382134, |
|
"grad_norm": 40.256370544433594, |
|
"learning_rate": 1.4495311638168781e-05, |
|
"loss": 2.6905, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5707196029776674, |
|
"grad_norm": 5.939204692840576, |
|
"learning_rate": 1.4329840044125759e-05, |
|
"loss": 2.7223, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.575682382133995, |
|
"grad_norm": 5.288106441497803, |
|
"learning_rate": 1.4164368450082736e-05, |
|
"loss": 2.7235, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5806451612903226, |
|
"grad_norm": 5.540660858154297, |
|
"learning_rate": 1.3998896856039713e-05, |
|
"loss": 2.636, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5856079404466501, |
|
"grad_norm": 5.182808876037598, |
|
"learning_rate": 1.3833425261996691e-05, |
|
"loss": 2.6314, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5905707196029777, |
|
"grad_norm": 4.9914140701293945, |
|
"learning_rate": 1.3667953667953668e-05, |
|
"loss": 2.5941, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5955334987593052, |
|
"grad_norm": 4.8229475021362305, |
|
"learning_rate": 1.3502482073910646e-05, |
|
"loss": 2.7827, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5955334987593052, |
|
"eval_loss": 2.2910733222961426, |
|
"eval_runtime": 115.8769, |
|
"eval_samples_per_second": 146.224, |
|
"eval_steps_per_second": 4.574, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6004962779156328, |
|
"grad_norm": 5.556408882141113, |
|
"learning_rate": 1.3337010479867623e-05, |
|
"loss": 2.6104, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.6054590570719603, |
|
"grad_norm": 6.0620551109313965, |
|
"learning_rate": 1.3171538885824601e-05, |
|
"loss": 2.6148, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.6104218362282878, |
|
"grad_norm": 5.285867214202881, |
|
"learning_rate": 1.3006067291781577e-05, |
|
"loss": 2.6355, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 5.499397277832031, |
|
"learning_rate": 1.2840595697738555e-05, |
|
"loss": 2.6269, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.6203473945409429, |
|
"grad_norm": 4.932219505310059, |
|
"learning_rate": 1.2675124103695532e-05, |
|
"loss": 2.6003, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6253101736972705, |
|
"grad_norm": 5.04417610168457, |
|
"learning_rate": 1.250965250965251e-05, |
|
"loss": 2.6256, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.630272952853598, |
|
"grad_norm": 5.0753679275512695, |
|
"learning_rate": 1.2344180915609488e-05, |
|
"loss": 2.6326, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6352357320099256, |
|
"grad_norm": 5.003329753875732, |
|
"learning_rate": 1.2178709321566465e-05, |
|
"loss": 2.681, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6401985111662531, |
|
"grad_norm": 5.094631671905518, |
|
"learning_rate": 1.2013237727523442e-05, |
|
"loss": 2.5776, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 4.915186405181885, |
|
"learning_rate": 1.184776613348042e-05, |
|
"loss": 2.7528, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6501240694789082, |
|
"grad_norm": 5.3234076499938965, |
|
"learning_rate": 1.1682294539437398e-05, |
|
"loss": 2.6076, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.6550868486352357, |
|
"grad_norm": 6.5297064781188965, |
|
"learning_rate": 1.1516822945394374e-05, |
|
"loss": 2.5784, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6600496277915633, |
|
"grad_norm": 5.549312591552734, |
|
"learning_rate": 1.1351351351351352e-05, |
|
"loss": 2.6064, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6650124069478908, |
|
"grad_norm": 4.589717388153076, |
|
"learning_rate": 1.118587975730833e-05, |
|
"loss": 2.5757, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6699751861042184, |
|
"grad_norm": 6.217405796051025, |
|
"learning_rate": 1.1020408163265306e-05, |
|
"loss": 2.5851, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.674937965260546, |
|
"grad_norm": 5.602436542510986, |
|
"learning_rate": 1.0854936569222283e-05, |
|
"loss": 2.6007, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6799007444168734, |
|
"grad_norm": 4.934968948364258, |
|
"learning_rate": 1.0689464975179262e-05, |
|
"loss": 2.5674, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.684863523573201, |
|
"grad_norm": 5.8763322830200195, |
|
"learning_rate": 1.0523993381136238e-05, |
|
"loss": 2.6984, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6898263027295285, |
|
"grad_norm": 5.077223777770996, |
|
"learning_rate": 1.0358521787093216e-05, |
|
"loss": 2.6202, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6947890818858561, |
|
"grad_norm": 6.423431873321533, |
|
"learning_rate": 1.0193050193050193e-05, |
|
"loss": 2.6729, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6997518610421837, |
|
"grad_norm": 5.683679103851318, |
|
"learning_rate": 1.002757859900717e-05, |
|
"loss": 2.6683, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.7047146401985112, |
|
"grad_norm": 5.217991828918457, |
|
"learning_rate": 9.862107004964148e-06, |
|
"loss": 2.6355, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.7096774193548387, |
|
"grad_norm": 5.0821332931518555, |
|
"learning_rate": 9.696635410921126e-06, |
|
"loss": 2.6033, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.7146401985111662, |
|
"grad_norm": 5.337713718414307, |
|
"learning_rate": 9.531163816878102e-06, |
|
"loss": 2.6834, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7196029776674938, |
|
"grad_norm": 5.425897598266602, |
|
"learning_rate": 9.36569222283508e-06, |
|
"loss": 2.6597, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7245657568238213, |
|
"grad_norm": 4.878344535827637, |
|
"learning_rate": 9.200220628792057e-06, |
|
"loss": 2.6298, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7295285359801489, |
|
"grad_norm": 5.754893779754639, |
|
"learning_rate": 9.034749034749034e-06, |
|
"loss": 2.6232, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7344913151364765, |
|
"grad_norm": 4.991476535797119, |
|
"learning_rate": 8.869277440706012e-06, |
|
"loss": 2.5672, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.739454094292804, |
|
"grad_norm": 6.236905097961426, |
|
"learning_rate": 8.70380584666299e-06, |
|
"loss": 2.5139, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.7444168734491315, |
|
"grad_norm": 4.892019748687744, |
|
"learning_rate": 8.538334252619966e-06, |
|
"loss": 2.6248, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7444168734491315, |
|
"eval_loss": 2.3090243339538574, |
|
"eval_runtime": 114.529, |
|
"eval_samples_per_second": 147.945, |
|
"eval_steps_per_second": 4.628, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.749379652605459, |
|
"grad_norm": 5.293705463409424, |
|
"learning_rate": 8.372862658576944e-06, |
|
"loss": 2.6417, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.7543424317617866, |
|
"grad_norm": 6.500164031982422, |
|
"learning_rate": 8.207391064533921e-06, |
|
"loss": 2.6197, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.7593052109181141, |
|
"grad_norm": 5.456136226654053, |
|
"learning_rate": 8.0419194704909e-06, |
|
"loss": 2.6911, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.7642679900744417, |
|
"grad_norm": 5.395550727844238, |
|
"learning_rate": 7.876447876447876e-06, |
|
"loss": 2.5542, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 5.446476936340332, |
|
"learning_rate": 7.710976282404854e-06, |
|
"loss": 2.6584, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 5.699354648590088, |
|
"learning_rate": 7.5455046883618305e-06, |
|
"loss": 2.6182, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.7791563275434243, |
|
"grad_norm": 5.835291862487793, |
|
"learning_rate": 7.3800330943188084e-06, |
|
"loss": 2.6301, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7841191066997518, |
|
"grad_norm": 5.066339015960693, |
|
"learning_rate": 7.214561500275786e-06, |
|
"loss": 2.5629, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.7890818858560794, |
|
"grad_norm": 6.113178253173828, |
|
"learning_rate": 7.049089906232763e-06, |
|
"loss": 2.5965, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.794044665012407, |
|
"grad_norm": 5.544576168060303, |
|
"learning_rate": 6.8836183121897404e-06, |
|
"loss": 2.5722, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7990074441687345, |
|
"grad_norm": 5.560300350189209, |
|
"learning_rate": 6.718146718146718e-06, |
|
"loss": 2.5835, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.8039702233250621, |
|
"grad_norm": 5.354392051696777, |
|
"learning_rate": 6.552675124103695e-06, |
|
"loss": 2.5901, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.8089330024813896, |
|
"grad_norm": 5.256809234619141, |
|
"learning_rate": 6.387203530060673e-06, |
|
"loss": 2.6055, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.8138957816377171, |
|
"grad_norm": 6.089339256286621, |
|
"learning_rate": 6.22173193601765e-06, |
|
"loss": 2.6019, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.8188585607940446, |
|
"grad_norm": 4.713355541229248, |
|
"learning_rate": 6.056260341974627e-06, |
|
"loss": 2.6421, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8238213399503722, |
|
"grad_norm": 6.513923168182373, |
|
"learning_rate": 5.890788747931605e-06, |
|
"loss": 2.6049, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.8287841191066998, |
|
"grad_norm": 5.37587833404541, |
|
"learning_rate": 5.725317153888582e-06, |
|
"loss": 2.5351, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.8337468982630273, |
|
"grad_norm": 6.346861362457275, |
|
"learning_rate": 5.5598455598455594e-06, |
|
"loss": 2.6158, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.8387096774193549, |
|
"grad_norm": 5.206553936004639, |
|
"learning_rate": 5.394373965802537e-06, |
|
"loss": 2.5994, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.8436724565756824, |
|
"grad_norm": 6.567471504211426, |
|
"learning_rate": 5.228902371759514e-06, |
|
"loss": 2.5816, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8486352357320099, |
|
"grad_norm": 5.689151287078857, |
|
"learning_rate": 5.0634307777164914e-06, |
|
"loss": 2.5848, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.8535980148883374, |
|
"grad_norm": 5.288684844970703, |
|
"learning_rate": 4.897959183673469e-06, |
|
"loss": 2.6138, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.858560794044665, |
|
"grad_norm": 5.238705635070801, |
|
"learning_rate": 4.732487589630447e-06, |
|
"loss": 2.5811, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.8635235732009926, |
|
"grad_norm": 5.114046573638916, |
|
"learning_rate": 4.567015995587424e-06, |
|
"loss": 2.5933, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.8684863523573201, |
|
"grad_norm": 6.13997220993042, |
|
"learning_rate": 4.401544401544402e-06, |
|
"loss": 2.5869, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8734491315136477, |
|
"grad_norm": 5.258871555328369, |
|
"learning_rate": 4.236072807501379e-06, |
|
"loss": 2.5464, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.8784119106699751, |
|
"grad_norm": 5.823874473571777, |
|
"learning_rate": 4.070601213458356e-06, |
|
"loss": 2.6842, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.8833746898263027, |
|
"grad_norm": 5.351442337036133, |
|
"learning_rate": 3.905129619415334e-06, |
|
"loss": 2.6312, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.8883374689826302, |
|
"grad_norm": 5.9153947830200195, |
|
"learning_rate": 3.739658025372311e-06, |
|
"loss": 2.5621, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8933002481389578, |
|
"grad_norm": 5.032203197479248, |
|
"learning_rate": 3.5741864313292883e-06, |
|
"loss": 2.6103, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8933002481389578, |
|
"eval_loss": 2.285778284072876, |
|
"eval_runtime": 108.9864, |
|
"eval_samples_per_second": 155.469, |
|
"eval_steps_per_second": 4.863, |
|
"step": 1800 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2015, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|