|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 300000000000000000, |
|
"global_step": 4230, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02364066193853428, |
|
"grad_norm": 4265.77734375, |
|
"learning_rate": 2.695035460992908e-06, |
|
"loss": 19.7865, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04728132387706856, |
|
"grad_norm": Infinity, |
|
"learning_rate": 6.2411347517730495e-06, |
|
"loss": 9.5661, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07092198581560284, |
|
"grad_norm": 1082.762939453125, |
|
"learning_rate": 9.645390070921986e-06, |
|
"loss": 2.3602, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09456264775413711, |
|
"grad_norm": 249.59776306152344, |
|
"learning_rate": 1.3191489361702129e-05, |
|
"loss": 1.7304, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1182033096926714, |
|
"grad_norm": 171.15611267089844, |
|
"learning_rate": 1.673758865248227e-05, |
|
"loss": 1.6707, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.14184397163120568, |
|
"grad_norm": 64.97687530517578, |
|
"learning_rate": 2.028368794326241e-05, |
|
"loss": 1.5563, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.16548463356973994, |
|
"grad_norm": 22.22820472717285, |
|
"learning_rate": 2.3829787234042553e-05, |
|
"loss": 1.4809, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.18912529550827423, |
|
"grad_norm": 17.94223403930664, |
|
"learning_rate": 2.7375886524822697e-05, |
|
"loss": 1.3877, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 6.133880615234375, |
|
"learning_rate": 2.999913686685402e-05, |
|
"loss": 1.2877, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2364066193853428, |
|
"grad_norm": 5.359964370727539, |
|
"learning_rate": 2.9979733510221677e-05, |
|
"loss": 1.1985, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.26004728132387706, |
|
"grad_norm": 8.637283325195312, |
|
"learning_rate": 2.9934831545542617e-05, |
|
"loss": 1.1513, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.28368794326241137, |
|
"grad_norm": 13.667312622070312, |
|
"learning_rate": 2.986450740525784e-05, |
|
"loss": 1.0896, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3073286052009456, |
|
"grad_norm": 6.358010292053223, |
|
"learning_rate": 2.9768880795615002e-05, |
|
"loss": 1.0343, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3309692671394799, |
|
"grad_norm": 4.579405784606934, |
|
"learning_rate": 2.9648114492903583e-05, |
|
"loss": 0.9778, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3546099290780142, |
|
"grad_norm": 4.688839435577393, |
|
"learning_rate": 2.950241406637593e-05, |
|
"loss": 0.9688, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.37825059101654845, |
|
"grad_norm": 5.001623153686523, |
|
"learning_rate": 2.9332027528325834e-05, |
|
"loss": 0.9246, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.40189125295508277, |
|
"grad_norm": 5.845919609069824, |
|
"learning_rate": 2.9137244911920255e-05, |
|
"loss": 0.9075, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 6.357454299926758, |
|
"learning_rate": 2.891839777750281e-05, |
|
"loss": 0.9032, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4491725768321513, |
|
"grad_norm": 4.689474105834961, |
|
"learning_rate": 2.8675858648209442e-05, |
|
"loss": 0.8833, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4728132387706856, |
|
"grad_norm": 5.220682621002197, |
|
"learning_rate": 2.841004037585688e-05, |
|
"loss": 0.8512, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.49645390070921985, |
|
"grad_norm": 6.682931423187256, |
|
"learning_rate": 2.8121395438183372e-05, |
|
"loss": 0.8526, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5200945626477541, |
|
"grad_norm": 5.913801670074463, |
|
"learning_rate": 2.7810415168637912e-05, |
|
"loss": 0.8187, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5437352245862884, |
|
"grad_norm": 5.719775676727295, |
|
"learning_rate": 2.7477628920028935e-05, |
|
"loss": 0.8174, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5673758865248227, |
|
"grad_norm": 7.033783912658691, |
|
"learning_rate": 2.712360316345627e-05, |
|
"loss": 0.8378, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5910165484633569, |
|
"grad_norm": 10.351150512695312, |
|
"learning_rate": 2.6748940524060027e-05, |
|
"loss": 0.81, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6146572104018913, |
|
"grad_norm": 6.334929466247559, |
|
"learning_rate": 2.6354278755227802e-05, |
|
"loss": 0.8019, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 4.533697605133057, |
|
"learning_rate": 2.5940289653006427e-05, |
|
"loss": 0.7874, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6619385342789598, |
|
"grad_norm": 5.18574857711792, |
|
"learning_rate": 2.550767791256593e-05, |
|
"loss": 0.7758, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6855791962174941, |
|
"grad_norm": 5.3152923583984375, |
|
"learning_rate": 2.5057179928662506e-05, |
|
"loss": 0.7811, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7092198581560284, |
|
"grad_norm": 6.021467685699463, |
|
"learning_rate": 2.458956254214211e-05, |
|
"loss": 0.7849, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7328605200945626, |
|
"grad_norm": 9.204541206359863, |
|
"learning_rate": 2.4105621734618613e-05, |
|
"loss": 0.7653, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7565011820330969, |
|
"grad_norm": 7.0010504722595215, |
|
"learning_rate": 2.3606181273548253e-05, |
|
"loss": 0.7578, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7801418439716312, |
|
"grad_norm": 4.713606357574463, |
|
"learning_rate": 2.309209131000687e-05, |
|
"loss": 0.743, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8037825059101655, |
|
"grad_norm": 8.350544929504395, |
|
"learning_rate": 2.256422693155675e-05, |
|
"loss": 0.7327, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8274231678486997, |
|
"grad_norm": 5.2699713706970215, |
|
"learning_rate": 2.2023486672666385e-05, |
|
"loss": 0.7252, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 5.1293625831604, |
|
"learning_rate": 2.1470790985218804e-05, |
|
"loss": 0.7169, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8747044917257684, |
|
"grad_norm": 5.4763407707214355, |
|
"learning_rate": 2.0907080671711832e-05, |
|
"loss": 0.7208, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.8983451536643026, |
|
"grad_norm": 13.955887794494629, |
|
"learning_rate": 2.0333315283817486e-05, |
|
"loss": 0.7309, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9219858156028369, |
|
"grad_norm": 8.627326011657715, |
|
"learning_rate": 1.975047148902632e-05, |
|
"loss": 0.7155, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.9456264775413712, |
|
"grad_norm": 8.94743537902832, |
|
"learning_rate": 1.9159541408157158e-05, |
|
"loss": 0.7068, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9692671394799054, |
|
"grad_norm": 5.135090351104736, |
|
"learning_rate": 1.8561530926562023e-05, |
|
"loss": 0.6918, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.9929078014184397, |
|
"grad_norm": 38.368629455566406, |
|
"learning_rate": 1.795745798190099e-05, |
|
"loss": 0.6818, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.016548463356974, |
|
"grad_norm": 25.27336311340332, |
|
"learning_rate": 1.734835083140153e-05, |
|
"loss": 0.685, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.0401891252955082, |
|
"grad_norm": 7.319797992706299, |
|
"learning_rate": 1.6735246301551825e-05, |
|
"loss": 0.6739, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 10.93574333190918, |
|
"learning_rate": 1.6119188023207348e-05, |
|
"loss": 0.6828, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.0874704491725768, |
|
"grad_norm": 51.116207122802734, |
|
"learning_rate": 1.5501224655115118e-05, |
|
"loss": 0.6514, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 17.643468856811523, |
|
"learning_rate": 1.4882408098879367e-05, |
|
"loss": 0.6708, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.1347517730496455, |
|
"grad_norm": 24.42500114440918, |
|
"learning_rate": 1.426379170840718e-05, |
|
"loss": 0.6595, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1583924349881798, |
|
"grad_norm": 7.433150291442871, |
|
"learning_rate": 1.364642849688209e-05, |
|
"loss": 0.6545, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.1820330969267139, |
|
"grad_norm": 12.09028148651123, |
|
"learning_rate": 1.3031369344317569e-05, |
|
"loss": 0.6533, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.2056737588652482, |
|
"grad_norm": 8.16505241394043, |
|
"learning_rate": 1.2419661208741687e-05, |
|
"loss": 0.6466, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.2293144208037825, |
|
"grad_norm": 8.545854568481445, |
|
"learning_rate": 1.181234534405775e-05, |
|
"loss": 0.6311, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.2529550827423168, |
|
"grad_norm": 7.951985836029053, |
|
"learning_rate": 1.1210455527614574e-05, |
|
"loss": 0.6338, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 17.269718170166016, |
|
"learning_rate": 1.061501630050338e-05, |
|
"loss": 0.6214, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.3002364066193852, |
|
"grad_norm": 14.052486419677734, |
|
"learning_rate": 1.0027041223576735e-05, |
|
"loss": 0.6307, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.3238770685579198, |
|
"grad_norm": 7.13311767578125, |
|
"learning_rate": 9.447531152158089e-06, |
|
"loss": 0.6273, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.3475177304964538, |
|
"grad_norm": 9.32107162475586, |
|
"learning_rate": 8.877472532378836e-06, |
|
"loss": 0.6218, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.3711583924349882, |
|
"grad_norm": 6.4954118728637695, |
|
"learning_rate": 8.317835722042693e-06, |
|
"loss": 0.6085, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.3947990543735225, |
|
"grad_norm": 7.570118427276611, |
|
"learning_rate": 7.769573338875851e-06, |
|
"loss": 0.6007, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.4184397163120568, |
|
"grad_norm": 18.045612335205078, |
|
"learning_rate": 7.2336186389743095e-06, |
|
"loss": 0.6177, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.442080378250591, |
|
"grad_norm": 10.485590934753418, |
|
"learning_rate": 6.710883928208835e-06, |
|
"loss": 0.5859, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.4657210401891252, |
|
"grad_norm": 13.521890640258789, |
|
"learning_rate": 6.202259009291401e-06, |
|
"loss": 0.6117, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 10.763672828674316, |
|
"learning_rate": 5.708609667146788e-06, |
|
"loss": 0.5647, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.5130023640661938, |
|
"grad_norm": 13.315629959106445, |
|
"learning_rate": 5.2307761951673425e-06, |
|
"loss": 0.6115, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.5366430260047281, |
|
"grad_norm": 13.952415466308594, |
|
"learning_rate": 4.769571964859664e-06, |
|
"loss": 0.594, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.5602836879432624, |
|
"grad_norm": 9.341636657714844, |
|
"learning_rate": 4.325782041317874e-06, |
|
"loss": 0.5803, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.5839243498817965, |
|
"grad_norm": 23.52338218688965, |
|
"learning_rate": 3.900161846880281e-06, |
|
"loss": 0.5648, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.607565011820331, |
|
"grad_norm": 22.669052124023438, |
|
"learning_rate": 3.4934358752441315e-06, |
|
"loss": 0.5588, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.6312056737588652, |
|
"grad_norm": 50.38961410522461, |
|
"learning_rate": 3.106296458227363e-06, |
|
"loss": 0.5777, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.6548463356973995, |
|
"grad_norm": 6.503214359283447, |
|
"learning_rate": 2.7394025872764556e-06, |
|
"loss": 0.5688, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.6784869976359338, |
|
"grad_norm": 81.57908630371094, |
|
"learning_rate": 2.39337879172658e-06, |
|
"loss": 0.5849, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 10.680161476135254, |
|
"learning_rate": 2.0688140757233428e-06, |
|
"loss": 0.5761, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.7257683215130024, |
|
"grad_norm": 6.659136772155762, |
|
"learning_rate": 1.7662609156157749e-06, |
|
"loss": 0.566, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.7494089834515365, |
|
"grad_norm": 13.552983283996582, |
|
"learning_rate": 1.486234319527186e-06, |
|
"loss": 0.5625, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.773049645390071, |
|
"grad_norm": 22.759199142456055, |
|
"learning_rate": 1.2292109507047273e-06, |
|
"loss": 0.557, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.7966903073286051, |
|
"grad_norm": 36.262474060058594, |
|
"learning_rate": 9.956283161398172e-07, |
|
"loss": 0.5561, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.8203309692671394, |
|
"grad_norm": 8.722681999206543, |
|
"learning_rate": 7.858840218406921e-07, |
|
"loss": 0.5624, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.8439716312056738, |
|
"grad_norm": 6.087028980255127, |
|
"learning_rate": 6.00335096024619e-07, |
|
"loss": 0.5747, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.867612293144208, |
|
"grad_norm": 9.753291130065918, |
|
"learning_rate": 4.3929738138196787e-07, |
|
"loss": 0.5629, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.8912529550827424, |
|
"grad_norm": 13.233976364135742, |
|
"learning_rate": 3.030449974465471e-07, |
|
"loss": 0.5588, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 23.338430404663086, |
|
"learning_rate": 1.9180987398740358e-07, |
|
"loss": 0.5632, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.938534278959811, |
|
"grad_norm": 30.731971740722656, |
|
"learning_rate": 1.0578135621633178e-07, |
|
"loss": 0.5566, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.962174940898345, |
|
"grad_norm": 12.004512786865234, |
|
"learning_rate": 4.5105882483119643e-08, |
|
"loss": 0.5446, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.9858156028368794, |
|
"grad_norm": 18.55535125732422, |
|
"learning_rate": 9.886735007152425e-09, |
|
"loss": 0.5539, |
|
"step": 4200 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 4230, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.480992965002527e+17, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|