|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.384, |
|
"eval_steps": 500, |
|
"global_step": 60, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 6.651687052805929, |
|
"learning_rate": 3.125e-07, |
|
"loss": 0.4684, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 6.265915305107489, |
|
"learning_rate": 6.25e-07, |
|
"loss": 0.4563, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 6.401273380714525, |
|
"learning_rate": 9.375000000000001e-07, |
|
"loss": 0.4547, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 6.56315483941447, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.4494, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 6.59719209633345, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.4575, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 7.53537793152979, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.4427, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 9.571011007558802, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 0.4378, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 10.148947574276539, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.4369, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 5.2264536848502035, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.4002, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 4.788413945220771, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.3976, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 4.514781509921602, |
|
"learning_rate": 3.4375e-06, |
|
"loss": 0.3942, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 4.538614240888282, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.3761, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 3.0503754653246355, |
|
"learning_rate": 4.0625000000000005e-06, |
|
"loss": 0.3664, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 3.0554055571037093, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.3585, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.9763698617399021, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 0.3561, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 1.5108193822789298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3205, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 1.3652592836546582, |
|
"learning_rate": 4.999370587356267e-06, |
|
"loss": 0.2957, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 1.8791692065602286, |
|
"learning_rate": 4.997482666353287e-06, |
|
"loss": 0.2974, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 1.6499778792766742, |
|
"learning_rate": 4.99433718761614e-06, |
|
"loss": 0.2882, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.7803281647114307, |
|
"learning_rate": 4.989935734988098e-06, |
|
"loss": 0.2822, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 0.6228676679830198, |
|
"learning_rate": 4.984280524733107e-06, |
|
"loss": 0.2708, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 0.6503139687960523, |
|
"learning_rate": 4.977374404419838e-06, |
|
"loss": 0.2624, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 0.6050613097065756, |
|
"learning_rate": 4.9692208514878445e-06, |
|
"loss": 0.2595, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 0.513863899627231, |
|
"learning_rate": 4.959823971496575e-06, |
|
"loss": 0.24, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5401602458085882, |
|
"learning_rate": 4.949188496058089e-06, |
|
"loss": 0.2564, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 0.4588626341682895, |
|
"learning_rate": 4.937319780454559e-06, |
|
"loss": 0.2559, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 0.5618821751922192, |
|
"learning_rate": 4.924223800941718e-06, |
|
"loss": 0.2396, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 0.47667362996506163, |
|
"learning_rate": 4.909907151739634e-06, |
|
"loss": 0.2417, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 0.4186548264796007, |
|
"learning_rate": 4.894377041712327e-06, |
|
"loss": 0.2439, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.4531599370801774, |
|
"learning_rate": 4.8776412907378845e-06, |
|
"loss": 0.2415, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 0.3909364624586131, |
|
"learning_rate": 4.859708325770919e-06, |
|
"loss": 0.2302, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 0.31305049271337076, |
|
"learning_rate": 4.8405871765993435e-06, |
|
"loss": 0.2333, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 0.3219167869220544, |
|
"learning_rate": 4.820287471297598e-06, |
|
"loss": 0.2341, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 0.3118803828567028, |
|
"learning_rate": 4.7988194313786275e-06, |
|
"loss": 0.2189, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.29310345514929165, |
|
"learning_rate": 4.7761938666470405e-06, |
|
"loss": 0.2114, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 0.3347074091997152, |
|
"learning_rate": 4.752422169756048e-06, |
|
"loss": 0.2249, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 0.29649389449972713, |
|
"learning_rate": 4.72751631047092e-06, |
|
"loss": 0.2206, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 0.2837151778804135, |
|
"learning_rate": 4.701488829641845e-06, |
|
"loss": 0.2304, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 0.2806309744043982, |
|
"learning_rate": 4.674352832889239e-06, |
|
"loss": 0.2207, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.27783183257142374, |
|
"learning_rate": 4.646121984004666e-06, |
|
"loss": 0.2155, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 0.27666228945567495, |
|
"learning_rate": 4.6168104980707105e-06, |
|
"loss": 0.2127, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 0.275485776092337, |
|
"learning_rate": 4.586433134303257e-06, |
|
"loss": 0.2238, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 0.27153628095668647, |
|
"learning_rate": 4.555005188619776e-06, |
|
"loss": 0.21, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 0.274290824493659, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.2131, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.2694663744414641, |
|
"learning_rate": 4.4890613722044526e-06, |
|
"loss": 0.2108, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 0.2642992894492684, |
|
"learning_rate": 4.454578706170075e-06, |
|
"loss": 0.2069, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 0.26815076119929016, |
|
"learning_rate": 4.4191118508950286e-06, |
|
"loss": 0.2063, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 0.25684981864747414, |
|
"learning_rate": 4.382678665009028e-06, |
|
"loss": 0.2121, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 0.2536186069276322, |
|
"learning_rate": 4.345297493718352e-06, |
|
"loss": 0.2074, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.256667169168501, |
|
"learning_rate": 4.3069871595684795e-06, |
|
"loss": 0.2106, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 0.2631748867077966, |
|
"learning_rate": 4.267766952966369e-06, |
|
"loss": 0.2031, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 0.2567671139005443, |
|
"learning_rate": 4.227656622467162e-06, |
|
"loss": 0.2063, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 0.26280340427063503, |
|
"learning_rate": 4.186676364830187e-06, |
|
"loss": 0.212, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 0.2599237100252332, |
|
"learning_rate": 4.144846814849282e-06, |
|
"loss": 0.2161, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.25711041735736345, |
|
"learning_rate": 4.102189034962561e-06, |
|
"loss": 0.1959, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 0.26568396343464284, |
|
"learning_rate": 4.058724504646834e-06, |
|
"loss": 0.2053, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 0.26965052490303654, |
|
"learning_rate": 4.01447510960205e-06, |
|
"loss": 0.197, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 0.2474576437319212, |
|
"learning_rate": 3.969463130731183e-06, |
|
"loss": 0.1963, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 0.26050605159298984, |
|
"learning_rate": 3.92371123292113e-06, |
|
"loss": 0.1985, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.25523756004269815, |
|
"learning_rate": 3.8772424536302565e-06, |
|
"loss": 0.1957, |
|
"step": 60 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 156, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 121473953169408.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|