|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 1000, |
|
"global_step": 19818, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02522958926228681, |
|
"grad_norm": 1.6351510286331177, |
|
"learning_rate": 4.873852053688566e-05, |
|
"loss": 4.2934, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05045917852457362, |
|
"grad_norm": 2.1481850147247314, |
|
"learning_rate": 4.747704107377132e-05, |
|
"loss": 2.8305, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05045917852457362, |
|
"eval_accuracy": 0.46500522497769375, |
|
"eval_loss": 2.379011869430542, |
|
"eval_runtime": 56.6734, |
|
"eval_samples_per_second": 112.469, |
|
"eval_steps_per_second": 3.529, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07568876778686043, |
|
"grad_norm": 1.5696136951446533, |
|
"learning_rate": 4.6215561610656984e-05, |
|
"loss": 2.2245, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.10091835704914724, |
|
"grad_norm": 1.5901012420654297, |
|
"learning_rate": 4.4954082147542644e-05, |
|
"loss": 1.9272, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.10091835704914724, |
|
"eval_accuracy": 0.5808557246708203, |
|
"eval_loss": 1.750130534172058, |
|
"eval_runtime": 55.5938, |
|
"eval_samples_per_second": 114.653, |
|
"eval_steps_per_second": 3.598, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.12614794631143406, |
|
"grad_norm": 1.5128012895584106, |
|
"learning_rate": 4.3692602684428305e-05, |
|
"loss": 1.7748, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.15137753557372086, |
|
"grad_norm": 1.318622350692749, |
|
"learning_rate": 4.243112322131396e-05, |
|
"loss": 1.6793, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.15137753557372086, |
|
"eval_accuracy": 0.6136108291841765, |
|
"eval_loss": 1.568946123123169, |
|
"eval_runtime": 54.4039, |
|
"eval_samples_per_second": 117.161, |
|
"eval_steps_per_second": 3.676, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.17660712483600766, |
|
"grad_norm": 1.4798802137374878, |
|
"learning_rate": 4.116964375819962e-05, |
|
"loss": 1.613, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.20183671409829448, |
|
"grad_norm": 1.2613641023635864, |
|
"learning_rate": 3.990816429508528e-05, |
|
"loss": 1.5605, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.20183671409829448, |
|
"eval_accuracy": 0.6316407595495017, |
|
"eval_loss": 1.4698580503463745, |
|
"eval_runtime": 55.0048, |
|
"eval_samples_per_second": 115.881, |
|
"eval_steps_per_second": 3.636, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.22706630336058128, |
|
"grad_norm": 1.1746481657028198, |
|
"learning_rate": 3.864668483197093e-05, |
|
"loss": 1.5228, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2522958926228681, |
|
"grad_norm": 1.2291498184204102, |
|
"learning_rate": 3.738520536885659e-05, |
|
"loss": 1.4891, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2522958926228681, |
|
"eval_accuracy": 0.642200060669245, |
|
"eval_loss": 1.4112207889556885, |
|
"eval_runtime": 54.5565, |
|
"eval_samples_per_second": 116.833, |
|
"eval_steps_per_second": 3.666, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2775254818851549, |
|
"grad_norm": 1.257660150527954, |
|
"learning_rate": 3.6123725905742254e-05, |
|
"loss": 1.4607, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.3027550711474417, |
|
"grad_norm": 1.1654913425445557, |
|
"learning_rate": 3.4862246442627914e-05, |
|
"loss": 1.4391, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3027550711474417, |
|
"eval_accuracy": 0.6513591843207115, |
|
"eval_loss": 1.364630103111267, |
|
"eval_runtime": 54.7084, |
|
"eval_samples_per_second": 116.509, |
|
"eval_steps_per_second": 3.656, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3279846604097285, |
|
"grad_norm": 1.112001657485962, |
|
"learning_rate": 3.3600766979513575e-05, |
|
"loss": 1.4193, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.3532142496720153, |
|
"grad_norm": 1.1872960329055786, |
|
"learning_rate": 3.2339287516399235e-05, |
|
"loss": 1.3995, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.3532142496720153, |
|
"eval_accuracy": 0.6575222655822269, |
|
"eval_loss": 1.3316864967346191, |
|
"eval_runtime": 54.512, |
|
"eval_samples_per_second": 116.928, |
|
"eval_steps_per_second": 3.669, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.37844383893430217, |
|
"grad_norm": 1.129939079284668, |
|
"learning_rate": 3.1077808053284896e-05, |
|
"loss": 1.3852, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.40367342819658897, |
|
"grad_norm": 1.1093947887420654, |
|
"learning_rate": 2.9816328590170556e-05, |
|
"loss": 1.3707, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.40367342819658897, |
|
"eval_accuracy": 0.663146746266679, |
|
"eval_loss": 1.3020933866500854, |
|
"eval_runtime": 54.3588, |
|
"eval_samples_per_second": 117.258, |
|
"eval_steps_per_second": 3.679, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.42890301745887577, |
|
"grad_norm": 1.1245774030685425, |
|
"learning_rate": 2.855484912705621e-05, |
|
"loss": 1.3574, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.45413260672116257, |
|
"grad_norm": 1.1140567064285278, |
|
"learning_rate": 2.729336966394187e-05, |
|
"loss": 1.3424, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.45413260672116257, |
|
"eval_accuracy": 0.6674909770600935, |
|
"eval_loss": 1.2805943489074707, |
|
"eval_runtime": 54.0957, |
|
"eval_samples_per_second": 117.828, |
|
"eval_steps_per_second": 3.697, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.47936219598344937, |
|
"grad_norm": 1.1032965183258057, |
|
"learning_rate": 2.603189020082753e-05, |
|
"loss": 1.3331, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5045917852457362, |
|
"grad_norm": 1.1406043767929077, |
|
"learning_rate": 2.477041073771319e-05, |
|
"loss": 1.3242, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5045917852457362, |
|
"eval_accuracy": 0.6713588714661621, |
|
"eval_loss": 1.2613232135772705, |
|
"eval_runtime": 54.8004, |
|
"eval_samples_per_second": 116.313, |
|
"eval_steps_per_second": 3.65, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.529821374508023, |
|
"grad_norm": 1.0637890100479126, |
|
"learning_rate": 2.350893127459885e-05, |
|
"loss": 1.316, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.5550509637703098, |
|
"grad_norm": 1.0851575136184692, |
|
"learning_rate": 2.224745181148451e-05, |
|
"loss": 1.3058, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.5550509637703098, |
|
"eval_accuracy": 0.6747553370072272, |
|
"eval_loss": 1.2435107231140137, |
|
"eval_runtime": 54.5981, |
|
"eval_samples_per_second": 116.744, |
|
"eval_steps_per_second": 3.663, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.5802805530325966, |
|
"grad_norm": 1.1059494018554688, |
|
"learning_rate": 2.098597234837017e-05, |
|
"loss": 1.298, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6055101422948834, |
|
"grad_norm": 1.1078968048095703, |
|
"learning_rate": 1.972449288525583e-05, |
|
"loss": 1.2888, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6055101422948834, |
|
"eval_accuracy": 0.6777041444946341, |
|
"eval_loss": 1.229060173034668, |
|
"eval_runtime": 55.4356, |
|
"eval_samples_per_second": 114.98, |
|
"eval_steps_per_second": 3.608, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6307397315571702, |
|
"grad_norm": 1.1068435907363892, |
|
"learning_rate": 1.846301342214149e-05, |
|
"loss": 1.2855, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.655969320819457, |
|
"grad_norm": 1.0919195413589478, |
|
"learning_rate": 1.7201533959027147e-05, |
|
"loss": 1.2748, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.655969320819457, |
|
"eval_accuracy": 0.6801306075727364, |
|
"eval_loss": 1.2177754640579224, |
|
"eval_runtime": 55.3404, |
|
"eval_samples_per_second": 115.178, |
|
"eval_steps_per_second": 3.614, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.6811989100817438, |
|
"grad_norm": 1.0631417036056519, |
|
"learning_rate": 1.5940054495912807e-05, |
|
"loss": 1.2717, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7064284993440306, |
|
"grad_norm": 1.1203536987304688, |
|
"learning_rate": 1.4678575032798466e-05, |
|
"loss": 1.2654, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7064284993440306, |
|
"eval_accuracy": 0.6820543563309032, |
|
"eval_loss": 1.2076771259307861, |
|
"eval_runtime": 54.4302, |
|
"eval_samples_per_second": 117.104, |
|
"eval_steps_per_second": 3.674, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7316580886063175, |
|
"grad_norm": 1.0601987838745117, |
|
"learning_rate": 1.3417095569684127e-05, |
|
"loss": 1.2592, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.7568876778686043, |
|
"grad_norm": 1.0845381021499634, |
|
"learning_rate": 1.2155616106569785e-05, |
|
"loss": 1.2549, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.7568876778686043, |
|
"eval_accuracy": 0.6842664220266779, |
|
"eval_loss": 1.196437120437622, |
|
"eval_runtime": 54.7305, |
|
"eval_samples_per_second": 116.462, |
|
"eval_steps_per_second": 3.654, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.7821172671308911, |
|
"grad_norm": 1.1175463199615479, |
|
"learning_rate": 1.0894136643455446e-05, |
|
"loss": 1.2499, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.8073468563931779, |
|
"grad_norm": 1.1143847703933716, |
|
"learning_rate": 9.632657180341105e-06, |
|
"loss": 1.2459, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.8073468563931779, |
|
"eval_accuracy": 0.6860334367900387, |
|
"eval_loss": 1.1877895593643188, |
|
"eval_runtime": 54.8073, |
|
"eval_samples_per_second": 116.298, |
|
"eval_steps_per_second": 3.649, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.8325764456554647, |
|
"grad_norm": 1.0714515447616577, |
|
"learning_rate": 8.371177717226763e-06, |
|
"loss": 1.2401, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.8578060349177515, |
|
"grad_norm": 1.0937442779541016, |
|
"learning_rate": 7.109698254112424e-06, |
|
"loss": 1.2385, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.8578060349177515, |
|
"eval_accuracy": 0.6873077056382217, |
|
"eval_loss": 1.1818801164627075, |
|
"eval_runtime": 54.4299, |
|
"eval_samples_per_second": 117.105, |
|
"eval_steps_per_second": 3.674, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.8830356241800383, |
|
"grad_norm": 1.125985026359558, |
|
"learning_rate": 5.848218790998083e-06, |
|
"loss": 1.2335, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.9082652134423251, |
|
"grad_norm": 1.0988380908966064, |
|
"learning_rate": 4.586739327883742e-06, |
|
"loss": 1.2286, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.9082652134423251, |
|
"eval_accuracy": 0.6885514558318389, |
|
"eval_loss": 1.1755918264389038, |
|
"eval_runtime": 54.168, |
|
"eval_samples_per_second": 117.671, |
|
"eval_steps_per_second": 3.692, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.9334948027046119, |
|
"grad_norm": 1.0741068124771118, |
|
"learning_rate": 3.325259864769402e-06, |
|
"loss": 1.2284, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.9587243919668987, |
|
"grad_norm": 1.1670308113098145, |
|
"learning_rate": 2.063780401655061e-06, |
|
"loss": 1.2246, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.9587243919668987, |
|
"eval_accuracy": 0.6896297305064778, |
|
"eval_loss": 1.1708202362060547, |
|
"eval_runtime": 54.2124, |
|
"eval_samples_per_second": 117.575, |
|
"eval_steps_per_second": 3.689, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.9839539812291855, |
|
"grad_norm": 1.1070556640625, |
|
"learning_rate": 8.023009385407206e-07, |
|
"loss": 1.2213, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 19818, |
|
"total_flos": 3.31395116433408e+17, |
|
"train_loss": 1.5000046812714452, |
|
"train_runtime": 7707.6508, |
|
"train_samples_per_second": 82.275, |
|
"train_steps_per_second": 2.571 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 19818, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.31395116433408e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|