{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 19818, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02522958926228681, "grad_norm": 1.6351510286331177, "learning_rate": 4.873852053688566e-05, "loss": 4.2934, "step": 500 }, { "epoch": 0.05045917852457362, "grad_norm": 2.1481850147247314, "learning_rate": 4.747704107377132e-05, "loss": 2.8305, "step": 1000 }, { "epoch": 0.05045917852457362, "eval_accuracy": 0.46500522497769375, "eval_loss": 2.379011869430542, "eval_runtime": 56.6734, "eval_samples_per_second": 112.469, "eval_steps_per_second": 3.529, "step": 1000 }, { "epoch": 0.07568876778686043, "grad_norm": 1.5696136951446533, "learning_rate": 4.6215561610656984e-05, "loss": 2.2245, "step": 1500 }, { "epoch": 0.10091835704914724, "grad_norm": 1.5901012420654297, "learning_rate": 4.4954082147542644e-05, "loss": 1.9272, "step": 2000 }, { "epoch": 0.10091835704914724, "eval_accuracy": 0.5808557246708203, "eval_loss": 1.750130534172058, "eval_runtime": 55.5938, "eval_samples_per_second": 114.653, "eval_steps_per_second": 3.598, "step": 2000 }, { "epoch": 0.12614794631143406, "grad_norm": 1.5128012895584106, "learning_rate": 4.3692602684428305e-05, "loss": 1.7748, "step": 2500 }, { "epoch": 0.15137753557372086, "grad_norm": 1.318622350692749, "learning_rate": 4.243112322131396e-05, "loss": 1.6793, "step": 3000 }, { "epoch": 0.15137753557372086, "eval_accuracy": 0.6136108291841765, "eval_loss": 1.568946123123169, "eval_runtime": 54.4039, "eval_samples_per_second": 117.161, "eval_steps_per_second": 3.676, "step": 3000 }, { "epoch": 0.17660712483600766, "grad_norm": 1.4798802137374878, "learning_rate": 4.116964375819962e-05, "loss": 1.613, "step": 3500 }, { "epoch": 0.20183671409829448, "grad_norm": 1.2613641023635864, "learning_rate": 3.990816429508528e-05, "loss": 1.5605, "step": 4000 }, { "epoch": 0.20183671409829448, "eval_accuracy": 0.6316407595495017, "eval_loss": 1.4698580503463745, "eval_runtime": 55.0048, "eval_samples_per_second": 115.881, "eval_steps_per_second": 3.636, "step": 4000 }, { "epoch": 0.22706630336058128, "grad_norm": 1.1746481657028198, "learning_rate": 3.864668483197093e-05, "loss": 1.5228, "step": 4500 }, { "epoch": 0.2522958926228681, "grad_norm": 1.2291498184204102, "learning_rate": 3.738520536885659e-05, "loss": 1.4891, "step": 5000 }, { "epoch": 0.2522958926228681, "eval_accuracy": 0.642200060669245, "eval_loss": 1.4112207889556885, "eval_runtime": 54.5565, "eval_samples_per_second": 116.833, "eval_steps_per_second": 3.666, "step": 5000 }, { "epoch": 0.2775254818851549, "grad_norm": 1.257660150527954, "learning_rate": 3.6123725905742254e-05, "loss": 1.4607, "step": 5500 }, { "epoch": 0.3027550711474417, "grad_norm": 1.1654913425445557, "learning_rate": 3.4862246442627914e-05, "loss": 1.4391, "step": 6000 }, { "epoch": 0.3027550711474417, "eval_accuracy": 0.6513591843207115, "eval_loss": 1.364630103111267, "eval_runtime": 54.7084, "eval_samples_per_second": 116.509, "eval_steps_per_second": 3.656, "step": 6000 }, { "epoch": 0.3279846604097285, "grad_norm": 1.112001657485962, "learning_rate": 3.3600766979513575e-05, "loss": 1.4193, "step": 6500 }, { "epoch": 0.3532142496720153, "grad_norm": 1.1872960329055786, "learning_rate": 3.2339287516399235e-05, "loss": 1.3995, "step": 7000 }, { "epoch": 0.3532142496720153, "eval_accuracy": 0.6575222655822269, "eval_loss": 1.3316864967346191, "eval_runtime": 54.512, "eval_samples_per_second": 116.928, "eval_steps_per_second": 3.669, "step": 7000 }, { "epoch": 0.37844383893430217, "grad_norm": 1.129939079284668, "learning_rate": 3.1077808053284896e-05, "loss": 1.3852, "step": 7500 }, { "epoch": 0.40367342819658897, "grad_norm": 1.1093947887420654, "learning_rate": 2.9816328590170556e-05, "loss": 1.3707, "step": 8000 }, { "epoch": 0.40367342819658897, "eval_accuracy": 0.663146746266679, "eval_loss": 1.3020933866500854, "eval_runtime": 54.3588, "eval_samples_per_second": 117.258, "eval_steps_per_second": 3.679, "step": 8000 }, { "epoch": 0.42890301745887577, "grad_norm": 1.1245774030685425, "learning_rate": 2.855484912705621e-05, "loss": 1.3574, "step": 8500 }, { "epoch": 0.45413260672116257, "grad_norm": 1.1140567064285278, "learning_rate": 2.729336966394187e-05, "loss": 1.3424, "step": 9000 }, { "epoch": 0.45413260672116257, "eval_accuracy": 0.6674909770600935, "eval_loss": 1.2805943489074707, "eval_runtime": 54.0957, "eval_samples_per_second": 117.828, "eval_steps_per_second": 3.697, "step": 9000 }, { "epoch": 0.47936219598344937, "grad_norm": 1.1032965183258057, "learning_rate": 2.603189020082753e-05, "loss": 1.3331, "step": 9500 }, { "epoch": 0.5045917852457362, "grad_norm": 1.1406043767929077, "learning_rate": 2.477041073771319e-05, "loss": 1.3242, "step": 10000 }, { "epoch": 0.5045917852457362, "eval_accuracy": 0.6713588714661621, "eval_loss": 1.2613232135772705, "eval_runtime": 54.8004, "eval_samples_per_second": 116.313, "eval_steps_per_second": 3.65, "step": 10000 }, { "epoch": 0.529821374508023, "grad_norm": 1.0637890100479126, "learning_rate": 2.350893127459885e-05, "loss": 1.316, "step": 10500 }, { "epoch": 0.5550509637703098, "grad_norm": 1.0851575136184692, "learning_rate": 2.224745181148451e-05, "loss": 1.3058, "step": 11000 }, { "epoch": 0.5550509637703098, "eval_accuracy": 0.6747553370072272, "eval_loss": 1.2435107231140137, "eval_runtime": 54.5981, "eval_samples_per_second": 116.744, "eval_steps_per_second": 3.663, "step": 11000 }, { "epoch": 0.5802805530325966, "grad_norm": 1.1059494018554688, "learning_rate": 2.098597234837017e-05, "loss": 1.298, "step": 11500 }, { "epoch": 0.6055101422948834, "grad_norm": 1.1078968048095703, "learning_rate": 1.972449288525583e-05, "loss": 1.2888, "step": 12000 }, { "epoch": 0.6055101422948834, "eval_accuracy": 0.6777041444946341, "eval_loss": 1.229060173034668, "eval_runtime": 55.4356, "eval_samples_per_second": 114.98, "eval_steps_per_second": 3.608, "step": 12000 }, { "epoch": 0.6307397315571702, "grad_norm": 1.1068435907363892, "learning_rate": 1.846301342214149e-05, "loss": 1.2855, "step": 12500 }, { "epoch": 0.655969320819457, "grad_norm": 1.0919195413589478, "learning_rate": 1.7201533959027147e-05, "loss": 1.2748, "step": 13000 }, { "epoch": 0.655969320819457, "eval_accuracy": 0.6801306075727364, "eval_loss": 1.2177754640579224, "eval_runtime": 55.3404, "eval_samples_per_second": 115.178, "eval_steps_per_second": 3.614, "step": 13000 }, { "epoch": 0.6811989100817438, "grad_norm": 1.0631417036056519, "learning_rate": 1.5940054495912807e-05, "loss": 1.2717, "step": 13500 }, { "epoch": 0.7064284993440306, "grad_norm": 1.1203536987304688, "learning_rate": 1.4678575032798466e-05, "loss": 1.2654, "step": 14000 }, { "epoch": 0.7064284993440306, "eval_accuracy": 0.6820543563309032, "eval_loss": 1.2076771259307861, "eval_runtime": 54.4302, "eval_samples_per_second": 117.104, "eval_steps_per_second": 3.674, "step": 14000 }, { "epoch": 0.7316580886063175, "grad_norm": 1.0601987838745117, "learning_rate": 1.3417095569684127e-05, "loss": 1.2592, "step": 14500 }, { "epoch": 0.7568876778686043, "grad_norm": 1.0845381021499634, "learning_rate": 1.2155616106569785e-05, "loss": 1.2549, "step": 15000 }, { "epoch": 0.7568876778686043, "eval_accuracy": 0.6842664220266779, "eval_loss": 1.196437120437622, "eval_runtime": 54.7305, "eval_samples_per_second": 116.462, "eval_steps_per_second": 3.654, "step": 15000 }, { "epoch": 0.7821172671308911, "grad_norm": 1.1175463199615479, "learning_rate": 1.0894136643455446e-05, "loss": 1.2499, "step": 15500 }, { "epoch": 0.8073468563931779, "grad_norm": 1.1143847703933716, "learning_rate": 9.632657180341105e-06, "loss": 1.2459, "step": 16000 }, { "epoch": 0.8073468563931779, "eval_accuracy": 0.6860334367900387, "eval_loss": 1.1877895593643188, "eval_runtime": 54.8073, "eval_samples_per_second": 116.298, "eval_steps_per_second": 3.649, "step": 16000 }, { "epoch": 0.8325764456554647, "grad_norm": 1.0714515447616577, "learning_rate": 8.371177717226763e-06, "loss": 1.2401, "step": 16500 }, { "epoch": 0.8578060349177515, "grad_norm": 1.0937442779541016, "learning_rate": 7.109698254112424e-06, "loss": 1.2385, "step": 17000 }, { "epoch": 0.8578060349177515, "eval_accuracy": 0.6873077056382217, "eval_loss": 1.1818801164627075, "eval_runtime": 54.4299, "eval_samples_per_second": 117.105, "eval_steps_per_second": 3.674, "step": 17000 }, { "epoch": 0.8830356241800383, "grad_norm": 1.125985026359558, "learning_rate": 5.848218790998083e-06, "loss": 1.2335, "step": 17500 }, { "epoch": 0.9082652134423251, "grad_norm": 1.0988380908966064, "learning_rate": 4.586739327883742e-06, "loss": 1.2286, "step": 18000 }, { "epoch": 0.9082652134423251, "eval_accuracy": 0.6885514558318389, "eval_loss": 1.1755918264389038, "eval_runtime": 54.168, "eval_samples_per_second": 117.671, "eval_steps_per_second": 3.692, "step": 18000 }, { "epoch": 0.9334948027046119, "grad_norm": 1.0741068124771118, "learning_rate": 3.325259864769402e-06, "loss": 1.2284, "step": 18500 }, { "epoch": 0.9587243919668987, "grad_norm": 1.1670308113098145, "learning_rate": 2.063780401655061e-06, "loss": 1.2246, "step": 19000 }, { "epoch": 0.9587243919668987, "eval_accuracy": 0.6896297305064778, "eval_loss": 1.1708202362060547, "eval_runtime": 54.2124, "eval_samples_per_second": 117.575, "eval_steps_per_second": 3.689, "step": 19000 }, { "epoch": 0.9839539812291855, "grad_norm": 1.1070556640625, "learning_rate": 8.023009385407206e-07, "loss": 1.2213, "step": 19500 }, { "epoch": 1.0, "step": 19818, "total_flos": 3.31395116433408e+17, "train_loss": 1.5000046812714452, "train_runtime": 7707.6508, "train_samples_per_second": 82.275, "train_steps_per_second": 2.571 } ], "logging_steps": 500, "max_steps": 19818, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.31395116433408e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }