{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 34.44804382324219, "learning_rate": 1e-05, "loss": 13.0101, "mean_token_accuracy": 0.4696590006351471, "step": 1 }, { "epoch": 0.016, "grad_norm": 30.779788970947266, "learning_rate": 2e-05, "loss": 12.3851, "mean_token_accuracy": 0.47303473204374313, "step": 2 }, { "epoch": 0.024, "grad_norm": 29.67559242248535, "learning_rate": 3e-05, "loss": 12.3488, "mean_token_accuracy": 0.49709559231996536, "step": 3 }, { "epoch": 0.032, "grad_norm": 26.862010955810547, "learning_rate": 4e-05, "loss": 11.6596, "mean_token_accuracy": 0.5584611147642136, "step": 4 }, { "epoch": 0.04, "grad_norm": 22.10072135925293, "learning_rate": 5e-05, "loss": 10.1384, "mean_token_accuracy": 0.5924926251173019, "step": 5 }, { "epoch": 0.048, "grad_norm": 20.171361923217773, "learning_rate": 4.9473684210526315e-05, "loss": 9.5421, "mean_token_accuracy": 0.5888276249170303, "step": 6 }, { "epoch": 0.056, "grad_norm": 16.452842712402344, "learning_rate": 4.8947368421052635e-05, "loss": 8.4344, "mean_token_accuracy": 0.632336363196373, "step": 7 }, { "epoch": 0.064, "grad_norm": 12.61925983428955, "learning_rate": 4.842105263157895e-05, "loss": 7.7786, "mean_token_accuracy": 0.6633019298315048, "step": 8 }, { "epoch": 0.072, "grad_norm": 11.540078163146973, "learning_rate": 4.789473684210526e-05, "loss": 7.5607, "mean_token_accuracy": 0.6721473336219788, "step": 9 }, { "epoch": 0.08, "grad_norm": 9.264492988586426, "learning_rate": 4.736842105263158e-05, "loss": 7.5589, "mean_token_accuracy": 0.6924590468406677, "step": 10 }, { "epoch": 0.088, "grad_norm": 9.118257522583008, "learning_rate": 4.68421052631579e-05, "loss": 7.2038, "mean_token_accuracy": 0.6936477273702621, "step": 11 }, { "epoch": 0.096, "grad_norm": 9.140292167663574, "learning_rate": 4.6315789473684214e-05, "loss": 6.7882, "mean_token_accuracy": 0.7149988412857056, "step": 12 }, { "epoch": 0.104, "grad_norm": 8.944159507751465, "learning_rate": 4.5789473684210527e-05, "loss": 6.7822, "mean_token_accuracy": 0.7038442492485046, "step": 13 }, { "epoch": 0.112, "grad_norm": 8.406500816345215, "learning_rate": 4.5263157894736846e-05, "loss": 6.6675, "mean_token_accuracy": 0.6912005394697189, "step": 14 }, { "epoch": 0.12, "grad_norm": 7.81226921081543, "learning_rate": 4.473684210526316e-05, "loss": 6.156, "mean_token_accuracy": 0.7322177290916443, "step": 15 }, { "epoch": 0.128, "grad_norm": 10.26094913482666, "learning_rate": 4.421052631578947e-05, "loss": 6.3945, "mean_token_accuracy": 0.7067171931266785, "step": 16 }, { "epoch": 0.136, "grad_norm": 9.275789260864258, "learning_rate": 4.368421052631579e-05, "loss": 6.0611, "mean_token_accuracy": 0.7207391560077667, "step": 17 }, { "epoch": 0.144, "grad_norm": 9.354033470153809, "learning_rate": 4.3157894736842105e-05, "loss": 6.1713, "mean_token_accuracy": 0.7210505157709122, "step": 18 }, { "epoch": 0.152, "grad_norm": 7.362099647521973, "learning_rate": 4.2631578947368425e-05, "loss": 6.4072, "mean_token_accuracy": 0.7182824611663818, "step": 19 }, { "epoch": 0.16, "grad_norm": 7.947159290313721, "learning_rate": 4.210526315789474e-05, "loss": 6.2321, "mean_token_accuracy": 0.7235151827335358, "step": 20 }, { "epoch": 0.168, "grad_norm": 9.023740768432617, "learning_rate": 4.157894736842106e-05, "loss": 6.0462, "mean_token_accuracy": 0.7328377515077591, "step": 21 }, { "epoch": 0.176, "grad_norm": 7.097681999206543, "learning_rate": 4.105263157894737e-05, "loss": 5.1996, "mean_token_accuracy": 0.7640947103500366, "step": 22 }, { "epoch": 0.184, "grad_norm": 7.417032718658447, "learning_rate": 4.0526315789473684e-05, "loss": 6.2611, "mean_token_accuracy": 0.7178521305322647, "step": 23 }, { "epoch": 0.192, "grad_norm": 6.950761318206787, "learning_rate": 4e-05, "loss": 5.5961, "mean_token_accuracy": 0.7444904744625092, "step": 24 }, { "epoch": 0.2, "grad_norm": 11.129012107849121, "learning_rate": 3.9473684210526316e-05, "loss": 5.3222, "mean_token_accuracy": 0.75727978348732, "step": 25 }, { "epoch": 0.208, "grad_norm": 7.0950469970703125, "learning_rate": 3.894736842105263e-05, "loss": 5.4028, "mean_token_accuracy": 0.7472832798957825, "step": 26 }, { "epoch": 0.216, "grad_norm": 9.279951095581055, "learning_rate": 3.842105263157895e-05, "loss": 5.6822, "mean_token_accuracy": 0.7470583468675613, "step": 27 }, { "epoch": 0.224, "grad_norm": 7.986962795257568, "learning_rate": 3.789473684210527e-05, "loss": 5.5314, "mean_token_accuracy": 0.7577493786811829, "step": 28 }, { "epoch": 0.232, "grad_norm": 7.804550647735596, "learning_rate": 3.736842105263158e-05, "loss": 5.6165, "mean_token_accuracy": 0.7538002282381058, "step": 29 }, { "epoch": 0.24, "grad_norm": 7.608943939208984, "learning_rate": 3.6842105263157895e-05, "loss": 5.8104, "mean_token_accuracy": 0.7309564054012299, "step": 30 }, { "epoch": 0.248, "grad_norm": 7.622748374938965, "learning_rate": 3.6315789473684214e-05, "loss": 6.0722, "mean_token_accuracy": 0.740648090839386, "step": 31 }, { "epoch": 0.256, "grad_norm": 7.36374044418335, "learning_rate": 3.578947368421053e-05, "loss": 5.1848, "mean_token_accuracy": 0.7709122896194458, "step": 32 }, { "epoch": 0.264, "grad_norm": 6.820411682128906, "learning_rate": 3.526315789473684e-05, "loss": 5.0621, "mean_token_accuracy": 0.7692567408084869, "step": 33 }, { "epoch": 0.272, "grad_norm": 6.473299980163574, "learning_rate": 3.473684210526316e-05, "loss": 5.3879, "mean_token_accuracy": 0.7631748914718628, "step": 34 }, { "epoch": 0.28, "grad_norm": 7.249595642089844, "learning_rate": 3.421052631578947e-05, "loss": 5.7251, "mean_token_accuracy": 0.7535725235939026, "step": 35 }, { "epoch": 0.288, "grad_norm": 6.396822929382324, "learning_rate": 3.368421052631579e-05, "loss": 5.5203, "mean_token_accuracy": 0.7570009678602219, "step": 36 }, { "epoch": 0.296, "grad_norm": 5.999199390411377, "learning_rate": 3.3157894736842106e-05, "loss": 4.8803, "mean_token_accuracy": 0.7936893254518509, "step": 37 }, { "epoch": 0.304, "grad_norm": 6.992559909820557, "learning_rate": 3.2631578947368426e-05, "loss": 5.3993, "mean_token_accuracy": 0.7533106952905655, "step": 38 }, { "epoch": 0.312, "grad_norm": 7.724341869354248, "learning_rate": 3.210526315789474e-05, "loss": 5.6051, "mean_token_accuracy": 0.7366128116846085, "step": 39 }, { "epoch": 0.32, "grad_norm": 6.947137832641602, "learning_rate": 3.157894736842105e-05, "loss": 5.5221, "mean_token_accuracy": 0.7558678537607193, "step": 40 }, { "epoch": 0.328, "grad_norm": 7.61790657043457, "learning_rate": 3.105263157894737e-05, "loss": 5.0442, "mean_token_accuracy": 0.7664971798658371, "step": 41 }, { "epoch": 0.336, "grad_norm": 6.234894275665283, "learning_rate": 3.0526315789473684e-05, "loss": 5.1083, "mean_token_accuracy": 0.7650484591722488, "step": 42 }, { "epoch": 0.344, "grad_norm": 6.606740951538086, "learning_rate": 3e-05, "loss": 4.7578, "mean_token_accuracy": 0.7714228928089142, "step": 43 }, { "epoch": 0.352, "grad_norm": 7.6674957275390625, "learning_rate": 2.9473684210526314e-05, "loss": 4.8684, "mean_token_accuracy": 0.7777758836746216, "step": 44 }, { "epoch": 0.36, "grad_norm": 6.467043399810791, "learning_rate": 2.8947368421052634e-05, "loss": 4.8455, "mean_token_accuracy": 0.7818402796983719, "step": 45 }, { "epoch": 0.368, "grad_norm": 8.154311180114746, "learning_rate": 2.842105263157895e-05, "loss": 4.2716, "mean_token_accuracy": 0.7975698858499527, "step": 46 }, { "epoch": 0.376, "grad_norm": 6.493333339691162, "learning_rate": 2.7894736842105263e-05, "loss": 4.8411, "mean_token_accuracy": 0.7828859686851501, "step": 47 }, { "epoch": 0.384, "grad_norm": 6.82868766784668, "learning_rate": 2.7368421052631583e-05, "loss": 5.1772, "mean_token_accuracy": 0.7642232775688171, "step": 48 }, { "epoch": 0.392, "grad_norm": 7.6515793800354, "learning_rate": 2.6842105263157896e-05, "loss": 5.3152, "mean_token_accuracy": 0.7572789639234543, "step": 49 }, { "epoch": 0.4, "grad_norm": 7.436869144439697, "learning_rate": 2.6315789473684212e-05, "loss": 5.6628, "mean_token_accuracy": 0.7437437325716019, "step": 50 }, { "epoch": 0.408, "grad_norm": 7.494150161743164, "learning_rate": 2.578947368421053e-05, "loss": 4.7967, "mean_token_accuracy": 0.7860404402017593, "step": 51 }, { "epoch": 0.416, "grad_norm": 7.129732131958008, "learning_rate": 2.5263157894736845e-05, "loss": 4.1962, "mean_token_accuracy": 0.8091482371091843, "step": 52 }, { "epoch": 0.424, "grad_norm": 6.54312801361084, "learning_rate": 2.4736842105263158e-05, "loss": 5.1673, "mean_token_accuracy": 0.7621930986642838, "step": 53 }, { "epoch": 0.432, "grad_norm": 7.091205596923828, "learning_rate": 2.4210526315789474e-05, "loss": 4.9484, "mean_token_accuracy": 0.7637773156166077, "step": 54 }, { "epoch": 0.44, "grad_norm": 6.452225685119629, "learning_rate": 2.368421052631579e-05, "loss": 5.2121, "mean_token_accuracy": 0.7751423120498657, "step": 55 }, { "epoch": 0.448, "grad_norm": 7.079778671264648, "learning_rate": 2.3157894736842107e-05, "loss": 4.4913, "mean_token_accuracy": 0.7925348579883575, "step": 56 }, { "epoch": 0.456, "grad_norm": 6.021428108215332, "learning_rate": 2.2631578947368423e-05, "loss": 5.1743, "mean_token_accuracy": 0.7735058069229126, "step": 57 }, { "epoch": 0.464, "grad_norm": 6.639064788818359, "learning_rate": 2.2105263157894736e-05, "loss": 4.2683, "mean_token_accuracy": 0.7983661592006683, "step": 58 }, { "epoch": 0.472, "grad_norm": 7.0270256996154785, "learning_rate": 2.1578947368421053e-05, "loss": 4.7973, "mean_token_accuracy": 0.7833641171455383, "step": 59 }, { "epoch": 0.48, "grad_norm": 6.48342752456665, "learning_rate": 2.105263157894737e-05, "loss": 4.7573, "mean_token_accuracy": 0.7896635830402374, "step": 60 }, { "epoch": 0.488, "grad_norm": 7.281738758087158, "learning_rate": 2.0526315789473685e-05, "loss": 5.3465, "mean_token_accuracy": 0.7754404097795486, "step": 61 }, { "epoch": 0.496, "grad_norm": 7.372372627258301, "learning_rate": 2e-05, "loss": 4.3753, "mean_token_accuracy": 0.7940351366996765, "step": 62 }, { "epoch": 0.504, "grad_norm": 7.551279544830322, "learning_rate": 1.9473684210526315e-05, "loss": 4.7479, "mean_token_accuracy": 0.7812730073928833, "step": 63 }, { "epoch": 0.512, "grad_norm": 6.71967077255249, "learning_rate": 1.8947368421052634e-05, "loss": 4.7599, "mean_token_accuracy": 0.7793499678373337, "step": 64 }, { "epoch": 0.52, "grad_norm": 7.2533063888549805, "learning_rate": 1.8421052631578947e-05, "loss": 4.8565, "mean_token_accuracy": 0.7772107869386673, "step": 65 }, { "epoch": 0.528, "grad_norm": 6.089155197143555, "learning_rate": 1.7894736842105264e-05, "loss": 4.4817, "mean_token_accuracy": 0.7843845635652542, "step": 66 }, { "epoch": 0.536, "grad_norm": 7.141996383666992, "learning_rate": 1.736842105263158e-05, "loss": 4.8889, "mean_token_accuracy": 0.7912164181470871, "step": 67 }, { "epoch": 0.544, "grad_norm": 6.36722993850708, "learning_rate": 1.6842105263157896e-05, "loss": 5.663, "mean_token_accuracy": 0.7410993576049805, "step": 68 }, { "epoch": 0.552, "grad_norm": 6.695736885070801, "learning_rate": 1.6315789473684213e-05, "loss": 5.4292, "mean_token_accuracy": 0.7530855089426041, "step": 69 }, { "epoch": 0.56, "grad_norm": 6.8679518699646, "learning_rate": 1.5789473684210526e-05, "loss": 4.6897, "mean_token_accuracy": 0.7747458964586258, "step": 70 }, { "epoch": 0.568, "grad_norm": 7.445884704589844, "learning_rate": 1.5263157894736842e-05, "loss": 4.9695, "mean_token_accuracy": 0.782129317522049, "step": 71 }, { "epoch": 0.576, "grad_norm": 6.467252254486084, "learning_rate": 1.4736842105263157e-05, "loss": 4.2316, "mean_token_accuracy": 0.8010141849517822, "step": 72 }, { "epoch": 0.584, "grad_norm": 6.673328876495361, "learning_rate": 1.4210526315789475e-05, "loss": 5.5348, "mean_token_accuracy": 0.7492043673992157, "step": 73 }, { "epoch": 0.592, "grad_norm": 6.37332820892334, "learning_rate": 1.3684210526315791e-05, "loss": 4.1118, "mean_token_accuracy": 0.8063640743494034, "step": 74 }, { "epoch": 0.6, "grad_norm": 6.5554327964782715, "learning_rate": 1.3157894736842106e-05, "loss": 4.3587, "mean_token_accuracy": 0.7785357385873795, "step": 75 }, { "epoch": 0.608, "grad_norm": 7.242586135864258, "learning_rate": 1.2631578947368422e-05, "loss": 4.7259, "mean_token_accuracy": 0.7722227722406387, "step": 76 }, { "epoch": 0.616, "grad_norm": 6.499831676483154, "learning_rate": 1.2105263157894737e-05, "loss": 4.582, "mean_token_accuracy": 0.8019344508647919, "step": 77 }, { "epoch": 0.624, "grad_norm": 6.212286949157715, "learning_rate": 1.1578947368421053e-05, "loss": 4.3784, "mean_token_accuracy": 0.7915610671043396, "step": 78 }, { "epoch": 0.632, "grad_norm": 6.525815963745117, "learning_rate": 1.1052631578947368e-05, "loss": 4.6062, "mean_token_accuracy": 0.7760037034749985, "step": 79 }, { "epoch": 0.64, "grad_norm": 6.437106609344482, "learning_rate": 1.0526315789473684e-05, "loss": 4.3811, "mean_token_accuracy": 0.7914712876081467, "step": 80 }, { "epoch": 0.648, "grad_norm": 6.544358730316162, "learning_rate": 1e-05, "loss": 4.1778, "mean_token_accuracy": 0.813565582036972, "step": 81 }, { "epoch": 0.656, "grad_norm": 5.799524307250977, "learning_rate": 9.473684210526317e-06, "loss": 4.5642, "mean_token_accuracy": 0.7896921932697296, "step": 82 }, { "epoch": 0.664, "grad_norm": 6.26992130279541, "learning_rate": 8.947368421052632e-06, "loss": 4.4728, "mean_token_accuracy": 0.7932559102773666, "step": 83 }, { "epoch": 0.672, "grad_norm": 7.898167133331299, "learning_rate": 8.421052631578948e-06, "loss": 4.7496, "mean_token_accuracy": 0.7824009358882904, "step": 84 }, { "epoch": 0.68, "grad_norm": 5.840052127838135, "learning_rate": 7.894736842105263e-06, "loss": 4.4144, "mean_token_accuracy": 0.8065742999315262, "step": 85 }, { "epoch": 0.688, "grad_norm": 5.884693145751953, "learning_rate": 7.3684210526315784e-06, "loss": 4.0572, "mean_token_accuracy": 0.8163146525621414, "step": 86 }, { "epoch": 0.696, "grad_norm": 6.5094380378723145, "learning_rate": 6.842105263157896e-06, "loss": 4.692, "mean_token_accuracy": 0.7821700870990753, "step": 87 }, { "epoch": 0.704, "grad_norm": 6.171222686767578, "learning_rate": 6.315789473684211e-06, "loss": 4.7373, "mean_token_accuracy": 0.7865629494190216, "step": 88 }, { "epoch": 0.712, "grad_norm": 6.430420875549316, "learning_rate": 5.789473684210527e-06, "loss": 5.1443, "mean_token_accuracy": 0.7733548134565353, "step": 89 }, { "epoch": 0.72, "grad_norm": 6.6331987380981445, "learning_rate": 5.263157894736842e-06, "loss": 4.8172, "mean_token_accuracy": 0.7886734008789062, "step": 90 }, { "epoch": 0.728, "grad_norm": 6.18582820892334, "learning_rate": 4.736842105263159e-06, "loss": 4.0192, "mean_token_accuracy": 0.8108067065477371, "step": 91 }, { "epoch": 0.736, "grad_norm": 6.787937641143799, "learning_rate": 4.210526315789474e-06, "loss": 4.3634, "mean_token_accuracy": 0.7937927544116974, "step": 92 }, { "epoch": 0.744, "grad_norm": 6.101965427398682, "learning_rate": 3.6842105263157892e-06, "loss": 4.7228, "mean_token_accuracy": 0.7906162887811661, "step": 93 }, { "epoch": 0.752, "grad_norm": 6.77475118637085, "learning_rate": 3.1578947368421056e-06, "loss": 4.4398, "mean_token_accuracy": 0.7847026586532593, "step": 94 }, { "epoch": 0.76, "grad_norm": 6.1204023361206055, "learning_rate": 2.631578947368421e-06, "loss": 4.3872, "mean_token_accuracy": 0.7921232283115387, "step": 95 }, { "epoch": 0.768, "grad_norm": 6.262160301208496, "learning_rate": 2.105263157894737e-06, "loss": 4.2201, "mean_token_accuracy": 0.7995105534791946, "step": 96 }, { "epoch": 0.776, "grad_norm": 6.433416843414307, "learning_rate": 1.5789473684210528e-06, "loss": 4.2633, "mean_token_accuracy": 0.782690092921257, "step": 97 }, { "epoch": 0.784, "grad_norm": 6.012047290802002, "learning_rate": 1.0526315789473685e-06, "loss": 4.2714, "mean_token_accuracy": 0.791715532541275, "step": 98 }, { "epoch": 0.792, "grad_norm": 6.2551445960998535, "learning_rate": 5.263157894736843e-07, "loss": 4.4735, "mean_token_accuracy": 0.7887705862522125, "step": 99 }, { "epoch": 0.8, "grad_norm": 6.641271591186523, "learning_rate": 0.0, "loss": 4.7771, "mean_token_accuracy": 0.7853755056858063, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1546494935040000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }