{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09090909090909091, "grad_norm": 2.9437830448150635, "learning_rate": 9.818181818181818e-05, "loss": 0.2953, "step": 10 }, { "epoch": 0.18181818181818182, "grad_norm": 5.8472676277160645, "learning_rate": 9.636363636363637e-05, "loss": 0.349, "step": 20 }, { "epoch": 0.2727272727272727, "grad_norm": 2.7091057300567627, "learning_rate": 9.454545454545455e-05, "loss": 0.2583, "step": 30 }, { "epoch": 0.36363636363636365, "grad_norm": 3.0798821449279785, "learning_rate": 9.272727272727273e-05, "loss": 0.2806, "step": 40 }, { "epoch": 0.45454545454545453, "grad_norm": 0.9187162518501282, "learning_rate": 9.090909090909092e-05, "loss": 0.3229, "step": 50 }, { "epoch": 0.5454545454545454, "grad_norm": 1.0290968418121338, "learning_rate": 8.90909090909091e-05, "loss": 0.271, "step": 60 }, { "epoch": 0.6363636363636364, "grad_norm": 1.4106591939926147, "learning_rate": 8.727272727272727e-05, "loss": 0.3005, "step": 70 }, { "epoch": 0.7272727272727273, "grad_norm": 2.5027103424072266, "learning_rate": 8.545454545454545e-05, "loss": 0.3279, "step": 80 }, { "epoch": 0.8181818181818182, "grad_norm": 1.4250562191009521, "learning_rate": 8.363636363636364e-05, "loss": 0.261, "step": 90 }, { "epoch": 0.9090909090909091, "grad_norm": 2.6990318298339844, "learning_rate": 8.181818181818183e-05, "loss": 0.3056, "step": 100 }, { "epoch": 1.0, "grad_norm": 1.4321504831314087, "learning_rate": 8e-05, "loss": 0.3135, "step": 110 }, { "epoch": 1.0, "eval_loss": 0.26422303915023804, "eval_mse": 0.26422300934791565, "eval_runtime": 146.8663, "eval_samples_per_second": 5.992, "eval_steps_per_second": 0.191, "step": 110 }, { "epoch": 1.0909090909090908, "grad_norm": 1.3414746522903442, "learning_rate": 7.818181818181818e-05, "loss": 0.2876, "step": 120 }, { "epoch": 1.1818181818181819, "grad_norm": 2.420074939727783, "learning_rate": 7.636363636363637e-05, "loss": 0.2263, "step": 130 }, { "epoch": 1.2727272727272727, "grad_norm": 1.7859795093536377, "learning_rate": 7.454545454545455e-05, "loss": 0.2656, "step": 140 }, { "epoch": 1.3636363636363638, "grad_norm": 1.711840033531189, "learning_rate": 7.272727272727273e-05, "loss": 0.3181, "step": 150 }, { "epoch": 1.4545454545454546, "grad_norm": 1.9030543565750122, "learning_rate": 7.090909090909092e-05, "loss": 0.2948, "step": 160 }, { "epoch": 1.5454545454545454, "grad_norm": 4.079201698303223, "learning_rate": 6.90909090909091e-05, "loss": 0.3092, "step": 170 }, { "epoch": 1.6363636363636362, "grad_norm": 1.5639630556106567, "learning_rate": 6.727272727272727e-05, "loss": 0.3237, "step": 180 }, { "epoch": 1.7272727272727273, "grad_norm": 1.0921226739883423, "learning_rate": 6.545454545454546e-05, "loss": 0.2761, "step": 190 }, { "epoch": 1.8181818181818183, "grad_norm": 1.2289607524871826, "learning_rate": 6.363636363636364e-05, "loss": 0.2839, "step": 200 }, { "epoch": 1.9090909090909092, "grad_norm": 7.732837677001953, "learning_rate": 6.181818181818182e-05, "loss": 0.3596, "step": 210 }, { "epoch": 2.0, "grad_norm": 1.0822261571884155, "learning_rate": 6e-05, "loss": 0.2541, "step": 220 }, { "epoch": 2.0, "eval_loss": 0.2631426453590393, "eval_mse": 0.2631426453590393, "eval_runtime": 144.7981, "eval_samples_per_second": 6.077, "eval_steps_per_second": 0.193, "step": 220 }, { "epoch": 2.090909090909091, "grad_norm": 7.311972618103027, "learning_rate": 5.818181818181818e-05, "loss": 0.2924, "step": 230 }, { "epoch": 2.1818181818181817, "grad_norm": 2.816499948501587, "learning_rate": 5.636363636363636e-05, "loss": 0.2654, "step": 240 }, { "epoch": 2.2727272727272725, "grad_norm": 2.1038618087768555, "learning_rate": 5.4545454545454546e-05, "loss": 0.3056, "step": 250 }, { "epoch": 2.3636363636363638, "grad_norm": 2.7272915840148926, "learning_rate": 5.272727272727272e-05, "loss": 0.2994, "step": 260 }, { "epoch": 2.4545454545454546, "grad_norm": 1.5483731031417847, "learning_rate": 5.090909090909091e-05, "loss": 0.2938, "step": 270 }, { "epoch": 2.5454545454545454, "grad_norm": 1.6636422872543335, "learning_rate": 4.909090909090909e-05, "loss": 0.2762, "step": 280 }, { "epoch": 2.6363636363636362, "grad_norm": 2.477865219116211, "learning_rate": 4.7272727272727275e-05, "loss": 0.3129, "step": 290 }, { "epoch": 2.7272727272727275, "grad_norm": 2.1476693153381348, "learning_rate": 4.545454545454546e-05, "loss": 0.2656, "step": 300 }, { "epoch": 2.8181818181818183, "grad_norm": 1.3794795274734497, "learning_rate": 4.3636363636363636e-05, "loss": 0.2629, "step": 310 }, { "epoch": 2.909090909090909, "grad_norm": 0.8552964925765991, "learning_rate": 4.181818181818182e-05, "loss": 0.2593, "step": 320 }, { "epoch": 3.0, "grad_norm": 1.4841196537017822, "learning_rate": 4e-05, "loss": 0.2589, "step": 330 }, { "epoch": 3.0, "eval_loss": 0.2576321065425873, "eval_mse": 0.2576321065425873, "eval_runtime": 145.7562, "eval_samples_per_second": 6.037, "eval_steps_per_second": 0.192, "step": 330 }, { "epoch": 3.090909090909091, "grad_norm": 3.3481814861297607, "learning_rate": 3.818181818181819e-05, "loss": 0.267, "step": 340 }, { "epoch": 3.1818181818181817, "grad_norm": 1.704964280128479, "learning_rate": 3.6363636363636364e-05, "loss": 0.2177, "step": 350 }, { "epoch": 3.2727272727272725, "grad_norm": 1.3325728178024292, "learning_rate": 3.454545454545455e-05, "loss": 0.2815, "step": 360 }, { "epoch": 3.3636363636363638, "grad_norm": 2.831594705581665, "learning_rate": 3.272727272727273e-05, "loss": 0.2813, "step": 370 }, { "epoch": 3.4545454545454546, "grad_norm": 1.7833585739135742, "learning_rate": 3.090909090909091e-05, "loss": 0.2745, "step": 380 }, { "epoch": 3.5454545454545454, "grad_norm": 2.4361367225646973, "learning_rate": 2.909090909090909e-05, "loss": 0.3067, "step": 390 }, { "epoch": 3.6363636363636362, "grad_norm": 1.7597070932388306, "learning_rate": 2.7272727272727273e-05, "loss": 0.2706, "step": 400 }, { "epoch": 3.7272727272727275, "grad_norm": 2.5651698112487793, "learning_rate": 2.5454545454545454e-05, "loss": 0.2742, "step": 410 }, { "epoch": 3.8181818181818183, "grad_norm": 1.827591896057129, "learning_rate": 2.3636363636363637e-05, "loss": 0.2732, "step": 420 }, { "epoch": 3.909090909090909, "grad_norm": 3.2626404762268066, "learning_rate": 2.1818181818181818e-05, "loss": 0.2653, "step": 430 }, { "epoch": 4.0, "grad_norm": 2.742932081222534, "learning_rate": 2e-05, "loss": 0.3396, "step": 440 }, { "epoch": 4.0, "eval_loss": 0.2595486342906952, "eval_mse": 0.2595486640930176, "eval_runtime": 142.819, "eval_samples_per_second": 6.162, "eval_steps_per_second": 0.196, "step": 440 }, { "epoch": 4.090909090909091, "grad_norm": 1.4092158079147339, "learning_rate": 1.8181818181818182e-05, "loss": 0.2322, "step": 450 }, { "epoch": 4.181818181818182, "grad_norm": 4.581083297729492, "learning_rate": 1.6363636363636366e-05, "loss": 0.3015, "step": 460 }, { "epoch": 4.2727272727272725, "grad_norm": 1.8915051221847534, "learning_rate": 1.4545454545454545e-05, "loss": 0.2979, "step": 470 }, { "epoch": 4.363636363636363, "grad_norm": 2.127157688140869, "learning_rate": 1.2727272727272727e-05, "loss": 0.2525, "step": 480 }, { "epoch": 4.454545454545454, "grad_norm": 3.3702645301818848, "learning_rate": 1.0909090909090909e-05, "loss": 0.2603, "step": 490 }, { "epoch": 4.545454545454545, "grad_norm": 2.1041481494903564, "learning_rate": 9.090909090909091e-06, "loss": 0.2564, "step": 500 }, { "epoch": 4.636363636363637, "grad_norm": 2.5933034420013428, "learning_rate": 7.272727272727272e-06, "loss": 0.2677, "step": 510 }, { "epoch": 4.7272727272727275, "grad_norm": 2.349623918533325, "learning_rate": 5.4545454545454545e-06, "loss": 0.2752, "step": 520 }, { "epoch": 4.818181818181818, "grad_norm": 5.803585529327393, "learning_rate": 3.636363636363636e-06, "loss": 0.2943, "step": 530 }, { "epoch": 4.909090909090909, "grad_norm": 1.2074114084243774, "learning_rate": 1.818181818181818e-06, "loss": 0.2917, "step": 540 }, { "epoch": 5.0, "grad_norm": 0.9745060801506042, "learning_rate": 0.0, "loss": 0.2587, "step": 550 } ], "logging_steps": 10, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }