{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9974380871050385, "eval_steps": 500, "global_step": 292, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017079419299743808, "grad_norm": 16.74905776977539, "learning_rate": 0.0002, "loss": 0.7228, "step": 5 }, { "epoch": 0.034158838599487616, "grad_norm": 58.4304313659668, "learning_rate": 0.0002, "loss": 0.4776, "step": 10 }, { "epoch": 0.05123825789923143, "grad_norm": 5103.43115234375, "learning_rate": 0.0002, "loss": 0.4844, "step": 15 }, { "epoch": 0.06831767719897523, "grad_norm": 4.203484058380127, "learning_rate": 0.0002, "loss": 0.3647, "step": 20 }, { "epoch": 0.08539709649871904, "grad_norm": 5.00346040725708, "learning_rate": 0.0002, "loss": 0.2881, "step": 25 }, { "epoch": 0.10247651579846286, "grad_norm": 3.1734917163848877, "learning_rate": 0.0002, "loss": 0.2746, "step": 30 }, { "epoch": 0.11955593509820667, "grad_norm": 5.0422892570495605, "learning_rate": 0.0002, "loss": 0.2815, "step": 35 }, { "epoch": 0.13663535439795046, "grad_norm": 2.9244906902313232, "learning_rate": 0.0002, "loss": 0.2701, "step": 40 }, { "epoch": 0.1537147736976943, "grad_norm": 2.645214319229126, "learning_rate": 0.0002, "loss": 0.2735, "step": 45 }, { "epoch": 0.1707941929974381, "grad_norm": 4.402356147766113, "learning_rate": 0.0002, "loss": 0.2787, "step": 50 }, { "epoch": 0.18787361229718189, "grad_norm": 16.216989517211914, "learning_rate": 0.0002, "loss": 0.2773, "step": 55 }, { "epoch": 0.2049530315969257, "grad_norm": 2.3941385746002197, "learning_rate": 0.0002, "loss": 0.2555, "step": 60 }, { "epoch": 0.2220324508966695, "grad_norm": 4.721043109893799, "learning_rate": 0.0002, "loss": 0.2621, "step": 65 }, { "epoch": 0.23911187019641333, "grad_norm": 3.0449936389923096, "learning_rate": 0.0002, "loss": 0.2778, "step": 70 }, { "epoch": 0.2561912894961571, "grad_norm": 3.577408790588379, "learning_rate": 0.0002, "loss": 0.2719, "step": 75 }, { "epoch": 0.27327070879590093, "grad_norm": 5.593847751617432, "learning_rate": 0.0002, "loss": 0.2745, "step": 80 }, { "epoch": 0.29035012809564475, "grad_norm": 8.954093933105469, "learning_rate": 0.0002, "loss": 0.2729, "step": 85 }, { "epoch": 0.3074295473953886, "grad_norm": 3.8077216148376465, "learning_rate": 0.0002, "loss": 0.2858, "step": 90 }, { "epoch": 0.32450896669513235, "grad_norm": 3.859689235687256, "learning_rate": 0.0002, "loss": 0.2632, "step": 95 }, { "epoch": 0.3415883859948762, "grad_norm": 5.621855735778809, "learning_rate": 0.0002, "loss": 0.2741, "step": 100 }, { "epoch": 0.35866780529462, "grad_norm": 2.253854990005493, "learning_rate": 0.0002, "loss": 0.2703, "step": 105 }, { "epoch": 0.37574722459436377, "grad_norm": 4.003990650177002, "learning_rate": 0.0002, "loss": 0.2676, "step": 110 }, { "epoch": 0.3928266438941076, "grad_norm": 2.7299556732177734, "learning_rate": 0.0002, "loss": 0.257, "step": 115 }, { "epoch": 0.4099060631938514, "grad_norm": 2.6036922931671143, "learning_rate": 0.0002, "loss": 0.2559, "step": 120 }, { "epoch": 0.4269854824935952, "grad_norm": 4.3982439041137695, "learning_rate": 0.0002, "loss": 0.2618, "step": 125 }, { "epoch": 0.444064901793339, "grad_norm": 1.9455078840255737, "learning_rate": 0.0002, "loss": 0.2384, "step": 130 }, { "epoch": 0.46114432109308284, "grad_norm": 1.7560608386993408, "learning_rate": 0.0002, "loss": 0.2566, "step": 135 }, { "epoch": 0.47822374039282667, "grad_norm": 1.9673510789871216, "learning_rate": 0.0002, "loss": 0.2449, "step": 140 }, { "epoch": 0.49530315969257044, "grad_norm": 5.126251220703125, "learning_rate": 0.0002, "loss": 0.2447, "step": 145 }, { "epoch": 0.5123825789923142, "grad_norm": 2.063695192337036, "learning_rate": 0.0002, "loss": 0.2672, "step": 150 }, { "epoch": 0.5294619982920581, "grad_norm": 1.9485970735549927, "learning_rate": 0.0002, "loss": 0.2503, "step": 155 }, { "epoch": 0.5465414175918019, "grad_norm": 2.7796075344085693, "learning_rate": 0.0002, "loss": 0.2445, "step": 160 }, { "epoch": 0.5636208368915457, "grad_norm": 5.221657752990723, "learning_rate": 0.0002, "loss": 0.2611, "step": 165 }, { "epoch": 0.5807002561912895, "grad_norm": 1.8795976638793945, "learning_rate": 0.0002, "loss": 0.2534, "step": 170 }, { "epoch": 0.5977796754910333, "grad_norm": 1.9134082794189453, "learning_rate": 0.0002, "loss": 0.2367, "step": 175 }, { "epoch": 0.6148590947907772, "grad_norm": 1.7492831945419312, "learning_rate": 0.0002, "loss": 0.2488, "step": 180 }, { "epoch": 0.6319385140905209, "grad_norm": 1.947543740272522, "learning_rate": 0.0002, "loss": 0.2756, "step": 185 }, { "epoch": 0.6490179333902647, "grad_norm": 1.6915335655212402, "learning_rate": 0.0002, "loss": 0.2503, "step": 190 }, { "epoch": 0.6660973526900086, "grad_norm": 1.868281602859497, "learning_rate": 0.0002, "loss": 0.2539, "step": 195 }, { "epoch": 0.6831767719897524, "grad_norm": 1.8502358198165894, "learning_rate": 0.0002, "loss": 0.2486, "step": 200 }, { "epoch": 0.7002561912894961, "grad_norm": 1.773826241493225, "learning_rate": 0.0002, "loss": 0.2777, "step": 205 }, { "epoch": 0.71733561058924, "grad_norm": 1.6549079418182373, "learning_rate": 0.0002, "loss": 0.237, "step": 210 }, { "epoch": 0.7344150298889838, "grad_norm": 2.374314308166504, "learning_rate": 0.0002, "loss": 0.2348, "step": 215 }, { "epoch": 0.7514944491887275, "grad_norm": 1.527572751045227, "learning_rate": 0.0002, "loss": 0.2481, "step": 220 }, { "epoch": 0.7685738684884714, "grad_norm": 6.5215630531311035, "learning_rate": 0.0002, "loss": 0.255, "step": 225 }, { "epoch": 0.7856532877882152, "grad_norm": 1.6331310272216797, "learning_rate": 0.0002, "loss": 0.2463, "step": 230 }, { "epoch": 0.802732707087959, "grad_norm": 1.66902494430542, "learning_rate": 0.0002, "loss": 0.2565, "step": 235 }, { "epoch": 0.8198121263877028, "grad_norm": 1.773727536201477, "learning_rate": 0.0002, "loss": 0.2572, "step": 240 }, { "epoch": 0.8368915456874466, "grad_norm": 1.5620421171188354, "learning_rate": 0.0002, "loss": 0.2441, "step": 245 }, { "epoch": 0.8539709649871904, "grad_norm": 1.8150323629379272, "learning_rate": 0.0002, "loss": 0.2414, "step": 250 }, { "epoch": 0.8710503842869343, "grad_norm": 1.7679723501205444, "learning_rate": 0.0002, "loss": 0.2461, "step": 255 }, { "epoch": 0.888129803586678, "grad_norm": 1.8769031763076782, "learning_rate": 0.0002, "loss": 0.2541, "step": 260 }, { "epoch": 0.9052092228864219, "grad_norm": 1.8784369230270386, "learning_rate": 0.0002, "loss": 0.2451, "step": 265 }, { "epoch": 0.9222886421861657, "grad_norm": 2.3734097480773926, "learning_rate": 0.0002, "loss": 0.2465, "step": 270 }, { "epoch": 0.9393680614859095, "grad_norm": 1.8832842111587524, "learning_rate": 0.0002, "loss": 0.2532, "step": 275 }, { "epoch": 0.9564474807856533, "grad_norm": 1.890751600265503, "learning_rate": 0.0002, "loss": 0.246, "step": 280 }, { "epoch": 0.9735269000853971, "grad_norm": 1.5273067951202393, "learning_rate": 0.0002, "loss": 0.2547, "step": 285 }, { "epoch": 0.9906063193851409, "grad_norm": 1.633906602859497, "learning_rate": 0.0002, "loss": 0.2444, "step": 290 }, { "epoch": 0.9974380871050385, "step": 292, "total_flos": 2.0907667526516736e+17, "train_loss": 0.27567735936951965, "train_runtime": 6451.0878, "train_samples_per_second": 0.726, "train_steps_per_second": 0.045 } ], "logging_steps": 5, "max_steps": 292, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0907667526516736e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }