{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4998118178396688, "eval_steps": 1000, "global_step": 332, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015054572826496047, "grad_norm": 3.01820707321167, "learning_rate": 1.4705882352941177e-05, "loss": 1.4874, "step": 10 }, { "epoch": 0.030109145652992095, "grad_norm": 5.344997882843018, "learning_rate": 2.9411764705882354e-05, "loss": 1.4372, "step": 20 }, { "epoch": 0.04516371847948814, "grad_norm": 4.847251892089844, "learning_rate": 4.411764705882353e-05, "loss": 1.5973, "step": 30 }, { "epoch": 0.06021829130598419, "grad_norm": 31.4217586517334, "learning_rate": 4.995000403852057e-05, "loss": 1.8488, "step": 40 }, { "epoch": 0.07527286413248024, "grad_norm": 4.010560035705566, "learning_rate": 4.9645197025242506e-05, "loss": 1.6171, "step": 50 }, { "epoch": 0.09032743695897628, "grad_norm": 3.573289394378662, "learning_rate": 4.9066738880408945e-05, "loss": 1.5635, "step": 60 }, { "epoch": 0.10538200978547234, "grad_norm": 3.300258159637451, "learning_rate": 4.822105258882007e-05, "loss": 1.5138, "step": 70 }, { "epoch": 0.12043658261196838, "grad_norm": 2.9470131397247314, "learning_rate": 4.711752833755362e-05, "loss": 1.5063, "step": 80 }, { "epoch": 0.13549115543846443, "grad_norm": 3.3921890258789062, "learning_rate": 4.576841925080853e-05, "loss": 1.4952, "step": 90 }, { "epoch": 0.1505457282649605, "grad_norm": 2.8532607555389404, "learning_rate": 4.418870533575625e-05, "loss": 1.4475, "step": 100 }, { "epoch": 0.16560030109145654, "grad_norm": 2.333944082260132, "learning_rate": 4.239592715009429e-05, "loss": 1.4654, "step": 110 }, { "epoch": 0.18065487391795257, "grad_norm": 2.1384618282318115, "learning_rate": 4.040999103819606e-05, "loss": 1.4262, "step": 120 }, { "epoch": 0.19570944674444862, "grad_norm": 2.218015193939209, "learning_rate": 3.8252948098442344e-05, "loss": 1.4034, "step": 130 }, { "epoch": 0.21076401957094468, "grad_norm": 2.2272789478302, "learning_rate": 3.5948749335999496e-05, "loss": 1.3977, "step": 140 }, { "epoch": 0.22581859239744073, "grad_norm": 2.2398104667663574, "learning_rate": 3.3522979719736926e-05, "loss": 1.3365, "step": 150 }, { "epoch": 0.24087316522393676, "grad_norm": 1.9465758800506592, "learning_rate": 3.100257409621738e-05, "loss": 1.3573, "step": 160 }, { "epoch": 0.25592773805043284, "grad_norm": 1.9894447326660156, "learning_rate": 2.8415518115145674e-05, "loss": 1.3378, "step": 170 }, { "epoch": 0.27098231087692887, "grad_norm": 2.052354335784912, "learning_rate": 2.5790537487088974e-05, "loss": 1.3316, "step": 180 }, { "epoch": 0.2860368837034249, "grad_norm": 2.086660861968994, "learning_rate": 2.3156779023835525e-05, "loss": 1.3145, "step": 190 }, { "epoch": 0.301091456529921, "grad_norm": 1.757770299911499, "learning_rate": 2.054348700300158e-05, "loss": 1.3088, "step": 200 }, { "epoch": 0.316146029356417, "grad_norm": 1.7598217725753784, "learning_rate": 1.7979678450413845e-05, "loss": 1.2496, "step": 210 }, { "epoch": 0.3312006021829131, "grad_norm": 1.96985924243927, "learning_rate": 1.549382094581166e-05, "loss": 1.2671, "step": 220 }, { "epoch": 0.3462551750094091, "grad_norm": 2.18863582611084, "learning_rate": 1.3113516529394704e-05, "loss": 1.2616, "step": 230 }, { "epoch": 0.36130974783590514, "grad_norm": 1.9665237665176392, "learning_rate": 1.086519521900103e-05, "loss": 1.2281, "step": 240 }, { "epoch": 0.3763643206624012, "grad_norm": 1.5388092994689941, "learning_rate": 8.77382154098679e-06, "loss": 1.2435, "step": 250 }, { "epoch": 0.39141889348889725, "grad_norm": 2.0164120197296143, "learning_rate": 6.862617333380214e-06, "loss": 1.2359, "step": 260 }, { "epoch": 0.40647346631539333, "grad_norm": 1.8150506019592285, "learning_rate": 5.1528038992007e-06, "loss": 1.2225, "step": 270 }, { "epoch": 0.42152803914188935, "grad_norm": 1.8402557373046875, "learning_rate": 3.6633663729770008e-06, "loss": 1.2176, "step": 280 }, { "epoch": 0.4365826119683854, "grad_norm": 1.8227931261062622, "learning_rate": 2.4108429168514245e-06, "loss": 1.2321, "step": 290 }, { "epoch": 0.45163718479488146, "grad_norm": 1.6499103307724, "learning_rate": 1.4091410869582267e-06, "loss": 1.2331, "step": 300 }, { "epoch": 0.4666917576213775, "grad_norm": 1.8676466941833496, "learning_rate": 6.69383409075991e-07, "loss": 1.2173, "step": 310 }, { "epoch": 0.4817463304478735, "grad_norm": 1.8943265676498413, "learning_rate": 1.9978387822460197e-07, "loss": 1.2157, "step": 320 }, { "epoch": 0.4968009032743696, "grad_norm": 1.675297737121582, "learning_rate": 5.55675350954743e-09, "loss": 1.1926, "step": 330 }, { "epoch": 0.4998118178396688, "step": 332, "total_flos": 2.9336259752728986e+17, "train_loss": 1.3684147273201541, "train_runtime": 1819.5691, "train_samples_per_second": 23.361, "train_steps_per_second": 0.182 } ], "logging_steps": 10, "max_steps": 332, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.9336259752728986e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }