{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 500, "global_step": 555, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009009009009009009, "grad_norm": 1.4058982133865356, "learning_rate": 1.785714285714286e-05, "loss": 3.0329, "step": 10 }, { "epoch": 0.018018018018018018, "grad_norm": 1.5084704160690308, "learning_rate": 3.571428571428572e-05, "loss": 2.963, "step": 20 }, { "epoch": 0.02702702702702703, "grad_norm": 1.8131718635559082, "learning_rate": 5.3571428571428575e-05, "loss": 2.5483, "step": 30 }, { "epoch": 0.036036036036036036, "grad_norm": 1.7166498899459839, "learning_rate": 7.142857142857143e-05, "loss": 2.2173, "step": 40 }, { "epoch": 0.04504504504504504, "grad_norm": 1.9339869022369385, "learning_rate": 8.92857142857143e-05, "loss": 2.0999, "step": 50 }, { "epoch": 0.05405405405405406, "grad_norm": 1.7656594514846802, "learning_rate": 9.998414611537681e-05, "loss": 2.065, "step": 60 }, { "epoch": 0.06306306306306306, "grad_norm": 1.831594467163086, "learning_rate": 9.980590535514233e-05, "loss": 2.0001, "step": 70 }, { "epoch": 0.07207207207207207, "grad_norm": 1.879267692565918, "learning_rate": 9.943031509146825e-05, "loss": 1.9977, "step": 80 }, { "epoch": 0.08108108108108109, "grad_norm": 1.8009542226791382, "learning_rate": 9.885886355253758e-05, "loss": 2.0185, "step": 90 }, { "epoch": 0.09009009009009009, "grad_norm": 1.7639012336730957, "learning_rate": 9.809381504168234e-05, "loss": 1.9265, "step": 100 }, { "epoch": 0.0990990990990991, "grad_norm": 1.8977757692337036, "learning_rate": 9.713820096537225e-05, "loss": 1.9129, "step": 110 }, { "epoch": 0.10810810810810811, "grad_norm": 2.3275864124298096, "learning_rate": 9.599580782165598e-05, "loss": 1.9642, "step": 120 }, { "epoch": 0.11711711711711711, "grad_norm": 1.7085381746292114, "learning_rate": 9.467116219664894e-05, "loss": 1.9043, "step": 130 }, { "epoch": 0.12612612612612611, "grad_norm": 1.9735201597213745, "learning_rate": 9.316951282851707e-05, "loss": 1.8833, "step": 140 }, { "epoch": 0.13513513513513514, "grad_norm": 1.8018202781677246, "learning_rate": 9.149680981002609e-05, "loss": 1.9395, "step": 150 }, { "epoch": 0.14414414414414414, "grad_norm": 2.0004780292510986, "learning_rate": 8.965968101206291e-05, "loss": 1.8237, "step": 160 }, { "epoch": 0.15315315315315314, "grad_norm": 1.8323822021484375, "learning_rate": 8.766540582154859e-05, "loss": 1.9039, "step": 170 }, { "epoch": 0.16216216216216217, "grad_norm": 2.0277061462402344, "learning_rate": 8.552188629780244e-05, "loss": 1.8434, "step": 180 }, { "epoch": 0.17117117117117117, "grad_norm": 1.9151962995529175, "learning_rate": 8.323761586164695e-05, "loss": 1.8668, "step": 190 }, { "epoch": 0.18018018018018017, "grad_norm": 1.985343098640442, "learning_rate": 8.082164564131845e-05, "loss": 1.8238, "step": 200 }, { "epoch": 0.1891891891891892, "grad_norm": 1.9954603910446167, "learning_rate": 7.828354860853399e-05, "loss": 1.8616, "step": 210 }, { "epoch": 0.1981981981981982, "grad_norm": 2.0763890743255615, "learning_rate": 7.563338164682036e-05, "loss": 1.795, "step": 220 }, { "epoch": 0.2072072072072072, "grad_norm": 2.1642045974731445, "learning_rate": 7.288164570240463e-05, "loss": 1.7992, "step": 230 }, { "epoch": 0.21621621621621623, "grad_norm": 1.9592050313949585, "learning_rate": 7.003924417556343e-05, "loss": 1.7634, "step": 240 }, { "epoch": 0.22522522522522523, "grad_norm": 2.011563777923584, "learning_rate": 6.711743971729967e-05, "loss": 1.796, "step": 250 }, { "epoch": 0.23423423423423423, "grad_norm": 2.1861069202423096, "learning_rate": 6.412780960253436e-05, "loss": 1.839, "step": 260 }, { "epoch": 0.24324324324324326, "grad_norm": 2.18581223487854, "learning_rate": 6.108219985664161e-05, "loss": 1.8259, "step": 270 }, { "epoch": 0.25225225225225223, "grad_norm": 2.018559694290161, "learning_rate": 5.799267831709442e-05, "loss": 1.7566, "step": 280 }, { "epoch": 0.26126126126126126, "grad_norm": 2.043818950653076, "learning_rate": 5.487148681620862e-05, "loss": 1.7795, "step": 290 }, { "epoch": 0.2702702702702703, "grad_norm": 2.263270854949951, "learning_rate": 5.173099267445451e-05, "loss": 1.7507, "step": 300 }, { "epoch": 0.27927927927927926, "grad_norm": 2.139925241470337, "learning_rate": 4.858363969653781e-05, "loss": 1.7462, "step": 310 }, { "epoch": 0.2882882882882883, "grad_norm": 2.1028239727020264, "learning_rate": 4.544189886442162e-05, "loss": 1.8059, "step": 320 }, { "epoch": 0.2972972972972973, "grad_norm": 2.3162405490875244, "learning_rate": 4.23182189226621e-05, "loss": 1.7456, "step": 330 }, { "epoch": 0.3063063063063063, "grad_norm": 2.153001546859741, "learning_rate": 3.9224977051856904e-05, "loss": 1.7294, "step": 340 }, { "epoch": 0.3153153153153153, "grad_norm": 2.3347864151000977, "learning_rate": 3.6174429825656685e-05, "loss": 1.7287, "step": 350 }, { "epoch": 0.32432432432432434, "grad_norm": 2.408170700073242, "learning_rate": 3.3178664645666066e-05, "loss": 1.7555, "step": 360 }, { "epoch": 0.3333333333333333, "grad_norm": 2.102254629135132, "learning_rate": 3.0249551846667207e-05, "loss": 1.7149, "step": 370 }, { "epoch": 0.34234234234234234, "grad_norm": 2.818976640701294, "learning_rate": 2.739869766194263e-05, "loss": 1.7532, "step": 380 }, { "epoch": 0.35135135135135137, "grad_norm": 2.4131245613098145, "learning_rate": 2.4637398235066527e-05, "loss": 1.6948, "step": 390 }, { "epoch": 0.36036036036036034, "grad_norm": 2.3351337909698486, "learning_rate": 2.1976594860386597e-05, "loss": 1.7533, "step": 400 }, { "epoch": 0.36936936936936937, "grad_norm": 2.7150731086730957, "learning_rate": 1.9426830629550242e-05, "loss": 1.7752, "step": 410 }, { "epoch": 0.3783783783783784, "grad_norm": 2.3618431091308594, "learning_rate": 1.6998208655858137e-05, "loss": 1.7286, "step": 420 }, { "epoch": 0.38738738738738737, "grad_norm": 2.6013503074645996, "learning_rate": 1.4700352041975168e-05, "loss": 1.7331, "step": 430 }, { "epoch": 0.3963963963963964, "grad_norm": 2.411003351211548, "learning_rate": 1.2542365749622049e-05, "loss": 1.7582, "step": 440 }, { "epoch": 0.40540540540540543, "grad_norm": 2.211366891860962, "learning_rate": 1.0532800522333902e-05, "loss": 1.6628, "step": 450 }, { "epoch": 0.4144144144144144, "grad_norm": 2.5683863162994385, "learning_rate": 8.67961900423711e-06, "loss": 1.6957, "step": 460 }, { "epoch": 0.42342342342342343, "grad_norm": 2.2518529891967773, "learning_rate": 6.990164189094589e-06, "loss": 1.7118, "step": 470 }, { "epoch": 0.43243243243243246, "grad_norm": 2.4307000637054443, "learning_rate": 5.4711303246361144e-06, "loss": 1.6488, "step": 480 }, { "epoch": 0.44144144144144143, "grad_norm": 2.3331761360168457, "learning_rate": 4.12853638746134e-06, "loss": 1.6793, "step": 490 }, { "epoch": 0.45045045045045046, "grad_norm": 2.3686130046844482, "learning_rate": 2.9677022336181413e-06, "loss": 1.6649, "step": 500 }, { "epoch": 0.45045045045045046, "eval_loss": 1.7574011087417603, "eval_runtime": 51.3954, "eval_samples_per_second": 76.758, "eval_steps_per_second": 19.204, "step": 500 }, { "epoch": 0.4594594594594595, "grad_norm": 2.348306655883789, "learning_rate": 1.993227519356189e-06, "loss": 1.7905, "step": 510 }, { "epoch": 0.46846846846846846, "grad_norm": 2.5951645374298096, "learning_rate": 1.208973475579761e-06, "loss": 1.7358, "step": 520 }, { "epoch": 0.4774774774774775, "grad_norm": 2.408781051635742, "learning_rate": 6.180476082162656e-07, "loss": 1.7079, "step": 530 }, { "epoch": 0.4864864864864865, "grad_norm": 2.519191265106201, "learning_rate": 2.2279138512300567e-07, "loss": 1.6984, "step": 540 }, { "epoch": 0.4954954954954955, "grad_norm": 2.511556625366211, "learning_rate": 2.4770958321568283e-08, "loss": 1.637, "step": 550 }, { "epoch": 0.5, "step": 555, "total_flos": 1.1620205475764634e+17, "train_loss": 1.8654380205515269, "train_runtime": 468.8189, "train_samples_per_second": 37.866, "train_steps_per_second": 1.184 } ], "logging_steps": 10, "max_steps": 555, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1620205475764634e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }