{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 300000000000000000, "global_step": 4230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02364066193853428, "grad_norm": 4265.77734375, "learning_rate": 2.695035460992908e-06, "loss": 19.7865, "step": 50 }, { "epoch": 0.04728132387706856, "grad_norm": Infinity, "learning_rate": 6.2411347517730495e-06, "loss": 9.5661, "step": 100 }, { "epoch": 0.07092198581560284, "grad_norm": 1082.762939453125, "learning_rate": 9.645390070921986e-06, "loss": 2.3602, "step": 150 }, { "epoch": 0.09456264775413711, "grad_norm": 249.59776306152344, "learning_rate": 1.3191489361702129e-05, "loss": 1.7304, "step": 200 }, { "epoch": 0.1182033096926714, "grad_norm": 171.15611267089844, "learning_rate": 1.673758865248227e-05, "loss": 1.6707, "step": 250 }, { "epoch": 0.14184397163120568, "grad_norm": 64.97687530517578, "learning_rate": 2.028368794326241e-05, "loss": 1.5563, "step": 300 }, { "epoch": 0.16548463356973994, "grad_norm": 22.22820472717285, "learning_rate": 2.3829787234042553e-05, "loss": 1.4809, "step": 350 }, { "epoch": 0.18912529550827423, "grad_norm": 17.94223403930664, "learning_rate": 2.7375886524822697e-05, "loss": 1.3877, "step": 400 }, { "epoch": 0.2127659574468085, "grad_norm": 6.133880615234375, "learning_rate": 2.999913686685402e-05, "loss": 1.2877, "step": 450 }, { "epoch": 0.2364066193853428, "grad_norm": 5.359964370727539, "learning_rate": 2.9979733510221677e-05, "loss": 1.1985, "step": 500 }, { "epoch": 0.26004728132387706, "grad_norm": 8.637283325195312, "learning_rate": 2.9934831545542617e-05, "loss": 1.1513, "step": 550 }, { "epoch": 0.28368794326241137, "grad_norm": 13.667312622070312, "learning_rate": 2.986450740525784e-05, "loss": 1.0896, "step": 600 }, { "epoch": 0.3073286052009456, "grad_norm": 6.358010292053223, "learning_rate": 2.9768880795615002e-05, "loss": 1.0343, "step": 650 }, { "epoch": 0.3309692671394799, "grad_norm": 4.579405784606934, "learning_rate": 2.9648114492903583e-05, "loss": 0.9778, "step": 700 }, { "epoch": 0.3546099290780142, "grad_norm": 4.688839435577393, "learning_rate": 2.950241406637593e-05, "loss": 0.9688, "step": 750 }, { "epoch": 0.37825059101654845, "grad_norm": 5.001623153686523, "learning_rate": 2.9332027528325834e-05, "loss": 0.9246, "step": 800 }, { "epoch": 0.40189125295508277, "grad_norm": 5.845919609069824, "learning_rate": 2.9137244911920255e-05, "loss": 0.9075, "step": 850 }, { "epoch": 0.425531914893617, "grad_norm": 6.357454299926758, "learning_rate": 2.891839777750281e-05, "loss": 0.9032, "step": 900 }, { "epoch": 0.4491725768321513, "grad_norm": 4.689474105834961, "learning_rate": 2.8675858648209442e-05, "loss": 0.8833, "step": 950 }, { "epoch": 0.4728132387706856, "grad_norm": 5.220682621002197, "learning_rate": 2.841004037585688e-05, "loss": 0.8512, "step": 1000 }, { "epoch": 0.49645390070921985, "grad_norm": 6.682931423187256, "learning_rate": 2.8121395438183372e-05, "loss": 0.8526, "step": 1050 }, { "epoch": 0.5200945626477541, "grad_norm": 5.913801670074463, "learning_rate": 2.7810415168637912e-05, "loss": 0.8187, "step": 1100 }, { "epoch": 0.5437352245862884, "grad_norm": 5.719775676727295, "learning_rate": 2.7477628920028935e-05, "loss": 0.8174, "step": 1150 }, { "epoch": 0.5673758865248227, "grad_norm": 7.033783912658691, "learning_rate": 2.712360316345627e-05, "loss": 0.8378, "step": 1200 }, { "epoch": 0.5910165484633569, "grad_norm": 10.351150512695312, "learning_rate": 2.6748940524060027e-05, "loss": 0.81, "step": 1250 }, { "epoch": 0.6146572104018913, "grad_norm": 6.334929466247559, "learning_rate": 2.6354278755227802e-05, "loss": 0.8019, "step": 1300 }, { "epoch": 0.6382978723404256, "grad_norm": 4.533697605133057, "learning_rate": 2.5940289653006427e-05, "loss": 0.7874, "step": 1350 }, { "epoch": 0.6619385342789598, "grad_norm": 5.18574857711792, "learning_rate": 2.550767791256593e-05, "loss": 0.7758, "step": 1400 }, { "epoch": 0.6855791962174941, "grad_norm": 5.3152923583984375, "learning_rate": 2.5057179928662506e-05, "loss": 0.7811, "step": 1450 }, { "epoch": 0.7092198581560284, "grad_norm": 6.021467685699463, "learning_rate": 2.458956254214211e-05, "loss": 0.7849, "step": 1500 }, { "epoch": 0.7328605200945626, "grad_norm": 9.204541206359863, "learning_rate": 2.4105621734618613e-05, "loss": 0.7653, "step": 1550 }, { "epoch": 0.7565011820330969, "grad_norm": 7.0010504722595215, "learning_rate": 2.3606181273548253e-05, "loss": 0.7578, "step": 1600 }, { "epoch": 0.7801418439716312, "grad_norm": 4.713606357574463, "learning_rate": 2.309209131000687e-05, "loss": 0.743, "step": 1650 }, { "epoch": 0.8037825059101655, "grad_norm": 8.350544929504395, "learning_rate": 2.256422693155675e-05, "loss": 0.7327, "step": 1700 }, { "epoch": 0.8274231678486997, "grad_norm": 5.2699713706970215, "learning_rate": 2.2023486672666385e-05, "loss": 0.7252, "step": 1750 }, { "epoch": 0.851063829787234, "grad_norm": 5.1293625831604, "learning_rate": 2.1470790985218804e-05, "loss": 0.7169, "step": 1800 }, { "epoch": 0.8747044917257684, "grad_norm": 5.4763407707214355, "learning_rate": 2.0907080671711832e-05, "loss": 0.7208, "step": 1850 }, { "epoch": 0.8983451536643026, "grad_norm": 13.955887794494629, "learning_rate": 2.0333315283817486e-05, "loss": 0.7309, "step": 1900 }, { "epoch": 0.9219858156028369, "grad_norm": 8.627326011657715, "learning_rate": 1.975047148902632e-05, "loss": 0.7155, "step": 1950 }, { "epoch": 0.9456264775413712, "grad_norm": 8.94743537902832, "learning_rate": 1.9159541408157158e-05, "loss": 0.7068, "step": 2000 }, { "epoch": 0.9692671394799054, "grad_norm": 5.135090351104736, "learning_rate": 1.8561530926562023e-05, "loss": 0.6918, "step": 2050 }, { "epoch": 0.9929078014184397, "grad_norm": 38.368629455566406, "learning_rate": 1.795745798190099e-05, "loss": 0.6818, "step": 2100 }, { "epoch": 1.016548463356974, "grad_norm": 25.27336311340332, "learning_rate": 1.734835083140153e-05, "loss": 0.685, "step": 2150 }, { "epoch": 1.0401891252955082, "grad_norm": 7.319797992706299, "learning_rate": 1.6735246301551825e-05, "loss": 0.6739, "step": 2200 }, { "epoch": 1.0638297872340425, "grad_norm": 10.93574333190918, "learning_rate": 1.6119188023207348e-05, "loss": 0.6828, "step": 2250 }, { "epoch": 1.0874704491725768, "grad_norm": 51.116207122802734, "learning_rate": 1.5501224655115118e-05, "loss": 0.6514, "step": 2300 }, { "epoch": 1.1111111111111112, "grad_norm": 17.643468856811523, "learning_rate": 1.4882408098879367e-05, "loss": 0.6708, "step": 2350 }, { "epoch": 1.1347517730496455, "grad_norm": 24.42500114440918, "learning_rate": 1.426379170840718e-05, "loss": 0.6595, "step": 2400 }, { "epoch": 1.1583924349881798, "grad_norm": 7.433150291442871, "learning_rate": 1.364642849688209e-05, "loss": 0.6545, "step": 2450 }, { "epoch": 1.1820330969267139, "grad_norm": 12.09028148651123, "learning_rate": 1.3031369344317569e-05, "loss": 0.6533, "step": 2500 }, { "epoch": 1.2056737588652482, "grad_norm": 8.16505241394043, "learning_rate": 1.2419661208741687e-05, "loss": 0.6466, "step": 2550 }, { "epoch": 1.2293144208037825, "grad_norm": 8.545854568481445, "learning_rate": 1.181234534405775e-05, "loss": 0.6311, "step": 2600 }, { "epoch": 1.2529550827423168, "grad_norm": 7.951985836029053, "learning_rate": 1.1210455527614574e-05, "loss": 0.6338, "step": 2650 }, { "epoch": 1.2765957446808511, "grad_norm": 17.269718170166016, "learning_rate": 1.061501630050338e-05, "loss": 0.6214, "step": 2700 }, { "epoch": 1.3002364066193852, "grad_norm": 14.052486419677734, "learning_rate": 1.0027041223576735e-05, "loss": 0.6307, "step": 2750 }, { "epoch": 1.3238770685579198, "grad_norm": 7.13311767578125, "learning_rate": 9.447531152158089e-06, "loss": 0.6273, "step": 2800 }, { "epoch": 1.3475177304964538, "grad_norm": 9.32107162475586, "learning_rate": 8.877472532378836e-06, "loss": 0.6218, "step": 2850 }, { "epoch": 1.3711583924349882, "grad_norm": 6.4954118728637695, "learning_rate": 8.317835722042693e-06, "loss": 0.6085, "step": 2900 }, { "epoch": 1.3947990543735225, "grad_norm": 7.570118427276611, "learning_rate": 7.769573338875851e-06, "loss": 0.6007, "step": 2950 }, { "epoch": 1.4184397163120568, "grad_norm": 18.045612335205078, "learning_rate": 7.2336186389743095e-06, "loss": 0.6177, "step": 3000 }, { "epoch": 1.442080378250591, "grad_norm": 10.485590934753418, "learning_rate": 6.710883928208835e-06, "loss": 0.5859, "step": 3050 }, { "epoch": 1.4657210401891252, "grad_norm": 13.521890640258789, "learning_rate": 6.202259009291401e-06, "loss": 0.6117, "step": 3100 }, { "epoch": 1.4893617021276595, "grad_norm": 10.763672828674316, "learning_rate": 5.708609667146788e-06, "loss": 0.5647, "step": 3150 }, { "epoch": 1.5130023640661938, "grad_norm": 13.315629959106445, "learning_rate": 5.2307761951673425e-06, "loss": 0.6115, "step": 3200 }, { "epoch": 1.5366430260047281, "grad_norm": 13.952415466308594, "learning_rate": 4.769571964859664e-06, "loss": 0.594, "step": 3250 }, { "epoch": 1.5602836879432624, "grad_norm": 9.341636657714844, "learning_rate": 4.325782041317874e-06, "loss": 0.5803, "step": 3300 }, { "epoch": 1.5839243498817965, "grad_norm": 23.52338218688965, "learning_rate": 3.900161846880281e-06, "loss": 0.5648, "step": 3350 }, { "epoch": 1.607565011820331, "grad_norm": 22.669052124023438, "learning_rate": 3.4934358752441315e-06, "loss": 0.5588, "step": 3400 }, { "epoch": 1.6312056737588652, "grad_norm": 50.38961410522461, "learning_rate": 3.106296458227363e-06, "loss": 0.5777, "step": 3450 }, { "epoch": 1.6548463356973995, "grad_norm": 6.503214359283447, "learning_rate": 2.7394025872764556e-06, "loss": 0.5688, "step": 3500 }, { "epoch": 1.6784869976359338, "grad_norm": 81.57908630371094, "learning_rate": 2.39337879172658e-06, "loss": 0.5849, "step": 3550 }, { "epoch": 1.702127659574468, "grad_norm": 10.680161476135254, "learning_rate": 2.0688140757233428e-06, "loss": 0.5761, "step": 3600 }, { "epoch": 1.7257683215130024, "grad_norm": 6.659136772155762, "learning_rate": 1.7662609156157749e-06, "loss": 0.566, "step": 3650 }, { "epoch": 1.7494089834515365, "grad_norm": 13.552983283996582, "learning_rate": 1.486234319527186e-06, "loss": 0.5625, "step": 3700 }, { "epoch": 1.773049645390071, "grad_norm": 22.759199142456055, "learning_rate": 1.2292109507047273e-06, "loss": 0.557, "step": 3750 }, { "epoch": 1.7966903073286051, "grad_norm": 36.262474060058594, "learning_rate": 9.956283161398172e-07, "loss": 0.5561, "step": 3800 }, { "epoch": 1.8203309692671394, "grad_norm": 8.722681999206543, "learning_rate": 7.858840218406921e-07, "loss": 0.5624, "step": 3850 }, { "epoch": 1.8439716312056738, "grad_norm": 6.087028980255127, "learning_rate": 6.00335096024619e-07, "loss": 0.5747, "step": 3900 }, { "epoch": 1.867612293144208, "grad_norm": 9.753291130065918, "learning_rate": 4.3929738138196787e-07, "loss": 0.5629, "step": 3950 }, { "epoch": 1.8912529550827424, "grad_norm": 13.233976364135742, "learning_rate": 3.030449974465471e-07, "loss": 0.5588, "step": 4000 }, { "epoch": 1.9148936170212765, "grad_norm": 23.338430404663086, "learning_rate": 1.9180987398740358e-07, "loss": 0.5632, "step": 4050 }, { "epoch": 1.938534278959811, "grad_norm": 30.731971740722656, "learning_rate": 1.0578135621633178e-07, "loss": 0.5566, "step": 4100 }, { "epoch": 1.962174940898345, "grad_norm": 12.004512786865234, "learning_rate": 4.5105882483119643e-08, "loss": 0.5446, "step": 4150 }, { "epoch": 1.9858156028368794, "grad_norm": 18.55535125732422, "learning_rate": 9.886735007152425e-09, "loss": 0.5539, "step": 4200 } ], "logging_steps": 50, "max_steps": 4230, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.480992965002527e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }