{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9968847352024922, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016614745586708203, "grad_norm": 0.061364494264125824, "learning_rate": 4.999991432639962e-05, "loss": 0.5857, "num_input_tokens_seen": 70408, "step": 1 }, { "epoch": 0.033229491173416406, "grad_norm": 0.0613991804420948, "learning_rate": 4.999965730618567e-05, "loss": 0.5331, "num_input_tokens_seen": 139640, "step": 2 }, { "epoch": 0.04984423676012461, "grad_norm": 0.06351307034492493, "learning_rate": 4.9999228941119745e-05, "loss": 0.5852, "num_input_tokens_seen": 223656, "step": 3 }, { "epoch": 0.06645898234683281, "grad_norm": 0.05762802064418793, "learning_rate": 4.999862923413781e-05, "loss": 0.5384, "num_input_tokens_seen": 300688, "step": 4 }, { "epoch": 0.08307372793354102, "grad_norm": 0.0632179006934166, "learning_rate": 4.999785818935018e-05, "loss": 0.5273, "num_input_tokens_seen": 366368, "step": 5 }, { "epoch": 0.09968847352024922, "grad_norm": 0.056689903140068054, "learning_rate": 4.999691581204152e-05, "loss": 0.5145, "num_input_tokens_seen": 445808, "step": 6 }, { "epoch": 0.11630321910695743, "grad_norm": 0.06574171781539917, "learning_rate": 4.9995802108670775e-05, "loss": 0.5301, "num_input_tokens_seen": 522800, "step": 7 }, { "epoch": 0.13291796469366562, "grad_norm": 0.06367070972919464, "learning_rate": 4.999451708687114e-05, "loss": 0.5552, "num_input_tokens_seen": 599608, "step": 8 }, { "epoch": 0.14953271028037382, "grad_norm": 0.0585966520011425, "learning_rate": 4.9993060755450015e-05, "loss": 0.5999, "num_input_tokens_seen": 681424, "step": 9 }, { "epoch": 0.16614745586708204, "grad_norm": 0.05650574713945389, "learning_rate": 4.999143312438893e-05, "loss": 0.4535, "num_input_tokens_seen": 756744, "step": 10 }, { "epoch": 0.18276220145379024, "grad_norm": 0.05954223498702049, "learning_rate": 4.998963420484349e-05, "loss": 0.4674, "num_input_tokens_seen": 842576, "step": 11 }, { "epoch": 0.19937694704049844, "grad_norm": 0.0663776770234108, "learning_rate": 4.998766400914329e-05, "loss": 0.4703, "num_input_tokens_seen": 917232, "step": 12 }, { "epoch": 0.21599169262720663, "grad_norm": 0.056374579668045044, "learning_rate": 4.9985522550791825e-05, "loss": 0.3725, "num_input_tokens_seen": 1006800, "step": 13 }, { "epoch": 0.23260643821391486, "grad_norm": 0.06437493115663528, "learning_rate": 4.998320984446641e-05, "loss": 0.4653, "num_input_tokens_seen": 1085824, "step": 14 }, { "epoch": 0.24922118380062305, "grad_norm": 0.06560757756233215, "learning_rate": 4.9980725906018074e-05, "loss": 0.5026, "num_input_tokens_seen": 1164160, "step": 15 }, { "epoch": 0.26583592938733125, "grad_norm": 0.06942517310380936, "learning_rate": 4.997807075247146e-05, "loss": 0.5401, "num_input_tokens_seen": 1242264, "step": 16 }, { "epoch": 0.2824506749740395, "grad_norm": 0.06349828094244003, "learning_rate": 4.997524440202469e-05, "loss": 0.4713, "num_input_tokens_seen": 1325904, "step": 17 }, { "epoch": 0.29906542056074764, "grad_norm": 0.08846385776996613, "learning_rate": 4.9972246874049254e-05, "loss": 0.5834, "num_input_tokens_seen": 1385632, "step": 18 }, { "epoch": 0.31568016614745587, "grad_norm": 0.062130190432071686, "learning_rate": 4.996907818908987e-05, "loss": 0.4045, "num_input_tokens_seen": 1470632, "step": 19 }, { "epoch": 0.3322949117341641, "grad_norm": 0.07743565738201141, "learning_rate": 4.996573836886435e-05, "loss": 0.5283, "num_input_tokens_seen": 1547536, "step": 20 }, { "epoch": 0.34890965732087226, "grad_norm": 0.06756695359945297, "learning_rate": 4.9962227436263453e-05, "loss": 0.4199, "num_input_tokens_seen": 1615528, "step": 21 }, { "epoch": 0.3655244029075805, "grad_norm": 0.08662309497594833, "learning_rate": 4.995854541535071e-05, "loss": 0.4775, "num_input_tokens_seen": 1694352, "step": 22 }, { "epoch": 0.3821391484942887, "grad_norm": 0.08380820602178574, "learning_rate": 4.9954692331362294e-05, "loss": 0.4871, "num_input_tokens_seen": 1753776, "step": 23 }, { "epoch": 0.3987538940809969, "grad_norm": 0.09967435896396637, "learning_rate": 4.995066821070679e-05, "loss": 0.4871, "num_input_tokens_seen": 1809048, "step": 24 }, { "epoch": 0.4153686396677051, "grad_norm": 0.0871267095208168, "learning_rate": 4.994647308096509e-05, "loss": 0.5461, "num_input_tokens_seen": 1884264, "step": 25 }, { "epoch": 0.43198338525441327, "grad_norm": 0.065020851790905, "learning_rate": 4.994210697089014e-05, "loss": 0.405, "num_input_tokens_seen": 1981704, "step": 26 }, { "epoch": 0.4485981308411215, "grad_norm": 0.09853450953960419, "learning_rate": 4.9937569910406756e-05, "loss": 0.4487, "num_input_tokens_seen": 2044144, "step": 27 }, { "epoch": 0.4652128764278297, "grad_norm": 0.08763110637664795, "learning_rate": 4.9932861930611454e-05, "loss": 0.3946, "num_input_tokens_seen": 2107584, "step": 28 }, { "epoch": 0.4818276220145379, "grad_norm": 0.08950547873973846, "learning_rate": 4.9927983063772196e-05, "loss": 0.4257, "num_input_tokens_seen": 2169248, "step": 29 }, { "epoch": 0.4984423676012461, "grad_norm": 0.09980211406946182, "learning_rate": 4.99229333433282e-05, "loss": 0.3911, "num_input_tokens_seen": 2230344, "step": 30 }, { "epoch": 0.5150571131879543, "grad_norm": 0.092055544257164, "learning_rate": 4.9917712803889674e-05, "loss": 0.3749, "num_input_tokens_seen": 2302368, "step": 31 }, { "epoch": 0.5316718587746625, "grad_norm": 0.10067818313837051, "learning_rate": 4.991232148123761e-05, "loss": 0.4761, "num_input_tokens_seen": 2369984, "step": 32 }, { "epoch": 0.5482866043613707, "grad_norm": 0.0717971920967102, "learning_rate": 4.990675941232353e-05, "loss": 0.4328, "num_input_tokens_seen": 2453032, "step": 33 }, { "epoch": 0.564901349948079, "grad_norm": 0.07436250895261765, "learning_rate": 4.990102663526924e-05, "loss": 0.417, "num_input_tokens_seen": 2527464, "step": 34 }, { "epoch": 0.5815160955347871, "grad_norm": 0.09256689995527267, "learning_rate": 4.989512318936655e-05, "loss": 0.4097, "num_input_tokens_seen": 2597032, "step": 35 }, { "epoch": 0.5981308411214953, "grad_norm": 0.09964177012443542, "learning_rate": 4.9889049115077005e-05, "loss": 0.4065, "num_input_tokens_seen": 2671704, "step": 36 }, { "epoch": 0.6147455867082036, "grad_norm": 0.06627887487411499, "learning_rate": 4.988280445403164e-05, "loss": 0.4136, "num_input_tokens_seen": 2767640, "step": 37 }, { "epoch": 0.6313603322949117, "grad_norm": 0.0746045857667923, "learning_rate": 4.987638924903067e-05, "loss": 0.4125, "num_input_tokens_seen": 2843720, "step": 38 }, { "epoch": 0.6479750778816199, "grad_norm": 0.0795741006731987, "learning_rate": 4.9869803544043166e-05, "loss": 0.3135, "num_input_tokens_seen": 2921472, "step": 39 }, { "epoch": 0.6645898234683282, "grad_norm": 0.08914181590080261, "learning_rate": 4.9863047384206835e-05, "loss": 0.4549, "num_input_tokens_seen": 2998400, "step": 40 }, { "epoch": 0.6812045690550363, "grad_norm": 0.11220043897628784, "learning_rate": 4.985612081582764e-05, "loss": 0.4135, "num_input_tokens_seen": 3059648, "step": 41 }, { "epoch": 0.6978193146417445, "grad_norm": 0.08390027284622192, "learning_rate": 4.98490238863795e-05, "loss": 0.3538, "num_input_tokens_seen": 3140184, "step": 42 }, { "epoch": 0.7144340602284528, "grad_norm": 0.08858532458543777, "learning_rate": 4.984175664450397e-05, "loss": 0.3644, "num_input_tokens_seen": 3207184, "step": 43 }, { "epoch": 0.731048805815161, "grad_norm": 0.07439564168453217, "learning_rate": 4.983431914000991e-05, "loss": 0.4019, "num_input_tokens_seen": 3292344, "step": 44 }, { "epoch": 0.7476635514018691, "grad_norm": 0.08694300055503845, "learning_rate": 4.982671142387316e-05, "loss": 0.4238, "num_input_tokens_seen": 3365384, "step": 45 }, { "epoch": 0.7642782969885774, "grad_norm": 0.0867784395813942, "learning_rate": 4.981893354823614e-05, "loss": 0.3702, "num_input_tokens_seen": 3440720, "step": 46 }, { "epoch": 0.7808930425752856, "grad_norm": 0.06278439611196518, "learning_rate": 4.9810985566407544e-05, "loss": 0.3354, "num_input_tokens_seen": 3533576, "step": 47 }, { "epoch": 0.7975077881619937, "grad_norm": 0.08999717980623245, "learning_rate": 4.980286753286195e-05, "loss": 0.4981, "num_input_tokens_seen": 3599744, "step": 48 }, { "epoch": 0.814122533748702, "grad_norm": 0.07938859611749649, "learning_rate": 4.979457950323945e-05, "loss": 0.4016, "num_input_tokens_seen": 3689520, "step": 49 }, { "epoch": 0.8307372793354102, "grad_norm": 0.1045590192079544, "learning_rate": 4.9786121534345265e-05, "loss": 0.388, "num_input_tokens_seen": 3751808, "step": 50 }, { "epoch": 0.8473520249221184, "grad_norm": 0.07890618592500687, "learning_rate": 4.9777493684149375e-05, "loss": 0.3674, "num_input_tokens_seen": 3839096, "step": 51 }, { "epoch": 0.8639667705088265, "grad_norm": 0.07802557945251465, "learning_rate": 4.976869601178609e-05, "loss": 0.4147, "num_input_tokens_seen": 3919824, "step": 52 }, { "epoch": 0.8805815160955348, "grad_norm": 0.0913538783788681, "learning_rate": 4.975972857755369e-05, "loss": 0.2978, "num_input_tokens_seen": 3989312, "step": 53 }, { "epoch": 0.897196261682243, "grad_norm": 0.08525951951742172, "learning_rate": 4.975059144291394e-05, "loss": 0.3923, "num_input_tokens_seen": 4060528, "step": 54 }, { "epoch": 0.9138110072689511, "grad_norm": 0.08649709820747375, "learning_rate": 4.974128467049176e-05, "loss": 0.3282, "num_input_tokens_seen": 4129368, "step": 55 }, { "epoch": 0.9304257528556594, "grad_norm": 0.11635593324899673, "learning_rate": 4.9731808324074717e-05, "loss": 0.3403, "num_input_tokens_seen": 4175208, "step": 56 }, { "epoch": 0.9470404984423676, "grad_norm": 0.1115177720785141, "learning_rate": 4.972216246861262e-05, "loss": 0.3191, "num_input_tokens_seen": 4218096, "step": 57 }, { "epoch": 0.9636552440290758, "grad_norm": 0.0986371859908104, "learning_rate": 4.971234717021709e-05, "loss": 0.3745, "num_input_tokens_seen": 4275968, "step": 58 }, { "epoch": 0.980269989615784, "grad_norm": 0.07860780507326126, "learning_rate": 4.9702362496161085e-05, "loss": 0.3129, "num_input_tokens_seen": 4346616, "step": 59 }, { "epoch": 0.9968847352024922, "grad_norm": 0.08581527322530746, "learning_rate": 4.9692208514878444e-05, "loss": 0.3324, "num_input_tokens_seen": 4425064, "step": 60 } ], "logging_steps": 1.0, "max_steps": 1200, "num_input_tokens_seen": 4425064, "num_train_epochs": 20, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8879949658967245e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }