|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.863966770508826, |
|
"eval_steps": 500, |
|
"global_step": 540, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016614745586708203, |
|
"grad_norm": 0.061364494264125824, |
|
"learning_rate": 4.999991432639962e-05, |
|
"loss": 0.5857, |
|
"num_input_tokens_seen": 70408, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.033229491173416406, |
|
"grad_norm": 0.0613991804420948, |
|
"learning_rate": 4.999965730618567e-05, |
|
"loss": 0.5331, |
|
"num_input_tokens_seen": 139640, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04984423676012461, |
|
"grad_norm": 0.06351307034492493, |
|
"learning_rate": 4.9999228941119745e-05, |
|
"loss": 0.5852, |
|
"num_input_tokens_seen": 223656, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.06645898234683281, |
|
"grad_norm": 0.05762802064418793, |
|
"learning_rate": 4.999862923413781e-05, |
|
"loss": 0.5384, |
|
"num_input_tokens_seen": 300688, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.08307372793354102, |
|
"grad_norm": 0.0632179006934166, |
|
"learning_rate": 4.999785818935018e-05, |
|
"loss": 0.5273, |
|
"num_input_tokens_seen": 366368, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09968847352024922, |
|
"grad_norm": 0.056689903140068054, |
|
"learning_rate": 4.999691581204152e-05, |
|
"loss": 0.5145, |
|
"num_input_tokens_seen": 445808, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.11630321910695743, |
|
"grad_norm": 0.06574171781539917, |
|
"learning_rate": 4.9995802108670775e-05, |
|
"loss": 0.5301, |
|
"num_input_tokens_seen": 522800, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.13291796469366562, |
|
"grad_norm": 0.06367070972919464, |
|
"learning_rate": 4.999451708687114e-05, |
|
"loss": 0.5552, |
|
"num_input_tokens_seen": 599608, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.14953271028037382, |
|
"grad_norm": 0.0585966520011425, |
|
"learning_rate": 4.9993060755450015e-05, |
|
"loss": 0.5999, |
|
"num_input_tokens_seen": 681424, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16614745586708204, |
|
"grad_norm": 0.05650574713945389, |
|
"learning_rate": 4.999143312438893e-05, |
|
"loss": 0.4535, |
|
"num_input_tokens_seen": 756744, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.18276220145379024, |
|
"grad_norm": 0.05954223498702049, |
|
"learning_rate": 4.998963420484349e-05, |
|
"loss": 0.4674, |
|
"num_input_tokens_seen": 842576, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.19937694704049844, |
|
"grad_norm": 0.0663776770234108, |
|
"learning_rate": 4.998766400914329e-05, |
|
"loss": 0.4703, |
|
"num_input_tokens_seen": 917232, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.21599169262720663, |
|
"grad_norm": 0.056374579668045044, |
|
"learning_rate": 4.9985522550791825e-05, |
|
"loss": 0.3725, |
|
"num_input_tokens_seen": 1006800, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.23260643821391486, |
|
"grad_norm": 0.06437493115663528, |
|
"learning_rate": 4.998320984446641e-05, |
|
"loss": 0.4653, |
|
"num_input_tokens_seen": 1085824, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24922118380062305, |
|
"grad_norm": 0.06560757756233215, |
|
"learning_rate": 4.9980725906018074e-05, |
|
"loss": 0.5026, |
|
"num_input_tokens_seen": 1164160, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.26583592938733125, |
|
"grad_norm": 0.06942517310380936, |
|
"learning_rate": 4.997807075247146e-05, |
|
"loss": 0.5401, |
|
"num_input_tokens_seen": 1242264, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2824506749740395, |
|
"grad_norm": 0.06349828094244003, |
|
"learning_rate": 4.997524440202469e-05, |
|
"loss": 0.4713, |
|
"num_input_tokens_seen": 1325904, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.29906542056074764, |
|
"grad_norm": 0.08846385776996613, |
|
"learning_rate": 4.9972246874049254e-05, |
|
"loss": 0.5834, |
|
"num_input_tokens_seen": 1385632, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.31568016614745587, |
|
"grad_norm": 0.062130190432071686, |
|
"learning_rate": 4.996907818908987e-05, |
|
"loss": 0.4045, |
|
"num_input_tokens_seen": 1470632, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.3322949117341641, |
|
"grad_norm": 0.07743565738201141, |
|
"learning_rate": 4.996573836886435e-05, |
|
"loss": 0.5283, |
|
"num_input_tokens_seen": 1547536, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.34890965732087226, |
|
"grad_norm": 0.06756695359945297, |
|
"learning_rate": 4.9962227436263453e-05, |
|
"loss": 0.4199, |
|
"num_input_tokens_seen": 1615528, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3655244029075805, |
|
"grad_norm": 0.08662309497594833, |
|
"learning_rate": 4.995854541535071e-05, |
|
"loss": 0.4775, |
|
"num_input_tokens_seen": 1694352, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.3821391484942887, |
|
"grad_norm": 0.08380820602178574, |
|
"learning_rate": 4.9954692331362294e-05, |
|
"loss": 0.4871, |
|
"num_input_tokens_seen": 1753776, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3987538940809969, |
|
"grad_norm": 0.09967435896396637, |
|
"learning_rate": 4.995066821070679e-05, |
|
"loss": 0.4871, |
|
"num_input_tokens_seen": 1809048, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4153686396677051, |
|
"grad_norm": 0.0871267095208168, |
|
"learning_rate": 4.994647308096509e-05, |
|
"loss": 0.5461, |
|
"num_input_tokens_seen": 1884264, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.43198338525441327, |
|
"grad_norm": 0.065020851790905, |
|
"learning_rate": 4.994210697089014e-05, |
|
"loss": 0.405, |
|
"num_input_tokens_seen": 1981704, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.4485981308411215, |
|
"grad_norm": 0.09853450953960419, |
|
"learning_rate": 4.9937569910406756e-05, |
|
"loss": 0.4487, |
|
"num_input_tokens_seen": 2044144, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.4652128764278297, |
|
"grad_norm": 0.08763110637664795, |
|
"learning_rate": 4.9932861930611454e-05, |
|
"loss": 0.3946, |
|
"num_input_tokens_seen": 2107584, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4818276220145379, |
|
"grad_norm": 0.08950547873973846, |
|
"learning_rate": 4.9927983063772196e-05, |
|
"loss": 0.4257, |
|
"num_input_tokens_seen": 2169248, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4984423676012461, |
|
"grad_norm": 0.09980211406946182, |
|
"learning_rate": 4.99229333433282e-05, |
|
"loss": 0.3911, |
|
"num_input_tokens_seen": 2230344, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5150571131879543, |
|
"grad_norm": 0.092055544257164, |
|
"learning_rate": 4.9917712803889674e-05, |
|
"loss": 0.3749, |
|
"num_input_tokens_seen": 2302368, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5316718587746625, |
|
"grad_norm": 0.10067818313837051, |
|
"learning_rate": 4.991232148123761e-05, |
|
"loss": 0.4761, |
|
"num_input_tokens_seen": 2369984, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5482866043613707, |
|
"grad_norm": 0.0717971920967102, |
|
"learning_rate": 4.990675941232353e-05, |
|
"loss": 0.4328, |
|
"num_input_tokens_seen": 2453032, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.564901349948079, |
|
"grad_norm": 0.07436250895261765, |
|
"learning_rate": 4.990102663526924e-05, |
|
"loss": 0.417, |
|
"num_input_tokens_seen": 2527464, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5815160955347871, |
|
"grad_norm": 0.09256689995527267, |
|
"learning_rate": 4.989512318936655e-05, |
|
"loss": 0.4097, |
|
"num_input_tokens_seen": 2597032, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5981308411214953, |
|
"grad_norm": 0.09964177012443542, |
|
"learning_rate": 4.9889049115077005e-05, |
|
"loss": 0.4065, |
|
"num_input_tokens_seen": 2671704, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.6147455867082036, |
|
"grad_norm": 0.06627887487411499, |
|
"learning_rate": 4.988280445403164e-05, |
|
"loss": 0.4136, |
|
"num_input_tokens_seen": 2767640, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6313603322949117, |
|
"grad_norm": 0.0746045857667923, |
|
"learning_rate": 4.987638924903067e-05, |
|
"loss": 0.4125, |
|
"num_input_tokens_seen": 2843720, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6479750778816199, |
|
"grad_norm": 0.0795741006731987, |
|
"learning_rate": 4.9869803544043166e-05, |
|
"loss": 0.3135, |
|
"num_input_tokens_seen": 2921472, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6645898234683282, |
|
"grad_norm": 0.08914181590080261, |
|
"learning_rate": 4.9863047384206835e-05, |
|
"loss": 0.4549, |
|
"num_input_tokens_seen": 2998400, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6812045690550363, |
|
"grad_norm": 0.11220043897628784, |
|
"learning_rate": 4.985612081582764e-05, |
|
"loss": 0.4135, |
|
"num_input_tokens_seen": 3059648, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6978193146417445, |
|
"grad_norm": 0.08390027284622192, |
|
"learning_rate": 4.98490238863795e-05, |
|
"loss": 0.3538, |
|
"num_input_tokens_seen": 3140184, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.7144340602284528, |
|
"grad_norm": 0.08858532458543777, |
|
"learning_rate": 4.984175664450397e-05, |
|
"loss": 0.3644, |
|
"num_input_tokens_seen": 3207184, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.731048805815161, |
|
"grad_norm": 0.07439564168453217, |
|
"learning_rate": 4.983431914000991e-05, |
|
"loss": 0.4019, |
|
"num_input_tokens_seen": 3292344, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7476635514018691, |
|
"grad_norm": 0.08694300055503845, |
|
"learning_rate": 4.982671142387316e-05, |
|
"loss": 0.4238, |
|
"num_input_tokens_seen": 3365384, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7642782969885774, |
|
"grad_norm": 0.0867784395813942, |
|
"learning_rate": 4.981893354823614e-05, |
|
"loss": 0.3702, |
|
"num_input_tokens_seen": 3440720, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7808930425752856, |
|
"grad_norm": 0.06278439611196518, |
|
"learning_rate": 4.9810985566407544e-05, |
|
"loss": 0.3354, |
|
"num_input_tokens_seen": 3533576, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7975077881619937, |
|
"grad_norm": 0.08999717980623245, |
|
"learning_rate": 4.980286753286195e-05, |
|
"loss": 0.4981, |
|
"num_input_tokens_seen": 3599744, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.814122533748702, |
|
"grad_norm": 0.07938859611749649, |
|
"learning_rate": 4.979457950323945e-05, |
|
"loss": 0.4016, |
|
"num_input_tokens_seen": 3689520, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.8307372793354102, |
|
"grad_norm": 0.1045590192079544, |
|
"learning_rate": 4.9786121534345265e-05, |
|
"loss": 0.388, |
|
"num_input_tokens_seen": 3751808, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8473520249221184, |
|
"grad_norm": 0.07890618592500687, |
|
"learning_rate": 4.9777493684149375e-05, |
|
"loss": 0.3674, |
|
"num_input_tokens_seen": 3839096, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.8639667705088265, |
|
"grad_norm": 0.07802557945251465, |
|
"learning_rate": 4.976869601178609e-05, |
|
"loss": 0.4147, |
|
"num_input_tokens_seen": 3919824, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8805815160955348, |
|
"grad_norm": 0.0913538783788681, |
|
"learning_rate": 4.975972857755369e-05, |
|
"loss": 0.2978, |
|
"num_input_tokens_seen": 3989312, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.897196261682243, |
|
"grad_norm": 0.08525951951742172, |
|
"learning_rate": 4.975059144291394e-05, |
|
"loss": 0.3923, |
|
"num_input_tokens_seen": 4060528, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.9138110072689511, |
|
"grad_norm": 0.08649709820747375, |
|
"learning_rate": 4.974128467049176e-05, |
|
"loss": 0.3282, |
|
"num_input_tokens_seen": 4129368, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.9304257528556594, |
|
"grad_norm": 0.11635593324899673, |
|
"learning_rate": 4.9731808324074717e-05, |
|
"loss": 0.3403, |
|
"num_input_tokens_seen": 4175208, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9470404984423676, |
|
"grad_norm": 0.1115177720785141, |
|
"learning_rate": 4.972216246861262e-05, |
|
"loss": 0.3191, |
|
"num_input_tokens_seen": 4218096, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.9636552440290758, |
|
"grad_norm": 0.0986371859908104, |
|
"learning_rate": 4.971234717021709e-05, |
|
"loss": 0.3745, |
|
"num_input_tokens_seen": 4275968, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.980269989615784, |
|
"grad_norm": 0.07860780507326126, |
|
"learning_rate": 4.9702362496161085e-05, |
|
"loss": 0.3129, |
|
"num_input_tokens_seen": 4346616, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.9968847352024922, |
|
"grad_norm": 0.08581527322530746, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 0.3324, |
|
"num_input_tokens_seen": 4425064, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.16779834032058716, |
|
"learning_rate": 4.968188529596342e-05, |
|
"loss": 0.2814, |
|
"num_input_tokens_seen": 4435328, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.0166147455867083, |
|
"grad_norm": 0.08948636800050735, |
|
"learning_rate": 4.9671392910170185e-05, |
|
"loss": 0.3467, |
|
"num_input_tokens_seen": 4500104, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0332294911734163, |
|
"grad_norm": 0.07826830446720123, |
|
"learning_rate": 4.966073142941239e-05, |
|
"loss": 0.3892, |
|
"num_input_tokens_seen": 4581976, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.0498442367601246, |
|
"grad_norm": 0.08562575280666351, |
|
"learning_rate": 4.964990092676263e-05, |
|
"loss": 0.3354, |
|
"num_input_tokens_seen": 4652160, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.066458982346833, |
|
"grad_norm": 0.1057090312242508, |
|
"learning_rate": 4.9638901476451946e-05, |
|
"loss": 0.3457, |
|
"num_input_tokens_seen": 4709368, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.083073727933541, |
|
"grad_norm": 0.08131146430969238, |
|
"learning_rate": 4.962773315386935e-05, |
|
"loss": 0.3672, |
|
"num_input_tokens_seen": 4798256, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0996884735202492, |
|
"grad_norm": 0.09464936703443527, |
|
"learning_rate": 4.961639603556127e-05, |
|
"loss": 0.3157, |
|
"num_input_tokens_seen": 4859200, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.1163032191069575, |
|
"grad_norm": 0.0999661460518837, |
|
"learning_rate": 4.960489019923105e-05, |
|
"loss": 0.3968, |
|
"num_input_tokens_seen": 4925992, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.1329179646936656, |
|
"grad_norm": 0.09851639717817307, |
|
"learning_rate": 4.9593215723738404e-05, |
|
"loss": 0.329, |
|
"num_input_tokens_seen": 4998808, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.1495327102803738, |
|
"grad_norm": 0.08382592350244522, |
|
"learning_rate": 4.958137268909887e-05, |
|
"loss": 0.2856, |
|
"num_input_tokens_seen": 5089672, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1661474558670821, |
|
"grad_norm": 0.09073847532272339, |
|
"learning_rate": 4.9569361176483286e-05, |
|
"loss": 0.3512, |
|
"num_input_tokens_seen": 5166744, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.1827622014537902, |
|
"grad_norm": 0.10290185362100601, |
|
"learning_rate": 4.9557181268217227e-05, |
|
"loss": 0.4263, |
|
"num_input_tokens_seen": 5228264, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.1993769470404985, |
|
"grad_norm": 0.07421435415744781, |
|
"learning_rate": 4.9544833047780394e-05, |
|
"loss": 0.3126, |
|
"num_input_tokens_seen": 5338224, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.2159916926272065, |
|
"grad_norm": 0.10284842550754547, |
|
"learning_rate": 4.9532316599806124e-05, |
|
"loss": 0.3473, |
|
"num_input_tokens_seen": 5399848, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.2326064382139148, |
|
"grad_norm": 0.10817047953605652, |
|
"learning_rate": 4.951963201008076e-05, |
|
"loss": 0.3275, |
|
"num_input_tokens_seen": 5468624, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.249221183800623, |
|
"grad_norm": 0.09662210941314697, |
|
"learning_rate": 4.9506779365543046e-05, |
|
"loss": 0.3296, |
|
"num_input_tokens_seen": 5536776, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.2658359293873311, |
|
"grad_norm": 0.11193853616714478, |
|
"learning_rate": 4.949375875428357e-05, |
|
"loss": 0.3605, |
|
"num_input_tokens_seen": 5609296, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.2824506749740394, |
|
"grad_norm": 0.11866679787635803, |
|
"learning_rate": 4.9480570265544144e-05, |
|
"loss": 0.3133, |
|
"num_input_tokens_seen": 5663824, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.2990654205607477, |
|
"grad_norm": 0.09865846484899521, |
|
"learning_rate": 4.94672139897172e-05, |
|
"loss": 0.3464, |
|
"num_input_tokens_seen": 5742032, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.3156801661474558, |
|
"grad_norm": 0.09930054098367691, |
|
"learning_rate": 4.9453690018345144e-05, |
|
"loss": 0.3346, |
|
"num_input_tokens_seen": 5816864, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.332294911734164, |
|
"grad_norm": 0.1085321381688118, |
|
"learning_rate": 4.943999844411977e-05, |
|
"loss": 0.3102, |
|
"num_input_tokens_seen": 5881624, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.3489096573208723, |
|
"grad_norm": 0.08012478053569794, |
|
"learning_rate": 4.94261393608816e-05, |
|
"loss": 0.2853, |
|
"num_input_tokens_seen": 5970272, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.3655244029075804, |
|
"grad_norm": 0.10291877388954163, |
|
"learning_rate": 4.941211286361922e-05, |
|
"loss": 0.3038, |
|
"num_input_tokens_seen": 6058752, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.3821391484942886, |
|
"grad_norm": 0.11999356001615524, |
|
"learning_rate": 4.939791904846869e-05, |
|
"loss": 0.3283, |
|
"num_input_tokens_seen": 6120064, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.398753894080997, |
|
"grad_norm": 0.10502559691667557, |
|
"learning_rate": 4.938355801271282e-05, |
|
"loss": 0.321, |
|
"num_input_tokens_seen": 6182072, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.415368639667705, |
|
"grad_norm": 0.12620873749256134, |
|
"learning_rate": 4.936902985478055e-05, |
|
"loss": 0.3296, |
|
"num_input_tokens_seen": 6269680, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.4319833852544133, |
|
"grad_norm": 0.13212910294532776, |
|
"learning_rate": 4.935433467424624e-05, |
|
"loss": 0.3225, |
|
"num_input_tokens_seen": 6347424, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.4485981308411215, |
|
"grad_norm": 0.11600925773382187, |
|
"learning_rate": 4.933947257182901e-05, |
|
"loss": 0.3479, |
|
"num_input_tokens_seen": 6412584, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.4652128764278296, |
|
"grad_norm": 0.11683235317468643, |
|
"learning_rate": 4.932444364939205e-05, |
|
"loss": 0.3322, |
|
"num_input_tokens_seen": 6482728, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.4818276220145379, |
|
"grad_norm": 0.11446017026901245, |
|
"learning_rate": 4.9309248009941914e-05, |
|
"loss": 0.3802, |
|
"num_input_tokens_seen": 6562104, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4984423676012462, |
|
"grad_norm": 0.10500892251729965, |
|
"learning_rate": 4.929388575762782e-05, |
|
"loss": 0.3371, |
|
"num_input_tokens_seen": 6656552, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.5150571131879542, |
|
"grad_norm": 0.13279151916503906, |
|
"learning_rate": 4.9278356997740904e-05, |
|
"loss": 0.293, |
|
"num_input_tokens_seen": 6714184, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.5316718587746625, |
|
"grad_norm": 0.107506163418293, |
|
"learning_rate": 4.9262661836713564e-05, |
|
"loss": 0.3127, |
|
"num_input_tokens_seen": 6793552, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.5482866043613708, |
|
"grad_norm": 0.124021977186203, |
|
"learning_rate": 4.924680038211867e-05, |
|
"loss": 0.3263, |
|
"num_input_tokens_seen": 6865256, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.5649013499480788, |
|
"grad_norm": 0.14172782003879547, |
|
"learning_rate": 4.9230772742668866e-05, |
|
"loss": 0.3204, |
|
"num_input_tokens_seen": 6931152, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.5815160955347871, |
|
"grad_norm": 0.12229758501052856, |
|
"learning_rate": 4.9214579028215776e-05, |
|
"loss": 0.326, |
|
"num_input_tokens_seen": 6998408, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.5981308411214954, |
|
"grad_norm": 0.1242135688662529, |
|
"learning_rate": 4.919821934974933e-05, |
|
"loss": 0.2814, |
|
"num_input_tokens_seen": 7053008, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.6147455867082035, |
|
"grad_norm": 0.12830108404159546, |
|
"learning_rate": 4.918169381939692e-05, |
|
"loss": 0.3254, |
|
"num_input_tokens_seen": 7106440, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.6313603322949117, |
|
"grad_norm": 0.12180659174919128, |
|
"learning_rate": 4.916500255042268e-05, |
|
"loss": 0.3228, |
|
"num_input_tokens_seen": 7167032, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.64797507788162, |
|
"grad_norm": 0.10792312026023865, |
|
"learning_rate": 4.914814565722671e-05, |
|
"loss": 0.2729, |
|
"num_input_tokens_seen": 7245720, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.664589823468328, |
|
"grad_norm": 0.18523500859737396, |
|
"learning_rate": 4.913112325534426e-05, |
|
"loss": 0.3462, |
|
"num_input_tokens_seen": 7326320, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.6812045690550363, |
|
"grad_norm": 0.09529964625835419, |
|
"learning_rate": 4.9113935461444955e-05, |
|
"loss": 0.3096, |
|
"num_input_tokens_seen": 7442232, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6978193146417446, |
|
"grad_norm": 0.14481183886528015, |
|
"learning_rate": 4.9096582393332025e-05, |
|
"loss": 0.3014, |
|
"num_input_tokens_seen": 7502496, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.7144340602284527, |
|
"grad_norm": 0.14645016193389893, |
|
"learning_rate": 4.907906416994146e-05, |
|
"loss": 0.3336, |
|
"num_input_tokens_seen": 7566496, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.731048805815161, |
|
"grad_norm": 0.1306885927915573, |
|
"learning_rate": 4.906138091134118e-05, |
|
"loss": 0.3911, |
|
"num_input_tokens_seen": 7629056, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.7476635514018692, |
|
"grad_norm": 0.10863160341978073, |
|
"learning_rate": 4.9043532738730284e-05, |
|
"loss": 0.3201, |
|
"num_input_tokens_seen": 7706096, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.7642782969885773, |
|
"grad_norm": 0.11725673079490662, |
|
"learning_rate": 4.9025519774438136e-05, |
|
"loss": 0.2783, |
|
"num_input_tokens_seen": 7780072, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.7808930425752856, |
|
"grad_norm": 0.1243867501616478, |
|
"learning_rate": 4.900734214192358e-05, |
|
"loss": 0.3044, |
|
"num_input_tokens_seen": 7857712, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.7975077881619939, |
|
"grad_norm": 0.13539955019950867, |
|
"learning_rate": 4.898899996577407e-05, |
|
"loss": 0.3009, |
|
"num_input_tokens_seen": 7916832, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.814122533748702, |
|
"grad_norm": 0.11198178678750992, |
|
"learning_rate": 4.8970493371704826e-05, |
|
"loss": 0.3229, |
|
"num_input_tokens_seen": 7993056, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.8307372793354102, |
|
"grad_norm": 0.11881165206432343, |
|
"learning_rate": 4.8951822486557986e-05, |
|
"loss": 0.3414, |
|
"num_input_tokens_seen": 8090056, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.8473520249221185, |
|
"grad_norm": 0.12841404974460602, |
|
"learning_rate": 4.893298743830168e-05, |
|
"loss": 0.2907, |
|
"num_input_tokens_seen": 8164808, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.8639667705088265, |
|
"grad_norm": 0.14767521619796753, |
|
"learning_rate": 4.891398835602925e-05, |
|
"loss": 0.2901, |
|
"num_input_tokens_seen": 8223568, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.8805815160955348, |
|
"grad_norm": 0.15326914191246033, |
|
"learning_rate": 4.8894825369958255e-05, |
|
"loss": 0.2918, |
|
"num_input_tokens_seen": 8276160, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.897196261682243, |
|
"grad_norm": 0.1210051029920578, |
|
"learning_rate": 4.8875498611429674e-05, |
|
"loss": 0.3074, |
|
"num_input_tokens_seen": 8354904, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.9138110072689511, |
|
"grad_norm": 0.13544373214244843, |
|
"learning_rate": 4.8856008212906925e-05, |
|
"loss": 0.3461, |
|
"num_input_tokens_seen": 8442584, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.9304257528556594, |
|
"grad_norm": 0.13535892963409424, |
|
"learning_rate": 4.8836354307975026e-05, |
|
"loss": 0.3078, |
|
"num_input_tokens_seen": 8506688, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.9470404984423677, |
|
"grad_norm": 0.10383590310811996, |
|
"learning_rate": 4.881653703133966e-05, |
|
"loss": 0.2432, |
|
"num_input_tokens_seen": 8610712, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.9636552440290758, |
|
"grad_norm": 0.12125886976718903, |
|
"learning_rate": 4.87965565188262e-05, |
|
"loss": 0.2915, |
|
"num_input_tokens_seen": 8692624, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.980269989615784, |
|
"grad_norm": 0.1351424902677536, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 0.3006, |
|
"num_input_tokens_seen": 8772208, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.9968847352024923, |
|
"grad_norm": 0.11472523212432861, |
|
"learning_rate": 4.8756106335059646e-05, |
|
"loss": 0.2774, |
|
"num_input_tokens_seen": 8854904, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.3606414794921875, |
|
"learning_rate": 4.87356369410476e-05, |
|
"loss": 0.2786, |
|
"num_input_tokens_seen": 8872656, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.016614745586708, |
|
"grad_norm": 0.13124766945838928, |
|
"learning_rate": 4.8715004865637614e-05, |
|
"loss": 0.294, |
|
"num_input_tokens_seen": 8946480, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.0332294911734166, |
|
"grad_norm": 0.12415049225091934, |
|
"learning_rate": 4.869421025023965e-05, |
|
"loss": 0.2931, |
|
"num_input_tokens_seen": 9023328, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.0498442367601246, |
|
"grad_norm": 0.16626115143299103, |
|
"learning_rate": 4.867325323737765e-05, |
|
"loss": 0.2887, |
|
"num_input_tokens_seen": 9074320, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.0664589823468327, |
|
"grad_norm": 0.153628870844841, |
|
"learning_rate": 4.8652133970688636e-05, |
|
"loss": 0.2776, |
|
"num_input_tokens_seen": 9148784, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.083073727933541, |
|
"grad_norm": 0.12231138348579407, |
|
"learning_rate": 4.8630852594921706e-05, |
|
"loss": 0.3091, |
|
"num_input_tokens_seen": 9246624, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.0996884735202492, |
|
"grad_norm": 0.15192057192325592, |
|
"learning_rate": 4.860940925593703e-05, |
|
"loss": 0.3354, |
|
"num_input_tokens_seen": 9328176, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.1163032191069573, |
|
"grad_norm": 0.13820070028305054, |
|
"learning_rate": 4.8587804100704845e-05, |
|
"loss": 0.282, |
|
"num_input_tokens_seen": 9388936, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.132917964693666, |
|
"grad_norm": 0.14466816186904907, |
|
"learning_rate": 4.856603727730447e-05, |
|
"loss": 0.2801, |
|
"num_input_tokens_seen": 9461664, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.149532710280374, |
|
"grad_norm": 0.14671838283538818, |
|
"learning_rate": 4.854410893492326e-05, |
|
"loss": 0.2927, |
|
"num_input_tokens_seen": 9535000, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.166147455867082, |
|
"grad_norm": 0.1757712960243225, |
|
"learning_rate": 4.852201922385564e-05, |
|
"loss": 0.2807, |
|
"num_input_tokens_seen": 9600296, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.1827622014537904, |
|
"grad_norm": 0.17755423486232758, |
|
"learning_rate": 4.8499768295502004e-05, |
|
"loss": 0.2765, |
|
"num_input_tokens_seen": 9686784, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.1993769470404985, |
|
"grad_norm": 0.13321827352046967, |
|
"learning_rate": 4.847735630236773e-05, |
|
"loss": 0.3068, |
|
"num_input_tokens_seen": 9781112, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.2159916926272065, |
|
"grad_norm": 0.15012745559215546, |
|
"learning_rate": 4.8454783398062106e-05, |
|
"loss": 0.2737, |
|
"num_input_tokens_seen": 9849528, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.232606438213915, |
|
"grad_norm": 0.14000360667705536, |
|
"learning_rate": 4.843204973729729e-05, |
|
"loss": 0.2831, |
|
"num_input_tokens_seen": 9931080, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.249221183800623, |
|
"grad_norm": 0.14742712676525116, |
|
"learning_rate": 4.840915547588725e-05, |
|
"loss": 0.3047, |
|
"num_input_tokens_seen": 10011176, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.265835929387331, |
|
"grad_norm": 0.16192346811294556, |
|
"learning_rate": 4.838610077074669e-05, |
|
"loss": 0.2759, |
|
"num_input_tokens_seen": 10084128, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.2824506749740396, |
|
"grad_norm": 0.1502583771944046, |
|
"learning_rate": 4.836288577988996e-05, |
|
"loss": 0.298, |
|
"num_input_tokens_seen": 10155536, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.2990654205607477, |
|
"grad_norm": 0.12661044299602509, |
|
"learning_rate": 4.8339510662430046e-05, |
|
"loss": 0.255, |
|
"num_input_tokens_seen": 10251160, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.3156801661474558, |
|
"grad_norm": 0.14002998173236847, |
|
"learning_rate": 4.8315975578577355e-05, |
|
"loss": 0.2566, |
|
"num_input_tokens_seen": 10345864, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.3322949117341643, |
|
"grad_norm": 0.17870523035526276, |
|
"learning_rate": 4.8292280689638725e-05, |
|
"loss": 0.4367, |
|
"num_input_tokens_seen": 10417616, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.3489096573208723, |
|
"grad_norm": 0.17209866642951965, |
|
"learning_rate": 4.826842615801628e-05, |
|
"loss": 0.2954, |
|
"num_input_tokens_seen": 10481816, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.3655244029075804, |
|
"grad_norm": 0.1665940284729004, |
|
"learning_rate": 4.8244412147206284e-05, |
|
"loss": 0.341, |
|
"num_input_tokens_seen": 10562056, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.382139148494289, |
|
"grad_norm": 0.18919898569583893, |
|
"learning_rate": 4.822023882179811e-05, |
|
"loss": 0.2716, |
|
"num_input_tokens_seen": 10612808, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.398753894080997, |
|
"grad_norm": 0.1681865006685257, |
|
"learning_rate": 4.8195906347473e-05, |
|
"loss": 0.2716, |
|
"num_input_tokens_seen": 10682328, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.415368639667705, |
|
"grad_norm": 0.13141104578971863, |
|
"learning_rate": 4.817141489100302e-05, |
|
"loss": 0.2829, |
|
"num_input_tokens_seen": 10771912, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.431983385254413, |
|
"grad_norm": 0.16544249653816223, |
|
"learning_rate": 4.814676462024988e-05, |
|
"loss": 0.3038, |
|
"num_input_tokens_seen": 10842232, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.4485981308411215, |
|
"grad_norm": 0.17946277558803558, |
|
"learning_rate": 4.8121955704163745e-05, |
|
"loss": 0.2792, |
|
"num_input_tokens_seen": 10902264, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.4652128764278296, |
|
"grad_norm": 0.14012685418128967, |
|
"learning_rate": 4.8096988312782174e-05, |
|
"loss": 0.2403, |
|
"num_input_tokens_seen": 10992744, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.4818276220145377, |
|
"grad_norm": 0.103813536465168, |
|
"learning_rate": 4.8071862617228855e-05, |
|
"loss": 0.1605, |
|
"num_input_tokens_seen": 11090064, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.498442367601246, |
|
"grad_norm": 0.1596001833677292, |
|
"learning_rate": 4.8046578789712515e-05, |
|
"loss": 0.2547, |
|
"num_input_tokens_seen": 11162864, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.515057113187954, |
|
"grad_norm": 0.17366129159927368, |
|
"learning_rate": 4.8021137003525664e-05, |
|
"loss": 0.2676, |
|
"num_input_tokens_seen": 11224368, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.5316718587746623, |
|
"grad_norm": 0.1615227609872818, |
|
"learning_rate": 4.7995537433043446e-05, |
|
"loss": 0.2898, |
|
"num_input_tokens_seen": 11291056, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.5482866043613708, |
|
"grad_norm": 0.1951528787612915, |
|
"learning_rate": 4.796978025372246e-05, |
|
"loss": 0.2546, |
|
"num_input_tokens_seen": 11345464, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.564901349948079, |
|
"grad_norm": 0.15065862238407135, |
|
"learning_rate": 4.794386564209953e-05, |
|
"loss": 0.3134, |
|
"num_input_tokens_seen": 11418912, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.581516095534787, |
|
"grad_norm": 0.17094938457012177, |
|
"learning_rate": 4.79177937757905e-05, |
|
"loss": 0.2689, |
|
"num_input_tokens_seen": 11491216, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.5981308411214954, |
|
"grad_norm": 0.16850312054157257, |
|
"learning_rate": 4.7891564833489035e-05, |
|
"loss": 0.2359, |
|
"num_input_tokens_seen": 11558016, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.6147455867082035, |
|
"grad_norm": 0.16789822280406952, |
|
"learning_rate": 4.7865178994965344e-05, |
|
"loss": 0.2735, |
|
"num_input_tokens_seen": 11630432, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.6313603322949115, |
|
"grad_norm": 0.19538354873657227, |
|
"learning_rate": 4.783863644106502e-05, |
|
"loss": 0.254, |
|
"num_input_tokens_seen": 11684624, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.64797507788162, |
|
"grad_norm": 0.1609475016593933, |
|
"learning_rate": 4.781193735370777e-05, |
|
"loss": 0.2763, |
|
"num_input_tokens_seen": 11770232, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.664589823468328, |
|
"grad_norm": 0.1964447796344757, |
|
"learning_rate": 4.7785081915886134e-05, |
|
"loss": 0.2663, |
|
"num_input_tokens_seen": 11828360, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.681204569055036, |
|
"grad_norm": 0.18869946897029877, |
|
"learning_rate": 4.775807031166428e-05, |
|
"loss": 0.2625, |
|
"num_input_tokens_seen": 11915944, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.6978193146417446, |
|
"grad_norm": 0.20539921522140503, |
|
"learning_rate": 4.773090272617672e-05, |
|
"loss": 0.2615, |
|
"num_input_tokens_seen": 11981792, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.7144340602284527, |
|
"grad_norm": 0.1616145521402359, |
|
"learning_rate": 4.7703579345627035e-05, |
|
"loss": 0.3453, |
|
"num_input_tokens_seen": 12044024, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.7310488058151607, |
|
"grad_norm": 0.22601978480815887, |
|
"learning_rate": 4.7676100357286624e-05, |
|
"loss": 0.3036, |
|
"num_input_tokens_seen": 12093424, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.7476635514018692, |
|
"grad_norm": 0.15262462198734283, |
|
"learning_rate": 4.76484659494934e-05, |
|
"loss": 0.2523, |
|
"num_input_tokens_seen": 12167792, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.7642782969885773, |
|
"grad_norm": 0.17928001284599304, |
|
"learning_rate": 4.762067631165049e-05, |
|
"loss": 0.2791, |
|
"num_input_tokens_seen": 12233712, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.7808930425752854, |
|
"grad_norm": 0.15228766202926636, |
|
"learning_rate": 4.7592731634224966e-05, |
|
"loss": 0.2291, |
|
"num_input_tokens_seen": 12310544, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.797507788161994, |
|
"grad_norm": 0.18862110376358032, |
|
"learning_rate": 4.756463210874652e-05, |
|
"loss": 0.2628, |
|
"num_input_tokens_seen": 12400160, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.814122533748702, |
|
"grad_norm": 0.16640189290046692, |
|
"learning_rate": 4.753637792780614e-05, |
|
"loss": 0.2824, |
|
"num_input_tokens_seen": 12480432, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.83073727933541, |
|
"grad_norm": 0.151117205619812, |
|
"learning_rate": 4.7507969285054845e-05, |
|
"loss": 0.2663, |
|
"num_input_tokens_seen": 12568064, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.8473520249221185, |
|
"grad_norm": 0.26551589369773865, |
|
"learning_rate": 4.7479406375202264e-05, |
|
"loss": 0.28, |
|
"num_input_tokens_seen": 12647400, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.8639667705088265, |
|
"grad_norm": 0.22416891157627106, |
|
"learning_rate": 4.745068939401539e-05, |
|
"loss": 0.2424, |
|
"num_input_tokens_seen": 12698208, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.8805815160955346, |
|
"grad_norm": 0.2024654597043991, |
|
"learning_rate": 4.742181853831721e-05, |
|
"loss": 0.2518, |
|
"num_input_tokens_seen": 12758528, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.897196261682243, |
|
"grad_norm": 0.18288369476795197, |
|
"learning_rate": 4.7392794005985326e-05, |
|
"loss": 0.259, |
|
"num_input_tokens_seen": 12837264, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.913811007268951, |
|
"grad_norm": 0.18088208138942719, |
|
"learning_rate": 4.7363615995950626e-05, |
|
"loss": 0.247, |
|
"num_input_tokens_seen": 12902368, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.930425752855659, |
|
"grad_norm": 0.16595424711704254, |
|
"learning_rate": 4.733428470819594e-05, |
|
"loss": 0.2438, |
|
"num_input_tokens_seen": 12974296, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.9470404984423677, |
|
"grad_norm": 0.17989091575145721, |
|
"learning_rate": 4.730480034375462e-05, |
|
"loss": 0.2708, |
|
"num_input_tokens_seen": 13057280, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.9636552440290758, |
|
"grad_norm": 0.16136637330055237, |
|
"learning_rate": 4.72751631047092e-05, |
|
"loss": 0.3171, |
|
"num_input_tokens_seen": 13158232, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.980269989615784, |
|
"grad_norm": 0.1870911419391632, |
|
"learning_rate": 4.7245373194189994e-05, |
|
"loss": 0.24, |
|
"num_input_tokens_seen": 13229840, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.9968847352024923, |
|
"grad_norm": 0.1857272833585739, |
|
"learning_rate": 4.7215430816373726e-05, |
|
"loss": 0.2674, |
|
"num_input_tokens_seen": 13296520, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.31230616569519043, |
|
"learning_rate": 4.718533617648209e-05, |
|
"loss": 0.1677, |
|
"num_input_tokens_seen": 13309672, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 3.016614745586708, |
|
"grad_norm": 0.20352481305599213, |
|
"learning_rate": 4.715508948078037e-05, |
|
"loss": 0.2272, |
|
"num_input_tokens_seen": 13371544, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 3.0332294911734166, |
|
"grad_norm": 0.20679159462451935, |
|
"learning_rate": 4.712469093657605e-05, |
|
"loss": 0.2133, |
|
"num_input_tokens_seen": 13432984, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.0498442367601246, |
|
"grad_norm": 0.18731139600276947, |
|
"learning_rate": 4.709414075221734e-05, |
|
"loss": 0.2695, |
|
"num_input_tokens_seen": 13500016, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 3.0664589823468327, |
|
"grad_norm": 0.21216924488544464, |
|
"learning_rate": 4.706343913709178e-05, |
|
"loss": 0.2524, |
|
"num_input_tokens_seen": 13579672, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 3.083073727933541, |
|
"grad_norm": 0.2222682386636734, |
|
"learning_rate": 4.70325863016248e-05, |
|
"loss": 0.2396, |
|
"num_input_tokens_seen": 13630704, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.0996884735202492, |
|
"grad_norm": 0.21611332893371582, |
|
"learning_rate": 4.7001582457278304e-05, |
|
"loss": 0.3057, |
|
"num_input_tokens_seen": 13695472, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 3.1163032191069573, |
|
"grad_norm": 0.23094947636127472, |
|
"learning_rate": 4.697042781654913e-05, |
|
"loss": 0.2436, |
|
"num_input_tokens_seen": 13767792, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.132917964693666, |
|
"grad_norm": 0.19241105020046234, |
|
"learning_rate": 4.693912259296773e-05, |
|
"loss": 0.2974, |
|
"num_input_tokens_seen": 13857352, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 3.149532710280374, |
|
"grad_norm": 0.19635124504566193, |
|
"learning_rate": 4.690766700109659e-05, |
|
"loss": 0.2457, |
|
"num_input_tokens_seen": 13939928, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.166147455867082, |
|
"grad_norm": 0.1822366714477539, |
|
"learning_rate": 4.687606125652882e-05, |
|
"loss": 0.2205, |
|
"num_input_tokens_seen": 14017936, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 3.1827622014537904, |
|
"grad_norm": 0.22182051837444305, |
|
"learning_rate": 4.684430557588664e-05, |
|
"loss": 0.2116, |
|
"num_input_tokens_seen": 14074176, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 3.1993769470404985, |
|
"grad_norm": 0.19278937578201294, |
|
"learning_rate": 4.681240017681993e-05, |
|
"loss": 0.2839, |
|
"num_input_tokens_seen": 14167656, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.2159916926272065, |
|
"grad_norm": 0.181584894657135, |
|
"learning_rate": 4.678034527800474e-05, |
|
"loss": 0.2115, |
|
"num_input_tokens_seen": 14235800, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.232606438213915, |
|
"grad_norm": 0.19878999888896942, |
|
"learning_rate": 4.674814109914174e-05, |
|
"loss": 0.1982, |
|
"num_input_tokens_seen": 14301272, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 3.249221183800623, |
|
"grad_norm": 0.23485153913497925, |
|
"learning_rate": 4.671578786095478e-05, |
|
"loss": 0.2494, |
|
"num_input_tokens_seen": 14347352, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.265835929387331, |
|
"grad_norm": 0.20542015135288239, |
|
"learning_rate": 4.668328578518933e-05, |
|
"loss": 0.3186, |
|
"num_input_tokens_seen": 14434600, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 3.2824506749740396, |
|
"grad_norm": 0.27169138193130493, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 0.2361, |
|
"num_input_tokens_seen": 14484104, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.2990654205607477, |
|
"grad_norm": 0.18548518419265747, |
|
"learning_rate": 4.661783601300388e-05, |
|
"loss": 0.2457, |
|
"num_input_tokens_seen": 14567152, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 3.3156801661474558, |
|
"grad_norm": 0.2257690727710724, |
|
"learning_rate": 4.6584888765169296e-05, |
|
"loss": 0.2885, |
|
"num_input_tokens_seen": 14647040, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 3.3322949117341643, |
|
"grad_norm": 0.1882171332836151, |
|
"learning_rate": 4.6551793576923964e-05, |
|
"loss": 0.259, |
|
"num_input_tokens_seen": 14738216, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 3.3489096573208723, |
|
"grad_norm": 0.1976601928472519, |
|
"learning_rate": 4.65185506750986e-05, |
|
"loss": 0.2102, |
|
"num_input_tokens_seen": 14811216, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 3.3655244029075804, |
|
"grad_norm": 0.20320351421833038, |
|
"learning_rate": 4.648516028753632e-05, |
|
"loss": 0.1858, |
|
"num_input_tokens_seen": 14885992, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.382139148494289, |
|
"grad_norm": 0.20090511441230774, |
|
"learning_rate": 4.645162264309112e-05, |
|
"loss": 0.272, |
|
"num_input_tokens_seen": 14961984, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 3.398753894080997, |
|
"grad_norm": 0.21391013264656067, |
|
"learning_rate": 4.6417937971626245e-05, |
|
"loss": 0.2036, |
|
"num_input_tokens_seen": 15021240, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 3.415368639667705, |
|
"grad_norm": 0.23000560700893402, |
|
"learning_rate": 4.638410650401267e-05, |
|
"loss": 0.2011, |
|
"num_input_tokens_seen": 15092016, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 3.431983385254413, |
|
"grad_norm": 0.17034712433815002, |
|
"learning_rate": 4.635012847212748e-05, |
|
"loss": 0.2007, |
|
"num_input_tokens_seen": 15198192, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 3.4485981308411215, |
|
"grad_norm": 0.23642270267009735, |
|
"learning_rate": 4.6316004108852305e-05, |
|
"loss": 0.2139, |
|
"num_input_tokens_seen": 15258432, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.4652128764278296, |
|
"grad_norm": 0.22289924323558807, |
|
"learning_rate": 4.628173364807171e-05, |
|
"loss": 0.2441, |
|
"num_input_tokens_seen": 15329600, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 3.4818276220145377, |
|
"grad_norm": 0.19895371794700623, |
|
"learning_rate": 4.6247317324671605e-05, |
|
"loss": 0.2368, |
|
"num_input_tokens_seen": 15407920, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 3.498442367601246, |
|
"grad_norm": 0.19507868587970734, |
|
"learning_rate": 4.6212755374537596e-05, |
|
"loss": 0.231, |
|
"num_input_tokens_seen": 15479640, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 3.515057113187954, |
|
"grad_norm": 0.23565508425235748, |
|
"learning_rate": 4.617804803455344e-05, |
|
"loss": 0.2336, |
|
"num_input_tokens_seen": 15561960, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 3.5316718587746623, |
|
"grad_norm": 0.21157748997211456, |
|
"learning_rate": 4.614319554259934e-05, |
|
"loss": 0.2638, |
|
"num_input_tokens_seen": 15641440, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.5482866043613708, |
|
"grad_norm": 0.2260795533657074, |
|
"learning_rate": 4.610819813755038e-05, |
|
"loss": 0.2646, |
|
"num_input_tokens_seen": 15728872, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 3.564901349948079, |
|
"grad_norm": 0.20947663486003876, |
|
"learning_rate": 4.607305605927487e-05, |
|
"loss": 0.2211, |
|
"num_input_tokens_seen": 15798112, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 3.581516095534787, |
|
"grad_norm": 0.22466345131397247, |
|
"learning_rate": 4.6037769548632656e-05, |
|
"loss": 0.2901, |
|
"num_input_tokens_seen": 15865936, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 3.5981308411214954, |
|
"grad_norm": 0.2176472693681717, |
|
"learning_rate": 4.600233884747355e-05, |
|
"loss": 0.2713, |
|
"num_input_tokens_seen": 15941368, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 3.6147455867082035, |
|
"grad_norm": 0.20593321323394775, |
|
"learning_rate": 4.5966764198635606e-05, |
|
"loss": 0.2047, |
|
"num_input_tokens_seen": 16028208, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.6313603322949115, |
|
"grad_norm": 0.21239908039569855, |
|
"learning_rate": 4.5931045845943474e-05, |
|
"loss": 0.1872, |
|
"num_input_tokens_seen": 16104408, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 3.64797507788162, |
|
"grad_norm": 0.22399188578128815, |
|
"learning_rate": 4.5895184034206765e-05, |
|
"loss": 0.3526, |
|
"num_input_tokens_seen": 16156800, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 3.664589823468328, |
|
"grad_norm": 0.18427705764770508, |
|
"learning_rate": 4.585917900921829e-05, |
|
"loss": 0.2905, |
|
"num_input_tokens_seen": 16256712, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 3.681204569055036, |
|
"grad_norm": 0.22559204697608948, |
|
"learning_rate": 4.5823031017752485e-05, |
|
"loss": 0.2014, |
|
"num_input_tokens_seen": 16330344, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 3.6978193146417446, |
|
"grad_norm": 0.22590497136116028, |
|
"learning_rate": 4.5786740307563636e-05, |
|
"loss": 0.2178, |
|
"num_input_tokens_seen": 16399792, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.7144340602284527, |
|
"grad_norm": 0.20590999722480774, |
|
"learning_rate": 4.575030712738419e-05, |
|
"loss": 0.2149, |
|
"num_input_tokens_seen": 16466368, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 3.7310488058151607, |
|
"grad_norm": 0.23313546180725098, |
|
"learning_rate": 4.571373172692309e-05, |
|
"loss": 0.2164, |
|
"num_input_tokens_seen": 16530976, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 3.7476635514018692, |
|
"grad_norm": 0.2081821709871292, |
|
"learning_rate": 4.567701435686404e-05, |
|
"loss": 0.2197, |
|
"num_input_tokens_seen": 16600216, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 3.7642782969885773, |
|
"grad_norm": 0.22633281350135803, |
|
"learning_rate": 4.5640155268863796e-05, |
|
"loss": 0.2527, |
|
"num_input_tokens_seen": 16673192, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 3.7808930425752854, |
|
"grad_norm": 0.2317536324262619, |
|
"learning_rate": 4.5603154715550386e-05, |
|
"loss": 0.1974, |
|
"num_input_tokens_seen": 16739912, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.797507788161994, |
|
"grad_norm": 0.18925197422504425, |
|
"learning_rate": 4.55660129505215e-05, |
|
"loss": 0.2098, |
|
"num_input_tokens_seen": 16834632, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 3.814122533748702, |
|
"grad_norm": 0.1934100240468979, |
|
"learning_rate": 4.5528730228342605e-05, |
|
"loss": 0.2109, |
|
"num_input_tokens_seen": 16914728, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 3.83073727933541, |
|
"grad_norm": 0.1923092156648636, |
|
"learning_rate": 4.549130680454532e-05, |
|
"loss": 0.2492, |
|
"num_input_tokens_seen": 17014304, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 3.8473520249221185, |
|
"grad_norm": 0.2139277458190918, |
|
"learning_rate": 4.545374293562559e-05, |
|
"loss": 0.2415, |
|
"num_input_tokens_seen": 17106664, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 3.8639667705088265, |
|
"grad_norm": 0.2056339681148529, |
|
"learning_rate": 4.541603887904198e-05, |
|
"loss": 0.2311, |
|
"num_input_tokens_seen": 17193744, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.8805815160955346, |
|
"grad_norm": 0.2485012263059616, |
|
"learning_rate": 4.537819489321386e-05, |
|
"loss": 0.2309, |
|
"num_input_tokens_seen": 17254656, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 3.897196261682243, |
|
"grad_norm": 0.21751578152179718, |
|
"learning_rate": 4.534021123751968e-05, |
|
"loss": 0.2334, |
|
"num_input_tokens_seen": 17325896, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 3.913811007268951, |
|
"grad_norm": 0.27809038758277893, |
|
"learning_rate": 4.5302088172295156e-05, |
|
"loss": 0.2598, |
|
"num_input_tokens_seen": 17394424, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 3.930425752855659, |
|
"grad_norm": 0.23579885065555573, |
|
"learning_rate": 4.526382595883152e-05, |
|
"loss": 0.2132, |
|
"num_input_tokens_seen": 17456352, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 3.9470404984423677, |
|
"grad_norm": 0.2328771948814392, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 0.2139, |
|
"num_input_tokens_seen": 17519168, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.9636552440290758, |
|
"grad_norm": 0.25951239466667175, |
|
"learning_rate": 4.51868851371185e-05, |
|
"loss": 0.2358, |
|
"num_input_tokens_seen": 17585144, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 3.980269989615784, |
|
"grad_norm": 0.2149265706539154, |
|
"learning_rate": 4.5148207056212896e-05, |
|
"loss": 0.1937, |
|
"num_input_tokens_seen": 17662024, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 3.9968847352024923, |
|
"grad_norm": 0.24635690450668335, |
|
"learning_rate": 4.5109390881752114e-05, |
|
"loss": 0.222, |
|
"num_input_tokens_seen": 17724360, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4375430643558502, |
|
"learning_rate": 4.5070436879777865e-05, |
|
"loss": 0.2036, |
|
"num_input_tokens_seen": 17746200, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 4.0166147455867085, |
|
"grad_norm": 0.18866199254989624, |
|
"learning_rate": 4.503134531727652e-05, |
|
"loss": 0.189, |
|
"num_input_tokens_seen": 17830760, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 4.033229491173416, |
|
"grad_norm": 0.22479456663131714, |
|
"learning_rate": 4.499211646217727e-05, |
|
"loss": 0.2027, |
|
"num_input_tokens_seen": 17903840, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 4.049844236760125, |
|
"grad_norm": 0.2447003722190857, |
|
"learning_rate": 4.495275058335029e-05, |
|
"loss": 0.2018, |
|
"num_input_tokens_seen": 17990448, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 4.066458982346833, |
|
"grad_norm": 0.281250536441803, |
|
"learning_rate": 4.491324795060491e-05, |
|
"loss": 0.2182, |
|
"num_input_tokens_seen": 18069520, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 4.083073727933541, |
|
"grad_norm": 0.2666569948196411, |
|
"learning_rate": 4.487360883468775e-05, |
|
"loss": 0.1997, |
|
"num_input_tokens_seen": 18129128, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 4.099688473520249, |
|
"grad_norm": 0.23250125348567963, |
|
"learning_rate": 4.4833833507280884e-05, |
|
"loss": 0.2237, |
|
"num_input_tokens_seen": 18202472, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.116303219106958, |
|
"grad_norm": 0.2713671028614044, |
|
"learning_rate": 4.4793922240999933e-05, |
|
"loss": 0.2012, |
|
"num_input_tokens_seen": 18267232, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 4.132917964693665, |
|
"grad_norm": 0.2696637213230133, |
|
"learning_rate": 4.4753875309392266e-05, |
|
"loss": 0.2189, |
|
"num_input_tokens_seen": 18325216, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 4.149532710280374, |
|
"grad_norm": 0.25705742835998535, |
|
"learning_rate": 4.471369298693505e-05, |
|
"loss": 0.2333, |
|
"num_input_tokens_seen": 18406184, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 4.166147455867082, |
|
"grad_norm": 0.2252712994813919, |
|
"learning_rate": 4.467337554903344e-05, |
|
"loss": 0.191, |
|
"num_input_tokens_seen": 18481056, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 4.18276220145379, |
|
"grad_norm": 0.2379617840051651, |
|
"learning_rate": 4.463292327201862e-05, |
|
"loss": 0.1707, |
|
"num_input_tokens_seen": 18554864, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 4.1993769470404985, |
|
"grad_norm": 0.32383039593696594, |
|
"learning_rate": 4.4592336433146e-05, |
|
"loss": 0.2362, |
|
"num_input_tokens_seen": 18612120, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 4.215991692627207, |
|
"grad_norm": 0.2559884786605835, |
|
"learning_rate": 4.4551615310593195e-05, |
|
"loss": 0.2385, |
|
"num_input_tokens_seen": 18710408, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 4.232606438213915, |
|
"grad_norm": 0.2688326835632324, |
|
"learning_rate": 4.451076018345825e-05, |
|
"loss": 0.2154, |
|
"num_input_tokens_seen": 18769400, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 4.249221183800623, |
|
"grad_norm": 0.23779121041297913, |
|
"learning_rate": 4.4469771331757604e-05, |
|
"loss": 0.2021, |
|
"num_input_tokens_seen": 18849704, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 4.265835929387332, |
|
"grad_norm": 0.2318015843629837, |
|
"learning_rate": 4.442864903642428e-05, |
|
"loss": 0.2245, |
|
"num_input_tokens_seen": 18943328, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.282450674974039, |
|
"grad_norm": 0.25898367166519165, |
|
"learning_rate": 4.4387393579305865e-05, |
|
"loss": 0.2279, |
|
"num_input_tokens_seen": 19022536, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 4.299065420560748, |
|
"grad_norm": 0.5110782980918884, |
|
"learning_rate": 4.434600524316266e-05, |
|
"loss": 0.1913, |
|
"num_input_tokens_seen": 19089200, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 4.315680166147456, |
|
"grad_norm": 0.2796955108642578, |
|
"learning_rate": 4.430448431166567e-05, |
|
"loss": 0.3056, |
|
"num_input_tokens_seen": 19171216, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 4.332294911734164, |
|
"grad_norm": 0.19809302687644958, |
|
"learning_rate": 4.426283106939474e-05, |
|
"loss": 0.1719, |
|
"num_input_tokens_seen": 19271872, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 4.348909657320872, |
|
"grad_norm": 0.2518528997898102, |
|
"learning_rate": 4.4221045801836494e-05, |
|
"loss": 0.2856, |
|
"num_input_tokens_seen": 19342984, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 4.365524402907581, |
|
"grad_norm": 0.2863263189792633, |
|
"learning_rate": 4.41791287953825e-05, |
|
"loss": 0.2079, |
|
"num_input_tokens_seen": 19391640, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 4.382139148494288, |
|
"grad_norm": 0.2902291417121887, |
|
"learning_rate": 4.4137080337327205e-05, |
|
"loss": 0.2321, |
|
"num_input_tokens_seen": 19463232, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 4.398753894080997, |
|
"grad_norm": 0.2613106071949005, |
|
"learning_rate": 4.4094900715866064e-05, |
|
"loss": 0.2147, |
|
"num_input_tokens_seen": 19523728, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 4.415368639667705, |
|
"grad_norm": 0.26637744903564453, |
|
"learning_rate": 4.4052590220093446e-05, |
|
"loss": 0.2283, |
|
"num_input_tokens_seen": 19598960, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 4.431983385254413, |
|
"grad_norm": 0.3226986229419708, |
|
"learning_rate": 4.401014914000078e-05, |
|
"loss": 0.2041, |
|
"num_input_tokens_seen": 19666136, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.4485981308411215, |
|
"grad_norm": 0.3052164912223816, |
|
"learning_rate": 4.3967577766474455e-05, |
|
"loss": 0.21, |
|
"num_input_tokens_seen": 19728600, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 4.46521287642783, |
|
"grad_norm": 0.2585601210594177, |
|
"learning_rate": 4.3924876391293915e-05, |
|
"loss": 0.2471, |
|
"num_input_tokens_seen": 19801032, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 4.481827622014538, |
|
"grad_norm": 0.2675788402557373, |
|
"learning_rate": 4.3882045307129594e-05, |
|
"loss": 0.2173, |
|
"num_input_tokens_seen": 19885496, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 4.498442367601246, |
|
"grad_norm": 0.2575731873512268, |
|
"learning_rate": 4.383908480754095e-05, |
|
"loss": 0.2104, |
|
"num_input_tokens_seen": 19952072, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 4.515057113187955, |
|
"grad_norm": 0.22937491536140442, |
|
"learning_rate": 4.379599518697444e-05, |
|
"loss": 0.1908, |
|
"num_input_tokens_seen": 20026536, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 4.531671858774662, |
|
"grad_norm": 0.28578364849090576, |
|
"learning_rate": 4.375277674076149e-05, |
|
"loss": 0.1778, |
|
"num_input_tokens_seen": 20079112, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 4.548286604361371, |
|
"grad_norm": 0.2623717486858368, |
|
"learning_rate": 4.3709429765116504e-05, |
|
"loss": 0.302, |
|
"num_input_tokens_seen": 20144264, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 4.564901349948079, |
|
"grad_norm": 0.3099273443222046, |
|
"learning_rate": 4.366595455713479e-05, |
|
"loss": 0.2113, |
|
"num_input_tokens_seen": 20207568, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 4.581516095534787, |
|
"grad_norm": 0.2775528132915497, |
|
"learning_rate": 4.3622351414790554e-05, |
|
"loss": 0.2519, |
|
"num_input_tokens_seen": 20292376, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 4.598130841121495, |
|
"grad_norm": 0.23497633635997772, |
|
"learning_rate": 4.357862063693486e-05, |
|
"loss": 0.1628, |
|
"num_input_tokens_seen": 20383048, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.614745586708204, |
|
"grad_norm": 0.25743210315704346, |
|
"learning_rate": 4.353476252329356e-05, |
|
"loss": 0.1923, |
|
"num_input_tokens_seen": 20463376, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 4.6313603322949115, |
|
"grad_norm": 0.2595055103302002, |
|
"learning_rate": 4.349077737446525e-05, |
|
"loss": 0.1745, |
|
"num_input_tokens_seen": 20537808, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 4.64797507788162, |
|
"grad_norm": 0.269157350063324, |
|
"learning_rate": 4.344666549191921e-05, |
|
"loss": 0.207, |
|
"num_input_tokens_seen": 20605496, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 4.6645898234683285, |
|
"grad_norm": 0.2762012481689453, |
|
"learning_rate": 4.3402427177993366e-05, |
|
"loss": 0.2412, |
|
"num_input_tokens_seen": 20692096, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 4.681204569055036, |
|
"grad_norm": 0.3109856843948364, |
|
"learning_rate": 4.335806273589214e-05, |
|
"loss": 0.2219, |
|
"num_input_tokens_seen": 20762800, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 4.697819314641745, |
|
"grad_norm": 0.2506738305091858, |
|
"learning_rate": 4.3313572469684474e-05, |
|
"loss": 0.1831, |
|
"num_input_tokens_seen": 20831584, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 4.714434060228453, |
|
"grad_norm": 0.25760403275489807, |
|
"learning_rate": 4.326895668430166e-05, |
|
"loss": 0.1457, |
|
"num_input_tokens_seen": 20897320, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 4.731048805815161, |
|
"grad_norm": 0.298622727394104, |
|
"learning_rate": 4.3224215685535294e-05, |
|
"loss": 0.193, |
|
"num_input_tokens_seen": 20966136, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 4.747663551401869, |
|
"grad_norm": 0.2863025665283203, |
|
"learning_rate": 4.317934978003517e-05, |
|
"loss": 0.1868, |
|
"num_input_tokens_seen": 21034800, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 4.764278296988578, |
|
"grad_norm": 0.2865165174007416, |
|
"learning_rate": 4.313435927530719e-05, |
|
"loss": 0.2251, |
|
"num_input_tokens_seen": 21098672, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.780893042575285, |
|
"grad_norm": 0.2902335226535797, |
|
"learning_rate": 4.3089244479711236e-05, |
|
"loss": 0.1853, |
|
"num_input_tokens_seen": 21177632, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 4.797507788161994, |
|
"grad_norm": 0.31741780042648315, |
|
"learning_rate": 4.304400570245906e-05, |
|
"loss": 0.2135, |
|
"num_input_tokens_seen": 21240896, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 4.814122533748702, |
|
"grad_norm": 0.22312244772911072, |
|
"learning_rate": 4.299864325361217e-05, |
|
"loss": 0.177, |
|
"num_input_tokens_seen": 21322984, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 4.83073727933541, |
|
"grad_norm": 0.244970440864563, |
|
"learning_rate": 4.295315744407972e-05, |
|
"loss": 0.1877, |
|
"num_input_tokens_seen": 21389128, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 4.8473520249221185, |
|
"grad_norm": 0.2605350613594055, |
|
"learning_rate": 4.290754858561637e-05, |
|
"loss": 0.2124, |
|
"num_input_tokens_seen": 21469912, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 4.863966770508826, |
|
"grad_norm": 0.27169349789619446, |
|
"learning_rate": 4.2861816990820084e-05, |
|
"loss": 0.1833, |
|
"num_input_tokens_seen": 21540320, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 4.880581516095535, |
|
"grad_norm": 0.2539166510105133, |
|
"learning_rate": 4.281596297313013e-05, |
|
"loss": 0.2134, |
|
"num_input_tokens_seen": 21626312, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 4.897196261682243, |
|
"grad_norm": 0.28907957673072815, |
|
"learning_rate": 4.2769986846824815e-05, |
|
"loss": 0.1912, |
|
"num_input_tokens_seen": 21702792, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 4.913811007268951, |
|
"grad_norm": 0.3405742645263672, |
|
"learning_rate": 4.272388892701934e-05, |
|
"loss": 0.2051, |
|
"num_input_tokens_seen": 21771880, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 4.930425752855659, |
|
"grad_norm": 0.2592983543872833, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 0.1926, |
|
"num_input_tokens_seen": 21844024, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.947040498442368, |
|
"grad_norm": 0.24671317636966705, |
|
"learning_rate": 4.2631328971540444e-05, |
|
"loss": 0.2039, |
|
"num_input_tokens_seen": 21925632, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 4.963655244029075, |
|
"grad_norm": 0.3087393641471863, |
|
"learning_rate": 4.2584867570262597e-05, |
|
"loss": 0.2077, |
|
"num_input_tokens_seen": 21981952, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 4.980269989615784, |
|
"grad_norm": 0.22413718700408936, |
|
"learning_rate": 4.25382856442714e-05, |
|
"loss": 0.174, |
|
"num_input_tokens_seen": 22070440, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 4.996884735202492, |
|
"grad_norm": 0.22630493342876434, |
|
"learning_rate": 4.249158351283414e-05, |
|
"loss": 0.204, |
|
"num_input_tokens_seen": 22170184, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.7261853814125061, |
|
"learning_rate": 4.244476149604201e-05, |
|
"loss": 0.2849, |
|
"num_input_tokens_seen": 22181856, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 5.0166147455867085, |
|
"grad_norm": 0.2537758946418762, |
|
"learning_rate": 4.2397819914807856e-05, |
|
"loss": 0.1879, |
|
"num_input_tokens_seen": 22256808, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 5.033229491173416, |
|
"grad_norm": 0.34757834672927856, |
|
"learning_rate": 4.2350759090864046e-05, |
|
"loss": 0.2158, |
|
"num_input_tokens_seen": 22325224, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 5.049844236760125, |
|
"grad_norm": 0.23899881541728973, |
|
"learning_rate": 4.230357934676017e-05, |
|
"loss": 0.1685, |
|
"num_input_tokens_seen": 22389624, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 5.066458982346833, |
|
"grad_norm": 0.38237079977989197, |
|
"learning_rate": 4.225628100586093e-05, |
|
"loss": 0.2253, |
|
"num_input_tokens_seen": 22463872, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 5.083073727933541, |
|
"grad_norm": 0.30238866806030273, |
|
"learning_rate": 4.220886439234385e-05, |
|
"loss": 0.1762, |
|
"num_input_tokens_seen": 22515824, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.099688473520249, |
|
"grad_norm": 0.2617652416229248, |
|
"learning_rate": 4.2161329831197095e-05, |
|
"loss": 0.1772, |
|
"num_input_tokens_seen": 22602336, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 5.116303219106958, |
|
"grad_norm": 0.31059470772743225, |
|
"learning_rate": 4.211367764821722e-05, |
|
"loss": 0.1729, |
|
"num_input_tokens_seen": 22655176, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 5.132917964693665, |
|
"grad_norm": 0.2957116663455963, |
|
"learning_rate": 4.2065908170006955e-05, |
|
"loss": 0.1857, |
|
"num_input_tokens_seen": 22728680, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 5.149532710280374, |
|
"grad_norm": 0.30844616889953613, |
|
"learning_rate": 4.201802172397295e-05, |
|
"loss": 0.176, |
|
"num_input_tokens_seen": 22806784, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 5.166147455867082, |
|
"grad_norm": 0.3961475193500519, |
|
"learning_rate": 4.197001863832355e-05, |
|
"loss": 0.1903, |
|
"num_input_tokens_seen": 22880648, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 5.18276220145379, |
|
"grad_norm": 0.3515004813671112, |
|
"learning_rate": 4.192189924206652e-05, |
|
"loss": 0.1706, |
|
"num_input_tokens_seen": 22953184, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 5.1993769470404985, |
|
"grad_norm": 0.6038290858268738, |
|
"learning_rate": 4.187366386500683e-05, |
|
"loss": 0.2127, |
|
"num_input_tokens_seen": 23037392, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 5.215991692627207, |
|
"grad_norm": 0.29695793986320496, |
|
"learning_rate": 4.182531283774434e-05, |
|
"loss": 0.293, |
|
"num_input_tokens_seen": 23086552, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 5.232606438213915, |
|
"grad_norm": 0.30214065313339233, |
|
"learning_rate": 4.177684649167158e-05, |
|
"loss": 0.1843, |
|
"num_input_tokens_seen": 23153152, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 5.249221183800623, |
|
"grad_norm": 0.3034592568874359, |
|
"learning_rate": 4.172826515897146e-05, |
|
"loss": 0.1945, |
|
"num_input_tokens_seen": 23240928, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.265835929387332, |
|
"grad_norm": 0.27193683385849, |
|
"learning_rate": 4.1679569172614996e-05, |
|
"loss": 0.1825, |
|
"num_input_tokens_seen": 23325912, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 5.282450674974039, |
|
"grad_norm": 0.3327508866786957, |
|
"learning_rate": 4.163075886635902e-05, |
|
"loss": 0.2044, |
|
"num_input_tokens_seen": 23401952, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 5.299065420560748, |
|
"grad_norm": 0.32610246539115906, |
|
"learning_rate": 4.1581834574743915e-05, |
|
"loss": 0.1718, |
|
"num_input_tokens_seen": 23463760, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 5.315680166147456, |
|
"grad_norm": 0.30451127886772156, |
|
"learning_rate": 4.1532796633091296e-05, |
|
"loss": 0.1768, |
|
"num_input_tokens_seen": 23535272, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 5.332294911734164, |
|
"grad_norm": 0.26648980379104614, |
|
"learning_rate": 4.148364537750172e-05, |
|
"loss": 0.1609, |
|
"num_input_tokens_seen": 23607752, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 5.348909657320872, |
|
"grad_norm": 0.2634081244468689, |
|
"learning_rate": 4.14343811448524e-05, |
|
"loss": 0.1769, |
|
"num_input_tokens_seen": 23674872, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 5.365524402907581, |
|
"grad_norm": 0.30805498361587524, |
|
"learning_rate": 4.138500427279485e-05, |
|
"loss": 0.2087, |
|
"num_input_tokens_seen": 23736384, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 5.382139148494288, |
|
"grad_norm": 0.2669171988964081, |
|
"learning_rate": 4.133551509975264e-05, |
|
"loss": 0.162, |
|
"num_input_tokens_seen": 23835000, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 5.398753894080997, |
|
"grad_norm": 0.25709372758865356, |
|
"learning_rate": 4.128591396491901e-05, |
|
"loss": 0.1614, |
|
"num_input_tokens_seen": 23912552, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 5.415368639667705, |
|
"grad_norm": 0.2793630063533783, |
|
"learning_rate": 4.123620120825459e-05, |
|
"loss": 0.2, |
|
"num_input_tokens_seen": 23987368, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.431983385254413, |
|
"grad_norm": 0.32492437958717346, |
|
"learning_rate": 4.118637717048506e-05, |
|
"loss": 0.1719, |
|
"num_input_tokens_seen": 24050848, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 5.4485981308411215, |
|
"grad_norm": 0.26048383116722107, |
|
"learning_rate": 4.113644219309877e-05, |
|
"loss": 0.1678, |
|
"num_input_tokens_seen": 24146104, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 5.46521287642783, |
|
"grad_norm": 0.3429310619831085, |
|
"learning_rate": 4.1086396618344476e-05, |
|
"loss": 0.176, |
|
"num_input_tokens_seen": 24194184, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 5.481827622014538, |
|
"grad_norm": 0.2843048870563507, |
|
"learning_rate": 4.1036240789228954e-05, |
|
"loss": 0.1844, |
|
"num_input_tokens_seen": 24275368, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 5.498442367601246, |
|
"grad_norm": 0.3061092495918274, |
|
"learning_rate": 4.098597504951462e-05, |
|
"loss": 0.1901, |
|
"num_input_tokens_seen": 24329192, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 5.515057113187955, |
|
"grad_norm": 0.28139594197273254, |
|
"learning_rate": 4.093559974371725e-05, |
|
"loss": 0.1751, |
|
"num_input_tokens_seen": 24426696, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 5.531671858774662, |
|
"grad_norm": 0.2379189133644104, |
|
"learning_rate": 4.088511521710352e-05, |
|
"loss": 0.171, |
|
"num_input_tokens_seen": 24514344, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 5.548286604361371, |
|
"grad_norm": 0.3467542827129364, |
|
"learning_rate": 4.083452181568875e-05, |
|
"loss": 0.1766, |
|
"num_input_tokens_seen": 24584464, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 5.564901349948079, |
|
"grad_norm": 0.2928486764431, |
|
"learning_rate": 4.0783819886234445e-05, |
|
"loss": 0.1584, |
|
"num_input_tokens_seen": 24660600, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 5.581516095534787, |
|
"grad_norm": 0.31528523564338684, |
|
"learning_rate": 4.073300977624594e-05, |
|
"loss": 0.1711, |
|
"num_input_tokens_seen": 24717088, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.598130841121495, |
|
"grad_norm": 0.313851922750473, |
|
"learning_rate": 4.068209183397004e-05, |
|
"loss": 0.1798, |
|
"num_input_tokens_seen": 24775352, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 5.614745586708204, |
|
"grad_norm": 0.2714058756828308, |
|
"learning_rate": 4.063106640839264e-05, |
|
"loss": 0.1666, |
|
"num_input_tokens_seen": 24860072, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 5.6313603322949115, |
|
"grad_norm": 0.2646903693675995, |
|
"learning_rate": 4.057993384923626e-05, |
|
"loss": 0.168, |
|
"num_input_tokens_seen": 24947856, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 5.64797507788162, |
|
"grad_norm": 0.29279816150665283, |
|
"learning_rate": 4.052869450695776e-05, |
|
"loss": 0.1801, |
|
"num_input_tokens_seen": 25024992, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 5.6645898234683285, |
|
"grad_norm": 0.3221881091594696, |
|
"learning_rate": 4.047734873274586e-05, |
|
"loss": 0.183, |
|
"num_input_tokens_seen": 25092248, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 5.681204569055036, |
|
"grad_norm": 0.31584057211875916, |
|
"learning_rate": 4.042589687851872e-05, |
|
"loss": 0.1752, |
|
"num_input_tokens_seen": 25170496, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 5.697819314641745, |
|
"grad_norm": 0.2635148763656616, |
|
"learning_rate": 4.037433929692161e-05, |
|
"loss": 0.1798, |
|
"num_input_tokens_seen": 25268720, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 5.714434060228453, |
|
"grad_norm": 0.3024344742298126, |
|
"learning_rate": 4.0322676341324415e-05, |
|
"loss": 0.1793, |
|
"num_input_tokens_seen": 25332688, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 5.731048805815161, |
|
"grad_norm": 0.35282352566719055, |
|
"learning_rate": 4.027090836581925e-05, |
|
"loss": 0.2022, |
|
"num_input_tokens_seen": 25413904, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 5.747663551401869, |
|
"grad_norm": 0.3042284846305847, |
|
"learning_rate": 4.021903572521802e-05, |
|
"loss": 0.1848, |
|
"num_input_tokens_seen": 25503720, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.764278296988578, |
|
"grad_norm": 0.33289963006973267, |
|
"learning_rate": 4.0167058775049996e-05, |
|
"loss": 0.1931, |
|
"num_input_tokens_seen": 25568560, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 5.780893042575285, |
|
"grad_norm": 0.3255409598350525, |
|
"learning_rate": 4.011497787155938e-05, |
|
"loss": 0.1675, |
|
"num_input_tokens_seen": 25635184, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 5.797507788161994, |
|
"grad_norm": 0.2858707904815674, |
|
"learning_rate": 4.006279337170283e-05, |
|
"loss": 0.176, |
|
"num_input_tokens_seen": 25719768, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 5.814122533748702, |
|
"grad_norm": 0.30737245082855225, |
|
"learning_rate": 4.0010505633147106e-05, |
|
"loss": 0.1705, |
|
"num_input_tokens_seen": 25795016, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 5.83073727933541, |
|
"grad_norm": 0.29503941535949707, |
|
"learning_rate": 3.995811501426648e-05, |
|
"loss": 0.153, |
|
"num_input_tokens_seen": 25863184, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 5.8473520249221185, |
|
"grad_norm": 0.3046650290489197, |
|
"learning_rate": 3.99056218741404e-05, |
|
"loss": 0.1795, |
|
"num_input_tokens_seen": 25935752, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 5.863966770508826, |
|
"grad_norm": 0.33626508712768555, |
|
"learning_rate": 3.985302657255097e-05, |
|
"loss": 0.1744, |
|
"num_input_tokens_seen": 25995760, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 5.880581516095535, |
|
"grad_norm": 0.2900950312614441, |
|
"learning_rate": 3.980032946998049e-05, |
|
"loss": 0.1538, |
|
"num_input_tokens_seen": 26061240, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 5.897196261682243, |
|
"grad_norm": 0.32002708315849304, |
|
"learning_rate": 3.974753092760901e-05, |
|
"loss": 0.172, |
|
"num_input_tokens_seen": 26131024, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 5.913811007268951, |
|
"grad_norm": 0.22458530962467194, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 0.197, |
|
"num_input_tokens_seen": 26233672, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.930425752855659, |
|
"grad_norm": 0.2662505805492401, |
|
"learning_rate": 3.964163097165702e-05, |
|
"loss": 0.1359, |
|
"num_input_tokens_seen": 26303488, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 5.947040498442368, |
|
"grad_norm": 0.2906314432621002, |
|
"learning_rate": 3.958853028390294e-05, |
|
"loss": 0.1622, |
|
"num_input_tokens_seen": 26377768, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 5.963655244029075, |
|
"grad_norm": 0.33236268162727356, |
|
"learning_rate": 3.953532960799577e-05, |
|
"loss": 0.3039, |
|
"num_input_tokens_seen": 26435984, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 5.980269989615784, |
|
"grad_norm": 0.3479491174221039, |
|
"learning_rate": 3.948202930856697e-05, |
|
"loss": 0.185, |
|
"num_input_tokens_seen": 26513960, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 5.996884735202492, |
|
"grad_norm": 0.2604476511478424, |
|
"learning_rate": 3.942862975093085e-05, |
|
"loss": 0.1671, |
|
"num_input_tokens_seen": 26599064, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.5614582300186157, |
|
"learning_rate": 3.937513130108197e-05, |
|
"loss": 0.2127, |
|
"num_input_tokens_seen": 26617264, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 6.0166147455867085, |
|
"grad_norm": 0.29380959272384644, |
|
"learning_rate": 3.9321534325692726e-05, |
|
"loss": 0.1736, |
|
"num_input_tokens_seen": 26717024, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 6.033229491173416, |
|
"grad_norm": 0.3308286666870117, |
|
"learning_rate": 3.92678391921108e-05, |
|
"loss": 0.166, |
|
"num_input_tokens_seen": 26788120, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 6.049844236760125, |
|
"grad_norm": 0.34477588534355164, |
|
"learning_rate": 3.92140462683566e-05, |
|
"loss": 0.182, |
|
"num_input_tokens_seen": 26853760, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 6.066458982346833, |
|
"grad_norm": 0.33609539270401, |
|
"learning_rate": 3.916015592312082e-05, |
|
"loss": 0.1621, |
|
"num_input_tokens_seen": 26923848, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 6.083073727933541, |
|
"grad_norm": 0.2985127866268158, |
|
"learning_rate": 3.9106168525761855e-05, |
|
"loss": 0.1488, |
|
"num_input_tokens_seen": 26976184, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 6.099688473520249, |
|
"grad_norm": 0.3301701545715332, |
|
"learning_rate": 3.905208444630327e-05, |
|
"loss": 0.1554, |
|
"num_input_tokens_seen": 27065712, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 6.116303219106958, |
|
"grad_norm": 0.22536417841911316, |
|
"learning_rate": 3.899790405543129e-05, |
|
"loss": 0.1653, |
|
"num_input_tokens_seen": 27145472, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 6.132917964693665, |
|
"grad_norm": 0.28911101818084717, |
|
"learning_rate": 3.894362772449226e-05, |
|
"loss": 0.1503, |
|
"num_input_tokens_seen": 27233904, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 6.149532710280374, |
|
"grad_norm": 0.30377212166786194, |
|
"learning_rate": 3.888925582549006e-05, |
|
"loss": 0.1358, |
|
"num_input_tokens_seen": 27311512, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 6.166147455867082, |
|
"grad_norm": 0.32584136724472046, |
|
"learning_rate": 3.883478873108361e-05, |
|
"loss": 0.15, |
|
"num_input_tokens_seen": 27387400, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 6.18276220145379, |
|
"grad_norm": 0.33625590801239014, |
|
"learning_rate": 3.878022681458426e-05, |
|
"loss": 0.1588, |
|
"num_input_tokens_seen": 27461280, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 6.1993769470404985, |
|
"grad_norm": 0.2865462601184845, |
|
"learning_rate": 3.87255704499533e-05, |
|
"loss": 0.143, |
|
"num_input_tokens_seen": 27556400, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 6.215991692627207, |
|
"grad_norm": 0.3429831266403198, |
|
"learning_rate": 3.8670820011799315e-05, |
|
"loss": 0.1713, |
|
"num_input_tokens_seen": 27613664, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 6.232606438213915, |
|
"grad_norm": 0.32414478063583374, |
|
"learning_rate": 3.861597587537568e-05, |
|
"loss": 0.1893, |
|
"num_input_tokens_seen": 27681024, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.249221183800623, |
|
"grad_norm": 0.3242063522338867, |
|
"learning_rate": 3.856103841657797e-05, |
|
"loss": 0.156, |
|
"num_input_tokens_seen": 27759536, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 6.265835929387332, |
|
"grad_norm": 0.22485007345676422, |
|
"learning_rate": 3.850600801194138e-05, |
|
"loss": 0.1247, |
|
"num_input_tokens_seen": 27857288, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 6.282450674974039, |
|
"grad_norm": 0.4592108726501465, |
|
"learning_rate": 3.8450885038638127e-05, |
|
"loss": 0.172, |
|
"num_input_tokens_seen": 27940528, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 6.299065420560748, |
|
"grad_norm": 0.3695475459098816, |
|
"learning_rate": 3.8395669874474915e-05, |
|
"loss": 0.166, |
|
"num_input_tokens_seen": 28033824, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 6.315680166147456, |
|
"grad_norm": 0.33483219146728516, |
|
"learning_rate": 3.834036289789029e-05, |
|
"loss": 0.1415, |
|
"num_input_tokens_seen": 28096192, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 6.332294911734164, |
|
"grad_norm": 0.2734437882900238, |
|
"learning_rate": 3.828496448795207e-05, |
|
"loss": 0.1369, |
|
"num_input_tokens_seen": 28181256, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 6.348909657320872, |
|
"grad_norm": 0.3039200007915497, |
|
"learning_rate": 3.822947502435477e-05, |
|
"loss": 0.1465, |
|
"num_input_tokens_seen": 28245480, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 6.365524402907581, |
|
"grad_norm": 0.3409143388271332, |
|
"learning_rate": 3.8173894887416945e-05, |
|
"loss": 0.1456, |
|
"num_input_tokens_seen": 28307200, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 6.382139148494288, |
|
"grad_norm": 0.390440970659256, |
|
"learning_rate": 3.811822445807863e-05, |
|
"loss": 0.1752, |
|
"num_input_tokens_seen": 28384640, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 6.398753894080997, |
|
"grad_norm": 0.3439676761627197, |
|
"learning_rate": 3.8062464117898724e-05, |
|
"loss": 0.1344, |
|
"num_input_tokens_seen": 28447992, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.415368639667705, |
|
"grad_norm": 0.4008062779903412, |
|
"learning_rate": 3.800661424905235e-05, |
|
"loss": 0.1506, |
|
"num_input_tokens_seen": 28513856, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 6.431983385254413, |
|
"grad_norm": 0.5333216190338135, |
|
"learning_rate": 3.795067523432826e-05, |
|
"loss": 0.1648, |
|
"num_input_tokens_seen": 28596584, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 6.4485981308411215, |
|
"grad_norm": 0.3630414605140686, |
|
"learning_rate": 3.789464745712619e-05, |
|
"loss": 0.1762, |
|
"num_input_tokens_seen": 28664560, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 6.46521287642783, |
|
"grad_norm": 0.3014258146286011, |
|
"learning_rate": 3.7838531301454254e-05, |
|
"loss": 0.14, |
|
"num_input_tokens_seen": 28739512, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 6.481827622014538, |
|
"grad_norm": 0.4165439307689667, |
|
"learning_rate": 3.77823271519263e-05, |
|
"loss": 0.1591, |
|
"num_input_tokens_seen": 28831848, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 6.498442367601246, |
|
"grad_norm": 0.36684006452560425, |
|
"learning_rate": 3.7726035393759285e-05, |
|
"loss": 0.163, |
|
"num_input_tokens_seen": 28898408, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 6.515057113187955, |
|
"grad_norm": 0.3208068609237671, |
|
"learning_rate": 3.76696564127706e-05, |
|
"loss": 0.1558, |
|
"num_input_tokens_seen": 28960224, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 6.531671858774662, |
|
"grad_norm": 0.42699623107910156, |
|
"learning_rate": 3.761319059537548e-05, |
|
"loss": 0.1906, |
|
"num_input_tokens_seen": 29020568, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 6.548286604361371, |
|
"grad_norm": 0.3583790957927704, |
|
"learning_rate": 3.755663832858432e-05, |
|
"loss": 0.1399, |
|
"num_input_tokens_seen": 29095448, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 6.564901349948079, |
|
"grad_norm": 0.34571129083633423, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.1554, |
|
"num_input_tokens_seen": 29186600, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.581516095534787, |
|
"grad_norm": 0.30998197197914124, |
|
"learning_rate": 3.744327599781531e-05, |
|
"loss": 0.2448, |
|
"num_input_tokens_seen": 29258552, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 6.598130841121495, |
|
"grad_norm": 0.35567161440849304, |
|
"learning_rate": 3.7386466710810194e-05, |
|
"loss": 0.1853, |
|
"num_input_tokens_seen": 29344848, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 6.614745586708204, |
|
"grad_norm": 0.3521808683872223, |
|
"learning_rate": 3.7329572528349146e-05, |
|
"loss": 0.2248, |
|
"num_input_tokens_seen": 29410184, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 6.6313603322949115, |
|
"grad_norm": 0.31165575981140137, |
|
"learning_rate": 3.727259384037852e-05, |
|
"loss": 0.1647, |
|
"num_input_tokens_seen": 29484928, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 6.64797507788162, |
|
"grad_norm": 0.3182571530342102, |
|
"learning_rate": 3.721553103742388e-05, |
|
"loss": 0.1622, |
|
"num_input_tokens_seen": 29566432, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 6.6645898234683285, |
|
"grad_norm": 0.3434613347053528, |
|
"learning_rate": 3.715838451058726e-05, |
|
"loss": 0.1509, |
|
"num_input_tokens_seen": 29634032, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 6.681204569055036, |
|
"grad_norm": 0.3849641978740692, |
|
"learning_rate": 3.7101154651544584e-05, |
|
"loss": 0.1719, |
|
"num_input_tokens_seen": 29681424, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 6.697819314641745, |
|
"grad_norm": 0.3322864770889282, |
|
"learning_rate": 3.704384185254288e-05, |
|
"loss": 0.1478, |
|
"num_input_tokens_seen": 29762208, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 6.714434060228453, |
|
"grad_norm": 0.3480667769908905, |
|
"learning_rate": 3.6986446506397666e-05, |
|
"loss": 0.1563, |
|
"num_input_tokens_seen": 29816280, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 6.731048805815161, |
|
"grad_norm": 0.31062281131744385, |
|
"learning_rate": 3.692896900649021e-05, |
|
"loss": 0.1456, |
|
"num_input_tokens_seen": 29893040, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.747663551401869, |
|
"grad_norm": 0.3606508672237396, |
|
"learning_rate": 3.6871409746764865e-05, |
|
"loss": 0.1617, |
|
"num_input_tokens_seen": 29971688, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 6.764278296988578, |
|
"grad_norm": 0.2981427013874054, |
|
"learning_rate": 3.681376912172636e-05, |
|
"loss": 0.1417, |
|
"num_input_tokens_seen": 30051784, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 6.780893042575285, |
|
"grad_norm": 0.30427512526512146, |
|
"learning_rate": 3.675604752643706e-05, |
|
"loss": 0.1527, |
|
"num_input_tokens_seen": 30146048, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 6.797507788161994, |
|
"grad_norm": 0.3024544417858124, |
|
"learning_rate": 3.6698245356514335e-05, |
|
"loss": 0.1498, |
|
"num_input_tokens_seen": 30221296, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 6.814122533748702, |
|
"grad_norm": 0.2973288297653198, |
|
"learning_rate": 3.6640363008127784e-05, |
|
"loss": 0.1594, |
|
"num_input_tokens_seen": 30287664, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 6.83073727933541, |
|
"grad_norm": 0.37876859307289124, |
|
"learning_rate": 3.6582400877996546e-05, |
|
"loss": 0.1691, |
|
"num_input_tokens_seen": 30352816, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 6.8473520249221185, |
|
"grad_norm": 0.31672796607017517, |
|
"learning_rate": 3.652435936338656e-05, |
|
"loss": 0.1556, |
|
"num_input_tokens_seen": 30439688, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 6.863966770508826, |
|
"grad_norm": 0.41883349418640137, |
|
"learning_rate": 3.646623886210788e-05, |
|
"loss": 0.1729, |
|
"num_input_tokens_seen": 30506856, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 6.880581516095535, |
|
"grad_norm": 0.3436118960380554, |
|
"learning_rate": 3.64080397725119e-05, |
|
"loss": 0.1459, |
|
"num_input_tokens_seen": 30565848, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 6.897196261682243, |
|
"grad_norm": 0.42407792806625366, |
|
"learning_rate": 3.634976249348867e-05, |
|
"loss": 0.1753, |
|
"num_input_tokens_seen": 30633944, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.913811007268951, |
|
"grad_norm": 0.32624706625938416, |
|
"learning_rate": 3.629140742446414e-05, |
|
"loss": 0.1538, |
|
"num_input_tokens_seen": 30704760, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 6.930425752855659, |
|
"grad_norm": 0.3517460227012634, |
|
"learning_rate": 3.623297496539741e-05, |
|
"loss": 0.1475, |
|
"num_input_tokens_seen": 30773792, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 6.947040498442368, |
|
"grad_norm": 0.36355283856391907, |
|
"learning_rate": 3.6174465516778035e-05, |
|
"loss": 0.16, |
|
"num_input_tokens_seen": 30848672, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 6.963655244029075, |
|
"grad_norm": 0.3491535186767578, |
|
"learning_rate": 3.611587947962319e-05, |
|
"loss": 0.1505, |
|
"num_input_tokens_seen": 30906064, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 6.980269989615784, |
|
"grad_norm": 0.4260394275188446, |
|
"learning_rate": 3.6057217255475034e-05, |
|
"loss": 0.1796, |
|
"num_input_tokens_seen": 30964720, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 6.996884735202492, |
|
"grad_norm": 0.3254542052745819, |
|
"learning_rate": 3.599847924639788e-05, |
|
"loss": 0.159, |
|
"num_input_tokens_seen": 31043152, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.6581417918205261, |
|
"learning_rate": 3.593966585497547e-05, |
|
"loss": 0.1275, |
|
"num_input_tokens_seen": 31056056, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 7.0166147455867085, |
|
"grad_norm": 0.3190588057041168, |
|
"learning_rate": 3.588077748430819e-05, |
|
"loss": 0.1349, |
|
"num_input_tokens_seen": 31135304, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 7.033229491173416, |
|
"grad_norm": 0.3564179241657257, |
|
"learning_rate": 3.582181453801036e-05, |
|
"loss": 0.148, |
|
"num_input_tokens_seen": 31185600, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 7.049844236760125, |
|
"grad_norm": 0.3535906672477722, |
|
"learning_rate": 3.576277742020738e-05, |
|
"loss": 0.1439, |
|
"num_input_tokens_seen": 31254312, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 7.066458982346833, |
|
"grad_norm": 0.5310954451560974, |
|
"learning_rate": 3.570366653553307e-05, |
|
"loss": 0.1345, |
|
"num_input_tokens_seen": 31339112, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 7.083073727933541, |
|
"grad_norm": 0.35149645805358887, |
|
"learning_rate": 3.564448228912682e-05, |
|
"loss": 0.1196, |
|
"num_input_tokens_seen": 31424024, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 7.099688473520249, |
|
"grad_norm": 0.3583419620990753, |
|
"learning_rate": 3.558522508663081e-05, |
|
"loss": 0.1533, |
|
"num_input_tokens_seen": 31494656, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 7.116303219106958, |
|
"grad_norm": 0.2901877462863922, |
|
"learning_rate": 3.552589533418728e-05, |
|
"loss": 0.1404, |
|
"num_input_tokens_seen": 31588536, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 7.132917964693665, |
|
"grad_norm": 0.35585492849349976, |
|
"learning_rate": 3.54664934384357e-05, |
|
"loss": 0.2437, |
|
"num_input_tokens_seen": 31657560, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 7.149532710280374, |
|
"grad_norm": 0.3314085006713867, |
|
"learning_rate": 3.540701980651003e-05, |
|
"loss": 0.15, |
|
"num_input_tokens_seen": 31743992, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 7.166147455867082, |
|
"grad_norm": 0.34194597601890564, |
|
"learning_rate": 3.534747484603587e-05, |
|
"loss": 0.1375, |
|
"num_input_tokens_seen": 31806520, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 7.18276220145379, |
|
"grad_norm": 0.3955981731414795, |
|
"learning_rate": 3.528785896512772e-05, |
|
"loss": 0.1388, |
|
"num_input_tokens_seen": 31860464, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 7.1993769470404985, |
|
"grad_norm": 0.33545568585395813, |
|
"learning_rate": 3.5228172572386146e-05, |
|
"loss": 0.2926, |
|
"num_input_tokens_seen": 31921424, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 7.215991692627207, |
|
"grad_norm": 0.43362271785736084, |
|
"learning_rate": 3.516841607689501e-05, |
|
"loss": 0.1454, |
|
"num_input_tokens_seen": 31981064, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.232606438213915, |
|
"grad_norm": 0.40348634123802185, |
|
"learning_rate": 3.510858988821863e-05, |
|
"loss": 0.1388, |
|
"num_input_tokens_seen": 32050648, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 7.249221183800623, |
|
"grad_norm": 0.35088223218917847, |
|
"learning_rate": 3.504869441639901e-05, |
|
"loss": 0.1248, |
|
"num_input_tokens_seen": 32118584, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 7.265835929387332, |
|
"grad_norm": 0.3033677041530609, |
|
"learning_rate": 3.4988730071953004e-05, |
|
"loss": 0.1252, |
|
"num_input_tokens_seen": 32206384, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 7.282450674974039, |
|
"grad_norm": 0.3067215383052826, |
|
"learning_rate": 3.4928697265869515e-05, |
|
"loss": 0.1101, |
|
"num_input_tokens_seen": 32299040, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 7.299065420560748, |
|
"grad_norm": 0.33541834354400635, |
|
"learning_rate": 3.486859640960668e-05, |
|
"loss": 0.1351, |
|
"num_input_tokens_seen": 32355624, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 7.315680166147456, |
|
"grad_norm": 0.3549206554889679, |
|
"learning_rate": 3.480842791508904e-05, |
|
"loss": 0.1513, |
|
"num_input_tokens_seen": 32427792, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 7.332294911734164, |
|
"grad_norm": 0.3848089277744293, |
|
"learning_rate": 3.474819219470471e-05, |
|
"loss": 0.1342, |
|
"num_input_tokens_seen": 32508696, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 7.348909657320872, |
|
"grad_norm": 0.31206491589546204, |
|
"learning_rate": 3.4687889661302576e-05, |
|
"loss": 0.136, |
|
"num_input_tokens_seen": 32601312, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 7.365524402907581, |
|
"grad_norm": 0.2999168336391449, |
|
"learning_rate": 3.4627520728189456e-05, |
|
"loss": 0.1183, |
|
"num_input_tokens_seen": 32680256, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 7.382139148494288, |
|
"grad_norm": 0.3667398989200592, |
|
"learning_rate": 3.456708580912725e-05, |
|
"loss": 0.1375, |
|
"num_input_tokens_seen": 32738816, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 7.398753894080997, |
|
"grad_norm": 0.3525990843772888, |
|
"learning_rate": 3.4506585318330125e-05, |
|
"loss": 0.1265, |
|
"num_input_tokens_seen": 32813240, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 7.415368639667705, |
|
"grad_norm": 0.3496154546737671, |
|
"learning_rate": 3.444601967046168e-05, |
|
"loss": 0.1422, |
|
"num_input_tokens_seen": 32889680, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 7.431983385254413, |
|
"grad_norm": 0.38129401206970215, |
|
"learning_rate": 3.438538928063208e-05, |
|
"loss": 0.1534, |
|
"num_input_tokens_seen": 32964760, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 7.4485981308411215, |
|
"grad_norm": 0.3664973974227905, |
|
"learning_rate": 3.432469456439523e-05, |
|
"loss": 0.1517, |
|
"num_input_tokens_seen": 33048992, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 7.46521287642783, |
|
"grad_norm": 0.3663897216320038, |
|
"learning_rate": 3.426393593774591e-05, |
|
"loss": 0.1345, |
|
"num_input_tokens_seen": 33130200, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 7.481827622014538, |
|
"grad_norm": 0.33521801233291626, |
|
"learning_rate": 3.4203113817116957e-05, |
|
"loss": 0.133, |
|
"num_input_tokens_seen": 33223024, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 7.498442367601246, |
|
"grad_norm": 0.32186245918273926, |
|
"learning_rate": 3.414222861937636e-05, |
|
"loss": 0.1394, |
|
"num_input_tokens_seen": 33303120, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 7.515057113187955, |
|
"grad_norm": 0.35747572779655457, |
|
"learning_rate": 3.408128076182446e-05, |
|
"loss": 0.1474, |
|
"num_input_tokens_seen": 33364984, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 7.531671858774662, |
|
"grad_norm": 0.37195590138435364, |
|
"learning_rate": 3.402027066219105e-05, |
|
"loss": 0.1585, |
|
"num_input_tokens_seen": 33427352, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 7.548286604361371, |
|
"grad_norm": 0.3562054932117462, |
|
"learning_rate": 3.39591987386325e-05, |
|
"loss": 0.1315, |
|
"num_input_tokens_seen": 33481272, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.564901349948079, |
|
"grad_norm": 0.3821605443954468, |
|
"learning_rate": 3.389806540972898e-05, |
|
"loss": 0.1252, |
|
"num_input_tokens_seen": 33538904, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 7.581516095534787, |
|
"grad_norm": 0.3299770653247833, |
|
"learning_rate": 3.383687109448143e-05, |
|
"loss": 0.1375, |
|
"num_input_tokens_seen": 33635976, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 7.598130841121495, |
|
"grad_norm": 0.38955169916152954, |
|
"learning_rate": 3.377561621230887e-05, |
|
"loss": 0.137, |
|
"num_input_tokens_seen": 33711184, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 7.614745586708204, |
|
"grad_norm": 0.3389476537704468, |
|
"learning_rate": 3.3714301183045385e-05, |
|
"loss": 0.1133, |
|
"num_input_tokens_seen": 33778848, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 7.6313603322949115, |
|
"grad_norm": 0.33927062153816223, |
|
"learning_rate": 3.365292642693732e-05, |
|
"loss": 0.129, |
|
"num_input_tokens_seen": 33866024, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 7.64797507788162, |
|
"grad_norm": 0.26980310678482056, |
|
"learning_rate": 3.359149236464041e-05, |
|
"loss": 0.1453, |
|
"num_input_tokens_seen": 33978144, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 7.6645898234683285, |
|
"grad_norm": 0.3778550922870636, |
|
"learning_rate": 3.35299994172168e-05, |
|
"loss": 0.1642, |
|
"num_input_tokens_seen": 34047480, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 7.681204569055036, |
|
"grad_norm": 0.2846803069114685, |
|
"learning_rate": 3.346844800613229e-05, |
|
"loss": 0.1296, |
|
"num_input_tokens_seen": 34134480, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 7.697819314641745, |
|
"grad_norm": 0.3579311668872833, |
|
"learning_rate": 3.340683855325335e-05, |
|
"loss": 0.1299, |
|
"num_input_tokens_seen": 34190176, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 7.714434060228453, |
|
"grad_norm": 0.37811708450317383, |
|
"learning_rate": 3.3345171480844275e-05, |
|
"loss": 0.155, |
|
"num_input_tokens_seen": 34267336, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.731048805815161, |
|
"grad_norm": 0.35094380378723145, |
|
"learning_rate": 3.3283447211564276e-05, |
|
"loss": 0.1439, |
|
"num_input_tokens_seen": 34333616, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 7.747663551401869, |
|
"grad_norm": 0.313473105430603, |
|
"learning_rate": 3.322166616846458e-05, |
|
"loss": 0.1451, |
|
"num_input_tokens_seen": 34404000, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 7.764278296988578, |
|
"grad_norm": 0.36839237809181213, |
|
"learning_rate": 3.315982877498555e-05, |
|
"loss": 0.1403, |
|
"num_input_tokens_seen": 34466048, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 7.780893042575285, |
|
"grad_norm": 0.32588714361190796, |
|
"learning_rate": 3.309793545495374e-05, |
|
"loss": 0.1355, |
|
"num_input_tokens_seen": 34547312, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 7.797507788161994, |
|
"grad_norm": 0.3828336000442505, |
|
"learning_rate": 3.303598663257904e-05, |
|
"loss": 0.1295, |
|
"num_input_tokens_seen": 34600544, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 7.814122533748702, |
|
"grad_norm": 0.35497623682022095, |
|
"learning_rate": 3.2973982732451755e-05, |
|
"loss": 0.1405, |
|
"num_input_tokens_seen": 34660792, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 7.83073727933541, |
|
"grad_norm": 0.28363677859306335, |
|
"learning_rate": 3.2911924179539656e-05, |
|
"loss": 0.1858, |
|
"num_input_tokens_seen": 34778440, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 7.8473520249221185, |
|
"grad_norm": 0.376277893781662, |
|
"learning_rate": 3.284981139918513e-05, |
|
"loss": 0.1454, |
|
"num_input_tokens_seen": 34849760, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 7.863966770508826, |
|
"grad_norm": 0.32492315769195557, |
|
"learning_rate": 3.278764481710221e-05, |
|
"loss": 0.1177, |
|
"num_input_tokens_seen": 34940776, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 7.880581516095535, |
|
"grad_norm": 0.33199772238731384, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 0.1369, |
|
"num_input_tokens_seen": 35018104, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.897196261682243, |
|
"grad_norm": 0.3320644795894623, |
|
"learning_rate": 3.26631519524482e-05, |
|
"loss": 0.1267, |
|
"num_input_tokens_seen": 35079744, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 7.913811007268951, |
|
"grad_norm": 0.3606792390346527, |
|
"learning_rate": 3.260082652313726e-05, |
|
"loss": 0.1236, |
|
"num_input_tokens_seen": 35132808, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 7.930425752855659, |
|
"grad_norm": 0.36535707116127014, |
|
"learning_rate": 3.253844899861239e-05, |
|
"loss": 0.131, |
|
"num_input_tokens_seen": 35197816, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 7.947040498442368, |
|
"grad_norm": 0.294482946395874, |
|
"learning_rate": 3.247601980640217e-05, |
|
"loss": 0.1129, |
|
"num_input_tokens_seen": 35275528, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 7.963655244029075, |
|
"grad_norm": 0.3573679029941559, |
|
"learning_rate": 3.241353937438927e-05, |
|
"loss": 0.1448, |
|
"num_input_tokens_seen": 35333280, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 7.980269989615784, |
|
"grad_norm": 0.3585527837276459, |
|
"learning_rate": 3.23510081308076e-05, |
|
"loss": 0.1488, |
|
"num_input_tokens_seen": 35412944, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 7.996884735202492, |
|
"grad_norm": 0.3394106924533844, |
|
"learning_rate": 3.228842650423929e-05, |
|
"loss": 0.148, |
|
"num_input_tokens_seen": 35485056, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.7672997117042542, |
|
"learning_rate": 3.222579492361179e-05, |
|
"loss": 0.1484, |
|
"num_input_tokens_seen": 35494824, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 8.016614745586708, |
|
"grad_norm": 0.3408578932285309, |
|
"learning_rate": 3.2163113818194964e-05, |
|
"loss": 0.124, |
|
"num_input_tokens_seen": 35557768, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 8.033229491173417, |
|
"grad_norm": 0.3427956700325012, |
|
"learning_rate": 3.210038361759807e-05, |
|
"loss": 0.124, |
|
"num_input_tokens_seen": 35613120, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 8.049844236760125, |
|
"grad_norm": 0.3450639247894287, |
|
"learning_rate": 3.2037604751766885e-05, |
|
"loss": 0.1214, |
|
"num_input_tokens_seen": 35674176, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 8.066458982346832, |
|
"grad_norm": 0.34786927700042725, |
|
"learning_rate": 3.1974777650980735e-05, |
|
"loss": 0.1295, |
|
"num_input_tokens_seen": 35786664, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 8.083073727933542, |
|
"grad_norm": 0.44464972615242004, |
|
"learning_rate": 3.191190274584952e-05, |
|
"loss": 0.1376, |
|
"num_input_tokens_seen": 35840720, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 8.09968847352025, |
|
"grad_norm": 0.32324495911598206, |
|
"learning_rate": 3.184898046731082e-05, |
|
"loss": 0.1087, |
|
"num_input_tokens_seen": 35936736, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 8.116303219106957, |
|
"grad_norm": 0.3147285580635071, |
|
"learning_rate": 3.178601124662686e-05, |
|
"loss": 0.1227, |
|
"num_input_tokens_seen": 36013800, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 8.132917964693666, |
|
"grad_norm": 0.31754013895988464, |
|
"learning_rate": 3.172299551538164e-05, |
|
"loss": 0.1221, |
|
"num_input_tokens_seen": 36097904, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 8.149532710280374, |
|
"grad_norm": 0.28259411454200745, |
|
"learning_rate": 3.165993370547794e-05, |
|
"loss": 0.0985, |
|
"num_input_tokens_seen": 36195544, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 8.166147455867081, |
|
"grad_norm": 0.39379462599754333, |
|
"learning_rate": 3.1596826249134324e-05, |
|
"loss": 0.1529, |
|
"num_input_tokens_seen": 36261256, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 8.18276220145379, |
|
"grad_norm": 0.44324612617492676, |
|
"learning_rate": 3.153367357888224e-05, |
|
"loss": 0.1489, |
|
"num_input_tokens_seen": 36325024, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 8.199376947040498, |
|
"grad_norm": 0.34766262769699097, |
|
"learning_rate": 3.147047612756302e-05, |
|
"loss": 0.1288, |
|
"num_input_tokens_seen": 36377368, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 8.215991692627206, |
|
"grad_norm": 0.3318942189216614, |
|
"learning_rate": 3.140723432832492e-05, |
|
"loss": 0.1153, |
|
"num_input_tokens_seen": 36459240, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 8.232606438213915, |
|
"grad_norm": 0.38428008556365967, |
|
"learning_rate": 3.1343948614620145e-05, |
|
"loss": 0.1305, |
|
"num_input_tokens_seen": 36553088, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 8.249221183800623, |
|
"grad_norm": 0.40489867329597473, |
|
"learning_rate": 3.128061942020189e-05, |
|
"loss": 0.2824, |
|
"num_input_tokens_seen": 36611464, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 8.26583592938733, |
|
"grad_norm": 0.26625269651412964, |
|
"learning_rate": 3.121724717912138e-05, |
|
"loss": 0.1033, |
|
"num_input_tokens_seen": 36705696, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 8.28245067497404, |
|
"grad_norm": 0.3326374590396881, |
|
"learning_rate": 3.115383232572483e-05, |
|
"loss": 0.1124, |
|
"num_input_tokens_seen": 36762744, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 8.299065420560748, |
|
"grad_norm": 0.3752160966396332, |
|
"learning_rate": 3.109037529465056e-05, |
|
"loss": 0.1297, |
|
"num_input_tokens_seen": 36827816, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 8.315680166147455, |
|
"grad_norm": 0.3128851056098938, |
|
"learning_rate": 3.102687652082597e-05, |
|
"loss": 0.1158, |
|
"num_input_tokens_seen": 36931424, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 8.332294911734165, |
|
"grad_norm": 0.3378419280052185, |
|
"learning_rate": 3.0963336439464526e-05, |
|
"loss": 0.1146, |
|
"num_input_tokens_seen": 36991464, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 8.348909657320872, |
|
"grad_norm": 0.33648964762687683, |
|
"learning_rate": 3.089975548606283e-05, |
|
"loss": 0.1044, |
|
"num_input_tokens_seen": 37092928, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 8.36552440290758, |
|
"grad_norm": 0.36446887254714966, |
|
"learning_rate": 3.083613409639764e-05, |
|
"loss": 0.1192, |
|
"num_input_tokens_seen": 37168792, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 8.38213914849429, |
|
"grad_norm": 0.36081403493881226, |
|
"learning_rate": 3.0772472706522806e-05, |
|
"loss": 0.1197, |
|
"num_input_tokens_seen": 37258864, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 8.398753894080997, |
|
"grad_norm": 0.34557875990867615, |
|
"learning_rate": 3.0708771752766394e-05, |
|
"loss": 0.1351, |
|
"num_input_tokens_seen": 37343224, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 8.415368639667705, |
|
"grad_norm": 0.42923229932785034, |
|
"learning_rate": 3.06450316717276e-05, |
|
"loss": 0.1336, |
|
"num_input_tokens_seen": 37395488, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 8.431983385254414, |
|
"grad_norm": 0.29999154806137085, |
|
"learning_rate": 3.0581252900273786e-05, |
|
"loss": 0.1057, |
|
"num_input_tokens_seen": 37473248, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 8.448598130841122, |
|
"grad_norm": 0.36136433482170105, |
|
"learning_rate": 3.0517435875537536e-05, |
|
"loss": 0.1101, |
|
"num_input_tokens_seen": 37532096, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 8.46521287642783, |
|
"grad_norm": 0.2820552587509155, |
|
"learning_rate": 3.045358103491357e-05, |
|
"loss": 0.1079, |
|
"num_input_tokens_seen": 37622328, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 8.481827622014539, |
|
"grad_norm": 0.4173890948295593, |
|
"learning_rate": 3.038968881605583e-05, |
|
"loss": 0.1245, |
|
"num_input_tokens_seen": 37686304, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 8.498442367601246, |
|
"grad_norm": 0.3525732159614563, |
|
"learning_rate": 3.0325759656874418e-05, |
|
"loss": 0.1275, |
|
"num_input_tokens_seen": 37770856, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 8.515057113187954, |
|
"grad_norm": 0.419452428817749, |
|
"learning_rate": 3.026179399553264e-05, |
|
"loss": 0.1123, |
|
"num_input_tokens_seen": 37834072, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 8.531671858774663, |
|
"grad_norm": 0.38209953904151917, |
|
"learning_rate": 3.0197792270443982e-05, |
|
"loss": 0.112, |
|
"num_input_tokens_seen": 37889928, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.54828660436137, |
|
"grad_norm": 0.3691641390323639, |
|
"learning_rate": 3.0133754920269103e-05, |
|
"loss": 0.2376, |
|
"num_input_tokens_seen": 37971296, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 8.564901349948078, |
|
"grad_norm": 0.3733454644680023, |
|
"learning_rate": 3.0069682383912813e-05, |
|
"loss": 0.123, |
|
"num_input_tokens_seen": 38049288, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 8.581516095534788, |
|
"grad_norm": 0.3974218964576721, |
|
"learning_rate": 3.0005575100521118e-05, |
|
"loss": 0.1386, |
|
"num_input_tokens_seen": 38123392, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 8.598130841121495, |
|
"grad_norm": 0.2970719337463379, |
|
"learning_rate": 2.9941433509478156e-05, |
|
"loss": 0.1194, |
|
"num_input_tokens_seen": 38208264, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 8.614745586708203, |
|
"grad_norm": 0.3720918595790863, |
|
"learning_rate": 2.9877258050403212e-05, |
|
"loss": 0.126, |
|
"num_input_tokens_seen": 38258192, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 8.631360332294912, |
|
"grad_norm": 0.3574189841747284, |
|
"learning_rate": 2.9813049163147688e-05, |
|
"loss": 0.1295, |
|
"num_input_tokens_seen": 38332408, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 8.64797507788162, |
|
"grad_norm": 0.3034169375896454, |
|
"learning_rate": 2.974880728779212e-05, |
|
"loss": 0.1035, |
|
"num_input_tokens_seen": 38404960, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 8.664589823468328, |
|
"grad_norm": 0.35143762826919556, |
|
"learning_rate": 2.9684532864643122e-05, |
|
"loss": 0.1347, |
|
"num_input_tokens_seen": 38481704, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 8.681204569055037, |
|
"grad_norm": 0.3551865518093109, |
|
"learning_rate": 2.9620226334230388e-05, |
|
"loss": 0.1076, |
|
"num_input_tokens_seen": 38546304, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 8.697819314641745, |
|
"grad_norm": 0.39726606011390686, |
|
"learning_rate": 2.9555888137303695e-05, |
|
"loss": 0.1514, |
|
"num_input_tokens_seen": 38621024, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.714434060228452, |
|
"grad_norm": 0.3454006314277649, |
|
"learning_rate": 2.949151871482982e-05, |
|
"loss": 0.1119, |
|
"num_input_tokens_seen": 38679368, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 8.731048805815162, |
|
"grad_norm": 0.3723394572734833, |
|
"learning_rate": 2.9427118507989586e-05, |
|
"loss": 0.1331, |
|
"num_input_tokens_seen": 38753984, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 8.74766355140187, |
|
"grad_norm": 0.35104718804359436, |
|
"learning_rate": 2.93626879581748e-05, |
|
"loss": 0.1158, |
|
"num_input_tokens_seen": 38808336, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 8.764278296988577, |
|
"grad_norm": 0.37009456753730774, |
|
"learning_rate": 2.929822750698524e-05, |
|
"loss": 0.2268, |
|
"num_input_tokens_seen": 38876624, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 8.780893042575286, |
|
"grad_norm": 0.3663157820701599, |
|
"learning_rate": 2.9233737596225613e-05, |
|
"loss": 0.1155, |
|
"num_input_tokens_seen": 38933576, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 8.797507788161994, |
|
"grad_norm": 0.3528357446193695, |
|
"learning_rate": 2.916921866790256e-05, |
|
"loss": 0.114, |
|
"num_input_tokens_seen": 39050816, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 8.814122533748701, |
|
"grad_norm": 0.3449118435382843, |
|
"learning_rate": 2.9104671164221576e-05, |
|
"loss": 0.119, |
|
"num_input_tokens_seen": 39101856, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 8.83073727933541, |
|
"grad_norm": 0.3620098829269409, |
|
"learning_rate": 2.9040095527584032e-05, |
|
"loss": 0.115, |
|
"num_input_tokens_seen": 39161928, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 8.847352024922118, |
|
"grad_norm": 0.39881882071495056, |
|
"learning_rate": 2.897549220058411e-05, |
|
"loss": 0.1312, |
|
"num_input_tokens_seen": 39216048, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 8.863966770508826, |
|
"grad_norm": 0.35401323437690735, |
|
"learning_rate": 2.8910861626005776e-05, |
|
"loss": 0.1107, |
|
"num_input_tokens_seen": 39317320, |
|
"step": 540 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 1200, |
|
"num_input_tokens_seen": 39317320, |
|
"num_train_epochs": 20, |
|
"save_steps": 60, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6775102547928023e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|