oodeh's picture
Add files using upload-large-folder tool
94d2ab9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.863966770508826,
"eval_steps": 500,
"global_step": 540,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016614745586708203,
"grad_norm": 0.061364494264125824,
"learning_rate": 4.999991432639962e-05,
"loss": 0.5857,
"num_input_tokens_seen": 70408,
"step": 1
},
{
"epoch": 0.033229491173416406,
"grad_norm": 0.0613991804420948,
"learning_rate": 4.999965730618567e-05,
"loss": 0.5331,
"num_input_tokens_seen": 139640,
"step": 2
},
{
"epoch": 0.04984423676012461,
"grad_norm": 0.06351307034492493,
"learning_rate": 4.9999228941119745e-05,
"loss": 0.5852,
"num_input_tokens_seen": 223656,
"step": 3
},
{
"epoch": 0.06645898234683281,
"grad_norm": 0.05762802064418793,
"learning_rate": 4.999862923413781e-05,
"loss": 0.5384,
"num_input_tokens_seen": 300688,
"step": 4
},
{
"epoch": 0.08307372793354102,
"grad_norm": 0.0632179006934166,
"learning_rate": 4.999785818935018e-05,
"loss": 0.5273,
"num_input_tokens_seen": 366368,
"step": 5
},
{
"epoch": 0.09968847352024922,
"grad_norm": 0.056689903140068054,
"learning_rate": 4.999691581204152e-05,
"loss": 0.5145,
"num_input_tokens_seen": 445808,
"step": 6
},
{
"epoch": 0.11630321910695743,
"grad_norm": 0.06574171781539917,
"learning_rate": 4.9995802108670775e-05,
"loss": 0.5301,
"num_input_tokens_seen": 522800,
"step": 7
},
{
"epoch": 0.13291796469366562,
"grad_norm": 0.06367070972919464,
"learning_rate": 4.999451708687114e-05,
"loss": 0.5552,
"num_input_tokens_seen": 599608,
"step": 8
},
{
"epoch": 0.14953271028037382,
"grad_norm": 0.0585966520011425,
"learning_rate": 4.9993060755450015e-05,
"loss": 0.5999,
"num_input_tokens_seen": 681424,
"step": 9
},
{
"epoch": 0.16614745586708204,
"grad_norm": 0.05650574713945389,
"learning_rate": 4.999143312438893e-05,
"loss": 0.4535,
"num_input_tokens_seen": 756744,
"step": 10
},
{
"epoch": 0.18276220145379024,
"grad_norm": 0.05954223498702049,
"learning_rate": 4.998963420484349e-05,
"loss": 0.4674,
"num_input_tokens_seen": 842576,
"step": 11
},
{
"epoch": 0.19937694704049844,
"grad_norm": 0.0663776770234108,
"learning_rate": 4.998766400914329e-05,
"loss": 0.4703,
"num_input_tokens_seen": 917232,
"step": 12
},
{
"epoch": 0.21599169262720663,
"grad_norm": 0.056374579668045044,
"learning_rate": 4.9985522550791825e-05,
"loss": 0.3725,
"num_input_tokens_seen": 1006800,
"step": 13
},
{
"epoch": 0.23260643821391486,
"grad_norm": 0.06437493115663528,
"learning_rate": 4.998320984446641e-05,
"loss": 0.4653,
"num_input_tokens_seen": 1085824,
"step": 14
},
{
"epoch": 0.24922118380062305,
"grad_norm": 0.06560757756233215,
"learning_rate": 4.9980725906018074e-05,
"loss": 0.5026,
"num_input_tokens_seen": 1164160,
"step": 15
},
{
"epoch": 0.26583592938733125,
"grad_norm": 0.06942517310380936,
"learning_rate": 4.997807075247146e-05,
"loss": 0.5401,
"num_input_tokens_seen": 1242264,
"step": 16
},
{
"epoch": 0.2824506749740395,
"grad_norm": 0.06349828094244003,
"learning_rate": 4.997524440202469e-05,
"loss": 0.4713,
"num_input_tokens_seen": 1325904,
"step": 17
},
{
"epoch": 0.29906542056074764,
"grad_norm": 0.08846385776996613,
"learning_rate": 4.9972246874049254e-05,
"loss": 0.5834,
"num_input_tokens_seen": 1385632,
"step": 18
},
{
"epoch": 0.31568016614745587,
"grad_norm": 0.062130190432071686,
"learning_rate": 4.996907818908987e-05,
"loss": 0.4045,
"num_input_tokens_seen": 1470632,
"step": 19
},
{
"epoch": 0.3322949117341641,
"grad_norm": 0.07743565738201141,
"learning_rate": 4.996573836886435e-05,
"loss": 0.5283,
"num_input_tokens_seen": 1547536,
"step": 20
},
{
"epoch": 0.34890965732087226,
"grad_norm": 0.06756695359945297,
"learning_rate": 4.9962227436263453e-05,
"loss": 0.4199,
"num_input_tokens_seen": 1615528,
"step": 21
},
{
"epoch": 0.3655244029075805,
"grad_norm": 0.08662309497594833,
"learning_rate": 4.995854541535071e-05,
"loss": 0.4775,
"num_input_tokens_seen": 1694352,
"step": 22
},
{
"epoch": 0.3821391484942887,
"grad_norm": 0.08380820602178574,
"learning_rate": 4.9954692331362294e-05,
"loss": 0.4871,
"num_input_tokens_seen": 1753776,
"step": 23
},
{
"epoch": 0.3987538940809969,
"grad_norm": 0.09967435896396637,
"learning_rate": 4.995066821070679e-05,
"loss": 0.4871,
"num_input_tokens_seen": 1809048,
"step": 24
},
{
"epoch": 0.4153686396677051,
"grad_norm": 0.0871267095208168,
"learning_rate": 4.994647308096509e-05,
"loss": 0.5461,
"num_input_tokens_seen": 1884264,
"step": 25
},
{
"epoch": 0.43198338525441327,
"grad_norm": 0.065020851790905,
"learning_rate": 4.994210697089014e-05,
"loss": 0.405,
"num_input_tokens_seen": 1981704,
"step": 26
},
{
"epoch": 0.4485981308411215,
"grad_norm": 0.09853450953960419,
"learning_rate": 4.9937569910406756e-05,
"loss": 0.4487,
"num_input_tokens_seen": 2044144,
"step": 27
},
{
"epoch": 0.4652128764278297,
"grad_norm": 0.08763110637664795,
"learning_rate": 4.9932861930611454e-05,
"loss": 0.3946,
"num_input_tokens_seen": 2107584,
"step": 28
},
{
"epoch": 0.4818276220145379,
"grad_norm": 0.08950547873973846,
"learning_rate": 4.9927983063772196e-05,
"loss": 0.4257,
"num_input_tokens_seen": 2169248,
"step": 29
},
{
"epoch": 0.4984423676012461,
"grad_norm": 0.09980211406946182,
"learning_rate": 4.99229333433282e-05,
"loss": 0.3911,
"num_input_tokens_seen": 2230344,
"step": 30
},
{
"epoch": 0.5150571131879543,
"grad_norm": 0.092055544257164,
"learning_rate": 4.9917712803889674e-05,
"loss": 0.3749,
"num_input_tokens_seen": 2302368,
"step": 31
},
{
"epoch": 0.5316718587746625,
"grad_norm": 0.10067818313837051,
"learning_rate": 4.991232148123761e-05,
"loss": 0.4761,
"num_input_tokens_seen": 2369984,
"step": 32
},
{
"epoch": 0.5482866043613707,
"grad_norm": 0.0717971920967102,
"learning_rate": 4.990675941232353e-05,
"loss": 0.4328,
"num_input_tokens_seen": 2453032,
"step": 33
},
{
"epoch": 0.564901349948079,
"grad_norm": 0.07436250895261765,
"learning_rate": 4.990102663526924e-05,
"loss": 0.417,
"num_input_tokens_seen": 2527464,
"step": 34
},
{
"epoch": 0.5815160955347871,
"grad_norm": 0.09256689995527267,
"learning_rate": 4.989512318936655e-05,
"loss": 0.4097,
"num_input_tokens_seen": 2597032,
"step": 35
},
{
"epoch": 0.5981308411214953,
"grad_norm": 0.09964177012443542,
"learning_rate": 4.9889049115077005e-05,
"loss": 0.4065,
"num_input_tokens_seen": 2671704,
"step": 36
},
{
"epoch": 0.6147455867082036,
"grad_norm": 0.06627887487411499,
"learning_rate": 4.988280445403164e-05,
"loss": 0.4136,
"num_input_tokens_seen": 2767640,
"step": 37
},
{
"epoch": 0.6313603322949117,
"grad_norm": 0.0746045857667923,
"learning_rate": 4.987638924903067e-05,
"loss": 0.4125,
"num_input_tokens_seen": 2843720,
"step": 38
},
{
"epoch": 0.6479750778816199,
"grad_norm": 0.0795741006731987,
"learning_rate": 4.9869803544043166e-05,
"loss": 0.3135,
"num_input_tokens_seen": 2921472,
"step": 39
},
{
"epoch": 0.6645898234683282,
"grad_norm": 0.08914181590080261,
"learning_rate": 4.9863047384206835e-05,
"loss": 0.4549,
"num_input_tokens_seen": 2998400,
"step": 40
},
{
"epoch": 0.6812045690550363,
"grad_norm": 0.11220043897628784,
"learning_rate": 4.985612081582764e-05,
"loss": 0.4135,
"num_input_tokens_seen": 3059648,
"step": 41
},
{
"epoch": 0.6978193146417445,
"grad_norm": 0.08390027284622192,
"learning_rate": 4.98490238863795e-05,
"loss": 0.3538,
"num_input_tokens_seen": 3140184,
"step": 42
},
{
"epoch": 0.7144340602284528,
"grad_norm": 0.08858532458543777,
"learning_rate": 4.984175664450397e-05,
"loss": 0.3644,
"num_input_tokens_seen": 3207184,
"step": 43
},
{
"epoch": 0.731048805815161,
"grad_norm": 0.07439564168453217,
"learning_rate": 4.983431914000991e-05,
"loss": 0.4019,
"num_input_tokens_seen": 3292344,
"step": 44
},
{
"epoch": 0.7476635514018691,
"grad_norm": 0.08694300055503845,
"learning_rate": 4.982671142387316e-05,
"loss": 0.4238,
"num_input_tokens_seen": 3365384,
"step": 45
},
{
"epoch": 0.7642782969885774,
"grad_norm": 0.0867784395813942,
"learning_rate": 4.981893354823614e-05,
"loss": 0.3702,
"num_input_tokens_seen": 3440720,
"step": 46
},
{
"epoch": 0.7808930425752856,
"grad_norm": 0.06278439611196518,
"learning_rate": 4.9810985566407544e-05,
"loss": 0.3354,
"num_input_tokens_seen": 3533576,
"step": 47
},
{
"epoch": 0.7975077881619937,
"grad_norm": 0.08999717980623245,
"learning_rate": 4.980286753286195e-05,
"loss": 0.4981,
"num_input_tokens_seen": 3599744,
"step": 48
},
{
"epoch": 0.814122533748702,
"grad_norm": 0.07938859611749649,
"learning_rate": 4.979457950323945e-05,
"loss": 0.4016,
"num_input_tokens_seen": 3689520,
"step": 49
},
{
"epoch": 0.8307372793354102,
"grad_norm": 0.1045590192079544,
"learning_rate": 4.9786121534345265e-05,
"loss": 0.388,
"num_input_tokens_seen": 3751808,
"step": 50
},
{
"epoch": 0.8473520249221184,
"grad_norm": 0.07890618592500687,
"learning_rate": 4.9777493684149375e-05,
"loss": 0.3674,
"num_input_tokens_seen": 3839096,
"step": 51
},
{
"epoch": 0.8639667705088265,
"grad_norm": 0.07802557945251465,
"learning_rate": 4.976869601178609e-05,
"loss": 0.4147,
"num_input_tokens_seen": 3919824,
"step": 52
},
{
"epoch": 0.8805815160955348,
"grad_norm": 0.0913538783788681,
"learning_rate": 4.975972857755369e-05,
"loss": 0.2978,
"num_input_tokens_seen": 3989312,
"step": 53
},
{
"epoch": 0.897196261682243,
"grad_norm": 0.08525951951742172,
"learning_rate": 4.975059144291394e-05,
"loss": 0.3923,
"num_input_tokens_seen": 4060528,
"step": 54
},
{
"epoch": 0.9138110072689511,
"grad_norm": 0.08649709820747375,
"learning_rate": 4.974128467049176e-05,
"loss": 0.3282,
"num_input_tokens_seen": 4129368,
"step": 55
},
{
"epoch": 0.9304257528556594,
"grad_norm": 0.11635593324899673,
"learning_rate": 4.9731808324074717e-05,
"loss": 0.3403,
"num_input_tokens_seen": 4175208,
"step": 56
},
{
"epoch": 0.9470404984423676,
"grad_norm": 0.1115177720785141,
"learning_rate": 4.972216246861262e-05,
"loss": 0.3191,
"num_input_tokens_seen": 4218096,
"step": 57
},
{
"epoch": 0.9636552440290758,
"grad_norm": 0.0986371859908104,
"learning_rate": 4.971234717021709e-05,
"loss": 0.3745,
"num_input_tokens_seen": 4275968,
"step": 58
},
{
"epoch": 0.980269989615784,
"grad_norm": 0.07860780507326126,
"learning_rate": 4.9702362496161085e-05,
"loss": 0.3129,
"num_input_tokens_seen": 4346616,
"step": 59
},
{
"epoch": 0.9968847352024922,
"grad_norm": 0.08581527322530746,
"learning_rate": 4.9692208514878444e-05,
"loss": 0.3324,
"num_input_tokens_seen": 4425064,
"step": 60
},
{
"epoch": 1.0,
"grad_norm": 0.16779834032058716,
"learning_rate": 4.968188529596342e-05,
"loss": 0.2814,
"num_input_tokens_seen": 4435328,
"step": 61
},
{
"epoch": 1.0166147455867083,
"grad_norm": 0.08948636800050735,
"learning_rate": 4.9671392910170185e-05,
"loss": 0.3467,
"num_input_tokens_seen": 4500104,
"step": 62
},
{
"epoch": 1.0332294911734163,
"grad_norm": 0.07826830446720123,
"learning_rate": 4.966073142941239e-05,
"loss": 0.3892,
"num_input_tokens_seen": 4581976,
"step": 63
},
{
"epoch": 1.0498442367601246,
"grad_norm": 0.08562575280666351,
"learning_rate": 4.964990092676263e-05,
"loss": 0.3354,
"num_input_tokens_seen": 4652160,
"step": 64
},
{
"epoch": 1.066458982346833,
"grad_norm": 0.1057090312242508,
"learning_rate": 4.9638901476451946e-05,
"loss": 0.3457,
"num_input_tokens_seen": 4709368,
"step": 65
},
{
"epoch": 1.083073727933541,
"grad_norm": 0.08131146430969238,
"learning_rate": 4.962773315386935e-05,
"loss": 0.3672,
"num_input_tokens_seen": 4798256,
"step": 66
},
{
"epoch": 1.0996884735202492,
"grad_norm": 0.09464936703443527,
"learning_rate": 4.961639603556127e-05,
"loss": 0.3157,
"num_input_tokens_seen": 4859200,
"step": 67
},
{
"epoch": 1.1163032191069575,
"grad_norm": 0.0999661460518837,
"learning_rate": 4.960489019923105e-05,
"loss": 0.3968,
"num_input_tokens_seen": 4925992,
"step": 68
},
{
"epoch": 1.1329179646936656,
"grad_norm": 0.09851639717817307,
"learning_rate": 4.9593215723738404e-05,
"loss": 0.329,
"num_input_tokens_seen": 4998808,
"step": 69
},
{
"epoch": 1.1495327102803738,
"grad_norm": 0.08382592350244522,
"learning_rate": 4.958137268909887e-05,
"loss": 0.2856,
"num_input_tokens_seen": 5089672,
"step": 70
},
{
"epoch": 1.1661474558670821,
"grad_norm": 0.09073847532272339,
"learning_rate": 4.9569361176483286e-05,
"loss": 0.3512,
"num_input_tokens_seen": 5166744,
"step": 71
},
{
"epoch": 1.1827622014537902,
"grad_norm": 0.10290185362100601,
"learning_rate": 4.9557181268217227e-05,
"loss": 0.4263,
"num_input_tokens_seen": 5228264,
"step": 72
},
{
"epoch": 1.1993769470404985,
"grad_norm": 0.07421435415744781,
"learning_rate": 4.9544833047780394e-05,
"loss": 0.3126,
"num_input_tokens_seen": 5338224,
"step": 73
},
{
"epoch": 1.2159916926272065,
"grad_norm": 0.10284842550754547,
"learning_rate": 4.9532316599806124e-05,
"loss": 0.3473,
"num_input_tokens_seen": 5399848,
"step": 74
},
{
"epoch": 1.2326064382139148,
"grad_norm": 0.10817047953605652,
"learning_rate": 4.951963201008076e-05,
"loss": 0.3275,
"num_input_tokens_seen": 5468624,
"step": 75
},
{
"epoch": 1.249221183800623,
"grad_norm": 0.09662210941314697,
"learning_rate": 4.9506779365543046e-05,
"loss": 0.3296,
"num_input_tokens_seen": 5536776,
"step": 76
},
{
"epoch": 1.2658359293873311,
"grad_norm": 0.11193853616714478,
"learning_rate": 4.949375875428357e-05,
"loss": 0.3605,
"num_input_tokens_seen": 5609296,
"step": 77
},
{
"epoch": 1.2824506749740394,
"grad_norm": 0.11866679787635803,
"learning_rate": 4.9480570265544144e-05,
"loss": 0.3133,
"num_input_tokens_seen": 5663824,
"step": 78
},
{
"epoch": 1.2990654205607477,
"grad_norm": 0.09865846484899521,
"learning_rate": 4.94672139897172e-05,
"loss": 0.3464,
"num_input_tokens_seen": 5742032,
"step": 79
},
{
"epoch": 1.3156801661474558,
"grad_norm": 0.09930054098367691,
"learning_rate": 4.9453690018345144e-05,
"loss": 0.3346,
"num_input_tokens_seen": 5816864,
"step": 80
},
{
"epoch": 1.332294911734164,
"grad_norm": 0.1085321381688118,
"learning_rate": 4.943999844411977e-05,
"loss": 0.3102,
"num_input_tokens_seen": 5881624,
"step": 81
},
{
"epoch": 1.3489096573208723,
"grad_norm": 0.08012478053569794,
"learning_rate": 4.94261393608816e-05,
"loss": 0.2853,
"num_input_tokens_seen": 5970272,
"step": 82
},
{
"epoch": 1.3655244029075804,
"grad_norm": 0.10291877388954163,
"learning_rate": 4.941211286361922e-05,
"loss": 0.3038,
"num_input_tokens_seen": 6058752,
"step": 83
},
{
"epoch": 1.3821391484942886,
"grad_norm": 0.11999356001615524,
"learning_rate": 4.939791904846869e-05,
"loss": 0.3283,
"num_input_tokens_seen": 6120064,
"step": 84
},
{
"epoch": 1.398753894080997,
"grad_norm": 0.10502559691667557,
"learning_rate": 4.938355801271282e-05,
"loss": 0.321,
"num_input_tokens_seen": 6182072,
"step": 85
},
{
"epoch": 1.415368639667705,
"grad_norm": 0.12620873749256134,
"learning_rate": 4.936902985478055e-05,
"loss": 0.3296,
"num_input_tokens_seen": 6269680,
"step": 86
},
{
"epoch": 1.4319833852544133,
"grad_norm": 0.13212910294532776,
"learning_rate": 4.935433467424624e-05,
"loss": 0.3225,
"num_input_tokens_seen": 6347424,
"step": 87
},
{
"epoch": 1.4485981308411215,
"grad_norm": 0.11600925773382187,
"learning_rate": 4.933947257182901e-05,
"loss": 0.3479,
"num_input_tokens_seen": 6412584,
"step": 88
},
{
"epoch": 1.4652128764278296,
"grad_norm": 0.11683235317468643,
"learning_rate": 4.932444364939205e-05,
"loss": 0.3322,
"num_input_tokens_seen": 6482728,
"step": 89
},
{
"epoch": 1.4818276220145379,
"grad_norm": 0.11446017026901245,
"learning_rate": 4.9309248009941914e-05,
"loss": 0.3802,
"num_input_tokens_seen": 6562104,
"step": 90
},
{
"epoch": 1.4984423676012462,
"grad_norm": 0.10500892251729965,
"learning_rate": 4.929388575762782e-05,
"loss": 0.3371,
"num_input_tokens_seen": 6656552,
"step": 91
},
{
"epoch": 1.5150571131879542,
"grad_norm": 0.13279151916503906,
"learning_rate": 4.9278356997740904e-05,
"loss": 0.293,
"num_input_tokens_seen": 6714184,
"step": 92
},
{
"epoch": 1.5316718587746625,
"grad_norm": 0.107506163418293,
"learning_rate": 4.9262661836713564e-05,
"loss": 0.3127,
"num_input_tokens_seen": 6793552,
"step": 93
},
{
"epoch": 1.5482866043613708,
"grad_norm": 0.124021977186203,
"learning_rate": 4.924680038211867e-05,
"loss": 0.3263,
"num_input_tokens_seen": 6865256,
"step": 94
},
{
"epoch": 1.5649013499480788,
"grad_norm": 0.14172782003879547,
"learning_rate": 4.9230772742668866e-05,
"loss": 0.3204,
"num_input_tokens_seen": 6931152,
"step": 95
},
{
"epoch": 1.5815160955347871,
"grad_norm": 0.12229758501052856,
"learning_rate": 4.9214579028215776e-05,
"loss": 0.326,
"num_input_tokens_seen": 6998408,
"step": 96
},
{
"epoch": 1.5981308411214954,
"grad_norm": 0.1242135688662529,
"learning_rate": 4.919821934974933e-05,
"loss": 0.2814,
"num_input_tokens_seen": 7053008,
"step": 97
},
{
"epoch": 1.6147455867082035,
"grad_norm": 0.12830108404159546,
"learning_rate": 4.918169381939692e-05,
"loss": 0.3254,
"num_input_tokens_seen": 7106440,
"step": 98
},
{
"epoch": 1.6313603322949117,
"grad_norm": 0.12180659174919128,
"learning_rate": 4.916500255042268e-05,
"loss": 0.3228,
"num_input_tokens_seen": 7167032,
"step": 99
},
{
"epoch": 1.64797507788162,
"grad_norm": 0.10792312026023865,
"learning_rate": 4.914814565722671e-05,
"loss": 0.2729,
"num_input_tokens_seen": 7245720,
"step": 100
},
{
"epoch": 1.664589823468328,
"grad_norm": 0.18523500859737396,
"learning_rate": 4.913112325534426e-05,
"loss": 0.3462,
"num_input_tokens_seen": 7326320,
"step": 101
},
{
"epoch": 1.6812045690550363,
"grad_norm": 0.09529964625835419,
"learning_rate": 4.9113935461444955e-05,
"loss": 0.3096,
"num_input_tokens_seen": 7442232,
"step": 102
},
{
"epoch": 1.6978193146417446,
"grad_norm": 0.14481183886528015,
"learning_rate": 4.9096582393332025e-05,
"loss": 0.3014,
"num_input_tokens_seen": 7502496,
"step": 103
},
{
"epoch": 1.7144340602284527,
"grad_norm": 0.14645016193389893,
"learning_rate": 4.907906416994146e-05,
"loss": 0.3336,
"num_input_tokens_seen": 7566496,
"step": 104
},
{
"epoch": 1.731048805815161,
"grad_norm": 0.1306885927915573,
"learning_rate": 4.906138091134118e-05,
"loss": 0.3911,
"num_input_tokens_seen": 7629056,
"step": 105
},
{
"epoch": 1.7476635514018692,
"grad_norm": 0.10863160341978073,
"learning_rate": 4.9043532738730284e-05,
"loss": 0.3201,
"num_input_tokens_seen": 7706096,
"step": 106
},
{
"epoch": 1.7642782969885773,
"grad_norm": 0.11725673079490662,
"learning_rate": 4.9025519774438136e-05,
"loss": 0.2783,
"num_input_tokens_seen": 7780072,
"step": 107
},
{
"epoch": 1.7808930425752856,
"grad_norm": 0.1243867501616478,
"learning_rate": 4.900734214192358e-05,
"loss": 0.3044,
"num_input_tokens_seen": 7857712,
"step": 108
},
{
"epoch": 1.7975077881619939,
"grad_norm": 0.13539955019950867,
"learning_rate": 4.898899996577407e-05,
"loss": 0.3009,
"num_input_tokens_seen": 7916832,
"step": 109
},
{
"epoch": 1.814122533748702,
"grad_norm": 0.11198178678750992,
"learning_rate": 4.8970493371704826e-05,
"loss": 0.3229,
"num_input_tokens_seen": 7993056,
"step": 110
},
{
"epoch": 1.8307372793354102,
"grad_norm": 0.11881165206432343,
"learning_rate": 4.8951822486557986e-05,
"loss": 0.3414,
"num_input_tokens_seen": 8090056,
"step": 111
},
{
"epoch": 1.8473520249221185,
"grad_norm": 0.12841404974460602,
"learning_rate": 4.893298743830168e-05,
"loss": 0.2907,
"num_input_tokens_seen": 8164808,
"step": 112
},
{
"epoch": 1.8639667705088265,
"grad_norm": 0.14767521619796753,
"learning_rate": 4.891398835602925e-05,
"loss": 0.2901,
"num_input_tokens_seen": 8223568,
"step": 113
},
{
"epoch": 1.8805815160955348,
"grad_norm": 0.15326914191246033,
"learning_rate": 4.8894825369958255e-05,
"loss": 0.2918,
"num_input_tokens_seen": 8276160,
"step": 114
},
{
"epoch": 1.897196261682243,
"grad_norm": 0.1210051029920578,
"learning_rate": 4.8875498611429674e-05,
"loss": 0.3074,
"num_input_tokens_seen": 8354904,
"step": 115
},
{
"epoch": 1.9138110072689511,
"grad_norm": 0.13544373214244843,
"learning_rate": 4.8856008212906925e-05,
"loss": 0.3461,
"num_input_tokens_seen": 8442584,
"step": 116
},
{
"epoch": 1.9304257528556594,
"grad_norm": 0.13535892963409424,
"learning_rate": 4.8836354307975026e-05,
"loss": 0.3078,
"num_input_tokens_seen": 8506688,
"step": 117
},
{
"epoch": 1.9470404984423677,
"grad_norm": 0.10383590310811996,
"learning_rate": 4.881653703133966e-05,
"loss": 0.2432,
"num_input_tokens_seen": 8610712,
"step": 118
},
{
"epoch": 1.9636552440290758,
"grad_norm": 0.12125886976718903,
"learning_rate": 4.87965565188262e-05,
"loss": 0.2915,
"num_input_tokens_seen": 8692624,
"step": 119
},
{
"epoch": 1.980269989615784,
"grad_norm": 0.1351424902677536,
"learning_rate": 4.877641290737884e-05,
"loss": 0.3006,
"num_input_tokens_seen": 8772208,
"step": 120
},
{
"epoch": 1.9968847352024923,
"grad_norm": 0.11472523212432861,
"learning_rate": 4.8756106335059646e-05,
"loss": 0.2774,
"num_input_tokens_seen": 8854904,
"step": 121
},
{
"epoch": 2.0,
"grad_norm": 0.3606414794921875,
"learning_rate": 4.87356369410476e-05,
"loss": 0.2786,
"num_input_tokens_seen": 8872656,
"step": 122
},
{
"epoch": 2.016614745586708,
"grad_norm": 0.13124766945838928,
"learning_rate": 4.8715004865637614e-05,
"loss": 0.294,
"num_input_tokens_seen": 8946480,
"step": 123
},
{
"epoch": 2.0332294911734166,
"grad_norm": 0.12415049225091934,
"learning_rate": 4.869421025023965e-05,
"loss": 0.2931,
"num_input_tokens_seen": 9023328,
"step": 124
},
{
"epoch": 2.0498442367601246,
"grad_norm": 0.16626115143299103,
"learning_rate": 4.867325323737765e-05,
"loss": 0.2887,
"num_input_tokens_seen": 9074320,
"step": 125
},
{
"epoch": 2.0664589823468327,
"grad_norm": 0.153628870844841,
"learning_rate": 4.8652133970688636e-05,
"loss": 0.2776,
"num_input_tokens_seen": 9148784,
"step": 126
},
{
"epoch": 2.083073727933541,
"grad_norm": 0.12231138348579407,
"learning_rate": 4.8630852594921706e-05,
"loss": 0.3091,
"num_input_tokens_seen": 9246624,
"step": 127
},
{
"epoch": 2.0996884735202492,
"grad_norm": 0.15192057192325592,
"learning_rate": 4.860940925593703e-05,
"loss": 0.3354,
"num_input_tokens_seen": 9328176,
"step": 128
},
{
"epoch": 2.1163032191069573,
"grad_norm": 0.13820070028305054,
"learning_rate": 4.8587804100704845e-05,
"loss": 0.282,
"num_input_tokens_seen": 9388936,
"step": 129
},
{
"epoch": 2.132917964693666,
"grad_norm": 0.14466816186904907,
"learning_rate": 4.856603727730447e-05,
"loss": 0.2801,
"num_input_tokens_seen": 9461664,
"step": 130
},
{
"epoch": 2.149532710280374,
"grad_norm": 0.14671838283538818,
"learning_rate": 4.854410893492326e-05,
"loss": 0.2927,
"num_input_tokens_seen": 9535000,
"step": 131
},
{
"epoch": 2.166147455867082,
"grad_norm": 0.1757712960243225,
"learning_rate": 4.852201922385564e-05,
"loss": 0.2807,
"num_input_tokens_seen": 9600296,
"step": 132
},
{
"epoch": 2.1827622014537904,
"grad_norm": 0.17755423486232758,
"learning_rate": 4.8499768295502004e-05,
"loss": 0.2765,
"num_input_tokens_seen": 9686784,
"step": 133
},
{
"epoch": 2.1993769470404985,
"grad_norm": 0.13321827352046967,
"learning_rate": 4.847735630236773e-05,
"loss": 0.3068,
"num_input_tokens_seen": 9781112,
"step": 134
},
{
"epoch": 2.2159916926272065,
"grad_norm": 0.15012745559215546,
"learning_rate": 4.8454783398062106e-05,
"loss": 0.2737,
"num_input_tokens_seen": 9849528,
"step": 135
},
{
"epoch": 2.232606438213915,
"grad_norm": 0.14000360667705536,
"learning_rate": 4.843204973729729e-05,
"loss": 0.2831,
"num_input_tokens_seen": 9931080,
"step": 136
},
{
"epoch": 2.249221183800623,
"grad_norm": 0.14742712676525116,
"learning_rate": 4.840915547588725e-05,
"loss": 0.3047,
"num_input_tokens_seen": 10011176,
"step": 137
},
{
"epoch": 2.265835929387331,
"grad_norm": 0.16192346811294556,
"learning_rate": 4.838610077074669e-05,
"loss": 0.2759,
"num_input_tokens_seen": 10084128,
"step": 138
},
{
"epoch": 2.2824506749740396,
"grad_norm": 0.1502583771944046,
"learning_rate": 4.836288577988996e-05,
"loss": 0.298,
"num_input_tokens_seen": 10155536,
"step": 139
},
{
"epoch": 2.2990654205607477,
"grad_norm": 0.12661044299602509,
"learning_rate": 4.8339510662430046e-05,
"loss": 0.255,
"num_input_tokens_seen": 10251160,
"step": 140
},
{
"epoch": 2.3156801661474558,
"grad_norm": 0.14002998173236847,
"learning_rate": 4.8315975578577355e-05,
"loss": 0.2566,
"num_input_tokens_seen": 10345864,
"step": 141
},
{
"epoch": 2.3322949117341643,
"grad_norm": 0.17870523035526276,
"learning_rate": 4.8292280689638725e-05,
"loss": 0.4367,
"num_input_tokens_seen": 10417616,
"step": 142
},
{
"epoch": 2.3489096573208723,
"grad_norm": 0.17209866642951965,
"learning_rate": 4.826842615801628e-05,
"loss": 0.2954,
"num_input_tokens_seen": 10481816,
"step": 143
},
{
"epoch": 2.3655244029075804,
"grad_norm": 0.1665940284729004,
"learning_rate": 4.8244412147206284e-05,
"loss": 0.341,
"num_input_tokens_seen": 10562056,
"step": 144
},
{
"epoch": 2.382139148494289,
"grad_norm": 0.18919898569583893,
"learning_rate": 4.822023882179811e-05,
"loss": 0.2716,
"num_input_tokens_seen": 10612808,
"step": 145
},
{
"epoch": 2.398753894080997,
"grad_norm": 0.1681865006685257,
"learning_rate": 4.8195906347473e-05,
"loss": 0.2716,
"num_input_tokens_seen": 10682328,
"step": 146
},
{
"epoch": 2.415368639667705,
"grad_norm": 0.13141104578971863,
"learning_rate": 4.817141489100302e-05,
"loss": 0.2829,
"num_input_tokens_seen": 10771912,
"step": 147
},
{
"epoch": 2.431983385254413,
"grad_norm": 0.16544249653816223,
"learning_rate": 4.814676462024988e-05,
"loss": 0.3038,
"num_input_tokens_seen": 10842232,
"step": 148
},
{
"epoch": 2.4485981308411215,
"grad_norm": 0.17946277558803558,
"learning_rate": 4.8121955704163745e-05,
"loss": 0.2792,
"num_input_tokens_seen": 10902264,
"step": 149
},
{
"epoch": 2.4652128764278296,
"grad_norm": 0.14012685418128967,
"learning_rate": 4.8096988312782174e-05,
"loss": 0.2403,
"num_input_tokens_seen": 10992744,
"step": 150
},
{
"epoch": 2.4818276220145377,
"grad_norm": 0.103813536465168,
"learning_rate": 4.8071862617228855e-05,
"loss": 0.1605,
"num_input_tokens_seen": 11090064,
"step": 151
},
{
"epoch": 2.498442367601246,
"grad_norm": 0.1596001833677292,
"learning_rate": 4.8046578789712515e-05,
"loss": 0.2547,
"num_input_tokens_seen": 11162864,
"step": 152
},
{
"epoch": 2.515057113187954,
"grad_norm": 0.17366129159927368,
"learning_rate": 4.8021137003525664e-05,
"loss": 0.2676,
"num_input_tokens_seen": 11224368,
"step": 153
},
{
"epoch": 2.5316718587746623,
"grad_norm": 0.1615227609872818,
"learning_rate": 4.7995537433043446e-05,
"loss": 0.2898,
"num_input_tokens_seen": 11291056,
"step": 154
},
{
"epoch": 2.5482866043613708,
"grad_norm": 0.1951528787612915,
"learning_rate": 4.796978025372246e-05,
"loss": 0.2546,
"num_input_tokens_seen": 11345464,
"step": 155
},
{
"epoch": 2.564901349948079,
"grad_norm": 0.15065862238407135,
"learning_rate": 4.794386564209953e-05,
"loss": 0.3134,
"num_input_tokens_seen": 11418912,
"step": 156
},
{
"epoch": 2.581516095534787,
"grad_norm": 0.17094938457012177,
"learning_rate": 4.79177937757905e-05,
"loss": 0.2689,
"num_input_tokens_seen": 11491216,
"step": 157
},
{
"epoch": 2.5981308411214954,
"grad_norm": 0.16850312054157257,
"learning_rate": 4.7891564833489035e-05,
"loss": 0.2359,
"num_input_tokens_seen": 11558016,
"step": 158
},
{
"epoch": 2.6147455867082035,
"grad_norm": 0.16789822280406952,
"learning_rate": 4.7865178994965344e-05,
"loss": 0.2735,
"num_input_tokens_seen": 11630432,
"step": 159
},
{
"epoch": 2.6313603322949115,
"grad_norm": 0.19538354873657227,
"learning_rate": 4.783863644106502e-05,
"loss": 0.254,
"num_input_tokens_seen": 11684624,
"step": 160
},
{
"epoch": 2.64797507788162,
"grad_norm": 0.1609475016593933,
"learning_rate": 4.781193735370777e-05,
"loss": 0.2763,
"num_input_tokens_seen": 11770232,
"step": 161
},
{
"epoch": 2.664589823468328,
"grad_norm": 0.1964447796344757,
"learning_rate": 4.7785081915886134e-05,
"loss": 0.2663,
"num_input_tokens_seen": 11828360,
"step": 162
},
{
"epoch": 2.681204569055036,
"grad_norm": 0.18869946897029877,
"learning_rate": 4.775807031166428e-05,
"loss": 0.2625,
"num_input_tokens_seen": 11915944,
"step": 163
},
{
"epoch": 2.6978193146417446,
"grad_norm": 0.20539921522140503,
"learning_rate": 4.773090272617672e-05,
"loss": 0.2615,
"num_input_tokens_seen": 11981792,
"step": 164
},
{
"epoch": 2.7144340602284527,
"grad_norm": 0.1616145521402359,
"learning_rate": 4.7703579345627035e-05,
"loss": 0.3453,
"num_input_tokens_seen": 12044024,
"step": 165
},
{
"epoch": 2.7310488058151607,
"grad_norm": 0.22601978480815887,
"learning_rate": 4.7676100357286624e-05,
"loss": 0.3036,
"num_input_tokens_seen": 12093424,
"step": 166
},
{
"epoch": 2.7476635514018692,
"grad_norm": 0.15262462198734283,
"learning_rate": 4.76484659494934e-05,
"loss": 0.2523,
"num_input_tokens_seen": 12167792,
"step": 167
},
{
"epoch": 2.7642782969885773,
"grad_norm": 0.17928001284599304,
"learning_rate": 4.762067631165049e-05,
"loss": 0.2791,
"num_input_tokens_seen": 12233712,
"step": 168
},
{
"epoch": 2.7808930425752854,
"grad_norm": 0.15228766202926636,
"learning_rate": 4.7592731634224966e-05,
"loss": 0.2291,
"num_input_tokens_seen": 12310544,
"step": 169
},
{
"epoch": 2.797507788161994,
"grad_norm": 0.18862110376358032,
"learning_rate": 4.756463210874652e-05,
"loss": 0.2628,
"num_input_tokens_seen": 12400160,
"step": 170
},
{
"epoch": 2.814122533748702,
"grad_norm": 0.16640189290046692,
"learning_rate": 4.753637792780614e-05,
"loss": 0.2824,
"num_input_tokens_seen": 12480432,
"step": 171
},
{
"epoch": 2.83073727933541,
"grad_norm": 0.151117205619812,
"learning_rate": 4.7507969285054845e-05,
"loss": 0.2663,
"num_input_tokens_seen": 12568064,
"step": 172
},
{
"epoch": 2.8473520249221185,
"grad_norm": 0.26551589369773865,
"learning_rate": 4.7479406375202264e-05,
"loss": 0.28,
"num_input_tokens_seen": 12647400,
"step": 173
},
{
"epoch": 2.8639667705088265,
"grad_norm": 0.22416891157627106,
"learning_rate": 4.745068939401539e-05,
"loss": 0.2424,
"num_input_tokens_seen": 12698208,
"step": 174
},
{
"epoch": 2.8805815160955346,
"grad_norm": 0.2024654597043991,
"learning_rate": 4.742181853831721e-05,
"loss": 0.2518,
"num_input_tokens_seen": 12758528,
"step": 175
},
{
"epoch": 2.897196261682243,
"grad_norm": 0.18288369476795197,
"learning_rate": 4.7392794005985326e-05,
"loss": 0.259,
"num_input_tokens_seen": 12837264,
"step": 176
},
{
"epoch": 2.913811007268951,
"grad_norm": 0.18088208138942719,
"learning_rate": 4.7363615995950626e-05,
"loss": 0.247,
"num_input_tokens_seen": 12902368,
"step": 177
},
{
"epoch": 2.930425752855659,
"grad_norm": 0.16595424711704254,
"learning_rate": 4.733428470819594e-05,
"loss": 0.2438,
"num_input_tokens_seen": 12974296,
"step": 178
},
{
"epoch": 2.9470404984423677,
"grad_norm": 0.17989091575145721,
"learning_rate": 4.730480034375462e-05,
"loss": 0.2708,
"num_input_tokens_seen": 13057280,
"step": 179
},
{
"epoch": 2.9636552440290758,
"grad_norm": 0.16136637330055237,
"learning_rate": 4.72751631047092e-05,
"loss": 0.3171,
"num_input_tokens_seen": 13158232,
"step": 180
},
{
"epoch": 2.980269989615784,
"grad_norm": 0.1870911419391632,
"learning_rate": 4.7245373194189994e-05,
"loss": 0.24,
"num_input_tokens_seen": 13229840,
"step": 181
},
{
"epoch": 2.9968847352024923,
"grad_norm": 0.1857272833585739,
"learning_rate": 4.7215430816373726e-05,
"loss": 0.2674,
"num_input_tokens_seen": 13296520,
"step": 182
},
{
"epoch": 3.0,
"grad_norm": 0.31230616569519043,
"learning_rate": 4.718533617648209e-05,
"loss": 0.1677,
"num_input_tokens_seen": 13309672,
"step": 183
},
{
"epoch": 3.016614745586708,
"grad_norm": 0.20352481305599213,
"learning_rate": 4.715508948078037e-05,
"loss": 0.2272,
"num_input_tokens_seen": 13371544,
"step": 184
},
{
"epoch": 3.0332294911734166,
"grad_norm": 0.20679159462451935,
"learning_rate": 4.712469093657605e-05,
"loss": 0.2133,
"num_input_tokens_seen": 13432984,
"step": 185
},
{
"epoch": 3.0498442367601246,
"grad_norm": 0.18731139600276947,
"learning_rate": 4.709414075221734e-05,
"loss": 0.2695,
"num_input_tokens_seen": 13500016,
"step": 186
},
{
"epoch": 3.0664589823468327,
"grad_norm": 0.21216924488544464,
"learning_rate": 4.706343913709178e-05,
"loss": 0.2524,
"num_input_tokens_seen": 13579672,
"step": 187
},
{
"epoch": 3.083073727933541,
"grad_norm": 0.2222682386636734,
"learning_rate": 4.70325863016248e-05,
"loss": 0.2396,
"num_input_tokens_seen": 13630704,
"step": 188
},
{
"epoch": 3.0996884735202492,
"grad_norm": 0.21611332893371582,
"learning_rate": 4.7001582457278304e-05,
"loss": 0.3057,
"num_input_tokens_seen": 13695472,
"step": 189
},
{
"epoch": 3.1163032191069573,
"grad_norm": 0.23094947636127472,
"learning_rate": 4.697042781654913e-05,
"loss": 0.2436,
"num_input_tokens_seen": 13767792,
"step": 190
},
{
"epoch": 3.132917964693666,
"grad_norm": 0.19241105020046234,
"learning_rate": 4.693912259296773e-05,
"loss": 0.2974,
"num_input_tokens_seen": 13857352,
"step": 191
},
{
"epoch": 3.149532710280374,
"grad_norm": 0.19635124504566193,
"learning_rate": 4.690766700109659e-05,
"loss": 0.2457,
"num_input_tokens_seen": 13939928,
"step": 192
},
{
"epoch": 3.166147455867082,
"grad_norm": 0.1822366714477539,
"learning_rate": 4.687606125652882e-05,
"loss": 0.2205,
"num_input_tokens_seen": 14017936,
"step": 193
},
{
"epoch": 3.1827622014537904,
"grad_norm": 0.22182051837444305,
"learning_rate": 4.684430557588664e-05,
"loss": 0.2116,
"num_input_tokens_seen": 14074176,
"step": 194
},
{
"epoch": 3.1993769470404985,
"grad_norm": 0.19278937578201294,
"learning_rate": 4.681240017681993e-05,
"loss": 0.2839,
"num_input_tokens_seen": 14167656,
"step": 195
},
{
"epoch": 3.2159916926272065,
"grad_norm": 0.181584894657135,
"learning_rate": 4.678034527800474e-05,
"loss": 0.2115,
"num_input_tokens_seen": 14235800,
"step": 196
},
{
"epoch": 3.232606438213915,
"grad_norm": 0.19878999888896942,
"learning_rate": 4.674814109914174e-05,
"loss": 0.1982,
"num_input_tokens_seen": 14301272,
"step": 197
},
{
"epoch": 3.249221183800623,
"grad_norm": 0.23485153913497925,
"learning_rate": 4.671578786095478e-05,
"loss": 0.2494,
"num_input_tokens_seen": 14347352,
"step": 198
},
{
"epoch": 3.265835929387331,
"grad_norm": 0.20542015135288239,
"learning_rate": 4.668328578518933e-05,
"loss": 0.3186,
"num_input_tokens_seen": 14434600,
"step": 199
},
{
"epoch": 3.2824506749740396,
"grad_norm": 0.27169138193130493,
"learning_rate": 4.665063509461097e-05,
"loss": 0.2361,
"num_input_tokens_seen": 14484104,
"step": 200
},
{
"epoch": 3.2990654205607477,
"grad_norm": 0.18548518419265747,
"learning_rate": 4.661783601300388e-05,
"loss": 0.2457,
"num_input_tokens_seen": 14567152,
"step": 201
},
{
"epoch": 3.3156801661474558,
"grad_norm": 0.2257690727710724,
"learning_rate": 4.6584888765169296e-05,
"loss": 0.2885,
"num_input_tokens_seen": 14647040,
"step": 202
},
{
"epoch": 3.3322949117341643,
"grad_norm": 0.1882171332836151,
"learning_rate": 4.6551793576923964e-05,
"loss": 0.259,
"num_input_tokens_seen": 14738216,
"step": 203
},
{
"epoch": 3.3489096573208723,
"grad_norm": 0.1976601928472519,
"learning_rate": 4.65185506750986e-05,
"loss": 0.2102,
"num_input_tokens_seen": 14811216,
"step": 204
},
{
"epoch": 3.3655244029075804,
"grad_norm": 0.20320351421833038,
"learning_rate": 4.648516028753632e-05,
"loss": 0.1858,
"num_input_tokens_seen": 14885992,
"step": 205
},
{
"epoch": 3.382139148494289,
"grad_norm": 0.20090511441230774,
"learning_rate": 4.645162264309112e-05,
"loss": 0.272,
"num_input_tokens_seen": 14961984,
"step": 206
},
{
"epoch": 3.398753894080997,
"grad_norm": 0.21391013264656067,
"learning_rate": 4.6417937971626245e-05,
"loss": 0.2036,
"num_input_tokens_seen": 15021240,
"step": 207
},
{
"epoch": 3.415368639667705,
"grad_norm": 0.23000560700893402,
"learning_rate": 4.638410650401267e-05,
"loss": 0.2011,
"num_input_tokens_seen": 15092016,
"step": 208
},
{
"epoch": 3.431983385254413,
"grad_norm": 0.17034712433815002,
"learning_rate": 4.635012847212748e-05,
"loss": 0.2007,
"num_input_tokens_seen": 15198192,
"step": 209
},
{
"epoch": 3.4485981308411215,
"grad_norm": 0.23642270267009735,
"learning_rate": 4.6316004108852305e-05,
"loss": 0.2139,
"num_input_tokens_seen": 15258432,
"step": 210
},
{
"epoch": 3.4652128764278296,
"grad_norm": 0.22289924323558807,
"learning_rate": 4.628173364807171e-05,
"loss": 0.2441,
"num_input_tokens_seen": 15329600,
"step": 211
},
{
"epoch": 3.4818276220145377,
"grad_norm": 0.19895371794700623,
"learning_rate": 4.6247317324671605e-05,
"loss": 0.2368,
"num_input_tokens_seen": 15407920,
"step": 212
},
{
"epoch": 3.498442367601246,
"grad_norm": 0.19507868587970734,
"learning_rate": 4.6212755374537596e-05,
"loss": 0.231,
"num_input_tokens_seen": 15479640,
"step": 213
},
{
"epoch": 3.515057113187954,
"grad_norm": 0.23565508425235748,
"learning_rate": 4.617804803455344e-05,
"loss": 0.2336,
"num_input_tokens_seen": 15561960,
"step": 214
},
{
"epoch": 3.5316718587746623,
"grad_norm": 0.21157748997211456,
"learning_rate": 4.614319554259934e-05,
"loss": 0.2638,
"num_input_tokens_seen": 15641440,
"step": 215
},
{
"epoch": 3.5482866043613708,
"grad_norm": 0.2260795533657074,
"learning_rate": 4.610819813755038e-05,
"loss": 0.2646,
"num_input_tokens_seen": 15728872,
"step": 216
},
{
"epoch": 3.564901349948079,
"grad_norm": 0.20947663486003876,
"learning_rate": 4.607305605927487e-05,
"loss": 0.2211,
"num_input_tokens_seen": 15798112,
"step": 217
},
{
"epoch": 3.581516095534787,
"grad_norm": 0.22466345131397247,
"learning_rate": 4.6037769548632656e-05,
"loss": 0.2901,
"num_input_tokens_seen": 15865936,
"step": 218
},
{
"epoch": 3.5981308411214954,
"grad_norm": 0.2176472693681717,
"learning_rate": 4.600233884747355e-05,
"loss": 0.2713,
"num_input_tokens_seen": 15941368,
"step": 219
},
{
"epoch": 3.6147455867082035,
"grad_norm": 0.20593321323394775,
"learning_rate": 4.5966764198635606e-05,
"loss": 0.2047,
"num_input_tokens_seen": 16028208,
"step": 220
},
{
"epoch": 3.6313603322949115,
"grad_norm": 0.21239908039569855,
"learning_rate": 4.5931045845943474e-05,
"loss": 0.1872,
"num_input_tokens_seen": 16104408,
"step": 221
},
{
"epoch": 3.64797507788162,
"grad_norm": 0.22399188578128815,
"learning_rate": 4.5895184034206765e-05,
"loss": 0.3526,
"num_input_tokens_seen": 16156800,
"step": 222
},
{
"epoch": 3.664589823468328,
"grad_norm": 0.18427705764770508,
"learning_rate": 4.585917900921829e-05,
"loss": 0.2905,
"num_input_tokens_seen": 16256712,
"step": 223
},
{
"epoch": 3.681204569055036,
"grad_norm": 0.22559204697608948,
"learning_rate": 4.5823031017752485e-05,
"loss": 0.2014,
"num_input_tokens_seen": 16330344,
"step": 224
},
{
"epoch": 3.6978193146417446,
"grad_norm": 0.22590497136116028,
"learning_rate": 4.5786740307563636e-05,
"loss": 0.2178,
"num_input_tokens_seen": 16399792,
"step": 225
},
{
"epoch": 3.7144340602284527,
"grad_norm": 0.20590999722480774,
"learning_rate": 4.575030712738419e-05,
"loss": 0.2149,
"num_input_tokens_seen": 16466368,
"step": 226
},
{
"epoch": 3.7310488058151607,
"grad_norm": 0.23313546180725098,
"learning_rate": 4.571373172692309e-05,
"loss": 0.2164,
"num_input_tokens_seen": 16530976,
"step": 227
},
{
"epoch": 3.7476635514018692,
"grad_norm": 0.2081821709871292,
"learning_rate": 4.567701435686404e-05,
"loss": 0.2197,
"num_input_tokens_seen": 16600216,
"step": 228
},
{
"epoch": 3.7642782969885773,
"grad_norm": 0.22633281350135803,
"learning_rate": 4.5640155268863796e-05,
"loss": 0.2527,
"num_input_tokens_seen": 16673192,
"step": 229
},
{
"epoch": 3.7808930425752854,
"grad_norm": 0.2317536324262619,
"learning_rate": 4.5603154715550386e-05,
"loss": 0.1974,
"num_input_tokens_seen": 16739912,
"step": 230
},
{
"epoch": 3.797507788161994,
"grad_norm": 0.18925197422504425,
"learning_rate": 4.55660129505215e-05,
"loss": 0.2098,
"num_input_tokens_seen": 16834632,
"step": 231
},
{
"epoch": 3.814122533748702,
"grad_norm": 0.1934100240468979,
"learning_rate": 4.5528730228342605e-05,
"loss": 0.2109,
"num_input_tokens_seen": 16914728,
"step": 232
},
{
"epoch": 3.83073727933541,
"grad_norm": 0.1923092156648636,
"learning_rate": 4.549130680454532e-05,
"loss": 0.2492,
"num_input_tokens_seen": 17014304,
"step": 233
},
{
"epoch": 3.8473520249221185,
"grad_norm": 0.2139277458190918,
"learning_rate": 4.545374293562559e-05,
"loss": 0.2415,
"num_input_tokens_seen": 17106664,
"step": 234
},
{
"epoch": 3.8639667705088265,
"grad_norm": 0.2056339681148529,
"learning_rate": 4.541603887904198e-05,
"loss": 0.2311,
"num_input_tokens_seen": 17193744,
"step": 235
},
{
"epoch": 3.8805815160955346,
"grad_norm": 0.2485012263059616,
"learning_rate": 4.537819489321386e-05,
"loss": 0.2309,
"num_input_tokens_seen": 17254656,
"step": 236
},
{
"epoch": 3.897196261682243,
"grad_norm": 0.21751578152179718,
"learning_rate": 4.534021123751968e-05,
"loss": 0.2334,
"num_input_tokens_seen": 17325896,
"step": 237
},
{
"epoch": 3.913811007268951,
"grad_norm": 0.27809038758277893,
"learning_rate": 4.5302088172295156e-05,
"loss": 0.2598,
"num_input_tokens_seen": 17394424,
"step": 238
},
{
"epoch": 3.930425752855659,
"grad_norm": 0.23579885065555573,
"learning_rate": 4.526382595883152e-05,
"loss": 0.2132,
"num_input_tokens_seen": 17456352,
"step": 239
},
{
"epoch": 3.9470404984423677,
"grad_norm": 0.2328771948814392,
"learning_rate": 4.522542485937369e-05,
"loss": 0.2139,
"num_input_tokens_seen": 17519168,
"step": 240
},
{
"epoch": 3.9636552440290758,
"grad_norm": 0.25951239466667175,
"learning_rate": 4.51868851371185e-05,
"loss": 0.2358,
"num_input_tokens_seen": 17585144,
"step": 241
},
{
"epoch": 3.980269989615784,
"grad_norm": 0.2149265706539154,
"learning_rate": 4.5148207056212896e-05,
"loss": 0.1937,
"num_input_tokens_seen": 17662024,
"step": 242
},
{
"epoch": 3.9968847352024923,
"grad_norm": 0.24635690450668335,
"learning_rate": 4.5109390881752114e-05,
"loss": 0.222,
"num_input_tokens_seen": 17724360,
"step": 243
},
{
"epoch": 4.0,
"grad_norm": 0.4375430643558502,
"learning_rate": 4.5070436879777865e-05,
"loss": 0.2036,
"num_input_tokens_seen": 17746200,
"step": 244
},
{
"epoch": 4.0166147455867085,
"grad_norm": 0.18866199254989624,
"learning_rate": 4.503134531727652e-05,
"loss": 0.189,
"num_input_tokens_seen": 17830760,
"step": 245
},
{
"epoch": 4.033229491173416,
"grad_norm": 0.22479456663131714,
"learning_rate": 4.499211646217727e-05,
"loss": 0.2027,
"num_input_tokens_seen": 17903840,
"step": 246
},
{
"epoch": 4.049844236760125,
"grad_norm": 0.2447003722190857,
"learning_rate": 4.495275058335029e-05,
"loss": 0.2018,
"num_input_tokens_seen": 17990448,
"step": 247
},
{
"epoch": 4.066458982346833,
"grad_norm": 0.281250536441803,
"learning_rate": 4.491324795060491e-05,
"loss": 0.2182,
"num_input_tokens_seen": 18069520,
"step": 248
},
{
"epoch": 4.083073727933541,
"grad_norm": 0.2666569948196411,
"learning_rate": 4.487360883468775e-05,
"loss": 0.1997,
"num_input_tokens_seen": 18129128,
"step": 249
},
{
"epoch": 4.099688473520249,
"grad_norm": 0.23250125348567963,
"learning_rate": 4.4833833507280884e-05,
"loss": 0.2237,
"num_input_tokens_seen": 18202472,
"step": 250
},
{
"epoch": 4.116303219106958,
"grad_norm": 0.2713671028614044,
"learning_rate": 4.4793922240999933e-05,
"loss": 0.2012,
"num_input_tokens_seen": 18267232,
"step": 251
},
{
"epoch": 4.132917964693665,
"grad_norm": 0.2696637213230133,
"learning_rate": 4.4753875309392266e-05,
"loss": 0.2189,
"num_input_tokens_seen": 18325216,
"step": 252
},
{
"epoch": 4.149532710280374,
"grad_norm": 0.25705742835998535,
"learning_rate": 4.471369298693505e-05,
"loss": 0.2333,
"num_input_tokens_seen": 18406184,
"step": 253
},
{
"epoch": 4.166147455867082,
"grad_norm": 0.2252712994813919,
"learning_rate": 4.467337554903344e-05,
"loss": 0.191,
"num_input_tokens_seen": 18481056,
"step": 254
},
{
"epoch": 4.18276220145379,
"grad_norm": 0.2379617840051651,
"learning_rate": 4.463292327201862e-05,
"loss": 0.1707,
"num_input_tokens_seen": 18554864,
"step": 255
},
{
"epoch": 4.1993769470404985,
"grad_norm": 0.32383039593696594,
"learning_rate": 4.4592336433146e-05,
"loss": 0.2362,
"num_input_tokens_seen": 18612120,
"step": 256
},
{
"epoch": 4.215991692627207,
"grad_norm": 0.2559884786605835,
"learning_rate": 4.4551615310593195e-05,
"loss": 0.2385,
"num_input_tokens_seen": 18710408,
"step": 257
},
{
"epoch": 4.232606438213915,
"grad_norm": 0.2688326835632324,
"learning_rate": 4.451076018345825e-05,
"loss": 0.2154,
"num_input_tokens_seen": 18769400,
"step": 258
},
{
"epoch": 4.249221183800623,
"grad_norm": 0.23779121041297913,
"learning_rate": 4.4469771331757604e-05,
"loss": 0.2021,
"num_input_tokens_seen": 18849704,
"step": 259
},
{
"epoch": 4.265835929387332,
"grad_norm": 0.2318015843629837,
"learning_rate": 4.442864903642428e-05,
"loss": 0.2245,
"num_input_tokens_seen": 18943328,
"step": 260
},
{
"epoch": 4.282450674974039,
"grad_norm": 0.25898367166519165,
"learning_rate": 4.4387393579305865e-05,
"loss": 0.2279,
"num_input_tokens_seen": 19022536,
"step": 261
},
{
"epoch": 4.299065420560748,
"grad_norm": 0.5110782980918884,
"learning_rate": 4.434600524316266e-05,
"loss": 0.1913,
"num_input_tokens_seen": 19089200,
"step": 262
},
{
"epoch": 4.315680166147456,
"grad_norm": 0.2796955108642578,
"learning_rate": 4.430448431166567e-05,
"loss": 0.3056,
"num_input_tokens_seen": 19171216,
"step": 263
},
{
"epoch": 4.332294911734164,
"grad_norm": 0.19809302687644958,
"learning_rate": 4.426283106939474e-05,
"loss": 0.1719,
"num_input_tokens_seen": 19271872,
"step": 264
},
{
"epoch": 4.348909657320872,
"grad_norm": 0.2518528997898102,
"learning_rate": 4.4221045801836494e-05,
"loss": 0.2856,
"num_input_tokens_seen": 19342984,
"step": 265
},
{
"epoch": 4.365524402907581,
"grad_norm": 0.2863263189792633,
"learning_rate": 4.41791287953825e-05,
"loss": 0.2079,
"num_input_tokens_seen": 19391640,
"step": 266
},
{
"epoch": 4.382139148494288,
"grad_norm": 0.2902291417121887,
"learning_rate": 4.4137080337327205e-05,
"loss": 0.2321,
"num_input_tokens_seen": 19463232,
"step": 267
},
{
"epoch": 4.398753894080997,
"grad_norm": 0.2613106071949005,
"learning_rate": 4.4094900715866064e-05,
"loss": 0.2147,
"num_input_tokens_seen": 19523728,
"step": 268
},
{
"epoch": 4.415368639667705,
"grad_norm": 0.26637744903564453,
"learning_rate": 4.4052590220093446e-05,
"loss": 0.2283,
"num_input_tokens_seen": 19598960,
"step": 269
},
{
"epoch": 4.431983385254413,
"grad_norm": 0.3226986229419708,
"learning_rate": 4.401014914000078e-05,
"loss": 0.2041,
"num_input_tokens_seen": 19666136,
"step": 270
},
{
"epoch": 4.4485981308411215,
"grad_norm": 0.3052164912223816,
"learning_rate": 4.3967577766474455e-05,
"loss": 0.21,
"num_input_tokens_seen": 19728600,
"step": 271
},
{
"epoch": 4.46521287642783,
"grad_norm": 0.2585601210594177,
"learning_rate": 4.3924876391293915e-05,
"loss": 0.2471,
"num_input_tokens_seen": 19801032,
"step": 272
},
{
"epoch": 4.481827622014538,
"grad_norm": 0.2675788402557373,
"learning_rate": 4.3882045307129594e-05,
"loss": 0.2173,
"num_input_tokens_seen": 19885496,
"step": 273
},
{
"epoch": 4.498442367601246,
"grad_norm": 0.2575731873512268,
"learning_rate": 4.383908480754095e-05,
"loss": 0.2104,
"num_input_tokens_seen": 19952072,
"step": 274
},
{
"epoch": 4.515057113187955,
"grad_norm": 0.22937491536140442,
"learning_rate": 4.379599518697444e-05,
"loss": 0.1908,
"num_input_tokens_seen": 20026536,
"step": 275
},
{
"epoch": 4.531671858774662,
"grad_norm": 0.28578364849090576,
"learning_rate": 4.375277674076149e-05,
"loss": 0.1778,
"num_input_tokens_seen": 20079112,
"step": 276
},
{
"epoch": 4.548286604361371,
"grad_norm": 0.2623717486858368,
"learning_rate": 4.3709429765116504e-05,
"loss": 0.302,
"num_input_tokens_seen": 20144264,
"step": 277
},
{
"epoch": 4.564901349948079,
"grad_norm": 0.3099273443222046,
"learning_rate": 4.366595455713479e-05,
"loss": 0.2113,
"num_input_tokens_seen": 20207568,
"step": 278
},
{
"epoch": 4.581516095534787,
"grad_norm": 0.2775528132915497,
"learning_rate": 4.3622351414790554e-05,
"loss": 0.2519,
"num_input_tokens_seen": 20292376,
"step": 279
},
{
"epoch": 4.598130841121495,
"grad_norm": 0.23497633635997772,
"learning_rate": 4.357862063693486e-05,
"loss": 0.1628,
"num_input_tokens_seen": 20383048,
"step": 280
},
{
"epoch": 4.614745586708204,
"grad_norm": 0.25743210315704346,
"learning_rate": 4.353476252329356e-05,
"loss": 0.1923,
"num_input_tokens_seen": 20463376,
"step": 281
},
{
"epoch": 4.6313603322949115,
"grad_norm": 0.2595055103302002,
"learning_rate": 4.349077737446525e-05,
"loss": 0.1745,
"num_input_tokens_seen": 20537808,
"step": 282
},
{
"epoch": 4.64797507788162,
"grad_norm": 0.269157350063324,
"learning_rate": 4.344666549191921e-05,
"loss": 0.207,
"num_input_tokens_seen": 20605496,
"step": 283
},
{
"epoch": 4.6645898234683285,
"grad_norm": 0.2762012481689453,
"learning_rate": 4.3402427177993366e-05,
"loss": 0.2412,
"num_input_tokens_seen": 20692096,
"step": 284
},
{
"epoch": 4.681204569055036,
"grad_norm": 0.3109856843948364,
"learning_rate": 4.335806273589214e-05,
"loss": 0.2219,
"num_input_tokens_seen": 20762800,
"step": 285
},
{
"epoch": 4.697819314641745,
"grad_norm": 0.2506738305091858,
"learning_rate": 4.3313572469684474e-05,
"loss": 0.1831,
"num_input_tokens_seen": 20831584,
"step": 286
},
{
"epoch": 4.714434060228453,
"grad_norm": 0.25760403275489807,
"learning_rate": 4.326895668430166e-05,
"loss": 0.1457,
"num_input_tokens_seen": 20897320,
"step": 287
},
{
"epoch": 4.731048805815161,
"grad_norm": 0.298622727394104,
"learning_rate": 4.3224215685535294e-05,
"loss": 0.193,
"num_input_tokens_seen": 20966136,
"step": 288
},
{
"epoch": 4.747663551401869,
"grad_norm": 0.2863025665283203,
"learning_rate": 4.317934978003517e-05,
"loss": 0.1868,
"num_input_tokens_seen": 21034800,
"step": 289
},
{
"epoch": 4.764278296988578,
"grad_norm": 0.2865165174007416,
"learning_rate": 4.313435927530719e-05,
"loss": 0.2251,
"num_input_tokens_seen": 21098672,
"step": 290
},
{
"epoch": 4.780893042575285,
"grad_norm": 0.2902335226535797,
"learning_rate": 4.3089244479711236e-05,
"loss": 0.1853,
"num_input_tokens_seen": 21177632,
"step": 291
},
{
"epoch": 4.797507788161994,
"grad_norm": 0.31741780042648315,
"learning_rate": 4.304400570245906e-05,
"loss": 0.2135,
"num_input_tokens_seen": 21240896,
"step": 292
},
{
"epoch": 4.814122533748702,
"grad_norm": 0.22312244772911072,
"learning_rate": 4.299864325361217e-05,
"loss": 0.177,
"num_input_tokens_seen": 21322984,
"step": 293
},
{
"epoch": 4.83073727933541,
"grad_norm": 0.244970440864563,
"learning_rate": 4.295315744407972e-05,
"loss": 0.1877,
"num_input_tokens_seen": 21389128,
"step": 294
},
{
"epoch": 4.8473520249221185,
"grad_norm": 0.2605350613594055,
"learning_rate": 4.290754858561637e-05,
"loss": 0.2124,
"num_input_tokens_seen": 21469912,
"step": 295
},
{
"epoch": 4.863966770508826,
"grad_norm": 0.27169349789619446,
"learning_rate": 4.2861816990820084e-05,
"loss": 0.1833,
"num_input_tokens_seen": 21540320,
"step": 296
},
{
"epoch": 4.880581516095535,
"grad_norm": 0.2539166510105133,
"learning_rate": 4.281596297313013e-05,
"loss": 0.2134,
"num_input_tokens_seen": 21626312,
"step": 297
},
{
"epoch": 4.897196261682243,
"grad_norm": 0.28907957673072815,
"learning_rate": 4.2769986846824815e-05,
"loss": 0.1912,
"num_input_tokens_seen": 21702792,
"step": 298
},
{
"epoch": 4.913811007268951,
"grad_norm": 0.3405742645263672,
"learning_rate": 4.272388892701934e-05,
"loss": 0.2051,
"num_input_tokens_seen": 21771880,
"step": 299
},
{
"epoch": 4.930425752855659,
"grad_norm": 0.2592983543872833,
"learning_rate": 4.267766952966369e-05,
"loss": 0.1926,
"num_input_tokens_seen": 21844024,
"step": 300
},
{
"epoch": 4.947040498442368,
"grad_norm": 0.24671317636966705,
"learning_rate": 4.2631328971540444e-05,
"loss": 0.2039,
"num_input_tokens_seen": 21925632,
"step": 301
},
{
"epoch": 4.963655244029075,
"grad_norm": 0.3087393641471863,
"learning_rate": 4.2584867570262597e-05,
"loss": 0.2077,
"num_input_tokens_seen": 21981952,
"step": 302
},
{
"epoch": 4.980269989615784,
"grad_norm": 0.22413718700408936,
"learning_rate": 4.25382856442714e-05,
"loss": 0.174,
"num_input_tokens_seen": 22070440,
"step": 303
},
{
"epoch": 4.996884735202492,
"grad_norm": 0.22630493342876434,
"learning_rate": 4.249158351283414e-05,
"loss": 0.204,
"num_input_tokens_seen": 22170184,
"step": 304
},
{
"epoch": 5.0,
"grad_norm": 0.7261853814125061,
"learning_rate": 4.244476149604201e-05,
"loss": 0.2849,
"num_input_tokens_seen": 22181856,
"step": 305
},
{
"epoch": 5.0166147455867085,
"grad_norm": 0.2537758946418762,
"learning_rate": 4.2397819914807856e-05,
"loss": 0.1879,
"num_input_tokens_seen": 22256808,
"step": 306
},
{
"epoch": 5.033229491173416,
"grad_norm": 0.34757834672927856,
"learning_rate": 4.2350759090864046e-05,
"loss": 0.2158,
"num_input_tokens_seen": 22325224,
"step": 307
},
{
"epoch": 5.049844236760125,
"grad_norm": 0.23899881541728973,
"learning_rate": 4.230357934676017e-05,
"loss": 0.1685,
"num_input_tokens_seen": 22389624,
"step": 308
},
{
"epoch": 5.066458982346833,
"grad_norm": 0.38237079977989197,
"learning_rate": 4.225628100586093e-05,
"loss": 0.2253,
"num_input_tokens_seen": 22463872,
"step": 309
},
{
"epoch": 5.083073727933541,
"grad_norm": 0.30238866806030273,
"learning_rate": 4.220886439234385e-05,
"loss": 0.1762,
"num_input_tokens_seen": 22515824,
"step": 310
},
{
"epoch": 5.099688473520249,
"grad_norm": 0.2617652416229248,
"learning_rate": 4.2161329831197095e-05,
"loss": 0.1772,
"num_input_tokens_seen": 22602336,
"step": 311
},
{
"epoch": 5.116303219106958,
"grad_norm": 0.31059470772743225,
"learning_rate": 4.211367764821722e-05,
"loss": 0.1729,
"num_input_tokens_seen": 22655176,
"step": 312
},
{
"epoch": 5.132917964693665,
"grad_norm": 0.2957116663455963,
"learning_rate": 4.2065908170006955e-05,
"loss": 0.1857,
"num_input_tokens_seen": 22728680,
"step": 313
},
{
"epoch": 5.149532710280374,
"grad_norm": 0.30844616889953613,
"learning_rate": 4.201802172397295e-05,
"loss": 0.176,
"num_input_tokens_seen": 22806784,
"step": 314
},
{
"epoch": 5.166147455867082,
"grad_norm": 0.3961475193500519,
"learning_rate": 4.197001863832355e-05,
"loss": 0.1903,
"num_input_tokens_seen": 22880648,
"step": 315
},
{
"epoch": 5.18276220145379,
"grad_norm": 0.3515004813671112,
"learning_rate": 4.192189924206652e-05,
"loss": 0.1706,
"num_input_tokens_seen": 22953184,
"step": 316
},
{
"epoch": 5.1993769470404985,
"grad_norm": 0.6038290858268738,
"learning_rate": 4.187366386500683e-05,
"loss": 0.2127,
"num_input_tokens_seen": 23037392,
"step": 317
},
{
"epoch": 5.215991692627207,
"grad_norm": 0.29695793986320496,
"learning_rate": 4.182531283774434e-05,
"loss": 0.293,
"num_input_tokens_seen": 23086552,
"step": 318
},
{
"epoch": 5.232606438213915,
"grad_norm": 0.30214065313339233,
"learning_rate": 4.177684649167158e-05,
"loss": 0.1843,
"num_input_tokens_seen": 23153152,
"step": 319
},
{
"epoch": 5.249221183800623,
"grad_norm": 0.3034592568874359,
"learning_rate": 4.172826515897146e-05,
"loss": 0.1945,
"num_input_tokens_seen": 23240928,
"step": 320
},
{
"epoch": 5.265835929387332,
"grad_norm": 0.27193683385849,
"learning_rate": 4.1679569172614996e-05,
"loss": 0.1825,
"num_input_tokens_seen": 23325912,
"step": 321
},
{
"epoch": 5.282450674974039,
"grad_norm": 0.3327508866786957,
"learning_rate": 4.163075886635902e-05,
"loss": 0.2044,
"num_input_tokens_seen": 23401952,
"step": 322
},
{
"epoch": 5.299065420560748,
"grad_norm": 0.32610246539115906,
"learning_rate": 4.1581834574743915e-05,
"loss": 0.1718,
"num_input_tokens_seen": 23463760,
"step": 323
},
{
"epoch": 5.315680166147456,
"grad_norm": 0.30451127886772156,
"learning_rate": 4.1532796633091296e-05,
"loss": 0.1768,
"num_input_tokens_seen": 23535272,
"step": 324
},
{
"epoch": 5.332294911734164,
"grad_norm": 0.26648980379104614,
"learning_rate": 4.148364537750172e-05,
"loss": 0.1609,
"num_input_tokens_seen": 23607752,
"step": 325
},
{
"epoch": 5.348909657320872,
"grad_norm": 0.2634081244468689,
"learning_rate": 4.14343811448524e-05,
"loss": 0.1769,
"num_input_tokens_seen": 23674872,
"step": 326
},
{
"epoch": 5.365524402907581,
"grad_norm": 0.30805498361587524,
"learning_rate": 4.138500427279485e-05,
"loss": 0.2087,
"num_input_tokens_seen": 23736384,
"step": 327
},
{
"epoch": 5.382139148494288,
"grad_norm": 0.2669171988964081,
"learning_rate": 4.133551509975264e-05,
"loss": 0.162,
"num_input_tokens_seen": 23835000,
"step": 328
},
{
"epoch": 5.398753894080997,
"grad_norm": 0.25709372758865356,
"learning_rate": 4.128591396491901e-05,
"loss": 0.1614,
"num_input_tokens_seen": 23912552,
"step": 329
},
{
"epoch": 5.415368639667705,
"grad_norm": 0.2793630063533783,
"learning_rate": 4.123620120825459e-05,
"loss": 0.2,
"num_input_tokens_seen": 23987368,
"step": 330
},
{
"epoch": 5.431983385254413,
"grad_norm": 0.32492437958717346,
"learning_rate": 4.118637717048506e-05,
"loss": 0.1719,
"num_input_tokens_seen": 24050848,
"step": 331
},
{
"epoch": 5.4485981308411215,
"grad_norm": 0.26048383116722107,
"learning_rate": 4.113644219309877e-05,
"loss": 0.1678,
"num_input_tokens_seen": 24146104,
"step": 332
},
{
"epoch": 5.46521287642783,
"grad_norm": 0.3429310619831085,
"learning_rate": 4.1086396618344476e-05,
"loss": 0.176,
"num_input_tokens_seen": 24194184,
"step": 333
},
{
"epoch": 5.481827622014538,
"grad_norm": 0.2843048870563507,
"learning_rate": 4.1036240789228954e-05,
"loss": 0.1844,
"num_input_tokens_seen": 24275368,
"step": 334
},
{
"epoch": 5.498442367601246,
"grad_norm": 0.3061092495918274,
"learning_rate": 4.098597504951462e-05,
"loss": 0.1901,
"num_input_tokens_seen": 24329192,
"step": 335
},
{
"epoch": 5.515057113187955,
"grad_norm": 0.28139594197273254,
"learning_rate": 4.093559974371725e-05,
"loss": 0.1751,
"num_input_tokens_seen": 24426696,
"step": 336
},
{
"epoch": 5.531671858774662,
"grad_norm": 0.2379189133644104,
"learning_rate": 4.088511521710352e-05,
"loss": 0.171,
"num_input_tokens_seen": 24514344,
"step": 337
},
{
"epoch": 5.548286604361371,
"grad_norm": 0.3467542827129364,
"learning_rate": 4.083452181568875e-05,
"loss": 0.1766,
"num_input_tokens_seen": 24584464,
"step": 338
},
{
"epoch": 5.564901349948079,
"grad_norm": 0.2928486764431,
"learning_rate": 4.0783819886234445e-05,
"loss": 0.1584,
"num_input_tokens_seen": 24660600,
"step": 339
},
{
"epoch": 5.581516095534787,
"grad_norm": 0.31528523564338684,
"learning_rate": 4.073300977624594e-05,
"loss": 0.1711,
"num_input_tokens_seen": 24717088,
"step": 340
},
{
"epoch": 5.598130841121495,
"grad_norm": 0.313851922750473,
"learning_rate": 4.068209183397004e-05,
"loss": 0.1798,
"num_input_tokens_seen": 24775352,
"step": 341
},
{
"epoch": 5.614745586708204,
"grad_norm": 0.2714058756828308,
"learning_rate": 4.063106640839264e-05,
"loss": 0.1666,
"num_input_tokens_seen": 24860072,
"step": 342
},
{
"epoch": 5.6313603322949115,
"grad_norm": 0.2646903693675995,
"learning_rate": 4.057993384923626e-05,
"loss": 0.168,
"num_input_tokens_seen": 24947856,
"step": 343
},
{
"epoch": 5.64797507788162,
"grad_norm": 0.29279816150665283,
"learning_rate": 4.052869450695776e-05,
"loss": 0.1801,
"num_input_tokens_seen": 25024992,
"step": 344
},
{
"epoch": 5.6645898234683285,
"grad_norm": 0.3221881091594696,
"learning_rate": 4.047734873274586e-05,
"loss": 0.183,
"num_input_tokens_seen": 25092248,
"step": 345
},
{
"epoch": 5.681204569055036,
"grad_norm": 0.31584057211875916,
"learning_rate": 4.042589687851872e-05,
"loss": 0.1752,
"num_input_tokens_seen": 25170496,
"step": 346
},
{
"epoch": 5.697819314641745,
"grad_norm": 0.2635148763656616,
"learning_rate": 4.037433929692161e-05,
"loss": 0.1798,
"num_input_tokens_seen": 25268720,
"step": 347
},
{
"epoch": 5.714434060228453,
"grad_norm": 0.3024344742298126,
"learning_rate": 4.0322676341324415e-05,
"loss": 0.1793,
"num_input_tokens_seen": 25332688,
"step": 348
},
{
"epoch": 5.731048805815161,
"grad_norm": 0.35282352566719055,
"learning_rate": 4.027090836581925e-05,
"loss": 0.2022,
"num_input_tokens_seen": 25413904,
"step": 349
},
{
"epoch": 5.747663551401869,
"grad_norm": 0.3042284846305847,
"learning_rate": 4.021903572521802e-05,
"loss": 0.1848,
"num_input_tokens_seen": 25503720,
"step": 350
},
{
"epoch": 5.764278296988578,
"grad_norm": 0.33289963006973267,
"learning_rate": 4.0167058775049996e-05,
"loss": 0.1931,
"num_input_tokens_seen": 25568560,
"step": 351
},
{
"epoch": 5.780893042575285,
"grad_norm": 0.3255409598350525,
"learning_rate": 4.011497787155938e-05,
"loss": 0.1675,
"num_input_tokens_seen": 25635184,
"step": 352
},
{
"epoch": 5.797507788161994,
"grad_norm": 0.2858707904815674,
"learning_rate": 4.006279337170283e-05,
"loss": 0.176,
"num_input_tokens_seen": 25719768,
"step": 353
},
{
"epoch": 5.814122533748702,
"grad_norm": 0.30737245082855225,
"learning_rate": 4.0010505633147106e-05,
"loss": 0.1705,
"num_input_tokens_seen": 25795016,
"step": 354
},
{
"epoch": 5.83073727933541,
"grad_norm": 0.29503941535949707,
"learning_rate": 3.995811501426648e-05,
"loss": 0.153,
"num_input_tokens_seen": 25863184,
"step": 355
},
{
"epoch": 5.8473520249221185,
"grad_norm": 0.3046650290489197,
"learning_rate": 3.99056218741404e-05,
"loss": 0.1795,
"num_input_tokens_seen": 25935752,
"step": 356
},
{
"epoch": 5.863966770508826,
"grad_norm": 0.33626508712768555,
"learning_rate": 3.985302657255097e-05,
"loss": 0.1744,
"num_input_tokens_seen": 25995760,
"step": 357
},
{
"epoch": 5.880581516095535,
"grad_norm": 0.2900950312614441,
"learning_rate": 3.980032946998049e-05,
"loss": 0.1538,
"num_input_tokens_seen": 26061240,
"step": 358
},
{
"epoch": 5.897196261682243,
"grad_norm": 0.32002708315849304,
"learning_rate": 3.974753092760901e-05,
"loss": 0.172,
"num_input_tokens_seen": 26131024,
"step": 359
},
{
"epoch": 5.913811007268951,
"grad_norm": 0.22458530962467194,
"learning_rate": 3.969463130731183e-05,
"loss": 0.197,
"num_input_tokens_seen": 26233672,
"step": 360
},
{
"epoch": 5.930425752855659,
"grad_norm": 0.2662505805492401,
"learning_rate": 3.964163097165702e-05,
"loss": 0.1359,
"num_input_tokens_seen": 26303488,
"step": 361
},
{
"epoch": 5.947040498442368,
"grad_norm": 0.2906314432621002,
"learning_rate": 3.958853028390294e-05,
"loss": 0.1622,
"num_input_tokens_seen": 26377768,
"step": 362
},
{
"epoch": 5.963655244029075,
"grad_norm": 0.33236268162727356,
"learning_rate": 3.953532960799577e-05,
"loss": 0.3039,
"num_input_tokens_seen": 26435984,
"step": 363
},
{
"epoch": 5.980269989615784,
"grad_norm": 0.3479491174221039,
"learning_rate": 3.948202930856697e-05,
"loss": 0.185,
"num_input_tokens_seen": 26513960,
"step": 364
},
{
"epoch": 5.996884735202492,
"grad_norm": 0.2604476511478424,
"learning_rate": 3.942862975093085e-05,
"loss": 0.1671,
"num_input_tokens_seen": 26599064,
"step": 365
},
{
"epoch": 6.0,
"grad_norm": 0.5614582300186157,
"learning_rate": 3.937513130108197e-05,
"loss": 0.2127,
"num_input_tokens_seen": 26617264,
"step": 366
},
{
"epoch": 6.0166147455867085,
"grad_norm": 0.29380959272384644,
"learning_rate": 3.9321534325692726e-05,
"loss": 0.1736,
"num_input_tokens_seen": 26717024,
"step": 367
},
{
"epoch": 6.033229491173416,
"grad_norm": 0.3308286666870117,
"learning_rate": 3.92678391921108e-05,
"loss": 0.166,
"num_input_tokens_seen": 26788120,
"step": 368
},
{
"epoch": 6.049844236760125,
"grad_norm": 0.34477588534355164,
"learning_rate": 3.92140462683566e-05,
"loss": 0.182,
"num_input_tokens_seen": 26853760,
"step": 369
},
{
"epoch": 6.066458982346833,
"grad_norm": 0.33609539270401,
"learning_rate": 3.916015592312082e-05,
"loss": 0.1621,
"num_input_tokens_seen": 26923848,
"step": 370
},
{
"epoch": 6.083073727933541,
"grad_norm": 0.2985127866268158,
"learning_rate": 3.9106168525761855e-05,
"loss": 0.1488,
"num_input_tokens_seen": 26976184,
"step": 371
},
{
"epoch": 6.099688473520249,
"grad_norm": 0.3301701545715332,
"learning_rate": 3.905208444630327e-05,
"loss": 0.1554,
"num_input_tokens_seen": 27065712,
"step": 372
},
{
"epoch": 6.116303219106958,
"grad_norm": 0.22536417841911316,
"learning_rate": 3.899790405543129e-05,
"loss": 0.1653,
"num_input_tokens_seen": 27145472,
"step": 373
},
{
"epoch": 6.132917964693665,
"grad_norm": 0.28911101818084717,
"learning_rate": 3.894362772449226e-05,
"loss": 0.1503,
"num_input_tokens_seen": 27233904,
"step": 374
},
{
"epoch": 6.149532710280374,
"grad_norm": 0.30377212166786194,
"learning_rate": 3.888925582549006e-05,
"loss": 0.1358,
"num_input_tokens_seen": 27311512,
"step": 375
},
{
"epoch": 6.166147455867082,
"grad_norm": 0.32584136724472046,
"learning_rate": 3.883478873108361e-05,
"loss": 0.15,
"num_input_tokens_seen": 27387400,
"step": 376
},
{
"epoch": 6.18276220145379,
"grad_norm": 0.33625590801239014,
"learning_rate": 3.878022681458426e-05,
"loss": 0.1588,
"num_input_tokens_seen": 27461280,
"step": 377
},
{
"epoch": 6.1993769470404985,
"grad_norm": 0.2865462601184845,
"learning_rate": 3.87255704499533e-05,
"loss": 0.143,
"num_input_tokens_seen": 27556400,
"step": 378
},
{
"epoch": 6.215991692627207,
"grad_norm": 0.3429831266403198,
"learning_rate": 3.8670820011799315e-05,
"loss": 0.1713,
"num_input_tokens_seen": 27613664,
"step": 379
},
{
"epoch": 6.232606438213915,
"grad_norm": 0.32414478063583374,
"learning_rate": 3.861597587537568e-05,
"loss": 0.1893,
"num_input_tokens_seen": 27681024,
"step": 380
},
{
"epoch": 6.249221183800623,
"grad_norm": 0.3242063522338867,
"learning_rate": 3.856103841657797e-05,
"loss": 0.156,
"num_input_tokens_seen": 27759536,
"step": 381
},
{
"epoch": 6.265835929387332,
"grad_norm": 0.22485007345676422,
"learning_rate": 3.850600801194138e-05,
"loss": 0.1247,
"num_input_tokens_seen": 27857288,
"step": 382
},
{
"epoch": 6.282450674974039,
"grad_norm": 0.4592108726501465,
"learning_rate": 3.8450885038638127e-05,
"loss": 0.172,
"num_input_tokens_seen": 27940528,
"step": 383
},
{
"epoch": 6.299065420560748,
"grad_norm": 0.3695475459098816,
"learning_rate": 3.8395669874474915e-05,
"loss": 0.166,
"num_input_tokens_seen": 28033824,
"step": 384
},
{
"epoch": 6.315680166147456,
"grad_norm": 0.33483219146728516,
"learning_rate": 3.834036289789029e-05,
"loss": 0.1415,
"num_input_tokens_seen": 28096192,
"step": 385
},
{
"epoch": 6.332294911734164,
"grad_norm": 0.2734437882900238,
"learning_rate": 3.828496448795207e-05,
"loss": 0.1369,
"num_input_tokens_seen": 28181256,
"step": 386
},
{
"epoch": 6.348909657320872,
"grad_norm": 0.3039200007915497,
"learning_rate": 3.822947502435477e-05,
"loss": 0.1465,
"num_input_tokens_seen": 28245480,
"step": 387
},
{
"epoch": 6.365524402907581,
"grad_norm": 0.3409143388271332,
"learning_rate": 3.8173894887416945e-05,
"loss": 0.1456,
"num_input_tokens_seen": 28307200,
"step": 388
},
{
"epoch": 6.382139148494288,
"grad_norm": 0.390440970659256,
"learning_rate": 3.811822445807863e-05,
"loss": 0.1752,
"num_input_tokens_seen": 28384640,
"step": 389
},
{
"epoch": 6.398753894080997,
"grad_norm": 0.3439676761627197,
"learning_rate": 3.8062464117898724e-05,
"loss": 0.1344,
"num_input_tokens_seen": 28447992,
"step": 390
},
{
"epoch": 6.415368639667705,
"grad_norm": 0.4008062779903412,
"learning_rate": 3.800661424905235e-05,
"loss": 0.1506,
"num_input_tokens_seen": 28513856,
"step": 391
},
{
"epoch": 6.431983385254413,
"grad_norm": 0.5333216190338135,
"learning_rate": 3.795067523432826e-05,
"loss": 0.1648,
"num_input_tokens_seen": 28596584,
"step": 392
},
{
"epoch": 6.4485981308411215,
"grad_norm": 0.3630414605140686,
"learning_rate": 3.789464745712619e-05,
"loss": 0.1762,
"num_input_tokens_seen": 28664560,
"step": 393
},
{
"epoch": 6.46521287642783,
"grad_norm": 0.3014258146286011,
"learning_rate": 3.7838531301454254e-05,
"loss": 0.14,
"num_input_tokens_seen": 28739512,
"step": 394
},
{
"epoch": 6.481827622014538,
"grad_norm": 0.4165439307689667,
"learning_rate": 3.77823271519263e-05,
"loss": 0.1591,
"num_input_tokens_seen": 28831848,
"step": 395
},
{
"epoch": 6.498442367601246,
"grad_norm": 0.36684006452560425,
"learning_rate": 3.7726035393759285e-05,
"loss": 0.163,
"num_input_tokens_seen": 28898408,
"step": 396
},
{
"epoch": 6.515057113187955,
"grad_norm": 0.3208068609237671,
"learning_rate": 3.76696564127706e-05,
"loss": 0.1558,
"num_input_tokens_seen": 28960224,
"step": 397
},
{
"epoch": 6.531671858774662,
"grad_norm": 0.42699623107910156,
"learning_rate": 3.761319059537548e-05,
"loss": 0.1906,
"num_input_tokens_seen": 29020568,
"step": 398
},
{
"epoch": 6.548286604361371,
"grad_norm": 0.3583790957927704,
"learning_rate": 3.755663832858432e-05,
"loss": 0.1399,
"num_input_tokens_seen": 29095448,
"step": 399
},
{
"epoch": 6.564901349948079,
"grad_norm": 0.34571129083633423,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.1554,
"num_input_tokens_seen": 29186600,
"step": 400
},
{
"epoch": 6.581516095534787,
"grad_norm": 0.30998197197914124,
"learning_rate": 3.744327599781531e-05,
"loss": 0.2448,
"num_input_tokens_seen": 29258552,
"step": 401
},
{
"epoch": 6.598130841121495,
"grad_norm": 0.35567161440849304,
"learning_rate": 3.7386466710810194e-05,
"loss": 0.1853,
"num_input_tokens_seen": 29344848,
"step": 402
},
{
"epoch": 6.614745586708204,
"grad_norm": 0.3521808683872223,
"learning_rate": 3.7329572528349146e-05,
"loss": 0.2248,
"num_input_tokens_seen": 29410184,
"step": 403
},
{
"epoch": 6.6313603322949115,
"grad_norm": 0.31165575981140137,
"learning_rate": 3.727259384037852e-05,
"loss": 0.1647,
"num_input_tokens_seen": 29484928,
"step": 404
},
{
"epoch": 6.64797507788162,
"grad_norm": 0.3182571530342102,
"learning_rate": 3.721553103742388e-05,
"loss": 0.1622,
"num_input_tokens_seen": 29566432,
"step": 405
},
{
"epoch": 6.6645898234683285,
"grad_norm": 0.3434613347053528,
"learning_rate": 3.715838451058726e-05,
"loss": 0.1509,
"num_input_tokens_seen": 29634032,
"step": 406
},
{
"epoch": 6.681204569055036,
"grad_norm": 0.3849641978740692,
"learning_rate": 3.7101154651544584e-05,
"loss": 0.1719,
"num_input_tokens_seen": 29681424,
"step": 407
},
{
"epoch": 6.697819314641745,
"grad_norm": 0.3322864770889282,
"learning_rate": 3.704384185254288e-05,
"loss": 0.1478,
"num_input_tokens_seen": 29762208,
"step": 408
},
{
"epoch": 6.714434060228453,
"grad_norm": 0.3480667769908905,
"learning_rate": 3.6986446506397666e-05,
"loss": 0.1563,
"num_input_tokens_seen": 29816280,
"step": 409
},
{
"epoch": 6.731048805815161,
"grad_norm": 0.31062281131744385,
"learning_rate": 3.692896900649021e-05,
"loss": 0.1456,
"num_input_tokens_seen": 29893040,
"step": 410
},
{
"epoch": 6.747663551401869,
"grad_norm": 0.3606508672237396,
"learning_rate": 3.6871409746764865e-05,
"loss": 0.1617,
"num_input_tokens_seen": 29971688,
"step": 411
},
{
"epoch": 6.764278296988578,
"grad_norm": 0.2981427013874054,
"learning_rate": 3.681376912172636e-05,
"loss": 0.1417,
"num_input_tokens_seen": 30051784,
"step": 412
},
{
"epoch": 6.780893042575285,
"grad_norm": 0.30427512526512146,
"learning_rate": 3.675604752643706e-05,
"loss": 0.1527,
"num_input_tokens_seen": 30146048,
"step": 413
},
{
"epoch": 6.797507788161994,
"grad_norm": 0.3024544417858124,
"learning_rate": 3.6698245356514335e-05,
"loss": 0.1498,
"num_input_tokens_seen": 30221296,
"step": 414
},
{
"epoch": 6.814122533748702,
"grad_norm": 0.2973288297653198,
"learning_rate": 3.6640363008127784e-05,
"loss": 0.1594,
"num_input_tokens_seen": 30287664,
"step": 415
},
{
"epoch": 6.83073727933541,
"grad_norm": 0.37876859307289124,
"learning_rate": 3.6582400877996546e-05,
"loss": 0.1691,
"num_input_tokens_seen": 30352816,
"step": 416
},
{
"epoch": 6.8473520249221185,
"grad_norm": 0.31672796607017517,
"learning_rate": 3.652435936338656e-05,
"loss": 0.1556,
"num_input_tokens_seen": 30439688,
"step": 417
},
{
"epoch": 6.863966770508826,
"grad_norm": 0.41883349418640137,
"learning_rate": 3.646623886210788e-05,
"loss": 0.1729,
"num_input_tokens_seen": 30506856,
"step": 418
},
{
"epoch": 6.880581516095535,
"grad_norm": 0.3436118960380554,
"learning_rate": 3.64080397725119e-05,
"loss": 0.1459,
"num_input_tokens_seen": 30565848,
"step": 419
},
{
"epoch": 6.897196261682243,
"grad_norm": 0.42407792806625366,
"learning_rate": 3.634976249348867e-05,
"loss": 0.1753,
"num_input_tokens_seen": 30633944,
"step": 420
},
{
"epoch": 6.913811007268951,
"grad_norm": 0.32624706625938416,
"learning_rate": 3.629140742446414e-05,
"loss": 0.1538,
"num_input_tokens_seen": 30704760,
"step": 421
},
{
"epoch": 6.930425752855659,
"grad_norm": 0.3517460227012634,
"learning_rate": 3.623297496539741e-05,
"loss": 0.1475,
"num_input_tokens_seen": 30773792,
"step": 422
},
{
"epoch": 6.947040498442368,
"grad_norm": 0.36355283856391907,
"learning_rate": 3.6174465516778035e-05,
"loss": 0.16,
"num_input_tokens_seen": 30848672,
"step": 423
},
{
"epoch": 6.963655244029075,
"grad_norm": 0.3491535186767578,
"learning_rate": 3.611587947962319e-05,
"loss": 0.1505,
"num_input_tokens_seen": 30906064,
"step": 424
},
{
"epoch": 6.980269989615784,
"grad_norm": 0.4260394275188446,
"learning_rate": 3.6057217255475034e-05,
"loss": 0.1796,
"num_input_tokens_seen": 30964720,
"step": 425
},
{
"epoch": 6.996884735202492,
"grad_norm": 0.3254542052745819,
"learning_rate": 3.599847924639788e-05,
"loss": 0.159,
"num_input_tokens_seen": 31043152,
"step": 426
},
{
"epoch": 7.0,
"grad_norm": 0.6581417918205261,
"learning_rate": 3.593966585497547e-05,
"loss": 0.1275,
"num_input_tokens_seen": 31056056,
"step": 427
},
{
"epoch": 7.0166147455867085,
"grad_norm": 0.3190588057041168,
"learning_rate": 3.588077748430819e-05,
"loss": 0.1349,
"num_input_tokens_seen": 31135304,
"step": 428
},
{
"epoch": 7.033229491173416,
"grad_norm": 0.3564179241657257,
"learning_rate": 3.582181453801036e-05,
"loss": 0.148,
"num_input_tokens_seen": 31185600,
"step": 429
},
{
"epoch": 7.049844236760125,
"grad_norm": 0.3535906672477722,
"learning_rate": 3.576277742020738e-05,
"loss": 0.1439,
"num_input_tokens_seen": 31254312,
"step": 430
},
{
"epoch": 7.066458982346833,
"grad_norm": 0.5310954451560974,
"learning_rate": 3.570366653553307e-05,
"loss": 0.1345,
"num_input_tokens_seen": 31339112,
"step": 431
},
{
"epoch": 7.083073727933541,
"grad_norm": 0.35149645805358887,
"learning_rate": 3.564448228912682e-05,
"loss": 0.1196,
"num_input_tokens_seen": 31424024,
"step": 432
},
{
"epoch": 7.099688473520249,
"grad_norm": 0.3583419620990753,
"learning_rate": 3.558522508663081e-05,
"loss": 0.1533,
"num_input_tokens_seen": 31494656,
"step": 433
},
{
"epoch": 7.116303219106958,
"grad_norm": 0.2901877462863922,
"learning_rate": 3.552589533418728e-05,
"loss": 0.1404,
"num_input_tokens_seen": 31588536,
"step": 434
},
{
"epoch": 7.132917964693665,
"grad_norm": 0.35585492849349976,
"learning_rate": 3.54664934384357e-05,
"loss": 0.2437,
"num_input_tokens_seen": 31657560,
"step": 435
},
{
"epoch": 7.149532710280374,
"grad_norm": 0.3314085006713867,
"learning_rate": 3.540701980651003e-05,
"loss": 0.15,
"num_input_tokens_seen": 31743992,
"step": 436
},
{
"epoch": 7.166147455867082,
"grad_norm": 0.34194597601890564,
"learning_rate": 3.534747484603587e-05,
"loss": 0.1375,
"num_input_tokens_seen": 31806520,
"step": 437
},
{
"epoch": 7.18276220145379,
"grad_norm": 0.3955981731414795,
"learning_rate": 3.528785896512772e-05,
"loss": 0.1388,
"num_input_tokens_seen": 31860464,
"step": 438
},
{
"epoch": 7.1993769470404985,
"grad_norm": 0.33545568585395813,
"learning_rate": 3.5228172572386146e-05,
"loss": 0.2926,
"num_input_tokens_seen": 31921424,
"step": 439
},
{
"epoch": 7.215991692627207,
"grad_norm": 0.43362271785736084,
"learning_rate": 3.516841607689501e-05,
"loss": 0.1454,
"num_input_tokens_seen": 31981064,
"step": 440
},
{
"epoch": 7.232606438213915,
"grad_norm": 0.40348634123802185,
"learning_rate": 3.510858988821863e-05,
"loss": 0.1388,
"num_input_tokens_seen": 32050648,
"step": 441
},
{
"epoch": 7.249221183800623,
"grad_norm": 0.35088223218917847,
"learning_rate": 3.504869441639901e-05,
"loss": 0.1248,
"num_input_tokens_seen": 32118584,
"step": 442
},
{
"epoch": 7.265835929387332,
"grad_norm": 0.3033677041530609,
"learning_rate": 3.4988730071953004e-05,
"loss": 0.1252,
"num_input_tokens_seen": 32206384,
"step": 443
},
{
"epoch": 7.282450674974039,
"grad_norm": 0.3067215383052826,
"learning_rate": 3.4928697265869515e-05,
"loss": 0.1101,
"num_input_tokens_seen": 32299040,
"step": 444
},
{
"epoch": 7.299065420560748,
"grad_norm": 0.33541834354400635,
"learning_rate": 3.486859640960668e-05,
"loss": 0.1351,
"num_input_tokens_seen": 32355624,
"step": 445
},
{
"epoch": 7.315680166147456,
"grad_norm": 0.3549206554889679,
"learning_rate": 3.480842791508904e-05,
"loss": 0.1513,
"num_input_tokens_seen": 32427792,
"step": 446
},
{
"epoch": 7.332294911734164,
"grad_norm": 0.3848089277744293,
"learning_rate": 3.474819219470471e-05,
"loss": 0.1342,
"num_input_tokens_seen": 32508696,
"step": 447
},
{
"epoch": 7.348909657320872,
"grad_norm": 0.31206491589546204,
"learning_rate": 3.4687889661302576e-05,
"loss": 0.136,
"num_input_tokens_seen": 32601312,
"step": 448
},
{
"epoch": 7.365524402907581,
"grad_norm": 0.2999168336391449,
"learning_rate": 3.4627520728189456e-05,
"loss": 0.1183,
"num_input_tokens_seen": 32680256,
"step": 449
},
{
"epoch": 7.382139148494288,
"grad_norm": 0.3667398989200592,
"learning_rate": 3.456708580912725e-05,
"loss": 0.1375,
"num_input_tokens_seen": 32738816,
"step": 450
},
{
"epoch": 7.398753894080997,
"grad_norm": 0.3525990843772888,
"learning_rate": 3.4506585318330125e-05,
"loss": 0.1265,
"num_input_tokens_seen": 32813240,
"step": 451
},
{
"epoch": 7.415368639667705,
"grad_norm": 0.3496154546737671,
"learning_rate": 3.444601967046168e-05,
"loss": 0.1422,
"num_input_tokens_seen": 32889680,
"step": 452
},
{
"epoch": 7.431983385254413,
"grad_norm": 0.38129401206970215,
"learning_rate": 3.438538928063208e-05,
"loss": 0.1534,
"num_input_tokens_seen": 32964760,
"step": 453
},
{
"epoch": 7.4485981308411215,
"grad_norm": 0.3664973974227905,
"learning_rate": 3.432469456439523e-05,
"loss": 0.1517,
"num_input_tokens_seen": 33048992,
"step": 454
},
{
"epoch": 7.46521287642783,
"grad_norm": 0.3663897216320038,
"learning_rate": 3.426393593774591e-05,
"loss": 0.1345,
"num_input_tokens_seen": 33130200,
"step": 455
},
{
"epoch": 7.481827622014538,
"grad_norm": 0.33521801233291626,
"learning_rate": 3.4203113817116957e-05,
"loss": 0.133,
"num_input_tokens_seen": 33223024,
"step": 456
},
{
"epoch": 7.498442367601246,
"grad_norm": 0.32186245918273926,
"learning_rate": 3.414222861937636e-05,
"loss": 0.1394,
"num_input_tokens_seen": 33303120,
"step": 457
},
{
"epoch": 7.515057113187955,
"grad_norm": 0.35747572779655457,
"learning_rate": 3.408128076182446e-05,
"loss": 0.1474,
"num_input_tokens_seen": 33364984,
"step": 458
},
{
"epoch": 7.531671858774662,
"grad_norm": 0.37195590138435364,
"learning_rate": 3.402027066219105e-05,
"loss": 0.1585,
"num_input_tokens_seen": 33427352,
"step": 459
},
{
"epoch": 7.548286604361371,
"grad_norm": 0.3562054932117462,
"learning_rate": 3.39591987386325e-05,
"loss": 0.1315,
"num_input_tokens_seen": 33481272,
"step": 460
},
{
"epoch": 7.564901349948079,
"grad_norm": 0.3821605443954468,
"learning_rate": 3.389806540972898e-05,
"loss": 0.1252,
"num_input_tokens_seen": 33538904,
"step": 461
},
{
"epoch": 7.581516095534787,
"grad_norm": 0.3299770653247833,
"learning_rate": 3.383687109448143e-05,
"loss": 0.1375,
"num_input_tokens_seen": 33635976,
"step": 462
},
{
"epoch": 7.598130841121495,
"grad_norm": 0.38955169916152954,
"learning_rate": 3.377561621230887e-05,
"loss": 0.137,
"num_input_tokens_seen": 33711184,
"step": 463
},
{
"epoch": 7.614745586708204,
"grad_norm": 0.3389476537704468,
"learning_rate": 3.3714301183045385e-05,
"loss": 0.1133,
"num_input_tokens_seen": 33778848,
"step": 464
},
{
"epoch": 7.6313603322949115,
"grad_norm": 0.33927062153816223,
"learning_rate": 3.365292642693732e-05,
"loss": 0.129,
"num_input_tokens_seen": 33866024,
"step": 465
},
{
"epoch": 7.64797507788162,
"grad_norm": 0.26980310678482056,
"learning_rate": 3.359149236464041e-05,
"loss": 0.1453,
"num_input_tokens_seen": 33978144,
"step": 466
},
{
"epoch": 7.6645898234683285,
"grad_norm": 0.3778550922870636,
"learning_rate": 3.35299994172168e-05,
"loss": 0.1642,
"num_input_tokens_seen": 34047480,
"step": 467
},
{
"epoch": 7.681204569055036,
"grad_norm": 0.2846803069114685,
"learning_rate": 3.346844800613229e-05,
"loss": 0.1296,
"num_input_tokens_seen": 34134480,
"step": 468
},
{
"epoch": 7.697819314641745,
"grad_norm": 0.3579311668872833,
"learning_rate": 3.340683855325335e-05,
"loss": 0.1299,
"num_input_tokens_seen": 34190176,
"step": 469
},
{
"epoch": 7.714434060228453,
"grad_norm": 0.37811708450317383,
"learning_rate": 3.3345171480844275e-05,
"loss": 0.155,
"num_input_tokens_seen": 34267336,
"step": 470
},
{
"epoch": 7.731048805815161,
"grad_norm": 0.35094380378723145,
"learning_rate": 3.3283447211564276e-05,
"loss": 0.1439,
"num_input_tokens_seen": 34333616,
"step": 471
},
{
"epoch": 7.747663551401869,
"grad_norm": 0.313473105430603,
"learning_rate": 3.322166616846458e-05,
"loss": 0.1451,
"num_input_tokens_seen": 34404000,
"step": 472
},
{
"epoch": 7.764278296988578,
"grad_norm": 0.36839237809181213,
"learning_rate": 3.315982877498555e-05,
"loss": 0.1403,
"num_input_tokens_seen": 34466048,
"step": 473
},
{
"epoch": 7.780893042575285,
"grad_norm": 0.32588714361190796,
"learning_rate": 3.309793545495374e-05,
"loss": 0.1355,
"num_input_tokens_seen": 34547312,
"step": 474
},
{
"epoch": 7.797507788161994,
"grad_norm": 0.3828336000442505,
"learning_rate": 3.303598663257904e-05,
"loss": 0.1295,
"num_input_tokens_seen": 34600544,
"step": 475
},
{
"epoch": 7.814122533748702,
"grad_norm": 0.35497623682022095,
"learning_rate": 3.2973982732451755e-05,
"loss": 0.1405,
"num_input_tokens_seen": 34660792,
"step": 476
},
{
"epoch": 7.83073727933541,
"grad_norm": 0.28363677859306335,
"learning_rate": 3.2911924179539656e-05,
"loss": 0.1858,
"num_input_tokens_seen": 34778440,
"step": 477
},
{
"epoch": 7.8473520249221185,
"grad_norm": 0.376277893781662,
"learning_rate": 3.284981139918513e-05,
"loss": 0.1454,
"num_input_tokens_seen": 34849760,
"step": 478
},
{
"epoch": 7.863966770508826,
"grad_norm": 0.32492315769195557,
"learning_rate": 3.278764481710221e-05,
"loss": 0.1177,
"num_input_tokens_seen": 34940776,
"step": 479
},
{
"epoch": 7.880581516095535,
"grad_norm": 0.33199772238731384,
"learning_rate": 3.272542485937369e-05,
"loss": 0.1369,
"num_input_tokens_seen": 35018104,
"step": 480
},
{
"epoch": 7.897196261682243,
"grad_norm": 0.3320644795894623,
"learning_rate": 3.26631519524482e-05,
"loss": 0.1267,
"num_input_tokens_seen": 35079744,
"step": 481
},
{
"epoch": 7.913811007268951,
"grad_norm": 0.3606792390346527,
"learning_rate": 3.260082652313726e-05,
"loss": 0.1236,
"num_input_tokens_seen": 35132808,
"step": 482
},
{
"epoch": 7.930425752855659,
"grad_norm": 0.36535707116127014,
"learning_rate": 3.253844899861239e-05,
"loss": 0.131,
"num_input_tokens_seen": 35197816,
"step": 483
},
{
"epoch": 7.947040498442368,
"grad_norm": 0.294482946395874,
"learning_rate": 3.247601980640217e-05,
"loss": 0.1129,
"num_input_tokens_seen": 35275528,
"step": 484
},
{
"epoch": 7.963655244029075,
"grad_norm": 0.3573679029941559,
"learning_rate": 3.241353937438927e-05,
"loss": 0.1448,
"num_input_tokens_seen": 35333280,
"step": 485
},
{
"epoch": 7.980269989615784,
"grad_norm": 0.3585527837276459,
"learning_rate": 3.23510081308076e-05,
"loss": 0.1488,
"num_input_tokens_seen": 35412944,
"step": 486
},
{
"epoch": 7.996884735202492,
"grad_norm": 0.3394106924533844,
"learning_rate": 3.228842650423929e-05,
"loss": 0.148,
"num_input_tokens_seen": 35485056,
"step": 487
},
{
"epoch": 8.0,
"grad_norm": 0.7672997117042542,
"learning_rate": 3.222579492361179e-05,
"loss": 0.1484,
"num_input_tokens_seen": 35494824,
"step": 488
},
{
"epoch": 8.016614745586708,
"grad_norm": 0.3408578932285309,
"learning_rate": 3.2163113818194964e-05,
"loss": 0.124,
"num_input_tokens_seen": 35557768,
"step": 489
},
{
"epoch": 8.033229491173417,
"grad_norm": 0.3427956700325012,
"learning_rate": 3.210038361759807e-05,
"loss": 0.124,
"num_input_tokens_seen": 35613120,
"step": 490
},
{
"epoch": 8.049844236760125,
"grad_norm": 0.3450639247894287,
"learning_rate": 3.2037604751766885e-05,
"loss": 0.1214,
"num_input_tokens_seen": 35674176,
"step": 491
},
{
"epoch": 8.066458982346832,
"grad_norm": 0.34786927700042725,
"learning_rate": 3.1974777650980735e-05,
"loss": 0.1295,
"num_input_tokens_seen": 35786664,
"step": 492
},
{
"epoch": 8.083073727933542,
"grad_norm": 0.44464972615242004,
"learning_rate": 3.191190274584952e-05,
"loss": 0.1376,
"num_input_tokens_seen": 35840720,
"step": 493
},
{
"epoch": 8.09968847352025,
"grad_norm": 0.32324495911598206,
"learning_rate": 3.184898046731082e-05,
"loss": 0.1087,
"num_input_tokens_seen": 35936736,
"step": 494
},
{
"epoch": 8.116303219106957,
"grad_norm": 0.3147285580635071,
"learning_rate": 3.178601124662686e-05,
"loss": 0.1227,
"num_input_tokens_seen": 36013800,
"step": 495
},
{
"epoch": 8.132917964693666,
"grad_norm": 0.31754013895988464,
"learning_rate": 3.172299551538164e-05,
"loss": 0.1221,
"num_input_tokens_seen": 36097904,
"step": 496
},
{
"epoch": 8.149532710280374,
"grad_norm": 0.28259411454200745,
"learning_rate": 3.165993370547794e-05,
"loss": 0.0985,
"num_input_tokens_seen": 36195544,
"step": 497
},
{
"epoch": 8.166147455867081,
"grad_norm": 0.39379462599754333,
"learning_rate": 3.1596826249134324e-05,
"loss": 0.1529,
"num_input_tokens_seen": 36261256,
"step": 498
},
{
"epoch": 8.18276220145379,
"grad_norm": 0.44324612617492676,
"learning_rate": 3.153367357888224e-05,
"loss": 0.1489,
"num_input_tokens_seen": 36325024,
"step": 499
},
{
"epoch": 8.199376947040498,
"grad_norm": 0.34766262769699097,
"learning_rate": 3.147047612756302e-05,
"loss": 0.1288,
"num_input_tokens_seen": 36377368,
"step": 500
},
{
"epoch": 8.215991692627206,
"grad_norm": 0.3318942189216614,
"learning_rate": 3.140723432832492e-05,
"loss": 0.1153,
"num_input_tokens_seen": 36459240,
"step": 501
},
{
"epoch": 8.232606438213915,
"grad_norm": 0.38428008556365967,
"learning_rate": 3.1343948614620145e-05,
"loss": 0.1305,
"num_input_tokens_seen": 36553088,
"step": 502
},
{
"epoch": 8.249221183800623,
"grad_norm": 0.40489867329597473,
"learning_rate": 3.128061942020189e-05,
"loss": 0.2824,
"num_input_tokens_seen": 36611464,
"step": 503
},
{
"epoch": 8.26583592938733,
"grad_norm": 0.26625269651412964,
"learning_rate": 3.121724717912138e-05,
"loss": 0.1033,
"num_input_tokens_seen": 36705696,
"step": 504
},
{
"epoch": 8.28245067497404,
"grad_norm": 0.3326374590396881,
"learning_rate": 3.115383232572483e-05,
"loss": 0.1124,
"num_input_tokens_seen": 36762744,
"step": 505
},
{
"epoch": 8.299065420560748,
"grad_norm": 0.3752160966396332,
"learning_rate": 3.109037529465056e-05,
"loss": 0.1297,
"num_input_tokens_seen": 36827816,
"step": 506
},
{
"epoch": 8.315680166147455,
"grad_norm": 0.3128851056098938,
"learning_rate": 3.102687652082597e-05,
"loss": 0.1158,
"num_input_tokens_seen": 36931424,
"step": 507
},
{
"epoch": 8.332294911734165,
"grad_norm": 0.3378419280052185,
"learning_rate": 3.0963336439464526e-05,
"loss": 0.1146,
"num_input_tokens_seen": 36991464,
"step": 508
},
{
"epoch": 8.348909657320872,
"grad_norm": 0.33648964762687683,
"learning_rate": 3.089975548606283e-05,
"loss": 0.1044,
"num_input_tokens_seen": 37092928,
"step": 509
},
{
"epoch": 8.36552440290758,
"grad_norm": 0.36446887254714966,
"learning_rate": 3.083613409639764e-05,
"loss": 0.1192,
"num_input_tokens_seen": 37168792,
"step": 510
},
{
"epoch": 8.38213914849429,
"grad_norm": 0.36081403493881226,
"learning_rate": 3.0772472706522806e-05,
"loss": 0.1197,
"num_input_tokens_seen": 37258864,
"step": 511
},
{
"epoch": 8.398753894080997,
"grad_norm": 0.34557875990867615,
"learning_rate": 3.0708771752766394e-05,
"loss": 0.1351,
"num_input_tokens_seen": 37343224,
"step": 512
},
{
"epoch": 8.415368639667705,
"grad_norm": 0.42923229932785034,
"learning_rate": 3.06450316717276e-05,
"loss": 0.1336,
"num_input_tokens_seen": 37395488,
"step": 513
},
{
"epoch": 8.431983385254414,
"grad_norm": 0.29999154806137085,
"learning_rate": 3.0581252900273786e-05,
"loss": 0.1057,
"num_input_tokens_seen": 37473248,
"step": 514
},
{
"epoch": 8.448598130841122,
"grad_norm": 0.36136433482170105,
"learning_rate": 3.0517435875537536e-05,
"loss": 0.1101,
"num_input_tokens_seen": 37532096,
"step": 515
},
{
"epoch": 8.46521287642783,
"grad_norm": 0.2820552587509155,
"learning_rate": 3.045358103491357e-05,
"loss": 0.1079,
"num_input_tokens_seen": 37622328,
"step": 516
},
{
"epoch": 8.481827622014539,
"grad_norm": 0.4173890948295593,
"learning_rate": 3.038968881605583e-05,
"loss": 0.1245,
"num_input_tokens_seen": 37686304,
"step": 517
},
{
"epoch": 8.498442367601246,
"grad_norm": 0.3525732159614563,
"learning_rate": 3.0325759656874418e-05,
"loss": 0.1275,
"num_input_tokens_seen": 37770856,
"step": 518
},
{
"epoch": 8.515057113187954,
"grad_norm": 0.419452428817749,
"learning_rate": 3.026179399553264e-05,
"loss": 0.1123,
"num_input_tokens_seen": 37834072,
"step": 519
},
{
"epoch": 8.531671858774663,
"grad_norm": 0.38209953904151917,
"learning_rate": 3.0197792270443982e-05,
"loss": 0.112,
"num_input_tokens_seen": 37889928,
"step": 520
},
{
"epoch": 8.54828660436137,
"grad_norm": 0.3691641390323639,
"learning_rate": 3.0133754920269103e-05,
"loss": 0.2376,
"num_input_tokens_seen": 37971296,
"step": 521
},
{
"epoch": 8.564901349948078,
"grad_norm": 0.3733454644680023,
"learning_rate": 3.0069682383912813e-05,
"loss": 0.123,
"num_input_tokens_seen": 38049288,
"step": 522
},
{
"epoch": 8.581516095534788,
"grad_norm": 0.3974218964576721,
"learning_rate": 3.0005575100521118e-05,
"loss": 0.1386,
"num_input_tokens_seen": 38123392,
"step": 523
},
{
"epoch": 8.598130841121495,
"grad_norm": 0.2970719337463379,
"learning_rate": 2.9941433509478156e-05,
"loss": 0.1194,
"num_input_tokens_seen": 38208264,
"step": 524
},
{
"epoch": 8.614745586708203,
"grad_norm": 0.3720918595790863,
"learning_rate": 2.9877258050403212e-05,
"loss": 0.126,
"num_input_tokens_seen": 38258192,
"step": 525
},
{
"epoch": 8.631360332294912,
"grad_norm": 0.3574189841747284,
"learning_rate": 2.9813049163147688e-05,
"loss": 0.1295,
"num_input_tokens_seen": 38332408,
"step": 526
},
{
"epoch": 8.64797507788162,
"grad_norm": 0.3034169375896454,
"learning_rate": 2.974880728779212e-05,
"loss": 0.1035,
"num_input_tokens_seen": 38404960,
"step": 527
},
{
"epoch": 8.664589823468328,
"grad_norm": 0.35143762826919556,
"learning_rate": 2.9684532864643122e-05,
"loss": 0.1347,
"num_input_tokens_seen": 38481704,
"step": 528
},
{
"epoch": 8.681204569055037,
"grad_norm": 0.3551865518093109,
"learning_rate": 2.9620226334230388e-05,
"loss": 0.1076,
"num_input_tokens_seen": 38546304,
"step": 529
},
{
"epoch": 8.697819314641745,
"grad_norm": 0.39726606011390686,
"learning_rate": 2.9555888137303695e-05,
"loss": 0.1514,
"num_input_tokens_seen": 38621024,
"step": 530
},
{
"epoch": 8.714434060228452,
"grad_norm": 0.3454006314277649,
"learning_rate": 2.949151871482982e-05,
"loss": 0.1119,
"num_input_tokens_seen": 38679368,
"step": 531
},
{
"epoch": 8.731048805815162,
"grad_norm": 0.3723394572734833,
"learning_rate": 2.9427118507989586e-05,
"loss": 0.1331,
"num_input_tokens_seen": 38753984,
"step": 532
},
{
"epoch": 8.74766355140187,
"grad_norm": 0.35104718804359436,
"learning_rate": 2.93626879581748e-05,
"loss": 0.1158,
"num_input_tokens_seen": 38808336,
"step": 533
},
{
"epoch": 8.764278296988577,
"grad_norm": 0.37009456753730774,
"learning_rate": 2.929822750698524e-05,
"loss": 0.2268,
"num_input_tokens_seen": 38876624,
"step": 534
},
{
"epoch": 8.780893042575286,
"grad_norm": 0.3663157820701599,
"learning_rate": 2.9233737596225613e-05,
"loss": 0.1155,
"num_input_tokens_seen": 38933576,
"step": 535
},
{
"epoch": 8.797507788161994,
"grad_norm": 0.3528357446193695,
"learning_rate": 2.916921866790256e-05,
"loss": 0.114,
"num_input_tokens_seen": 39050816,
"step": 536
},
{
"epoch": 8.814122533748701,
"grad_norm": 0.3449118435382843,
"learning_rate": 2.9104671164221576e-05,
"loss": 0.119,
"num_input_tokens_seen": 39101856,
"step": 537
},
{
"epoch": 8.83073727933541,
"grad_norm": 0.3620098829269409,
"learning_rate": 2.9040095527584032e-05,
"loss": 0.115,
"num_input_tokens_seen": 39161928,
"step": 538
},
{
"epoch": 8.847352024922118,
"grad_norm": 0.39881882071495056,
"learning_rate": 2.897549220058411e-05,
"loss": 0.1312,
"num_input_tokens_seen": 39216048,
"step": 539
},
{
"epoch": 8.863966770508826,
"grad_norm": 0.35401323437690735,
"learning_rate": 2.8910861626005776e-05,
"loss": 0.1107,
"num_input_tokens_seen": 39317320,
"step": 540
}
],
"logging_steps": 1.0,
"max_steps": 1200,
"num_input_tokens_seen": 39317320,
"num_train_epochs": 20,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.6775102547928023e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}