{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 10665, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015002344116268168, "grad_norm": 13.25, "learning_rate": 2.905342080599813e-07, "loss": 0.8909, "step": 32 }, { "epoch": 0.030004688232536336, "grad_norm": 14.25, "learning_rate": 5.904404873477039e-07, "loss": 0.8198, "step": 64 }, { "epoch": 0.0450070323488045, "grad_norm": 13.0625, "learning_rate": 8.903467666354265e-07, "loss": 0.7483, "step": 96 }, { "epoch": 0.06000937646507267, "grad_norm": 14.25, "learning_rate": 1.1902530459231491e-06, "loss": 0.8148, "step": 128 }, { "epoch": 0.07501172058134084, "grad_norm": 7.375, "learning_rate": 1.4901593252108717e-06, "loss": 0.7295, "step": 160 }, { "epoch": 0.090014064697609, "grad_norm": 14.75, "learning_rate": 1.7900656044985943e-06, "loss": 0.692, "step": 192 }, { "epoch": 0.10501640881387717, "grad_norm": 7.9375, "learning_rate": 2.089971883786317e-06, "loss": 0.656, "step": 224 }, { "epoch": 0.12001875293014534, "grad_norm": 11.5, "learning_rate": 2.3898781630740394e-06, "loss": 0.5897, "step": 256 }, { "epoch": 0.1350210970464135, "grad_norm": 12.6875, "learning_rate": 2.689784442361762e-06, "loss": 0.5782, "step": 288 }, { "epoch": 0.15002344116268168, "grad_norm": 7.5, "learning_rate": 2.9896907216494846e-06, "loss": 0.5428, "step": 320 }, { "epoch": 0.16502578527894984, "grad_norm": 10.0625, "learning_rate": 3.2895970009372076e-06, "loss": 0.5027, "step": 352 }, { "epoch": 0.180028129395218, "grad_norm": 9.625, "learning_rate": 3.58950328022493e-06, "loss": 0.4795, "step": 384 }, { "epoch": 0.19503047351148617, "grad_norm": 9.375, "learning_rate": 3.889409559512652e-06, "loss": 0.4694, "step": 416 }, { "epoch": 0.21003281762775433, "grad_norm": 11.3125, "learning_rate": 4.189315838800375e-06, "loss": 0.4382, "step": 448 }, { "epoch": 0.2250351617440225, "grad_norm": 11.5625, "learning_rate": 4.489222118088098e-06, "loss": 0.4647, "step": 480 }, { "epoch": 0.24003750586029068, "grad_norm": 8.375, "learning_rate": 4.789128397375821e-06, "loss": 0.4764, "step": 512 }, { "epoch": 0.2550398499765588, "grad_norm": 13.25, "learning_rate": 5.0890346766635435e-06, "loss": 0.4333, "step": 544 }, { "epoch": 0.270042194092827, "grad_norm": 9.1875, "learning_rate": 5.388940955951266e-06, "loss": 0.4481, "step": 576 }, { "epoch": 0.28504453820909514, "grad_norm": 13.1875, "learning_rate": 5.688847235238988e-06, "loss": 0.4314, "step": 608 }, { "epoch": 0.30004688232536336, "grad_norm": 12.625, "learning_rate": 5.98875351452671e-06, "loss": 0.3941, "step": 640 }, { "epoch": 0.3150492264416315, "grad_norm": 6.59375, "learning_rate": 6.288659793814433e-06, "loss": 0.3834, "step": 672 }, { "epoch": 0.3300515705578997, "grad_norm": 8.625, "learning_rate": 6.588566073102156e-06, "loss": 0.3548, "step": 704 }, { "epoch": 0.34505391467416785, "grad_norm": 8.6875, "learning_rate": 6.888472352389879e-06, "loss": 0.3715, "step": 736 }, { "epoch": 0.360056258790436, "grad_norm": 11.8125, "learning_rate": 7.1883786316776015e-06, "loss": 0.3073, "step": 768 }, { "epoch": 0.3750586029067042, "grad_norm": 16.25, "learning_rate": 7.488284910965324e-06, "loss": 0.3536, "step": 800 }, { "epoch": 0.39006094702297234, "grad_norm": 9.125, "learning_rate": 7.788191190253046e-06, "loss": 0.3526, "step": 832 }, { "epoch": 0.4050632911392405, "grad_norm": 13.375, "learning_rate": 8.08809746954077e-06, "loss": 0.3374, "step": 864 }, { "epoch": 0.42006563525550866, "grad_norm": 8.9375, "learning_rate": 8.388003748828491e-06, "loss": 0.3048, "step": 896 }, { "epoch": 0.4350679793717768, "grad_norm": 10.625, "learning_rate": 8.687910028116214e-06, "loss": 0.3356, "step": 928 }, { "epoch": 0.450070323488045, "grad_norm": 19.75, "learning_rate": 8.987816307403938e-06, "loss": 0.3006, "step": 960 }, { "epoch": 0.46507266760431315, "grad_norm": 11.625, "learning_rate": 9.28772258669166e-06, "loss": 0.2912, "step": 992 }, { "epoch": 0.48007501172058137, "grad_norm": 10.625, "learning_rate": 9.587628865979383e-06, "loss": 0.3286, "step": 1024 }, { "epoch": 0.49507735583684953, "grad_norm": 8.4375, "learning_rate": 9.887535145267105e-06, "loss": 0.2886, "step": 1056 }, { "epoch": 0.5100796999531176, "grad_norm": 8.9375, "learning_rate": 9.999892863685326e-06, "loss": 0.2993, "step": 1088 }, { "epoch": 0.5250820440693859, "grad_norm": 9.375, "learning_rate": 9.999275773410506e-06, "loss": 0.2741, "step": 1120 }, { "epoch": 0.540084388185654, "grad_norm": 7.65625, "learning_rate": 9.998110227713216e-06, "loss": 0.3421, "step": 1152 }, { "epoch": 0.5550867323019222, "grad_norm": 23.375, "learning_rate": 9.996396354461945e-06, "loss": 0.3402, "step": 1184 }, { "epoch": 0.5700890764181903, "grad_norm": 9.5625, "learning_rate": 9.994134341680546e-06, "loss": 0.3022, "step": 1216 }, { "epoch": 0.5850914205344585, "grad_norm": 14.6875, "learning_rate": 9.991324437527599e-06, "loss": 0.3044, "step": 1248 }, { "epoch": 0.6000937646507267, "grad_norm": 11.1875, "learning_rate": 9.987966950269184e-06, "loss": 0.3214, "step": 1280 }, { "epoch": 0.6150961087669948, "grad_norm": 12.0, "learning_rate": 9.984062248245078e-06, "loss": 0.3197, "step": 1312 }, { "epoch": 0.630098452883263, "grad_norm": 11.5, "learning_rate": 9.979610759828324e-06, "loss": 0.2518, "step": 1344 }, { "epoch": 0.6451007969995312, "grad_norm": 10.8125, "learning_rate": 9.974612973378252e-06, "loss": 0.3286, "step": 1376 }, { "epoch": 0.6601031411157994, "grad_norm": 11.8125, "learning_rate": 9.969069437186899e-06, "loss": 0.3097, "step": 1408 }, { "epoch": 0.6751054852320675, "grad_norm": 9.875, "learning_rate": 9.962980759418844e-06, "loss": 0.2941, "step": 1440 }, { "epoch": 0.6901078293483357, "grad_norm": 10.3125, "learning_rate": 9.956347608044512e-06, "loss": 0.308, "step": 1472 }, { "epoch": 0.7051101734646038, "grad_norm": 12.3125, "learning_rate": 9.949170710766875e-06, "loss": 0.2987, "step": 1504 }, { "epoch": 0.720112517580872, "grad_norm": 9.75, "learning_rate": 9.94145085494162e-06, "loss": 0.2916, "step": 1536 }, { "epoch": 0.7351148616971401, "grad_norm": 6.75, "learning_rate": 9.933188887490784e-06, "loss": 0.2931, "step": 1568 }, { "epoch": 0.7501172058134083, "grad_norm": 12.125, "learning_rate": 9.924385714809818e-06, "loss": 0.3164, "step": 1600 }, { "epoch": 0.7651195499296765, "grad_norm": 7.65625, "learning_rate": 9.91504230266817e-06, "loss": 0.2986, "step": 1632 }, { "epoch": 0.7801218940459447, "grad_norm": 13.3125, "learning_rate": 9.905159676103322e-06, "loss": 0.2648, "step": 1664 }, { "epoch": 0.7951242381622129, "grad_norm": 8.5, "learning_rate": 9.89473891930834e-06, "loss": 0.291, "step": 1696 }, { "epoch": 0.810126582278481, "grad_norm": 9.125, "learning_rate": 9.88378117551293e-06, "loss": 0.2966, "step": 1728 }, { "epoch": 0.8251289263947492, "grad_norm": 12.1875, "learning_rate": 9.872287646858015e-06, "loss": 0.2927, "step": 1760 }, { "epoch": 0.8401312705110173, "grad_norm": 12.875, "learning_rate": 9.860259594263858e-06, "loss": 0.2829, "step": 1792 }, { "epoch": 0.8551336146272855, "grad_norm": 11.3125, "learning_rate": 9.847698337291725e-06, "loss": 0.2519, "step": 1824 }, { "epoch": 0.8701359587435537, "grad_norm": 10.375, "learning_rate": 9.834605253999119e-06, "loss": 0.2922, "step": 1856 }, { "epoch": 0.8851383028598219, "grad_norm": 13.125, "learning_rate": 9.820981780788604e-06, "loss": 0.2954, "step": 1888 }, { "epoch": 0.90014064697609, "grad_norm": 8.875, "learning_rate": 9.806829412250215e-06, "loss": 0.3013, "step": 1920 }, { "epoch": 0.9151429910923582, "grad_norm": 7.96875, "learning_rate": 9.792149700997492e-06, "loss": 0.284, "step": 1952 }, { "epoch": 0.9301453352086263, "grad_norm": 10.3125, "learning_rate": 9.776944257497157e-06, "loss": 0.3089, "step": 1984 }, { "epoch": 0.9451476793248945, "grad_norm": 13.4375, "learning_rate": 9.761214749892411e-06, "loss": 0.3033, "step": 2016 }, { "epoch": 0.9601500234411627, "grad_norm": 8.25, "learning_rate": 9.74496290381996e-06, "loss": 0.3033, "step": 2048 }, { "epoch": 0.9751523675574308, "grad_norm": 7.4375, "learning_rate": 9.728190502220673e-06, "loss": 0.3294, "step": 2080 }, { "epoch": 0.9901547116736991, "grad_norm": 10.625, "learning_rate": 9.710899385143993e-06, "loss": 0.306, "step": 2112 }, { "epoch": 1.0051570557899672, "grad_norm": 6.1875, "learning_rate": 9.693091449546068e-06, "loss": 0.2592, "step": 2144 }, { "epoch": 1.0201593999062353, "grad_norm": 13.5625, "learning_rate": 9.674768649081647e-06, "loss": 0.2325, "step": 2176 }, { "epoch": 1.0351617440225036, "grad_norm": 14.1875, "learning_rate": 9.655932993889742e-06, "loss": 0.2529, "step": 2208 }, { "epoch": 1.0501640881387717, "grad_norm": 10.0, "learning_rate": 9.636586550373105e-06, "loss": 0.2018, "step": 2240 }, { "epoch": 1.0651664322550398, "grad_norm": 14.6875, "learning_rate": 9.616731440971536e-06, "loss": 0.2427, "step": 2272 }, { "epoch": 1.080168776371308, "grad_norm": 14.0625, "learning_rate": 9.596369843929022e-06, "loss": 0.2289, "step": 2304 }, { "epoch": 1.0951711204875763, "grad_norm": 8.1875, "learning_rate": 9.575503993054787e-06, "loss": 0.2156, "step": 2336 }, { "epoch": 1.1101734646038444, "grad_norm": 8.5, "learning_rate": 9.554136177478206e-06, "loss": 0.2186, "step": 2368 }, { "epoch": 1.1251758087201125, "grad_norm": 11.3125, "learning_rate": 9.532268741397692e-06, "loss": 0.2451, "step": 2400 }, { "epoch": 1.1401781528363806, "grad_norm": 8.25, "learning_rate": 9.50990408382351e-06, "loss": 0.2421, "step": 2432 }, { "epoch": 1.155180496952649, "grad_norm": 7.59375, "learning_rate": 9.487044658314585e-06, "loss": 0.2165, "step": 2464 }, { "epoch": 1.170182841068917, "grad_norm": 7.75, "learning_rate": 9.463692972709349e-06, "loss": 0.2326, "step": 2496 }, { "epoch": 1.1851851851851851, "grad_norm": 10.875, "learning_rate": 9.439851588850586e-06, "loss": 0.2585, "step": 2528 }, { "epoch": 1.2001875293014534, "grad_norm": 5.90625, "learning_rate": 9.4155231223044e-06, "loss": 0.2165, "step": 2560 }, { "epoch": 1.2151898734177216, "grad_norm": 8.9375, "learning_rate": 9.390710242073265e-06, "loss": 0.268, "step": 2592 }, { "epoch": 1.2301922175339897, "grad_norm": 5.8125, "learning_rate": 9.365415670303214e-06, "loss": 0.2386, "step": 2624 }, { "epoch": 1.2451945616502578, "grad_norm": 9.1875, "learning_rate": 9.339642181985196e-06, "loss": 0.259, "step": 2656 }, { "epoch": 1.260196905766526, "grad_norm": 10.25, "learning_rate": 9.313392604650655e-06, "loss": 0.2222, "step": 2688 }, { "epoch": 1.2751992498827942, "grad_norm": 8.75, "learning_rate": 9.286669818061316e-06, "loss": 0.2383, "step": 2720 }, { "epoch": 1.2902015939990623, "grad_norm": 7.75, "learning_rate": 9.259476753893258e-06, "loss": 0.2221, "step": 2752 }, { "epoch": 1.3052039381153304, "grad_norm": 7.6875, "learning_rate": 9.231816395415294e-06, "loss": 0.2397, "step": 2784 }, { "epoch": 1.3202062822315987, "grad_norm": 9.875, "learning_rate": 9.20369177716168e-06, "loss": 0.2348, "step": 2816 }, { "epoch": 1.3352086263478669, "grad_norm": 6.34375, "learning_rate": 9.17510598459921e-06, "loss": 0.2499, "step": 2848 }, { "epoch": 1.350210970464135, "grad_norm": 4.84375, "learning_rate": 9.146062153788716e-06, "loss": 0.227, "step": 2880 }, { "epoch": 1.3652133145804033, "grad_norm": 9.6875, "learning_rate": 9.116563471041018e-06, "loss": 0.2308, "step": 2912 }, { "epoch": 1.3802156586966714, "grad_norm": 13.8125, "learning_rate": 9.086613172567368e-06, "loss": 0.2016, "step": 2944 }, { "epoch": 1.3952180028129395, "grad_norm": 11.9375, "learning_rate": 9.056214544124414e-06, "loss": 0.2356, "step": 2976 }, { "epoch": 1.4102203469292076, "grad_norm": 9.8125, "learning_rate": 9.025370920653723e-06, "loss": 0.2306, "step": 3008 }, { "epoch": 1.4252226910454757, "grad_norm": 10.75, "learning_rate": 8.994085685915934e-06, "loss": 0.2276, "step": 3040 }, { "epoch": 1.440225035161744, "grad_norm": 11.4375, "learning_rate": 8.962362272119504e-06, "loss": 0.2352, "step": 3072 }, { "epoch": 1.4552273792780122, "grad_norm": 9.5625, "learning_rate": 8.930204159544208e-06, "loss": 0.2316, "step": 3104 }, { "epoch": 1.4702297233942803, "grad_norm": 11.0625, "learning_rate": 8.89761487615929e-06, "loss": 0.2264, "step": 3136 }, { "epoch": 1.4852320675105486, "grad_norm": 6.375, "learning_rate": 8.864597997236454e-06, "loss": 0.2414, "step": 3168 }, { "epoch": 1.5002344116268167, "grad_norm": 14.5625, "learning_rate": 8.831157144957612e-06, "loss": 0.2165, "step": 3200 }, { "epoch": 1.5152367557430848, "grad_norm": 7.0625, "learning_rate": 8.797295988017506e-06, "loss": 0.2418, "step": 3232 }, { "epoch": 1.5302390998593531, "grad_norm": 10.5625, "learning_rate": 8.763018241221241e-06, "loss": 0.2129, "step": 3264 }, { "epoch": 1.5452414439756212, "grad_norm": 9.9375, "learning_rate": 8.728327665076726e-06, "loss": 0.253, "step": 3296 }, { "epoch": 1.5602437880918893, "grad_norm": 13.875, "learning_rate": 8.693228065382131e-06, "loss": 0.2156, "step": 3328 }, { "epoch": 1.5752461322081577, "grad_norm": 15.75, "learning_rate": 8.657723292808365e-06, "loss": 0.2261, "step": 3360 }, { "epoch": 1.5902484763244256, "grad_norm": 11.3125, "learning_rate": 8.621817242476626e-06, "loss": 0.2187, "step": 3392 }, { "epoch": 1.605250820440694, "grad_norm": 15.1875, "learning_rate": 8.58551385353108e-06, "loss": 0.2559, "step": 3424 }, { "epoch": 1.620253164556962, "grad_norm": 7.34375, "learning_rate": 8.548817108706714e-06, "loss": 0.2257, "step": 3456 }, { "epoch": 1.63525550867323, "grad_norm": 18.875, "learning_rate": 8.511731033892397e-06, "loss": 0.247, "step": 3488 }, { "epoch": 1.6502578527894984, "grad_norm": 6.90625, "learning_rate": 8.474259697689211e-06, "loss": 0.2775, "step": 3520 }, { "epoch": 1.6652601969057665, "grad_norm": 7.4375, "learning_rate": 8.436407210964101e-06, "loss": 0.2468, "step": 3552 }, { "epoch": 1.6802625410220347, "grad_norm": 6.8125, "learning_rate": 8.398177726398887e-06, "loss": 0.2642, "step": 3584 }, { "epoch": 1.695264885138303, "grad_norm": 12.125, "learning_rate": 8.359575438034671e-06, "loss": 0.2571, "step": 3616 }, { "epoch": 1.7102672292545709, "grad_norm": 9.625, "learning_rate": 8.320604580811744e-06, "loss": 0.2121, "step": 3648 }, { "epoch": 1.7252695733708392, "grad_norm": 10.3125, "learning_rate": 8.281269430104965e-06, "loss": 0.2512, "step": 3680 }, { "epoch": 1.7402719174871075, "grad_norm": 13.0625, "learning_rate": 8.241574301254733e-06, "loss": 0.2273, "step": 3712 }, { "epoch": 1.7552742616033754, "grad_norm": 12.0625, "learning_rate": 8.201523549093552e-06, "loss": 0.2298, "step": 3744 }, { "epoch": 1.7702766057196437, "grad_norm": 9.6875, "learning_rate": 8.161121567468298e-06, "loss": 0.2484, "step": 3776 }, { "epoch": 1.7852789498359118, "grad_norm": 8.8125, "learning_rate": 8.120372788758152e-06, "loss": 0.2269, "step": 3808 }, { "epoch": 1.80028129395218, "grad_norm": 8.6875, "learning_rate": 8.079281683388368e-06, "loss": 0.2263, "step": 3840 }, { "epoch": 1.8152836380684483, "grad_norm": 11.4375, "learning_rate": 8.037852759339814e-06, "loss": 0.2294, "step": 3872 }, { "epoch": 1.8302859821847164, "grad_norm": 8.5625, "learning_rate": 7.99609056165443e-06, "loss": 0.2391, "step": 3904 }, { "epoch": 1.8452883263009845, "grad_norm": 12.3125, "learning_rate": 7.953999671936591e-06, "loss": 0.241, "step": 3936 }, { "epoch": 1.8602906704172528, "grad_norm": 9.1875, "learning_rate": 7.911584707850487e-06, "loss": 0.1985, "step": 3968 }, { "epoch": 1.8752930145335207, "grad_norm": 6.8125, "learning_rate": 7.868850322613525e-06, "loss": 0.2431, "step": 4000 }, { "epoch": 1.890295358649789, "grad_norm": 11.125, "learning_rate": 7.825801204485837e-06, "loss": 0.2325, "step": 4032 }, { "epoch": 1.9052977027660571, "grad_norm": 9.4375, "learning_rate": 7.782442076255952e-06, "loss": 0.2256, "step": 4064 }, { "epoch": 1.9203000468823253, "grad_norm": 11.0, "learning_rate": 7.738777694722666e-06, "loss": 0.2618, "step": 4096 }, { "epoch": 1.9353023909985936, "grad_norm": 8.9375, "learning_rate": 7.694812850173197e-06, "loss": 0.224, "step": 4128 }, { "epoch": 1.9503047351148617, "grad_norm": 10.8125, "learning_rate": 7.650552365857648e-06, "loss": 0.2272, "step": 4160 }, { "epoch": 1.9653070792311298, "grad_norm": 12.875, "learning_rate": 7.606001097459865e-06, "loss": 0.2467, "step": 4192 }, { "epoch": 1.9803094233473981, "grad_norm": 8.125, "learning_rate": 7.561163932564739e-06, "loss": 0.2399, "step": 4224 }, { "epoch": 1.9953117674636662, "grad_norm": 4.78125, "learning_rate": 7.516045790122e-06, "loss": 0.2398, "step": 4256 }, { "epoch": 2.0103141115799343, "grad_norm": 8.5625, "learning_rate": 7.470651619906574e-06, "loss": 0.1666, "step": 4288 }, { "epoch": 2.0253164556962027, "grad_norm": 13.125, "learning_rate": 7.424986401975561e-06, "loss": 0.226, "step": 4320 }, { "epoch": 2.0403187998124706, "grad_norm": 6.09375, "learning_rate": 7.379055146121884e-06, "loss": 0.1728, "step": 4352 }, { "epoch": 2.055321143928739, "grad_norm": 12.875, "learning_rate": 7.332862891324681e-06, "loss": 0.2048, "step": 4384 }, { "epoch": 2.070323488045007, "grad_norm": 9.75, "learning_rate": 7.286414705196499e-06, "loss": 0.1943, "step": 4416 }, { "epoch": 2.085325832161275, "grad_norm": 14.0625, "learning_rate": 7.2397156834273295e-06, "loss": 0.2017, "step": 4448 }, { "epoch": 2.1003281762775434, "grad_norm": 7.625, "learning_rate": 7.192770949225591e-06, "loss": 0.1626, "step": 4480 }, { "epoch": 2.1153305203938118, "grad_norm": 6.78125, "learning_rate": 7.1455856527560666e-06, "loss": 0.1603, "step": 4512 }, { "epoch": 2.1303328645100796, "grad_norm": 14.3125, "learning_rate": 7.0981649705748955e-06, "loss": 0.1555, "step": 4544 }, { "epoch": 2.145335208626348, "grad_norm": 11.1875, "learning_rate": 7.050514105061679e-06, "loss": 0.1704, "step": 4576 }, { "epoch": 2.160337552742616, "grad_norm": 6.25, "learning_rate": 7.002638283848726e-06, "loss": 0.1642, "step": 4608 }, { "epoch": 2.175339896858884, "grad_norm": 6.0, "learning_rate": 6.95454275924756e-06, "loss": 0.173, "step": 4640 }, { "epoch": 2.1903422409751525, "grad_norm": 7.84375, "learning_rate": 6.906232807672699e-06, "loss": 0.1726, "step": 4672 }, { "epoch": 2.2053445850914204, "grad_norm": 8.0, "learning_rate": 6.857713729062794e-06, "loss": 0.1741, "step": 4704 }, { "epoch": 2.2203469292076887, "grad_norm": 12.9375, "learning_rate": 6.80899084629919e-06, "loss": 0.2037, "step": 4736 }, { "epoch": 2.235349273323957, "grad_norm": 12.5625, "learning_rate": 6.760069504621971e-06, "loss": 0.2404, "step": 4768 }, { "epoch": 2.250351617440225, "grad_norm": 6.09375, "learning_rate": 6.710955071043547e-06, "loss": 0.1778, "step": 4800 }, { "epoch": 2.2653539615564933, "grad_norm": 6.4375, "learning_rate": 6.661652933759856e-06, "loss": 0.1708, "step": 4832 }, { "epoch": 2.280356305672761, "grad_norm": 10.9375, "learning_rate": 6.612168501559242e-06, "loss": 0.1854, "step": 4864 }, { "epoch": 2.2953586497890295, "grad_norm": 6.0625, "learning_rate": 6.5625072032290735e-06, "loss": 0.1601, "step": 4896 }, { "epoch": 2.310360993905298, "grad_norm": 14.125, "learning_rate": 6.512674486960166e-06, "loss": 0.1539, "step": 4928 }, { "epoch": 2.3253633380215657, "grad_norm": 8.8125, "learning_rate": 6.462675819749082e-06, "loss": 0.1474, "step": 4960 }, { "epoch": 2.340365682137834, "grad_norm": 12.1875, "learning_rate": 6.412516686798354e-06, "loss": 0.166, "step": 4992 }, { "epoch": 2.3553680262541024, "grad_norm": 12.1875, "learning_rate": 6.362202590914728e-06, "loss": 0.1863, "step": 5024 }, { "epoch": 2.3703703703703702, "grad_norm": 13.75, "learning_rate": 6.311739051905468e-06, "loss": 0.1523, "step": 5056 }, { "epoch": 2.3853727144866386, "grad_norm": 11.4375, "learning_rate": 6.261131605972785e-06, "loss": 0.1795, "step": 5088 }, { "epoch": 2.400375058602907, "grad_norm": 7.78125, "learning_rate": 6.2103858051064915e-06, "loss": 0.187, "step": 5120 }, { "epoch": 2.415377402719175, "grad_norm": 8.625, "learning_rate": 6.159507216474891e-06, "loss": 0.2099, "step": 5152 }, { "epoch": 2.430379746835443, "grad_norm": 10.0, "learning_rate": 6.108501421814039e-06, "loss": 0.2008, "step": 5184 }, { "epoch": 2.4453820909517114, "grad_norm": 11.4375, "learning_rate": 6.057374016815376e-06, "loss": 0.2017, "step": 5216 }, { "epoch": 2.4603844350679793, "grad_norm": 11.875, "learning_rate": 6.0061306105118474e-06, "loss": 0.1826, "step": 5248 }, { "epoch": 2.4753867791842477, "grad_norm": 11.9375, "learning_rate": 5.954776824662547e-06, "loss": 0.1757, "step": 5280 }, { "epoch": 2.4903891233005155, "grad_norm": 8.3125, "learning_rate": 5.90331829313598e-06, "loss": 0.1693, "step": 5312 }, { "epoch": 2.505391467416784, "grad_norm": 4.84375, "learning_rate": 5.851760661291977e-06, "loss": 0.1838, "step": 5344 }, { "epoch": 2.520393811533052, "grad_norm": 8.8125, "learning_rate": 5.80010958536237e-06, "loss": 0.1887, "step": 5376 }, { "epoch": 2.53539615564932, "grad_norm": 7.84375, "learning_rate": 5.748370731830456e-06, "loss": 0.1932, "step": 5408 }, { "epoch": 2.5503984997655884, "grad_norm": 16.375, "learning_rate": 5.696549776809346e-06, "loss": 0.1739, "step": 5440 }, { "epoch": 2.5654008438818563, "grad_norm": 8.5, "learning_rate": 5.6446524054192605e-06, "loss": 0.1857, "step": 5472 }, { "epoch": 2.5804031879981246, "grad_norm": 9.4375, "learning_rate": 5.592684311163827e-06, "loss": 0.1872, "step": 5504 }, { "epoch": 2.595405532114393, "grad_norm": 7.75, "learning_rate": 5.540651195305464e-06, "loss": 0.2011, "step": 5536 }, { "epoch": 2.610407876230661, "grad_norm": 7.46875, "learning_rate": 5.488558766239916e-06, "loss": 0.1989, "step": 5568 }, { "epoch": 2.625410220346929, "grad_norm": 17.25, "learning_rate": 5.436412738869995e-06, "loss": 0.1745, "step": 5600 }, { "epoch": 2.6404125644631975, "grad_norm": 12.1875, "learning_rate": 5.384218833978626e-06, "loss": 0.1712, "step": 5632 }, { "epoch": 2.6554149085794654, "grad_norm": 10.75, "learning_rate": 5.331982777601228e-06, "loss": 0.1865, "step": 5664 }, { "epoch": 2.6704172526957337, "grad_norm": 8.3125, "learning_rate": 5.279710300397537e-06, "loss": 0.1839, "step": 5696 }, { "epoch": 2.685419596812002, "grad_norm": 6.46875, "learning_rate": 5.227407137022902e-06, "loss": 0.2113, "step": 5728 }, { "epoch": 2.70042194092827, "grad_norm": 12.25, "learning_rate": 5.175079025499163e-06, "loss": 0.1619, "step": 5760 }, { "epoch": 2.7154242850445383, "grad_norm": 9.375, "learning_rate": 5.1227317065851445e-06, "loss": 0.1825, "step": 5792 }, { "epoch": 2.7304266291608066, "grad_norm": 9.5625, "learning_rate": 5.070370923146855e-06, "loss": 0.1654, "step": 5824 }, { "epoch": 2.7454289732770745, "grad_norm": 12.8125, "learning_rate": 5.0180024195274555e-06, "loss": 0.1499, "step": 5856 }, { "epoch": 2.760431317393343, "grad_norm": 10.1875, "learning_rate": 4.965631940917068e-06, "loss": 0.1633, "step": 5888 }, { "epoch": 2.775433661509611, "grad_norm": 18.25, "learning_rate": 4.91326523272248e-06, "loss": 0.1708, "step": 5920 }, { "epoch": 2.790436005625879, "grad_norm": 6.40625, "learning_rate": 4.860908039936839e-06, "loss": 0.202, "step": 5952 }, { "epoch": 2.8054383497421473, "grad_norm": 8.3125, "learning_rate": 4.80856610650939e-06, "loss": 0.18, "step": 5984 }, { "epoch": 2.8204406938584152, "grad_norm": 11.375, "learning_rate": 4.756245174715315e-06, "loss": 0.1835, "step": 6016 }, { "epoch": 2.8354430379746836, "grad_norm": 10.875, "learning_rate": 4.703950984525774e-06, "loss": 0.2188, "step": 6048 }, { "epoch": 2.8504453820909514, "grad_norm": 10.3125, "learning_rate": 4.6516892729781815e-06, "loss": 0.1718, "step": 6080 }, { "epoch": 2.8654477262072198, "grad_norm": 9.625, "learning_rate": 4.599465773546822e-06, "loss": 0.1803, "step": 6112 }, { "epoch": 2.880450070323488, "grad_norm": 12.5, "learning_rate": 4.547286215513846e-06, "loss": 0.1736, "step": 6144 }, { "epoch": 2.895452414439756, "grad_norm": 9.25, "learning_rate": 4.495156323340724e-06, "loss": 0.2059, "step": 6176 }, { "epoch": 2.9104547585560243, "grad_norm": 7.71875, "learning_rate": 4.443081816040233e-06, "loss": 0.2204, "step": 6208 }, { "epoch": 2.9254571026722926, "grad_norm": 10.25, "learning_rate": 4.391068406549049e-06, "loss": 0.1991, "step": 6240 }, { "epoch": 2.9404594467885605, "grad_norm": 13.375, "learning_rate": 4.339121801100982e-06, "loss": 0.2167, "step": 6272 }, { "epoch": 2.955461790904829, "grad_norm": 4.3125, "learning_rate": 4.287247698600987e-06, "loss": 0.1526, "step": 6304 }, { "epoch": 2.970464135021097, "grad_norm": 7.15625, "learning_rate": 4.235451789999928e-06, "loss": 0.1693, "step": 6336 }, { "epoch": 2.985466479137365, "grad_norm": 11.5625, "learning_rate": 4.1837397576702576e-06, "loss": 0.2256, "step": 6368 }, { "epoch": 3.0004688232536334, "grad_norm": 12.5625, "learning_rate": 4.132117274782616e-06, "loss": 0.2014, "step": 6400 }, { "epoch": 3.0154711673699017, "grad_norm": 10.9375, "learning_rate": 4.0805900046834405e-06, "loss": 0.1782, "step": 6432 }, { "epoch": 3.0304735114861696, "grad_norm": 6.59375, "learning_rate": 4.0291636002736725e-06, "loss": 0.1622, "step": 6464 }, { "epoch": 3.045475855602438, "grad_norm": 16.125, "learning_rate": 3.977843703388572e-06, "loss": 0.1664, "step": 6496 }, { "epoch": 3.0604781997187063, "grad_norm": 15.3125, "learning_rate": 3.926635944178788e-06, "loss": 0.1734, "step": 6528 }, { "epoch": 3.075480543834974, "grad_norm": 11.9375, "learning_rate": 3.875545940492681e-06, "loss": 0.1617, "step": 6560 }, { "epoch": 3.0904828879512425, "grad_norm": 13.75, "learning_rate": 3.824579297260006e-06, "loss": 0.1748, "step": 6592 }, { "epoch": 3.1054852320675104, "grad_norm": 13.5625, "learning_rate": 3.773741605877026e-06, "loss": 0.1841, "step": 6624 }, { "epoch": 3.1204875761837787, "grad_norm": 13.5, "learning_rate": 3.7230384435930785e-06, "loss": 0.1718, "step": 6656 }, { "epoch": 3.135489920300047, "grad_norm": 11.625, "learning_rate": 3.6724753728987206e-06, "loss": 0.1408, "step": 6688 }, { "epoch": 3.150492264416315, "grad_norm": 12.25, "learning_rate": 3.6220579409154888e-06, "loss": 0.1576, "step": 6720 }, { "epoch": 3.1654946085325832, "grad_norm": 7.5, "learning_rate": 3.571791678787332e-06, "loss": 0.1769, "step": 6752 }, { "epoch": 3.1804969526488516, "grad_norm": 9.625, "learning_rate": 3.521682101073818e-06, "loss": 0.1473, "step": 6784 }, { "epoch": 3.1954992967651195, "grad_norm": 12.625, "learning_rate": 3.471734705145138e-06, "loss": 0.162, "step": 6816 }, { "epoch": 3.210501640881388, "grad_norm": 14.3125, "learning_rate": 3.421954970579008e-06, "loss": 0.1884, "step": 6848 }, { "epoch": 3.2255039849976557, "grad_norm": 18.25, "learning_rate": 3.3723483585595256e-06, "loss": 0.1563, "step": 6880 }, { "epoch": 3.240506329113924, "grad_norm": 9.875, "learning_rate": 3.3229203112780382e-06, "loss": 0.1876, "step": 6912 }, { "epoch": 3.2555086732301923, "grad_norm": 9.75, "learning_rate": 3.2736762513360963e-06, "loss": 0.1674, "step": 6944 }, { "epoch": 3.27051101734646, "grad_norm": 13.25, "learning_rate": 3.224621581150553e-06, "loss": 0.1422, "step": 6976 }, { "epoch": 3.2855133614627285, "grad_norm": 8.375, "learning_rate": 3.175761682360885e-06, "loss": 0.1676, "step": 7008 }, { "epoch": 3.300515705578997, "grad_norm": 12.25, "learning_rate": 3.1271019152387917e-06, "loss": 0.1543, "step": 7040 }, { "epoch": 3.3155180496952648, "grad_norm": 13.1875, "learning_rate": 3.0786476181001263e-06, "loss": 0.1648, "step": 7072 }, { "epoch": 3.330520393811533, "grad_norm": 8.375, "learning_rate": 3.030404106719259e-06, "loss": 0.1869, "step": 7104 }, { "epoch": 3.3455227379278014, "grad_norm": 12.5, "learning_rate": 2.982376673745887e-06, "loss": 0.1838, "step": 7136 }, { "epoch": 3.3605250820440693, "grad_norm": 7.40625, "learning_rate": 2.934570588124399e-06, "loss": 0.14, "step": 7168 }, { "epoch": 3.3755274261603376, "grad_norm": 9.625, "learning_rate": 2.8869910945158407e-06, "loss": 0.1635, "step": 7200 }, { "epoch": 3.390529770276606, "grad_norm": 6.5, "learning_rate": 2.839643412722525e-06, "loss": 0.1738, "step": 7232 }, { "epoch": 3.405532114392874, "grad_norm": 7.28125, "learning_rate": 2.7925327371153998e-06, "loss": 0.1335, "step": 7264 }, { "epoch": 3.420534458509142, "grad_norm": 7.6875, "learning_rate": 2.7456642360641772e-06, "loss": 0.178, "step": 7296 }, { "epoch": 3.43553680262541, "grad_norm": 9.5, "learning_rate": 2.6990430513703316e-06, "loss": 0.1827, "step": 7328 }, { "epoch": 3.4505391467416784, "grad_norm": 8.625, "learning_rate": 2.6526742977030084e-06, "loss": 0.1587, "step": 7360 }, { "epoch": 3.4655414908579467, "grad_norm": 8.75, "learning_rate": 2.6065630620379062e-06, "loss": 0.1642, "step": 7392 }, { "epoch": 3.4805438349742146, "grad_norm": 10.4375, "learning_rate": 2.5607144030992093e-06, "loss": 0.1472, "step": 7424 }, { "epoch": 3.495546179090483, "grad_norm": 9.5625, "learning_rate": 2.515133350804598e-06, "loss": 0.1556, "step": 7456 }, { "epoch": 3.510548523206751, "grad_norm": 6.6875, "learning_rate": 2.4698249057134377e-06, "loss": 0.1422, "step": 7488 }, { "epoch": 3.525550867323019, "grad_norm": 10.125, "learning_rate": 2.4247940384781834e-06, "loss": 0.1675, "step": 7520 }, { "epoch": 3.5405532114392875, "grad_norm": 12.1875, "learning_rate": 2.38004568929906e-06, "loss": 0.157, "step": 7552 }, { "epoch": 3.5555555555555554, "grad_norm": 12.375, "learning_rate": 2.335584767382098e-06, "loss": 0.1715, "step": 7584 }, { "epoch": 3.5705578996718237, "grad_norm": 7.3125, "learning_rate": 2.291416150400547e-06, "loss": 0.1809, "step": 7616 }, { "epoch": 3.585560243788092, "grad_norm": 10.875, "learning_rate": 2.247544683959767e-06, "loss": 0.1555, "step": 7648 }, { "epoch": 3.60056258790436, "grad_norm": 7.03125, "learning_rate": 2.203975181065632e-06, "loss": 0.1868, "step": 7680 }, { "epoch": 3.6155649320206282, "grad_norm": 9.75, "learning_rate": 2.160712421596506e-06, "loss": 0.1816, "step": 7712 }, { "epoch": 3.6305672761368966, "grad_norm": 14.4375, "learning_rate": 2.1177611517788655e-06, "loss": 0.1442, "step": 7744 }, { "epoch": 3.6455696202531644, "grad_norm": 8.8125, "learning_rate": 2.0751260836665947e-06, "loss": 0.1596, "step": 7776 }, { "epoch": 3.6605719643694328, "grad_norm": 7.5, "learning_rate": 2.0328118946240473e-06, "loss": 0.1852, "step": 7808 }, { "epoch": 3.675574308485701, "grad_norm": 4.40625, "learning_rate": 1.9908232268129037e-06, "loss": 0.1865, "step": 7840 }, { "epoch": 3.690576652601969, "grad_norm": 5.84375, "learning_rate": 1.9491646866828927e-06, "loss": 0.1619, "step": 7872 }, { "epoch": 3.7055789967182373, "grad_norm": 20.5, "learning_rate": 1.9078408444664417e-06, "loss": 0.1725, "step": 7904 }, { "epoch": 3.7205813408345056, "grad_norm": 10.75, "learning_rate": 1.8668562336772734e-06, "loss": 0.2191, "step": 7936 }, { "epoch": 3.7355836849507735, "grad_norm": 25.375, "learning_rate": 1.826215350613062e-06, "loss": 0.1488, "step": 7968 }, { "epoch": 3.750586029067042, "grad_norm": 9.75, "learning_rate": 1.7859226538621487e-06, "loss": 0.1691, "step": 8000 }, { "epoch": 3.7655883731833097, "grad_norm": 13.0, "learning_rate": 1.745982563814414e-06, "loss": 0.1527, "step": 8032 }, { "epoch": 3.780590717299578, "grad_norm": 8.375, "learning_rate": 1.7063994621763176e-06, "loss": 0.1541, "step": 8064 }, { "epoch": 3.795593061415846, "grad_norm": 5.6875, "learning_rate": 1.6671776914902027e-06, "loss": 0.1823, "step": 8096 }, { "epoch": 3.8105954055321143, "grad_norm": 15.5, "learning_rate": 1.6283215546578862e-06, "loss": 0.1835, "step": 8128 }, { "epoch": 3.8255977496483826, "grad_norm": 13.5, "learning_rate": 1.5898353144686036e-06, "loss": 0.157, "step": 8160 }, { "epoch": 3.8406000937646505, "grad_norm": 13.5625, "learning_rate": 1.5517231931313454e-06, "loss": 0.1473, "step": 8192 }, { "epoch": 3.855602437880919, "grad_norm": 8.5, "learning_rate": 1.513989371811656e-06, "loss": 0.119, "step": 8224 }, { "epoch": 3.870604781997187, "grad_norm": 8.3125, "learning_rate": 1.4766379901729272e-06, "loss": 0.1714, "step": 8256 }, { "epoch": 3.885607126113455, "grad_norm": 6.5625, "learning_rate": 1.4396731459222546e-06, "loss": 0.1421, "step": 8288 }, { "epoch": 3.9006094702297234, "grad_norm": 11.8125, "learning_rate": 1.4030988943608826e-06, "loss": 0.1796, "step": 8320 }, { "epoch": 3.9156118143459917, "grad_norm": 6.3125, "learning_rate": 1.3669192479393145e-06, "loss": 0.1611, "step": 8352 }, { "epoch": 3.9306141584622596, "grad_norm": 12.1875, "learning_rate": 1.3311381758171165e-06, "loss": 0.1537, "step": 8384 }, { "epoch": 3.945616502578528, "grad_norm": 7.40625, "learning_rate": 1.2957596034274732e-06, "loss": 0.1732, "step": 8416 }, { "epoch": 3.9606188466947962, "grad_norm": 9.9375, "learning_rate": 1.2607874120465457e-06, "loss": 0.1605, "step": 8448 }, { "epoch": 3.975621190811064, "grad_norm": 7.25, "learning_rate": 1.2262254383676597e-06, "loss": 0.1486, "step": 8480 }, { "epoch": 3.9906235349273325, "grad_norm": 12.75, "learning_rate": 1.192077474080398e-06, "loss": 0.1754, "step": 8512 }, { "epoch": 4.005625879043601, "grad_norm": 12.5, "learning_rate": 1.1583472654546257e-06, "loss": 0.1472, "step": 8544 }, { "epoch": 4.020628223159869, "grad_norm": 11.4375, "learning_rate": 1.1250385129295005e-06, "loss": 0.1482, "step": 8576 }, { "epoch": 4.035630567276137, "grad_norm": 11.875, "learning_rate": 1.0921548707075026e-06, "loss": 0.1918, "step": 8608 }, { "epoch": 4.050632911392405, "grad_norm": 6.15625, "learning_rate": 1.059699946353549e-06, "loss": 0.1499, "step": 8640 }, { "epoch": 4.065635255508673, "grad_norm": 13.0, "learning_rate": 1.0276773003992157e-06, "loss": 0.166, "step": 8672 }, { "epoch": 4.080637599624941, "grad_norm": 11.6875, "learning_rate": 9.96090445952121e-07, "loss": 0.1712, "step": 8704 }, { "epoch": 4.09563994374121, "grad_norm": 10.375, "learning_rate": 9.649428483105204e-07, "loss": 0.1802, "step": 8736 }, { "epoch": 4.110642287857478, "grad_norm": 7.40625, "learning_rate": 9.34237924583129e-07, "loss": 0.138, "step": 8768 }, { "epoch": 4.125644631973746, "grad_norm": 11.25, "learning_rate": 9.039790433142481e-07, "loss": 0.1896, "step": 8800 }, { "epoch": 4.140646976090014, "grad_norm": 10.5, "learning_rate": 8.741695241142095e-07, "loss": 0.1624, "step": 8832 }, { "epoch": 4.155649320206282, "grad_norm": 8.5625, "learning_rate": 8.448126372951904e-07, "loss": 0.1418, "step": 8864 }, { "epoch": 4.17065166432255, "grad_norm": 17.0, "learning_rate": 8.159116035124431e-07, "loss": 0.1635, "step": 8896 }, { "epoch": 4.185654008438819, "grad_norm": 8.25, "learning_rate": 7.874695934109583e-07, "loss": 0.1583, "step": 8928 }, { "epoch": 4.200656352555087, "grad_norm": 9.5, "learning_rate": 7.594897272776275e-07, "loss": 0.1837, "step": 8960 }, { "epoch": 4.215658696671355, "grad_norm": 14.1875, "learning_rate": 7.319750746989262e-07, "loss": 0.1752, "step": 8992 }, { "epoch": 4.2306610407876235, "grad_norm": 11.5, "learning_rate": 7.049286542241573e-07, "loss": 0.1455, "step": 9024 }, { "epoch": 4.245663384903891, "grad_norm": 6.59375, "learning_rate": 6.783534330342984e-07, "loss": 0.15, "step": 9056 }, { "epoch": 4.260665729020159, "grad_norm": 7.0, "learning_rate": 6.522523266164759e-07, "loss": 0.1644, "step": 9088 }, { "epoch": 4.275668073136427, "grad_norm": 6.4375, "learning_rate": 6.266281984441214e-07, "loss": 0.1311, "step": 9120 }, { "epoch": 4.290670417252696, "grad_norm": 8.625, "learning_rate": 6.014838596628225e-07, "loss": 0.1386, "step": 9152 }, { "epoch": 4.305672761368964, "grad_norm": 8.125, "learning_rate": 5.768220687819271e-07, "loss": 0.167, "step": 9184 }, { "epoch": 4.320675105485232, "grad_norm": 8.4375, "learning_rate": 5.526455313719126e-07, "loss": 0.1587, "step": 9216 }, { "epoch": 4.3356774496015005, "grad_norm": 12.125, "learning_rate": 5.289568997675643e-07, "loss": 0.1834, "step": 9248 }, { "epoch": 4.350679793717768, "grad_norm": 11.6875, "learning_rate": 5.057587727769981e-07, "loss": 0.1602, "step": 9280 }, { "epoch": 4.365682137834036, "grad_norm": 7.1875, "learning_rate": 4.830536953965531e-07, "loss": 0.1472, "step": 9312 }, { "epoch": 4.380684481950305, "grad_norm": 11.9375, "learning_rate": 4.6084415853158537e-07, "loss": 0.1334, "step": 9344 }, { "epoch": 4.395686826066573, "grad_norm": 25.0, "learning_rate": 4.391325987232037e-07, "loss": 0.2039, "step": 9376 }, { "epoch": 4.410689170182841, "grad_norm": 19.125, "learning_rate": 4.17921397880956e-07, "loss": 0.1607, "step": 9408 }, { "epoch": 4.42569151429911, "grad_norm": 4.875, "learning_rate": 3.9721288302152493e-07, "loss": 0.187, "step": 9440 }, { "epoch": 4.4406938584153774, "grad_norm": 7.71875, "learning_rate": 3.770093260134322e-07, "loss": 0.1658, "step": 9472 }, { "epoch": 4.455696202531645, "grad_norm": 8.875, "learning_rate": 3.573129433278011e-07, "loss": 0.1951, "step": 9504 }, { "epoch": 4.470698546647914, "grad_norm": 12.5, "learning_rate": 3.381258957951983e-07, "loss": 0.162, "step": 9536 }, { "epoch": 4.485700890764182, "grad_norm": 20.875, "learning_rate": 3.194502883685663e-07, "loss": 0.1915, "step": 9568 }, { "epoch": 4.50070323488045, "grad_norm": 11.8125, "learning_rate": 3.0128816989230315e-07, "loss": 0.1639, "step": 9600 }, { "epoch": 4.515705578996718, "grad_norm": 3.625, "learning_rate": 2.836415328774872e-07, "loss": 0.1495, "step": 9632 }, { "epoch": 4.5307079231129865, "grad_norm": 8.3125, "learning_rate": 2.665123132832842e-07, "loss": 0.1508, "step": 9664 }, { "epoch": 4.545710267229254, "grad_norm": 10.375, "learning_rate": 2.499023903045622e-07, "loss": 0.1933, "step": 9696 }, { "epoch": 4.560712611345522, "grad_norm": 9.125, "learning_rate": 2.3381358616572593e-07, "loss": 0.1807, "step": 9728 }, { "epoch": 4.575714955461791, "grad_norm": 5.25, "learning_rate": 2.1824766592080937e-07, "loss": 0.145, "step": 9760 }, { "epoch": 4.590717299578059, "grad_norm": 13.75, "learning_rate": 2.0320633725983641e-07, "loss": 0.1707, "step": 9792 }, { "epoch": 4.605719643694327, "grad_norm": 8.1875, "learning_rate": 1.8869125032147384e-07, "loss": 0.1769, "step": 9824 }, { "epoch": 4.620721987810596, "grad_norm": 15.625, "learning_rate": 1.747039975120035e-07, "loss": 0.1786, "step": 9856 }, { "epoch": 4.6357243319268635, "grad_norm": 13.3125, "learning_rate": 1.6124611333062036e-07, "loss": 0.1512, "step": 9888 }, { "epoch": 4.650726676043131, "grad_norm": 8.6875, "learning_rate": 1.4831907420108705e-07, "loss": 0.133, "step": 9920 }, { "epoch": 4.6657290201594, "grad_norm": 8.75, "learning_rate": 1.3592429830976362e-07, "loss": 0.1517, "step": 9952 }, { "epoch": 4.680731364275668, "grad_norm": 10.5, "learning_rate": 1.2406314545001795e-07, "loss": 0.1268, "step": 9984 }, { "epoch": 4.695733708391936, "grad_norm": 10.375, "learning_rate": 1.1273691687305299e-07, "loss": 0.1799, "step": 10016 }, { "epoch": 4.710736052508205, "grad_norm": 13.375, "learning_rate": 1.0194685514514302e-07, "loss": 0.1589, "step": 10048 }, { "epoch": 4.725738396624473, "grad_norm": 7.1875, "learning_rate": 9.16941440113206e-08, "loss": 0.1619, "step": 10080 }, { "epoch": 4.7407407407407405, "grad_norm": 6.78125, "learning_rate": 8.197990826551094e-08, "loss": 0.1367, "step": 10112 }, { "epoch": 4.755743084857009, "grad_norm": 10.75, "learning_rate": 7.280521362713122e-08, "loss": 0.1789, "step": 10144 }, { "epoch": 4.770745428973277, "grad_norm": 13.875, "learning_rate": 6.417106662417849e-08, "loss": 0.1452, "step": 10176 }, { "epoch": 4.785747773089545, "grad_norm": 10.5625, "learning_rate": 5.607841448280194e-08, "loss": 0.1715, "step": 10208 }, { "epoch": 4.800750117205814, "grad_norm": 6.1875, "learning_rate": 4.852814502338765e-08, "loss": 0.1644, "step": 10240 }, { "epoch": 4.815752461322082, "grad_norm": 9.0625, "learning_rate": 4.1521086563159344e-08, "loss": 0.1696, "step": 10272 }, { "epoch": 4.83075480543835, "grad_norm": 6.53125, "learning_rate": 3.5058007825303774e-08, "loss": 0.1565, "step": 10304 }, { "epoch": 4.845757149554618, "grad_norm": 6.46875, "learning_rate": 2.9139617854639368e-08, "loss": 0.1656, "step": 10336 }, { "epoch": 4.860759493670886, "grad_norm": 13.0625, "learning_rate": 2.3766565939826734e-08, "loss": 0.1673, "step": 10368 }, { "epoch": 4.875761837787154, "grad_norm": 3.703125, "learning_rate": 1.8939441542138448e-08, "loss": 0.1369, "step": 10400 }, { "epoch": 4.890764181903423, "grad_norm": 8.4375, "learning_rate": 1.4658774230789653e-08, "loss": 0.157, "step": 10432 }, { "epoch": 4.905766526019691, "grad_norm": 8.8125, "learning_rate": 1.0925033624842874e-08, "loss": 0.1443, "step": 10464 }, { "epoch": 4.920768870135959, "grad_norm": 14.6875, "learning_rate": 7.73862934168479e-09, "loss": 0.1617, "step": 10496 }, { "epoch": 4.9357712142522265, "grad_norm": 14.5, "learning_rate": 5.099910952091059e-09, "loss": 0.1769, "step": 10528 }, { "epoch": 4.950773558368495, "grad_norm": 9.8125, "learning_rate": 3.0091679418742248e-09, "loss": 0.1684, "step": 10560 }, { "epoch": 4.965775902484763, "grad_norm": 10.625, "learning_rate": 1.4666296801252312e-09, "loss": 0.1523, "step": 10592 }, { "epoch": 4.980778246601031, "grad_norm": 15.375, "learning_rate": 4.724653940513246e-10, "loss": 0.1202, "step": 10624 }, { "epoch": 4.9957805907173, "grad_norm": 11.3125, "learning_rate": 2.6784150408132315e-11, "loss": 0.196, "step": 10656 }, { "epoch": 5.0, "step": 10665, "total_flos": 1.232747073490944e+17, "train_loss": 0.22699634635386978, "train_runtime": 3211.3705, "train_samples_per_second": 3.321, "train_steps_per_second": 3.321 } ], "logging_steps": 32, "max_steps": 10665, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.232747073490944e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }