|
{ |
|
"best_metric": 1.3252594470977783, |
|
"best_model_checkpoint": "4bit_repro_03022025/host17_seed_42_full_det_fp16_no_flash_attn_fix_pad_gemma-2-9b-it-l16-cot-wt-4ep-lr3e04-ws20-bs4-ga4-fp16-13022025/checkpoint-109", |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 218, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009195402298850575, |
|
"grad_norm": 1.025204062461853, |
|
"learning_rate": 1.4999999999999999e-05, |
|
"loss": 2.395, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01839080459770115, |
|
"grad_norm": 0.7824286222457886, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 2.3972, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.027586206896551724, |
|
"grad_norm": 0.9325972199440002, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 2.2652, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0367816091954023, |
|
"grad_norm": 0.7933842539787292, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 2.1491, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04597701149425287, |
|
"grad_norm": 0.9390926957130432, |
|
"learning_rate": 7.5e-05, |
|
"loss": 2.2175, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05517241379310345, |
|
"grad_norm": 0.8701347708702087, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 2.0785, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06436781609195402, |
|
"grad_norm": 0.48448503017425537, |
|
"learning_rate": 0.00010499999999999999, |
|
"loss": 2.0785, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0735632183908046, |
|
"grad_norm": 0.39611828327178955, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 2.0303, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08275862068965517, |
|
"grad_norm": 0.5025896430015564, |
|
"learning_rate": 0.000135, |
|
"loss": 1.917, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09195402298850575, |
|
"grad_norm": 0.6268681883811951, |
|
"learning_rate": 0.00015, |
|
"loss": 1.7051, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10114942528735632, |
|
"grad_norm": 0.6085858941078186, |
|
"learning_rate": 0.000165, |
|
"loss": 1.729, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1103448275862069, |
|
"grad_norm": 0.5290607213973999, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 1.715, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.11954022988505747, |
|
"grad_norm": 0.657960832118988, |
|
"learning_rate": 0.000195, |
|
"loss": 1.5356, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.12873563218390804, |
|
"grad_norm": 0.4407201409339905, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 1.6903, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.3601807951927185, |
|
"learning_rate": 0.000225, |
|
"loss": 1.6744, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1471264367816092, |
|
"grad_norm": 0.3802438974380493, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 1.8822, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.15632183908045977, |
|
"grad_norm": 0.4443354904651642, |
|
"learning_rate": 0.00025499999999999996, |
|
"loss": 1.3503, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.16551724137931034, |
|
"grad_norm": 0.5189216136932373, |
|
"learning_rate": 0.00027, |
|
"loss": 1.3499, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.17471264367816092, |
|
"grad_norm": 0.3960488438606262, |
|
"learning_rate": 0.000285, |
|
"loss": 1.3832, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1839080459770115, |
|
"grad_norm": 0.37185606360435486, |
|
"learning_rate": 0.0003, |
|
"loss": 1.575, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.19310344827586207, |
|
"grad_norm": 0.28029191493988037, |
|
"learning_rate": 0.00029927184466019415, |
|
"loss": 1.6438, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.20229885057471264, |
|
"grad_norm": 0.2731279134750366, |
|
"learning_rate": 0.00029854368932038833, |
|
"loss": 1.5843, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.21149425287356322, |
|
"grad_norm": 0.35780686140060425, |
|
"learning_rate": 0.0002978155339805825, |
|
"loss": 1.3945, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.2206896551724138, |
|
"grad_norm": 0.35450395941734314, |
|
"learning_rate": 0.0002970873786407767, |
|
"loss": 1.4894, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 0.3032964766025543, |
|
"learning_rate": 0.00029635922330097087, |
|
"loss": 1.64, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.23908045977011494, |
|
"grad_norm": 0.3555232584476471, |
|
"learning_rate": 0.00029563106796116505, |
|
"loss": 1.4793, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2482758620689655, |
|
"grad_norm": 0.43719008564949036, |
|
"learning_rate": 0.0002949029126213592, |
|
"loss": 1.4318, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.2574712643678161, |
|
"grad_norm": 0.3937687277793884, |
|
"learning_rate": 0.00029417475728155335, |
|
"loss": 1.3755, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.3995443880558014, |
|
"learning_rate": 0.00029344660194174753, |
|
"loss": 1.6313, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.33234909176826477, |
|
"learning_rate": 0.0002927184466019417, |
|
"loss": 1.7548, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2850574712643678, |
|
"grad_norm": 0.3954809010028839, |
|
"learning_rate": 0.0002919902912621359, |
|
"loss": 1.5549, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2942528735632184, |
|
"grad_norm": 0.3647831976413727, |
|
"learning_rate": 0.00029126213592233006, |
|
"loss": 1.3264, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.30344827586206896, |
|
"grad_norm": 0.4714711308479309, |
|
"learning_rate": 0.00029053398058252424, |
|
"loss": 1.2362, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.31264367816091954, |
|
"grad_norm": 0.4638761878013611, |
|
"learning_rate": 0.0002898058252427184, |
|
"loss": 1.5707, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3218390804597701, |
|
"grad_norm": 0.43770870566368103, |
|
"learning_rate": 0.0002890776699029126, |
|
"loss": 1.5975, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3310344827586207, |
|
"grad_norm": 0.46125656366348267, |
|
"learning_rate": 0.0002883495145631068, |
|
"loss": 1.4532, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.34022988505747126, |
|
"grad_norm": 0.3735737204551697, |
|
"learning_rate": 0.00028762135922330096, |
|
"loss": 1.4564, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.34942528735632183, |
|
"grad_norm": 0.35823461413383484, |
|
"learning_rate": 0.00028689320388349513, |
|
"loss": 1.5855, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3586206896551724, |
|
"grad_norm": 0.5508543252944946, |
|
"learning_rate": 0.0002861650485436893, |
|
"loss": 1.306, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.367816091954023, |
|
"grad_norm": 0.4099932014942169, |
|
"learning_rate": 0.0002854368932038835, |
|
"loss": 1.5942, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.37701149425287356, |
|
"grad_norm": 0.3676886558532715, |
|
"learning_rate": 0.00028470873786407767, |
|
"loss": 1.3708, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.38620689655172413, |
|
"grad_norm": 0.6290714740753174, |
|
"learning_rate": 0.00028398058252427185, |
|
"loss": 1.2496, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3954022988505747, |
|
"grad_norm": 0.3946329951286316, |
|
"learning_rate": 0.00028325242718446603, |
|
"loss": 1.3344, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4045977011494253, |
|
"grad_norm": 0.4511699080467224, |
|
"learning_rate": 0.00028252427184466015, |
|
"loss": 1.356, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.5036881566047668, |
|
"learning_rate": 0.00028179611650485433, |
|
"loss": 1.2171, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.42298850574712643, |
|
"grad_norm": 0.4095934331417084, |
|
"learning_rate": 0.0002810679611650485, |
|
"loss": 1.4812, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.432183908045977, |
|
"grad_norm": 0.47633135318756104, |
|
"learning_rate": 0.0002803398058252427, |
|
"loss": 1.3561, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4413793103448276, |
|
"grad_norm": 0.4468563199043274, |
|
"learning_rate": 0.00027961165048543687, |
|
"loss": 1.2434, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.45057471264367815, |
|
"grad_norm": 0.48372191190719604, |
|
"learning_rate": 0.00027888349514563105, |
|
"loss": 1.2266, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 0.5756326913833618, |
|
"learning_rate": 0.0002781553398058252, |
|
"loss": 1.1512, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4689655172413793, |
|
"grad_norm": 0.4629153907299042, |
|
"learning_rate": 0.0002774271844660194, |
|
"loss": 1.3474, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4781609195402299, |
|
"grad_norm": 0.42864587903022766, |
|
"learning_rate": 0.0002766990291262136, |
|
"loss": 1.1593, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.48735632183908045, |
|
"grad_norm": 0.5796183943748474, |
|
"learning_rate": 0.00027597087378640776, |
|
"loss": 1.2137, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.496551724137931, |
|
"grad_norm": 0.5870793461799622, |
|
"learning_rate": 0.00027524271844660194, |
|
"loss": 1.1082, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5057471264367817, |
|
"grad_norm": 0.4859938323497772, |
|
"learning_rate": 0.0002745145631067961, |
|
"loss": 1.3229, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5149425287356322, |
|
"grad_norm": 0.5698845386505127, |
|
"learning_rate": 0.0002737864077669903, |
|
"loss": 1.3068, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5241379310344828, |
|
"grad_norm": 0.5284724831581116, |
|
"learning_rate": 0.0002730582524271845, |
|
"loss": 1.0949, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.5468711256980896, |
|
"learning_rate": 0.00027233009708737865, |
|
"loss": 1.2197, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.542528735632184, |
|
"grad_norm": 0.6027315258979797, |
|
"learning_rate": 0.0002716019417475728, |
|
"loss": 1.1874, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.5445360541343689, |
|
"learning_rate": 0.00027087378640776696, |
|
"loss": 1.2057, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5609195402298851, |
|
"grad_norm": 0.591551661491394, |
|
"learning_rate": 0.00027014563106796114, |
|
"loss": 1.1251, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5701149425287356, |
|
"grad_norm": 0.528071939945221, |
|
"learning_rate": 0.0002694174757281553, |
|
"loss": 1.0482, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5793103448275863, |
|
"grad_norm": 0.691935658454895, |
|
"learning_rate": 0.0002686893203883495, |
|
"loss": 1.0581, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5885057471264368, |
|
"grad_norm": 0.776759684085846, |
|
"learning_rate": 0.00026796116504854367, |
|
"loss": 1.1077, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5977011494252874, |
|
"grad_norm": 0.8228328227996826, |
|
"learning_rate": 0.00026723300970873785, |
|
"loss": 1.2629, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6068965517241379, |
|
"grad_norm": 0.5646819472312927, |
|
"learning_rate": 0.00026650485436893203, |
|
"loss": 0.9204, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6160919540229886, |
|
"grad_norm": 0.6202297806739807, |
|
"learning_rate": 0.0002657766990291262, |
|
"loss": 1.1396, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6252873563218391, |
|
"grad_norm": 0.6260644197463989, |
|
"learning_rate": 0.0002650485436893204, |
|
"loss": 1.0977, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6344827586206897, |
|
"grad_norm": 0.669505774974823, |
|
"learning_rate": 0.00026432038834951456, |
|
"loss": 1.2014, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6436781609195402, |
|
"grad_norm": 0.7686023712158203, |
|
"learning_rate": 0.00026359223300970874, |
|
"loss": 1.1332, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6528735632183909, |
|
"grad_norm": 0.7180910110473633, |
|
"learning_rate": 0.0002628640776699029, |
|
"loss": 0.88, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.6620689655172414, |
|
"grad_norm": 0.6693065166473389, |
|
"learning_rate": 0.00026213592233009705, |
|
"loss": 0.9068, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.671264367816092, |
|
"grad_norm": 0.6618425250053406, |
|
"learning_rate": 0.0002614077669902912, |
|
"loss": 0.9885, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.6804597701149425, |
|
"grad_norm": 0.7131378054618835, |
|
"learning_rate": 0.0002606796116504854, |
|
"loss": 1.0587, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.9193438291549683, |
|
"learning_rate": 0.0002599514563106796, |
|
"loss": 1.1504, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6988505747126437, |
|
"grad_norm": 1.1682260036468506, |
|
"learning_rate": 0.00025922330097087376, |
|
"loss": 1.1793, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7080459770114943, |
|
"grad_norm": 0.6184092164039612, |
|
"learning_rate": 0.00025849514563106794, |
|
"loss": 1.2133, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7172413793103448, |
|
"grad_norm": 0.7343618273735046, |
|
"learning_rate": 0.0002577669902912621, |
|
"loss": 0.9616, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7264367816091954, |
|
"grad_norm": 0.8535470366477966, |
|
"learning_rate": 0.0002570388349514563, |
|
"loss": 1.2857, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.735632183908046, |
|
"grad_norm": 0.6457574367523193, |
|
"learning_rate": 0.0002563106796116505, |
|
"loss": 1.0748, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7448275862068966, |
|
"grad_norm": 0.5693302154541016, |
|
"learning_rate": 0.0002555825242718446, |
|
"loss": 1.0785, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.7540229885057471, |
|
"grad_norm": 0.6433013081550598, |
|
"learning_rate": 0.0002548543689320388, |
|
"loss": 0.9738, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.7632183908045977, |
|
"grad_norm": 1.2133727073669434, |
|
"learning_rate": 0.00025412621359223296, |
|
"loss": 1.2062, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.7724137931034483, |
|
"grad_norm": 0.7277675271034241, |
|
"learning_rate": 0.00025339805825242714, |
|
"loss": 1.1494, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.7816091954022989, |
|
"grad_norm": 0.6444184184074402, |
|
"learning_rate": 0.0002526699029126213, |
|
"loss": 1.1214, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7908045977011494, |
|
"grad_norm": 0.8243492841720581, |
|
"learning_rate": 0.0002519417475728155, |
|
"loss": 0.8145, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6770063042640686, |
|
"learning_rate": 0.00025121359223300967, |
|
"loss": 0.9134, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8091954022988506, |
|
"grad_norm": 0.6134109497070312, |
|
"learning_rate": 0.00025048543689320385, |
|
"loss": 1.0109, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8183908045977012, |
|
"grad_norm": 0.5844547748565674, |
|
"learning_rate": 0.00024975728155339803, |
|
"loss": 1.063, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.5940524339675903, |
|
"learning_rate": 0.0002490291262135922, |
|
"loss": 0.9952, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8367816091954023, |
|
"grad_norm": 0.7235853672027588, |
|
"learning_rate": 0.0002483009708737864, |
|
"loss": 0.895, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.8459770114942529, |
|
"grad_norm": 0.7243452668190002, |
|
"learning_rate": 0.00024757281553398056, |
|
"loss": 0.7441, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.8551724137931035, |
|
"grad_norm": 0.6366357207298279, |
|
"learning_rate": 0.00024684466019417474, |
|
"loss": 1.0253, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.864367816091954, |
|
"grad_norm": 0.9579809308052063, |
|
"learning_rate": 0.0002461165048543689, |
|
"loss": 0.8995, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.8735632183908046, |
|
"grad_norm": 0.8137032985687256, |
|
"learning_rate": 0.0002453883495145631, |
|
"loss": 0.8475, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8827586206896552, |
|
"grad_norm": 0.5339512825012207, |
|
"learning_rate": 0.0002446601941747572, |
|
"loss": 0.8167, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.8919540229885058, |
|
"grad_norm": 0.6556524038314819, |
|
"learning_rate": 0.00024393203883495143, |
|
"loss": 1.0225, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.9011494252873563, |
|
"grad_norm": 0.6119419932365417, |
|
"learning_rate": 0.0002432038834951456, |
|
"loss": 1.0889, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9103448275862069, |
|
"grad_norm": 0.7066159248352051, |
|
"learning_rate": 0.0002424757281553398, |
|
"loss": 0.8548, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 0.5464254021644592, |
|
"learning_rate": 0.00024174757281553394, |
|
"loss": 0.9283, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9287356321839081, |
|
"grad_norm": 0.825078010559082, |
|
"learning_rate": 0.00024101941747572812, |
|
"loss": 0.8686, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.9379310344827586, |
|
"grad_norm": 1.2080026865005493, |
|
"learning_rate": 0.0002402912621359223, |
|
"loss": 0.8503, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.9471264367816092, |
|
"grad_norm": 0.6597005128860474, |
|
"learning_rate": 0.00023956310679611648, |
|
"loss": 0.9614, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.9563218390804598, |
|
"grad_norm": 0.614787757396698, |
|
"learning_rate": 0.00023883495145631065, |
|
"loss": 0.9684, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.6293591856956482, |
|
"learning_rate": 0.00023810679611650483, |
|
"loss": 0.7772, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9747126436781609, |
|
"grad_norm": 0.5669013857841492, |
|
"learning_rate": 0.000237378640776699, |
|
"loss": 1.1319, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.9839080459770115, |
|
"grad_norm": 0.6458181738853455, |
|
"learning_rate": 0.0002366504854368932, |
|
"loss": 0.9616, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.993103448275862, |
|
"grad_norm": 0.5852652192115784, |
|
"learning_rate": 0.00023592233009708734, |
|
"loss": 0.746, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.8498281836509705, |
|
"learning_rate": 0.00023519417475728152, |
|
"loss": 0.8678, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.3252594470977783, |
|
"eval_runtime": 94.0095, |
|
"eval_samples_per_second": 3.532, |
|
"eval_steps_per_second": 1.766, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.0091954022988505, |
|
"grad_norm": 0.6115343570709229, |
|
"learning_rate": 0.0002344660194174757, |
|
"loss": 0.811, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.018390804597701, |
|
"grad_norm": 0.5486385226249695, |
|
"learning_rate": 0.00023373786407766988, |
|
"loss": 0.6873, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.0275862068965518, |
|
"grad_norm": 0.6083724498748779, |
|
"learning_rate": 0.00023300970873786406, |
|
"loss": 0.75, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.0367816091954023, |
|
"grad_norm": 0.7998746633529663, |
|
"learning_rate": 0.00023228155339805823, |
|
"loss": 0.5876, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.0459770114942528, |
|
"grad_norm": 0.6601845026016235, |
|
"learning_rate": 0.0002315533980582524, |
|
"loss": 0.8747, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.0551724137931036, |
|
"grad_norm": 0.9844085574150085, |
|
"learning_rate": 0.0002308252427184466, |
|
"loss": 0.6241, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.064367816091954, |
|
"grad_norm": 0.6095510125160217, |
|
"learning_rate": 0.00023009708737864074, |
|
"loss": 0.6274, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.0735632183908046, |
|
"grad_norm": 0.9384357929229736, |
|
"learning_rate": 0.00022936893203883492, |
|
"loss": 0.7682, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.0827586206896551, |
|
"grad_norm": 0.5237877368927002, |
|
"learning_rate": 0.0002286407766990291, |
|
"loss": 0.8889, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.0919540229885056, |
|
"grad_norm": 0.5225788354873657, |
|
"learning_rate": 0.00022791262135922328, |
|
"loss": 0.6419, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.1011494252873564, |
|
"grad_norm": 0.6210270524024963, |
|
"learning_rate": 0.00022718446601941746, |
|
"loss": 0.6799, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.110344827586207, |
|
"grad_norm": 0.7171874046325684, |
|
"learning_rate": 0.00022645631067961164, |
|
"loss": 0.5037, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.1195402298850574, |
|
"grad_norm": 0.6145285367965698, |
|
"learning_rate": 0.00022572815533980582, |
|
"loss": 0.4812, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.1287356321839082, |
|
"grad_norm": 0.5306028723716736, |
|
"learning_rate": 0.000225, |
|
"loss": 0.6068, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.1379310344827587, |
|
"grad_norm": 0.6142969131469727, |
|
"learning_rate": 0.00022427184466019415, |
|
"loss": 0.6441, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.1471264367816092, |
|
"grad_norm": 0.5693908333778381, |
|
"learning_rate": 0.00022354368932038832, |
|
"loss": 0.6241, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1563218390804597, |
|
"grad_norm": 0.8560084104537964, |
|
"learning_rate": 0.0002228155339805825, |
|
"loss": 0.5208, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.1655172413793102, |
|
"grad_norm": 0.9754599928855896, |
|
"learning_rate": 0.00022208737864077668, |
|
"loss": 0.6499, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.174712643678161, |
|
"grad_norm": 0.515574038028717, |
|
"learning_rate": 0.00022135922330097086, |
|
"loss": 0.6246, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.1839080459770115, |
|
"grad_norm": 0.5477547645568848, |
|
"learning_rate": 0.00022063106796116504, |
|
"loss": 0.6331, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.193103448275862, |
|
"grad_norm": 0.445388525724411, |
|
"learning_rate": 0.00021990291262135922, |
|
"loss": 0.5737, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2022988505747128, |
|
"grad_norm": 0.6278632879257202, |
|
"learning_rate": 0.00021917475728155337, |
|
"loss": 0.5605, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.2114942528735633, |
|
"grad_norm": 0.5176573991775513, |
|
"learning_rate": 0.00021844660194174755, |
|
"loss": 0.6198, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.2206896551724138, |
|
"grad_norm": 0.5394790768623352, |
|
"learning_rate": 0.00021771844660194173, |
|
"loss": 0.743, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.2298850574712643, |
|
"grad_norm": 0.5462550520896912, |
|
"learning_rate": 0.0002169902912621359, |
|
"loss": 0.5505, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.2390804597701148, |
|
"grad_norm": 0.5793837904930115, |
|
"learning_rate": 0.00021626213592233008, |
|
"loss": 0.7134, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2482758620689656, |
|
"grad_norm": 0.5995808243751526, |
|
"learning_rate": 0.00021553398058252426, |
|
"loss": 0.8151, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.257471264367816, |
|
"grad_norm": 0.6317359805107117, |
|
"learning_rate": 0.00021480582524271844, |
|
"loss": 0.4986, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.2666666666666666, |
|
"grad_norm": 0.9133898019790649, |
|
"learning_rate": 0.00021407766990291262, |
|
"loss": 0.6029, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.2758620689655173, |
|
"grad_norm": 0.7161931991577148, |
|
"learning_rate": 0.00021334951456310677, |
|
"loss": 0.6581, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.2850574712643679, |
|
"grad_norm": 0.5639025568962097, |
|
"learning_rate": 0.00021262135922330095, |
|
"loss": 0.681, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2942528735632184, |
|
"grad_norm": 0.6325567364692688, |
|
"learning_rate": 0.00021189320388349513, |
|
"loss": 0.7994, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.303448275862069, |
|
"grad_norm": 0.47429075837135315, |
|
"learning_rate": 0.0002111650485436893, |
|
"loss": 0.4164, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.3126436781609194, |
|
"grad_norm": 0.3774986267089844, |
|
"learning_rate": 0.00021043689320388349, |
|
"loss": 0.3792, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.3218390804597702, |
|
"grad_norm": 0.5024625062942505, |
|
"learning_rate": 0.00020970873786407766, |
|
"loss": 0.6999, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.3310344827586207, |
|
"grad_norm": 0.4836028516292572, |
|
"learning_rate": 0.00020898058252427184, |
|
"loss": 0.3536, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3402298850574712, |
|
"grad_norm": 0.4562912881374359, |
|
"learning_rate": 0.00020825242718446602, |
|
"loss": 0.4362, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.349425287356322, |
|
"grad_norm": 0.4715615212917328, |
|
"learning_rate": 0.00020752427184466017, |
|
"loss": 0.4743, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.3586206896551725, |
|
"grad_norm": 0.5050966143608093, |
|
"learning_rate": 0.00020679611650485435, |
|
"loss": 0.6084, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.367816091954023, |
|
"grad_norm": 0.5919803380966187, |
|
"learning_rate": 0.00020606796116504853, |
|
"loss": 0.4208, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.3770114942528735, |
|
"grad_norm": 0.5397422313690186, |
|
"learning_rate": 0.0002053398058252427, |
|
"loss": 0.5182, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.386206896551724, |
|
"grad_norm": 0.604860246181488, |
|
"learning_rate": 0.0002046116504854369, |
|
"loss": 0.6569, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.3954022988505748, |
|
"grad_norm": 0.6743022799491882, |
|
"learning_rate": 0.00020388349514563107, |
|
"loss": 0.6063, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.4045977011494253, |
|
"grad_norm": 0.5582085847854614, |
|
"learning_rate": 0.00020315533980582524, |
|
"loss": 0.8471, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.4137931034482758, |
|
"grad_norm": 0.6764629483222961, |
|
"learning_rate": 0.00020242718446601942, |
|
"loss": 0.4767, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.4229885057471265, |
|
"grad_norm": 0.39126965403556824, |
|
"learning_rate": 0.00020169902912621357, |
|
"loss": 0.4983, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.432183908045977, |
|
"grad_norm": 0.5407236814498901, |
|
"learning_rate": 0.00020097087378640775, |
|
"loss": 0.6467, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.4413793103448276, |
|
"grad_norm": 0.4321889579296112, |
|
"learning_rate": 0.00020024271844660193, |
|
"loss": 0.6037, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.450574712643678, |
|
"grad_norm": 0.3570482134819031, |
|
"learning_rate": 0.0001995145631067961, |
|
"loss": 0.4515, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.4597701149425286, |
|
"grad_norm": 0.5193243622779846, |
|
"learning_rate": 0.0001987864077669903, |
|
"loss": 0.5267, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.4689655172413794, |
|
"grad_norm": 0.8264741897583008, |
|
"learning_rate": 0.00019805825242718447, |
|
"loss": 0.7169, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.4781609195402299, |
|
"grad_norm": 0.6514953374862671, |
|
"learning_rate": 0.00019733009708737865, |
|
"loss": 0.678, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.4873563218390804, |
|
"grad_norm": 0.5475180745124817, |
|
"learning_rate": 0.0001966019417475728, |
|
"loss": 0.5252, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.4965517241379311, |
|
"grad_norm": 0.49964120984077454, |
|
"learning_rate": 0.00019587378640776698, |
|
"loss": 0.4259, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.5057471264367817, |
|
"grad_norm": 0.4474540948867798, |
|
"learning_rate": 0.00019514563106796116, |
|
"loss": 0.4728, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.5149425287356322, |
|
"grad_norm": 0.5726771950721741, |
|
"learning_rate": 0.00019441747572815533, |
|
"loss": 0.6752, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.524137931034483, |
|
"grad_norm": 0.5038064122200012, |
|
"learning_rate": 0.0001936893203883495, |
|
"loss": 0.647, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.5333333333333332, |
|
"grad_norm": 0.4093747138977051, |
|
"learning_rate": 0.0001929611650485437, |
|
"loss": 0.6077, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.542528735632184, |
|
"grad_norm": 0.8166248798370361, |
|
"learning_rate": 0.00019223300970873787, |
|
"loss": 0.5149, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 0.5660980343818665, |
|
"learning_rate": 0.00019150485436893205, |
|
"loss": 0.3511, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.560919540229885, |
|
"grad_norm": 0.403187096118927, |
|
"learning_rate": 0.0001907766990291262, |
|
"loss": 0.4097, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.5701149425287357, |
|
"grad_norm": 0.5686673521995544, |
|
"learning_rate": 0.00019004854368932038, |
|
"loss": 0.8236, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.5793103448275863, |
|
"grad_norm": 0.4967772662639618, |
|
"learning_rate": 0.00018932038834951456, |
|
"loss": 0.562, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.5885057471264368, |
|
"grad_norm": 0.560854434967041, |
|
"learning_rate": 0.00018859223300970874, |
|
"loss": 0.5852, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.5977011494252875, |
|
"grad_norm": 0.3643392324447632, |
|
"learning_rate": 0.00018786407766990291, |
|
"loss": 0.4042, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.6068965517241378, |
|
"grad_norm": 0.6362044811248779, |
|
"learning_rate": 0.00018713592233009707, |
|
"loss": 0.6533, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6160919540229886, |
|
"grad_norm": 0.6190036535263062, |
|
"learning_rate": 0.00018640776699029122, |
|
"loss": 0.7651, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.625287356321839, |
|
"grad_norm": 0.3463480472564697, |
|
"learning_rate": 0.0001856796116504854, |
|
"loss": 0.3312, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.6344827586206896, |
|
"grad_norm": 0.2819209098815918, |
|
"learning_rate": 0.00018495145631067957, |
|
"loss": 0.3247, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.6436781609195403, |
|
"grad_norm": 0.5651117563247681, |
|
"learning_rate": 0.00018422330097087375, |
|
"loss": 0.7674, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.6528735632183909, |
|
"grad_norm": 0.4948618412017822, |
|
"learning_rate": 0.00018349514563106793, |
|
"loss": 0.6161, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6620689655172414, |
|
"grad_norm": 0.43636301159858704, |
|
"learning_rate": 0.0001827669902912621, |
|
"loss": 0.5334, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.6712643678160921, |
|
"grad_norm": 0.4951108694076538, |
|
"learning_rate": 0.0001820388349514563, |
|
"loss": 0.624, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.6804597701149424, |
|
"grad_norm": 0.5951234102249146, |
|
"learning_rate": 0.00018131067961165047, |
|
"loss": 0.5184, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.6896551724137931, |
|
"grad_norm": 0.6109154224395752, |
|
"learning_rate": 0.00018058252427184462, |
|
"loss": 0.7274, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.6988505747126437, |
|
"grad_norm": 0.4492969810962677, |
|
"learning_rate": 0.0001798543689320388, |
|
"loss": 0.6079, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7080459770114942, |
|
"grad_norm": 0.5195210576057434, |
|
"learning_rate": 0.00017912621359223298, |
|
"loss": 0.4998, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.717241379310345, |
|
"grad_norm": 0.49724170565605164, |
|
"learning_rate": 0.00017839805825242716, |
|
"loss": 0.3856, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.7264367816091954, |
|
"grad_norm": 1.1214869022369385, |
|
"learning_rate": 0.00017766990291262133, |
|
"loss": 0.4785, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.735632183908046, |
|
"grad_norm": 0.5645748376846313, |
|
"learning_rate": 0.0001769417475728155, |
|
"loss": 0.5791, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.7448275862068967, |
|
"grad_norm": 0.46523571014404297, |
|
"learning_rate": 0.0001762135922330097, |
|
"loss": 0.5804, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.754022988505747, |
|
"grad_norm": 0.3765566945075989, |
|
"learning_rate": 0.00017548543689320387, |
|
"loss": 0.5272, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.7632183908045977, |
|
"grad_norm": 0.5119166374206543, |
|
"learning_rate": 0.00017475728155339802, |
|
"loss": 0.6572, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.7724137931034483, |
|
"grad_norm": 0.38700059056282043, |
|
"learning_rate": 0.0001740291262135922, |
|
"loss": 0.5233, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.7816091954022988, |
|
"grad_norm": 0.3980446457862854, |
|
"learning_rate": 0.00017330097087378638, |
|
"loss": 0.4071, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.7908045977011495, |
|
"grad_norm": 0.35074886679649353, |
|
"learning_rate": 0.00017257281553398056, |
|
"loss": 0.4505, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.5284190773963928, |
|
"learning_rate": 0.00017184466019417474, |
|
"loss": 0.5048, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.8091954022988506, |
|
"grad_norm": 0.47595924139022827, |
|
"learning_rate": 0.00017111650485436891, |
|
"loss": 0.541, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.8183908045977013, |
|
"grad_norm": 0.557465672492981, |
|
"learning_rate": 0.0001703883495145631, |
|
"loss": 0.5175, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.8275862068965516, |
|
"grad_norm": 0.4417920410633087, |
|
"learning_rate": 0.00016966019417475724, |
|
"loss": 0.5284, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.8367816091954023, |
|
"grad_norm": 0.34410127997398376, |
|
"learning_rate": 0.00016893203883495142, |
|
"loss": 0.3723, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8459770114942529, |
|
"grad_norm": 0.3989458680152893, |
|
"learning_rate": 0.0001682038834951456, |
|
"loss": 0.4057, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.8551724137931034, |
|
"grad_norm": 0.3975292444229126, |
|
"learning_rate": 0.00016747572815533978, |
|
"loss": 0.4334, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.8643678160919541, |
|
"grad_norm": 0.5099373459815979, |
|
"learning_rate": 0.00016674757281553396, |
|
"loss": 0.5842, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.8735632183908046, |
|
"grad_norm": 0.4445691406726837, |
|
"learning_rate": 0.00016601941747572814, |
|
"loss": 0.593, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.8827586206896552, |
|
"grad_norm": 0.4758138060569763, |
|
"learning_rate": 0.00016529126213592232, |
|
"loss": 0.4163, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.891954022988506, |
|
"grad_norm": 0.41732391715049744, |
|
"learning_rate": 0.0001645631067961165, |
|
"loss": 0.5405, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.9011494252873562, |
|
"grad_norm": 0.3908286988735199, |
|
"learning_rate": 0.00016383495145631065, |
|
"loss": 0.4363, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.910344827586207, |
|
"grad_norm": 0.5812026858329773, |
|
"learning_rate": 0.00016310679611650483, |
|
"loss": 0.7188, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.9195402298850575, |
|
"grad_norm": 0.4734458327293396, |
|
"learning_rate": 0.000162378640776699, |
|
"loss": 0.569, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.928735632183908, |
|
"grad_norm": 0.4347914457321167, |
|
"learning_rate": 0.00016165048543689318, |
|
"loss": 0.5026, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9379310344827587, |
|
"grad_norm": 0.3364557921886444, |
|
"learning_rate": 0.00016092233009708736, |
|
"loss": 0.3537, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.9471264367816092, |
|
"grad_norm": 0.44029518961906433, |
|
"learning_rate": 0.00016019417475728154, |
|
"loss": 0.5733, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.9563218390804598, |
|
"grad_norm": 0.33010566234588623, |
|
"learning_rate": 0.00015946601941747572, |
|
"loss": 0.3838, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.9655172413793105, |
|
"grad_norm": 0.3367745876312256, |
|
"learning_rate": 0.0001587378640776699, |
|
"loss": 0.4212, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.9747126436781608, |
|
"grad_norm": 0.5834444165229797, |
|
"learning_rate": 0.00015800970873786405, |
|
"loss": 0.833, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.9839080459770115, |
|
"grad_norm": 0.5451297163963318, |
|
"learning_rate": 0.00015728155339805823, |
|
"loss": 0.7753, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.993103448275862, |
|
"grad_norm": 0.4916711747646332, |
|
"learning_rate": 0.0001565533980582524, |
|
"loss": 0.6183, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.7540034651756287, |
|
"learning_rate": 0.00015582524271844658, |
|
"loss": 0.8286, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.4519011974334717, |
|
"eval_runtime": 93.8599, |
|
"eval_samples_per_second": 3.537, |
|
"eval_steps_per_second": 1.769, |
|
"step": 218 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 432, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9491566965342618e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|