|
{ |
|
"best_metric": 1.4196490049362183, |
|
"best_model_checkpoint": "4bit_repro_03022025/host14_seed_42_full_det_fp16_no_flash_attn_fix_pad_gemma-2-9b-instruct-l16-no-cot-4ep-lr3e04-ws20-bs4-ga4-fp16-11022025/checkpoint-109", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 327, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009195402298850575, |
|
"grad_norm": 1.6409969329833984, |
|
"learning_rate": 1.4999999999999999e-05, |
|
"loss": 2.9018, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01839080459770115, |
|
"grad_norm": 0.8910855054855347, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 2.5407, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.027586206896551724, |
|
"grad_norm": 1.2834181785583496, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 2.5948, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0367816091954023, |
|
"grad_norm": 0.9639408588409424, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 2.3316, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04597701149425287, |
|
"grad_norm": 1.4458935260772705, |
|
"learning_rate": 7.5e-05, |
|
"loss": 2.6124, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05517241379310345, |
|
"grad_norm": 1.3878850936889648, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 2.3947, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06436781609195402, |
|
"grad_norm": 0.5613430142402649, |
|
"learning_rate": 0.00010499999999999999, |
|
"loss": 2.2874, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0735632183908046, |
|
"grad_norm": 0.4661317467689514, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 2.1525, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08275862068965517, |
|
"grad_norm": 0.6441327929496765, |
|
"learning_rate": 0.000135, |
|
"loss": 1.9578, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09195402298850575, |
|
"grad_norm": 0.9715940952301025, |
|
"learning_rate": 0.00015, |
|
"loss": 1.8896, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10114942528735632, |
|
"grad_norm": 0.7227729558944702, |
|
"learning_rate": 0.000165, |
|
"loss": 1.8532, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1103448275862069, |
|
"grad_norm": 0.6046324372291565, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 1.7605, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.11954022988505747, |
|
"grad_norm": 0.801392138004303, |
|
"learning_rate": 0.000195, |
|
"loss": 1.4759, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.12873563218390804, |
|
"grad_norm": 0.5491551160812378, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 1.6874, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.4392535388469696, |
|
"learning_rate": 0.000225, |
|
"loss": 1.7641, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1471264367816092, |
|
"grad_norm": 0.41002196073532104, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 1.9998, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.15632183908045977, |
|
"grad_norm": 0.5483619570732117, |
|
"learning_rate": 0.00025499999999999996, |
|
"loss": 1.3601, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.16551724137931034, |
|
"grad_norm": 0.5035637021064758, |
|
"learning_rate": 0.00027, |
|
"loss": 1.2777, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.17471264367816092, |
|
"grad_norm": 0.48088592290878296, |
|
"learning_rate": 0.000285, |
|
"loss": 1.462, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1839080459770115, |
|
"grad_norm": 0.42464756965637207, |
|
"learning_rate": 0.0003, |
|
"loss": 1.6768, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.19310344827586207, |
|
"grad_norm": 0.3142748773097992, |
|
"learning_rate": 0.00029927184466019415, |
|
"loss": 1.6529, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.20229885057471264, |
|
"grad_norm": 0.300471693277359, |
|
"learning_rate": 0.00029854368932038833, |
|
"loss": 1.6068, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.21149425287356322, |
|
"grad_norm": 0.3858422636985779, |
|
"learning_rate": 0.0002978155339805825, |
|
"loss": 1.371, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.2206896551724138, |
|
"grad_norm": 0.37815675139427185, |
|
"learning_rate": 0.0002970873786407767, |
|
"loss": 1.54, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 0.327928751707077, |
|
"learning_rate": 0.00029635922330097087, |
|
"loss": 1.7743, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.23908045977011494, |
|
"grad_norm": 0.45471611618995667, |
|
"learning_rate": 0.00029563106796116505, |
|
"loss": 1.5857, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2482758620689655, |
|
"grad_norm": 0.5154709219932556, |
|
"learning_rate": 0.0002949029126213592, |
|
"loss": 1.5444, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.2574712643678161, |
|
"grad_norm": 0.40547341108322144, |
|
"learning_rate": 0.00029417475728155335, |
|
"loss": 1.3692, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.4016611874103546, |
|
"learning_rate": 0.00029344660194174753, |
|
"loss": 1.7185, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.35080596804618835, |
|
"learning_rate": 0.0002927184466019417, |
|
"loss": 1.8895, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2850574712643678, |
|
"grad_norm": 0.4109053611755371, |
|
"learning_rate": 0.0002919902912621359, |
|
"loss": 1.6865, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2942528735632184, |
|
"grad_norm": 0.3945920467376709, |
|
"learning_rate": 0.00029126213592233006, |
|
"loss": 1.3734, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.30344827586206896, |
|
"grad_norm": 0.5358554124832153, |
|
"learning_rate": 0.00029053398058252424, |
|
"loss": 1.2796, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.31264367816091954, |
|
"grad_norm": 0.5781538486480713, |
|
"learning_rate": 0.0002898058252427184, |
|
"loss": 1.6769, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3218390804597701, |
|
"grad_norm": 0.4976755380630493, |
|
"learning_rate": 0.0002890776699029126, |
|
"loss": 1.6693, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3310344827586207, |
|
"grad_norm": 0.5537866353988647, |
|
"learning_rate": 0.0002883495145631068, |
|
"loss": 1.4935, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.34022988505747126, |
|
"grad_norm": 0.41387081146240234, |
|
"learning_rate": 0.00028762135922330096, |
|
"loss": 1.5668, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.34942528735632183, |
|
"grad_norm": 0.38216930627822876, |
|
"learning_rate": 0.00028689320388349513, |
|
"loss": 1.667, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3586206896551724, |
|
"grad_norm": 0.5784114599227905, |
|
"learning_rate": 0.0002861650485436893, |
|
"loss": 1.2776, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.367816091954023, |
|
"grad_norm": 0.417309045791626, |
|
"learning_rate": 0.0002854368932038835, |
|
"loss": 1.6741, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.37701149425287356, |
|
"grad_norm": 0.39875927567481995, |
|
"learning_rate": 0.00028470873786407767, |
|
"loss": 1.4101, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.38620689655172413, |
|
"grad_norm": 0.7550377249717712, |
|
"learning_rate": 0.00028398058252427185, |
|
"loss": 1.1991, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3954022988505747, |
|
"grad_norm": 0.4652000963687897, |
|
"learning_rate": 0.00028325242718446603, |
|
"loss": 1.4474, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4045977011494253, |
|
"grad_norm": 0.5465714931488037, |
|
"learning_rate": 0.00028252427184466015, |
|
"loss": 1.4219, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.5987188816070557, |
|
"learning_rate": 0.00028179611650485433, |
|
"loss": 1.2155, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.42298850574712643, |
|
"grad_norm": 0.5176146626472473, |
|
"learning_rate": 0.0002810679611650485, |
|
"loss": 1.6139, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.432183908045977, |
|
"grad_norm": 0.5650622248649597, |
|
"learning_rate": 0.0002803398058252427, |
|
"loss": 1.3885, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4413793103448276, |
|
"grad_norm": 0.5218377709388733, |
|
"learning_rate": 0.00027961165048543687, |
|
"loss": 1.2143, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.45057471264367815, |
|
"grad_norm": 0.8088476657867432, |
|
"learning_rate": 0.00027888349514563105, |
|
"loss": 1.1332, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 0.6791206002235413, |
|
"learning_rate": 0.0002781553398058252, |
|
"loss": 1.1016, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4689655172413793, |
|
"grad_norm": 0.528010368347168, |
|
"learning_rate": 0.0002774271844660194, |
|
"loss": 1.3407, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4781609195402299, |
|
"grad_norm": 0.5152159929275513, |
|
"learning_rate": 0.0002766990291262136, |
|
"loss": 1.0526, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.48735632183908045, |
|
"grad_norm": 0.8332945704460144, |
|
"learning_rate": 0.00027597087378640776, |
|
"loss": 1.2991, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.496551724137931, |
|
"grad_norm": 0.7987570762634277, |
|
"learning_rate": 0.00027524271844660194, |
|
"loss": 1.008, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5057471264367817, |
|
"grad_norm": 0.6524537205696106, |
|
"learning_rate": 0.0002745145631067961, |
|
"loss": 1.2895, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5149425287356322, |
|
"grad_norm": 0.7703379988670349, |
|
"learning_rate": 0.0002737864077669903, |
|
"loss": 1.2739, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5241379310344828, |
|
"grad_norm": 0.6915059089660645, |
|
"learning_rate": 0.0002730582524271845, |
|
"loss": 0.9791, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.7275146842002869, |
|
"learning_rate": 0.00027233009708737865, |
|
"loss": 1.1958, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.542528735632184, |
|
"grad_norm": 0.7504586577415466, |
|
"learning_rate": 0.0002716019417475728, |
|
"loss": 1.0989, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.9296619892120361, |
|
"learning_rate": 0.00027087378640776696, |
|
"loss": 1.2009, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5609195402298851, |
|
"grad_norm": 0.7090461850166321, |
|
"learning_rate": 0.00027014563106796114, |
|
"loss": 1.0826, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5701149425287356, |
|
"grad_norm": 0.7167800068855286, |
|
"learning_rate": 0.0002694174757281553, |
|
"loss": 0.9676, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5793103448275863, |
|
"grad_norm": 0.9599812030792236, |
|
"learning_rate": 0.0002686893203883495, |
|
"loss": 1.0113, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5885057471264368, |
|
"grad_norm": 0.8795760273933411, |
|
"learning_rate": 0.00026796116504854367, |
|
"loss": 0.9343, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5977011494252874, |
|
"grad_norm": 1.689482569694519, |
|
"learning_rate": 0.00026723300970873785, |
|
"loss": 1.4378, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6068965517241379, |
|
"grad_norm": 0.7943350672721863, |
|
"learning_rate": 0.00026650485436893203, |
|
"loss": 0.8399, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6160919540229886, |
|
"grad_norm": 0.862583577632904, |
|
"learning_rate": 0.0002657766990291262, |
|
"loss": 1.0277, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6252873563218391, |
|
"grad_norm": 0.8344200253486633, |
|
"learning_rate": 0.0002650485436893204, |
|
"loss": 1.0991, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6344827586206897, |
|
"grad_norm": 0.7979349493980408, |
|
"learning_rate": 0.00026432038834951456, |
|
"loss": 1.2579, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6436781609195402, |
|
"grad_norm": 0.7641604542732239, |
|
"learning_rate": 0.00026359223300970874, |
|
"loss": 1.0108, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6528735632183909, |
|
"grad_norm": 0.8905412554740906, |
|
"learning_rate": 0.0002628640776699029, |
|
"loss": 0.7594, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.6620689655172414, |
|
"grad_norm": 0.8589284420013428, |
|
"learning_rate": 0.00026213592233009705, |
|
"loss": 0.8535, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.671264367816092, |
|
"grad_norm": 0.8410593867301941, |
|
"learning_rate": 0.0002614077669902912, |
|
"loss": 0.8691, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.6804597701149425, |
|
"grad_norm": 1.0918798446655273, |
|
"learning_rate": 0.0002606796116504854, |
|
"loss": 0.984, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 1.3453165292739868, |
|
"learning_rate": 0.0002599514563106796, |
|
"loss": 1.1908, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6988505747126437, |
|
"grad_norm": 1.7153398990631104, |
|
"learning_rate": 0.00025922330097087376, |
|
"loss": 1.1151, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7080459770114943, |
|
"grad_norm": 0.7578756809234619, |
|
"learning_rate": 0.00025849514563106794, |
|
"loss": 1.0327, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7172413793103448, |
|
"grad_norm": 0.8445453643798828, |
|
"learning_rate": 0.0002577669902912621, |
|
"loss": 0.7871, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7264367816091954, |
|
"grad_norm": 1.0929808616638184, |
|
"learning_rate": 0.0002570388349514563, |
|
"loss": 1.2769, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.735632183908046, |
|
"grad_norm": 0.8767411708831787, |
|
"learning_rate": 0.0002563106796116505, |
|
"loss": 0.9978, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7448275862068966, |
|
"grad_norm": 0.7253080606460571, |
|
"learning_rate": 0.0002555825242718446, |
|
"loss": 1.0843, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.7540229885057471, |
|
"grad_norm": 0.7607583999633789, |
|
"learning_rate": 0.0002548543689320388, |
|
"loss": 0.8945, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.7632183908045977, |
|
"grad_norm": 1.163414716720581, |
|
"learning_rate": 0.00025412621359223296, |
|
"loss": 1.0853, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.7724137931034483, |
|
"grad_norm": 0.9414466023445129, |
|
"learning_rate": 0.00025339805825242714, |
|
"loss": 1.0867, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.7816091954022989, |
|
"grad_norm": 0.873741090297699, |
|
"learning_rate": 0.0002526699029126213, |
|
"loss": 1.1912, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7908045977011494, |
|
"grad_norm": 0.8017745614051819, |
|
"learning_rate": 0.0002519417475728155, |
|
"loss": 0.6812, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.8693607449531555, |
|
"learning_rate": 0.00025121359223300967, |
|
"loss": 0.8669, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8091954022988506, |
|
"grad_norm": 0.7914007902145386, |
|
"learning_rate": 0.00025048543689320385, |
|
"loss": 0.9684, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8183908045977012, |
|
"grad_norm": 0.7322354912757874, |
|
"learning_rate": 0.00024975728155339803, |
|
"loss": 1.0616, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.6482452154159546, |
|
"learning_rate": 0.0002490291262135922, |
|
"loss": 0.8636, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8367816091954023, |
|
"grad_norm": 0.8021015524864197, |
|
"learning_rate": 0.0002483009708737864, |
|
"loss": 0.7627, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.8459770114942529, |
|
"grad_norm": 0.8987912535667419, |
|
"learning_rate": 0.00024757281553398056, |
|
"loss": 0.5836, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.8551724137931035, |
|
"grad_norm": 0.7898840308189392, |
|
"learning_rate": 0.00024684466019417474, |
|
"loss": 1.068, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.864367816091954, |
|
"grad_norm": 1.0533185005187988, |
|
"learning_rate": 0.0002461165048543689, |
|
"loss": 0.77, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.8735632183908046, |
|
"grad_norm": 0.9064627289772034, |
|
"learning_rate": 0.0002453883495145631, |
|
"loss": 0.6817, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8827586206896552, |
|
"grad_norm": 0.8030613660812378, |
|
"learning_rate": 0.0002446601941747572, |
|
"loss": 0.7652, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.8919540229885058, |
|
"grad_norm": 0.828834593296051, |
|
"learning_rate": 0.00024393203883495143, |
|
"loss": 0.8619, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.9011494252873563, |
|
"grad_norm": 0.9028300642967224, |
|
"learning_rate": 0.0002432038834951456, |
|
"loss": 0.9762, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9103448275862069, |
|
"grad_norm": 1.261749505996704, |
|
"learning_rate": 0.0002424757281553398, |
|
"loss": 0.7234, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 0.7746110558509827, |
|
"learning_rate": 0.00024174757281553394, |
|
"loss": 0.8761, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9287356321839081, |
|
"grad_norm": 0.8339172005653381, |
|
"learning_rate": 0.00024101941747572812, |
|
"loss": 0.699, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.9379310344827586, |
|
"grad_norm": 1.1835957765579224, |
|
"learning_rate": 0.0002402912621359223, |
|
"loss": 0.6597, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.9471264367816092, |
|
"grad_norm": 0.7516824007034302, |
|
"learning_rate": 0.00023956310679611648, |
|
"loss": 0.9335, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.9563218390804598, |
|
"grad_norm": 0.7029162049293518, |
|
"learning_rate": 0.00023883495145631065, |
|
"loss": 0.8889, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.8920773267745972, |
|
"learning_rate": 0.00023810679611650483, |
|
"loss": 0.6555, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9747126436781609, |
|
"grad_norm": 0.7414918541908264, |
|
"learning_rate": 0.000237378640776699, |
|
"loss": 1.0669, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.9839080459770115, |
|
"grad_norm": 0.8566169738769531, |
|
"learning_rate": 0.0002366504854368932, |
|
"loss": 0.8665, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.993103448275862, |
|
"grad_norm": 0.6569233536720276, |
|
"learning_rate": 0.00023592233009708734, |
|
"loss": 0.585, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.2873867750167847, |
|
"learning_rate": 0.00023519417475728152, |
|
"loss": 0.8267, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.4196490049362183, |
|
"eval_runtime": 76.0927, |
|
"eval_samples_per_second": 4.363, |
|
"eval_steps_per_second": 2.182, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.0091954022988505, |
|
"grad_norm": 0.820888340473175, |
|
"learning_rate": 0.0002344660194174757, |
|
"loss": 0.677, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.018390804597701, |
|
"grad_norm": 0.7585135698318481, |
|
"learning_rate": 0.00023373786407766988, |
|
"loss": 0.6585, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.0275862068965518, |
|
"grad_norm": 0.8374487161636353, |
|
"learning_rate": 0.00023300970873786406, |
|
"loss": 0.7201, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.0367816091954023, |
|
"grad_norm": 0.9571560621261597, |
|
"learning_rate": 0.00023228155339805823, |
|
"loss": 0.4576, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.0459770114942528, |
|
"grad_norm": 0.6560165882110596, |
|
"learning_rate": 0.0002315533980582524, |
|
"loss": 0.8126, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.0551724137931036, |
|
"grad_norm": 1.02234947681427, |
|
"learning_rate": 0.0002308252427184466, |
|
"loss": 0.5351, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.064367816091954, |
|
"grad_norm": 0.5752732753753662, |
|
"learning_rate": 0.00023009708737864074, |
|
"loss": 0.4256, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.0735632183908046, |
|
"grad_norm": 1.0608015060424805, |
|
"learning_rate": 0.00022936893203883492, |
|
"loss": 0.6285, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.0827586206896551, |
|
"grad_norm": 0.6491764187812805, |
|
"learning_rate": 0.0002286407766990291, |
|
"loss": 0.8877, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.0919540229885056, |
|
"grad_norm": 0.6474234461784363, |
|
"learning_rate": 0.00022791262135922328, |
|
"loss": 0.5508, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.1011494252873564, |
|
"grad_norm": 0.8234098553657532, |
|
"learning_rate": 0.00022718446601941746, |
|
"loss": 0.5686, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.110344827586207, |
|
"grad_norm": 0.7973845601081848, |
|
"learning_rate": 0.00022645631067961164, |
|
"loss": 0.3965, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.1195402298850574, |
|
"grad_norm": 0.7338511943817139, |
|
"learning_rate": 0.00022572815533980582, |
|
"loss": 0.3424, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.1287356321839082, |
|
"grad_norm": 0.6569823026657104, |
|
"learning_rate": 0.000225, |
|
"loss": 0.5071, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.1379310344827587, |
|
"grad_norm": 0.768832802772522, |
|
"learning_rate": 0.00022427184466019415, |
|
"loss": 0.5674, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.1471264367816092, |
|
"grad_norm": 0.5685952305793762, |
|
"learning_rate": 0.00022354368932038832, |
|
"loss": 0.4932, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1563218390804597, |
|
"grad_norm": 0.7012789249420166, |
|
"learning_rate": 0.0002228155339805825, |
|
"loss": 0.4092, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.1655172413793102, |
|
"grad_norm": 1.0596917867660522, |
|
"learning_rate": 0.00022208737864077668, |
|
"loss": 0.5964, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.174712643678161, |
|
"grad_norm": 0.5177541375160217, |
|
"learning_rate": 0.00022135922330097086, |
|
"loss": 0.5093, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.1839080459770115, |
|
"grad_norm": 0.6834475994110107, |
|
"learning_rate": 0.00022063106796116504, |
|
"loss": 0.5799, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.193103448275862, |
|
"grad_norm": 0.46928611397743225, |
|
"learning_rate": 0.00021990291262135922, |
|
"loss": 0.424, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2022988505747128, |
|
"grad_norm": 0.9471850991249084, |
|
"learning_rate": 0.00021917475728155337, |
|
"loss": 0.5087, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.2114942528735633, |
|
"grad_norm": 0.6638930439949036, |
|
"learning_rate": 0.00021844660194174755, |
|
"loss": 0.5133, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.2206896551724138, |
|
"grad_norm": 0.554904580116272, |
|
"learning_rate": 0.00021771844660194173, |
|
"loss": 0.6236, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.2298850574712643, |
|
"grad_norm": 0.6292574405670166, |
|
"learning_rate": 0.0002169902912621359, |
|
"loss": 0.3756, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.2390804597701148, |
|
"grad_norm": 0.7311149835586548, |
|
"learning_rate": 0.00021626213592233008, |
|
"loss": 0.5629, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2482758620689656, |
|
"grad_norm": 0.6303699016571045, |
|
"learning_rate": 0.00021553398058252426, |
|
"loss": 0.7889, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.257471264367816, |
|
"grad_norm": 0.8203924894332886, |
|
"learning_rate": 0.00021480582524271844, |
|
"loss": 0.3686, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.2666666666666666, |
|
"grad_norm": 1.10085928440094, |
|
"learning_rate": 0.00021407766990291262, |
|
"loss": 0.4527, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.2758620689655173, |
|
"grad_norm": 0.8229245543479919, |
|
"learning_rate": 0.00021334951456310677, |
|
"loss": 0.526, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.2850574712643679, |
|
"grad_norm": 0.7513696551322937, |
|
"learning_rate": 0.00021262135922330095, |
|
"loss": 0.5633, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2942528735632184, |
|
"grad_norm": 0.8141852021217346, |
|
"learning_rate": 0.00021189320388349513, |
|
"loss": 0.751, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.303448275862069, |
|
"grad_norm": 0.5154571533203125, |
|
"learning_rate": 0.0002111650485436893, |
|
"loss": 0.3195, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.3126436781609194, |
|
"grad_norm": 0.41179782152175903, |
|
"learning_rate": 0.00021043689320388349, |
|
"loss": 0.265, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.3218390804597702, |
|
"grad_norm": 0.5113994479179382, |
|
"learning_rate": 0.00020970873786407766, |
|
"loss": 0.565, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.3310344827586207, |
|
"grad_norm": 0.5308792591094971, |
|
"learning_rate": 0.00020898058252427184, |
|
"loss": 0.2585, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3402298850574712, |
|
"grad_norm": 0.5127933025360107, |
|
"learning_rate": 0.00020825242718446602, |
|
"loss": 0.3584, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.349425287356322, |
|
"grad_norm": 0.4476073384284973, |
|
"learning_rate": 0.00020752427184466017, |
|
"loss": 0.2942, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.3586206896551725, |
|
"grad_norm": 0.5431134104728699, |
|
"learning_rate": 0.00020679611650485435, |
|
"loss": 0.4839, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.367816091954023, |
|
"grad_norm": 0.6306489109992981, |
|
"learning_rate": 0.00020606796116504853, |
|
"loss": 0.3573, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.3770114942528735, |
|
"grad_norm": 0.593370795249939, |
|
"learning_rate": 0.0002053398058252427, |
|
"loss": 0.4681, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.386206896551724, |
|
"grad_norm": 0.733506977558136, |
|
"learning_rate": 0.0002046116504854369, |
|
"loss": 0.6061, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.3954022988505748, |
|
"grad_norm": 0.8149594664573669, |
|
"learning_rate": 0.00020388349514563107, |
|
"loss": 0.5411, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.4045977011494253, |
|
"grad_norm": 0.7658489942550659, |
|
"learning_rate": 0.00020315533980582524, |
|
"loss": 0.9712, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.4137931034482758, |
|
"grad_norm": 0.7406046986579895, |
|
"learning_rate": 0.00020242718446601942, |
|
"loss": 0.3591, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.4229885057471265, |
|
"grad_norm": 0.41466066241264343, |
|
"learning_rate": 0.00020169902912621357, |
|
"loss": 0.3579, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.432183908045977, |
|
"grad_norm": 0.5462120175361633, |
|
"learning_rate": 0.00020097087378640775, |
|
"loss": 0.512, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.4413793103448276, |
|
"grad_norm": 0.4008376896381378, |
|
"learning_rate": 0.00020024271844660193, |
|
"loss": 0.4698, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.450574712643678, |
|
"grad_norm": 0.3887282609939575, |
|
"learning_rate": 0.0001995145631067961, |
|
"loss": 0.3842, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.4597701149425286, |
|
"grad_norm": 0.5248357653617859, |
|
"learning_rate": 0.0001987864077669903, |
|
"loss": 0.3748, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.4689655172413794, |
|
"grad_norm": 0.8764130473136902, |
|
"learning_rate": 0.00019805825242718447, |
|
"loss": 0.5248, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.4781609195402299, |
|
"grad_norm": 0.631163477897644, |
|
"learning_rate": 0.00019733009708737865, |
|
"loss": 0.6423, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.4873563218390804, |
|
"grad_norm": 0.5436691641807556, |
|
"learning_rate": 0.0001966019417475728, |
|
"loss": 0.4372, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.4965517241379311, |
|
"grad_norm": 0.6378396153450012, |
|
"learning_rate": 0.00019587378640776698, |
|
"loss": 0.3392, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.5057471264367817, |
|
"grad_norm": 0.41411787271499634, |
|
"learning_rate": 0.00019514563106796116, |
|
"loss": 0.3925, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.5149425287356322, |
|
"grad_norm": 0.7309579849243164, |
|
"learning_rate": 0.00019441747572815533, |
|
"loss": 0.6451, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.524137931034483, |
|
"grad_norm": 0.6073735356330872, |
|
"learning_rate": 0.0001936893203883495, |
|
"loss": 0.6802, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.5333333333333332, |
|
"grad_norm": 0.37571704387664795, |
|
"learning_rate": 0.0001929611650485437, |
|
"loss": 0.5077, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.542528735632184, |
|
"grad_norm": 0.9433541893959045, |
|
"learning_rate": 0.00019223300970873787, |
|
"loss": 0.3427, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 0.5408298373222351, |
|
"learning_rate": 0.00019150485436893205, |
|
"loss": 0.2311, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.560919540229885, |
|
"grad_norm": 0.39010676741600037, |
|
"learning_rate": 0.0001907766990291262, |
|
"loss": 0.2743, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.5701149425287357, |
|
"grad_norm": 0.6753641963005066, |
|
"learning_rate": 0.00019004854368932038, |
|
"loss": 0.8422, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.5793103448275863, |
|
"grad_norm": 0.5095095634460449, |
|
"learning_rate": 0.00018932038834951456, |
|
"loss": 0.3418, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.5885057471264368, |
|
"grad_norm": 0.6390260457992554, |
|
"learning_rate": 0.00018859223300970874, |
|
"loss": 0.4438, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.5977011494252875, |
|
"grad_norm": 0.2993685007095337, |
|
"learning_rate": 0.00018786407766990291, |
|
"loss": 0.2958, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.6068965517241378, |
|
"grad_norm": 0.7637106776237488, |
|
"learning_rate": 0.00018713592233009707, |
|
"loss": 0.6225, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6160919540229886, |
|
"grad_norm": 0.7003872990608215, |
|
"learning_rate": 0.00018640776699029122, |
|
"loss": 0.6526, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.625287356321839, |
|
"grad_norm": 0.3579877018928528, |
|
"learning_rate": 0.0001856796116504854, |
|
"loss": 0.1985, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.6344827586206896, |
|
"grad_norm": 0.2747999429702759, |
|
"learning_rate": 0.00018495145631067957, |
|
"loss": 0.2124, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.6436781609195403, |
|
"grad_norm": 0.6537378430366516, |
|
"learning_rate": 0.00018422330097087375, |
|
"loss": 0.7906, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.6528735632183909, |
|
"grad_norm": 0.4569692313671112, |
|
"learning_rate": 0.00018349514563106793, |
|
"loss": 0.4379, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6620689655172414, |
|
"grad_norm": 0.4810560941696167, |
|
"learning_rate": 0.0001827669902912621, |
|
"loss": 0.4129, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.6712643678160921, |
|
"grad_norm": 0.5723876357078552, |
|
"learning_rate": 0.0001820388349514563, |
|
"loss": 0.3974, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.6804597701149424, |
|
"grad_norm": 0.5330012440681458, |
|
"learning_rate": 0.00018131067961165047, |
|
"loss": 0.3204, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.6896551724137931, |
|
"grad_norm": 0.6080510020256042, |
|
"learning_rate": 0.00018058252427184462, |
|
"loss": 0.7068, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.6988505747126437, |
|
"grad_norm": 0.4264465570449829, |
|
"learning_rate": 0.0001798543689320388, |
|
"loss": 0.4423, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7080459770114942, |
|
"grad_norm": 0.5706063508987427, |
|
"learning_rate": 0.00017912621359223298, |
|
"loss": 0.3923, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.717241379310345, |
|
"grad_norm": 0.45583376288414, |
|
"learning_rate": 0.00017839805825242716, |
|
"loss": 0.2542, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.7264367816091954, |
|
"grad_norm": 0.5898862481117249, |
|
"learning_rate": 0.00017766990291262133, |
|
"loss": 0.3475, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.735632183908046, |
|
"grad_norm": 0.6169423460960388, |
|
"learning_rate": 0.0001769417475728155, |
|
"loss": 0.5062, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.7448275862068967, |
|
"grad_norm": 0.5277095437049866, |
|
"learning_rate": 0.0001762135922330097, |
|
"loss": 0.5095, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.754022988505747, |
|
"grad_norm": 0.3574261963367462, |
|
"learning_rate": 0.00017548543689320387, |
|
"loss": 0.4086, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.7632183908045977, |
|
"grad_norm": 0.4890098571777344, |
|
"learning_rate": 0.00017475728155339802, |
|
"loss": 0.5313, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.7724137931034483, |
|
"grad_norm": 0.3213779628276825, |
|
"learning_rate": 0.0001740291262135922, |
|
"loss": 0.4056, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.7816091954022988, |
|
"grad_norm": 0.3488066494464874, |
|
"learning_rate": 0.00017330097087378638, |
|
"loss": 0.2805, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.7908045977011495, |
|
"grad_norm": 0.32463303208351135, |
|
"learning_rate": 0.00017257281553398056, |
|
"loss": 0.3157, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.5792247653007507, |
|
"learning_rate": 0.00017184466019417474, |
|
"loss": 0.4088, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.8091954022988506, |
|
"grad_norm": 0.6414899230003357, |
|
"learning_rate": 0.00017111650485436891, |
|
"loss": 0.3401, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.8183908045977013, |
|
"grad_norm": 0.5633428692817688, |
|
"learning_rate": 0.0001703883495145631, |
|
"loss": 0.3967, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.8275862068965516, |
|
"grad_norm": 0.3799390196800232, |
|
"learning_rate": 0.00016966019417475724, |
|
"loss": 0.3257, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.8367816091954023, |
|
"grad_norm": 0.26283133029937744, |
|
"learning_rate": 0.00016893203883495142, |
|
"loss": 0.2517, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8459770114942529, |
|
"grad_norm": 0.3688882887363434, |
|
"learning_rate": 0.0001682038834951456, |
|
"loss": 0.2523, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.8551724137931034, |
|
"grad_norm": 0.3409745991230011, |
|
"learning_rate": 0.00016747572815533978, |
|
"loss": 0.327, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.8643678160919541, |
|
"grad_norm": 0.4516524076461792, |
|
"learning_rate": 0.00016674757281553396, |
|
"loss": 0.4782, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.8735632183908046, |
|
"grad_norm": 0.4798257350921631, |
|
"learning_rate": 0.00016601941747572814, |
|
"loss": 0.4515, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.8827586206896552, |
|
"grad_norm": 0.49878618121147156, |
|
"learning_rate": 0.00016529126213592232, |
|
"loss": 0.2599, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.891954022988506, |
|
"grad_norm": 0.42895984649658203, |
|
"learning_rate": 0.0001645631067961165, |
|
"loss": 0.4502, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.9011494252873562, |
|
"grad_norm": 0.391862154006958, |
|
"learning_rate": 0.00016383495145631065, |
|
"loss": 0.3478, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.910344827586207, |
|
"grad_norm": 0.6885610818862915, |
|
"learning_rate": 0.00016310679611650483, |
|
"loss": 0.6568, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.9195402298850575, |
|
"grad_norm": 0.579025149345398, |
|
"learning_rate": 0.000162378640776699, |
|
"loss": 0.3724, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.928735632183908, |
|
"grad_norm": 0.36686283349990845, |
|
"learning_rate": 0.00016165048543689318, |
|
"loss": 0.3496, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9379310344827587, |
|
"grad_norm": 0.3137669265270233, |
|
"learning_rate": 0.00016092233009708736, |
|
"loss": 0.1906, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.9471264367816092, |
|
"grad_norm": 0.4134412407875061, |
|
"learning_rate": 0.00016019417475728154, |
|
"loss": 0.4461, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.9563218390804598, |
|
"grad_norm": 0.3085227608680725, |
|
"learning_rate": 0.00015946601941747572, |
|
"loss": 0.2951, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.9655172413793105, |
|
"grad_norm": 0.2577720284461975, |
|
"learning_rate": 0.0001587378640776699, |
|
"loss": 0.2708, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.9747126436781608, |
|
"grad_norm": 0.6653198003768921, |
|
"learning_rate": 0.00015800970873786405, |
|
"loss": 0.9269, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.9839080459770115, |
|
"grad_norm": 0.6150757670402527, |
|
"learning_rate": 0.00015728155339805823, |
|
"loss": 0.692, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.993103448275862, |
|
"grad_norm": 0.44791334867477417, |
|
"learning_rate": 0.0001565533980582524, |
|
"loss": 0.4351, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.7618283629417419, |
|
"learning_rate": 0.00015582524271844658, |
|
"loss": 0.7963, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.5562227964401245, |
|
"eval_runtime": 76.4203, |
|
"eval_samples_per_second": 4.344, |
|
"eval_steps_per_second": 2.172, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.0091954022988507, |
|
"grad_norm": 0.32502397894859314, |
|
"learning_rate": 0.00015509708737864076, |
|
"loss": 0.2374, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.018390804597701, |
|
"grad_norm": 0.3833533525466919, |
|
"learning_rate": 0.00015436893203883494, |
|
"loss": 0.3454, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.027586206896552, |
|
"grad_norm": 0.3064585328102112, |
|
"learning_rate": 0.00015364077669902912, |
|
"loss": 0.2924, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.036781609195402, |
|
"grad_norm": 0.402799516916275, |
|
"learning_rate": 0.0001529126213592233, |
|
"loss": 0.3828, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.045977011494253, |
|
"grad_norm": 0.30839601159095764, |
|
"learning_rate": 0.00015218446601941745, |
|
"loss": 0.2421, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.0551724137931036, |
|
"grad_norm": 0.4161797761917114, |
|
"learning_rate": 0.00015145631067961163, |
|
"loss": 0.2989, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.064367816091954, |
|
"grad_norm": 0.4436909556388855, |
|
"learning_rate": 0.0001507281553398058, |
|
"loss": 0.3328, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.0735632183908046, |
|
"grad_norm": 0.5570560693740845, |
|
"learning_rate": 0.00015, |
|
"loss": 0.3132, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.0827586206896553, |
|
"grad_norm": 0.7697573304176331, |
|
"learning_rate": 0.00014927184466019417, |
|
"loss": 0.5465, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.0919540229885056, |
|
"grad_norm": 0.44799014925956726, |
|
"learning_rate": 0.00014854368932038834, |
|
"loss": 0.2612, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.1011494252873564, |
|
"grad_norm": 0.4220842719078064, |
|
"learning_rate": 0.00014781553398058252, |
|
"loss": 0.239, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.110344827586207, |
|
"grad_norm": 0.480135053396225, |
|
"learning_rate": 0.00014708737864077667, |
|
"loss": 0.339, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1195402298850574, |
|
"grad_norm": 0.46936216950416565, |
|
"learning_rate": 0.00014635922330097085, |
|
"loss": 0.2952, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.128735632183908, |
|
"grad_norm": 0.6709425449371338, |
|
"learning_rate": 0.00014563106796116503, |
|
"loss": 0.3496, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.1379310344827585, |
|
"grad_norm": 0.37677356600761414, |
|
"learning_rate": 0.0001449029126213592, |
|
"loss": 0.3361, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.147126436781609, |
|
"grad_norm": 1.1358637809753418, |
|
"learning_rate": 0.0001441747572815534, |
|
"loss": 0.5066, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.15632183908046, |
|
"grad_norm": 0.5674331188201904, |
|
"learning_rate": 0.00014344660194174757, |
|
"loss": 0.3828, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.1655172413793102, |
|
"grad_norm": 0.473661333322525, |
|
"learning_rate": 0.00014271844660194175, |
|
"loss": 0.2879, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.174712643678161, |
|
"grad_norm": 0.48269397020339966, |
|
"learning_rate": 0.00014199029126213592, |
|
"loss": 0.3322, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.1839080459770113, |
|
"grad_norm": 0.6089305281639099, |
|
"learning_rate": 0.00014126213592233008, |
|
"loss": 0.3218, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.193103448275862, |
|
"grad_norm": 0.3989170789718628, |
|
"learning_rate": 0.00014053398058252425, |
|
"loss": 0.2518, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.2022988505747128, |
|
"grad_norm": 0.3296070992946625, |
|
"learning_rate": 0.00013980582524271843, |
|
"loss": 0.2045, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.211494252873563, |
|
"grad_norm": 0.8743225336074829, |
|
"learning_rate": 0.0001390776699029126, |
|
"loss": 0.6216, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.220689655172414, |
|
"grad_norm": 0.36944878101348877, |
|
"learning_rate": 0.0001383495145631068, |
|
"loss": 0.1719, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.2298850574712645, |
|
"grad_norm": 0.33532336354255676, |
|
"learning_rate": 0.00013762135922330097, |
|
"loss": 0.2154, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.239080459770115, |
|
"grad_norm": 0.7817918658256531, |
|
"learning_rate": 0.00013689320388349515, |
|
"loss": 0.48, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.2482758620689656, |
|
"grad_norm": 0.8702794313430786, |
|
"learning_rate": 0.00013616504854368933, |
|
"loss": 0.6025, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.2574712643678163, |
|
"grad_norm": 0.3685101866722107, |
|
"learning_rate": 0.00013543689320388348, |
|
"loss": 0.164, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.2666666666666666, |
|
"grad_norm": 0.7941250801086426, |
|
"learning_rate": 0.00013470873786407766, |
|
"loss": 0.508, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.2758620689655173, |
|
"grad_norm": 0.41759851574897766, |
|
"learning_rate": 0.00013398058252427184, |
|
"loss": 0.1994, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.2850574712643676, |
|
"grad_norm": 0.653108537197113, |
|
"learning_rate": 0.00013325242718446601, |
|
"loss": 0.3635, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.2942528735632184, |
|
"grad_norm": 0.7277186512947083, |
|
"learning_rate": 0.0001325242718446602, |
|
"loss": 0.4847, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.303448275862069, |
|
"grad_norm": 0.4302230775356293, |
|
"learning_rate": 0.00013179611650485437, |
|
"loss": 0.296, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.3126436781609194, |
|
"grad_norm": 0.3574405312538147, |
|
"learning_rate": 0.00013106796116504852, |
|
"loss": 0.1844, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.32183908045977, |
|
"grad_norm": 0.5645660161972046, |
|
"learning_rate": 0.0001303398058252427, |
|
"loss": 0.3868, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.3310344827586205, |
|
"grad_norm": 0.4044601619243622, |
|
"learning_rate": 0.00012961165048543688, |
|
"loss": 0.2447, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.340229885057471, |
|
"grad_norm": 0.41919222474098206, |
|
"learning_rate": 0.00012888349514563106, |
|
"loss": 0.258, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.349425287356322, |
|
"grad_norm": 0.5476058721542358, |
|
"learning_rate": 0.00012815533980582524, |
|
"loss": 0.3241, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.3586206896551722, |
|
"grad_norm": 0.8460379838943481, |
|
"learning_rate": 0.0001274271844660194, |
|
"loss": 0.4723, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.367816091954023, |
|
"grad_norm": 0.5093741416931152, |
|
"learning_rate": 0.00012669902912621357, |
|
"loss": 0.2634, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.3770114942528737, |
|
"grad_norm": 0.6395347714424133, |
|
"learning_rate": 0.00012597087378640775, |
|
"loss": 0.4046, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.386206896551724, |
|
"grad_norm": 0.7581174373626709, |
|
"learning_rate": 0.00012524271844660192, |
|
"loss": 0.5078, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.3954022988505748, |
|
"grad_norm": 0.49910837411880493, |
|
"learning_rate": 0.0001245145631067961, |
|
"loss": 0.2991, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.4045977011494255, |
|
"grad_norm": 0.34560054540634155, |
|
"learning_rate": 0.00012378640776699028, |
|
"loss": 0.2364, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 0.3754998743534088, |
|
"learning_rate": 0.00012305825242718446, |
|
"loss": 0.2107, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.4229885057471265, |
|
"grad_norm": 0.33737894892692566, |
|
"learning_rate": 0.0001223300970873786, |
|
"loss": 0.1509, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.432183908045977, |
|
"grad_norm": 0.4702773690223694, |
|
"learning_rate": 0.0001216019417475728, |
|
"loss": 0.2477, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.4413793103448276, |
|
"grad_norm": 0.6840780973434448, |
|
"learning_rate": 0.00012087378640776697, |
|
"loss": 0.521, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.4505747126436783, |
|
"grad_norm": 0.4670672118663788, |
|
"learning_rate": 0.00012014563106796115, |
|
"loss": 0.3054, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.4597701149425286, |
|
"grad_norm": 0.34090566635131836, |
|
"learning_rate": 0.00011941747572815533, |
|
"loss": 0.2215, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.4689655172413794, |
|
"grad_norm": 0.32637739181518555, |
|
"learning_rate": 0.0001186893203883495, |
|
"loss": 0.211, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.4781609195402297, |
|
"grad_norm": 0.325579971075058, |
|
"learning_rate": 0.00011796116504854367, |
|
"loss": 0.1892, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.4873563218390804, |
|
"grad_norm": 0.2964633107185364, |
|
"learning_rate": 0.00011723300970873785, |
|
"loss": 0.1838, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.496551724137931, |
|
"grad_norm": 0.6292815804481506, |
|
"learning_rate": 0.00011650485436893203, |
|
"loss": 0.5632, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.5057471264367814, |
|
"grad_norm": 0.3210340738296509, |
|
"learning_rate": 0.0001157766990291262, |
|
"loss": 0.1643, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.514942528735632, |
|
"grad_norm": 0.35492682456970215, |
|
"learning_rate": 0.00011504854368932037, |
|
"loss": 0.2607, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.524137931034483, |
|
"grad_norm": 0.4605109393596649, |
|
"learning_rate": 0.00011432038834951455, |
|
"loss": 0.3006, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.533333333333333, |
|
"grad_norm": 0.21065670251846313, |
|
"learning_rate": 0.00011359223300970873, |
|
"loss": 0.0945, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.542528735632184, |
|
"grad_norm": 0.4777117669582367, |
|
"learning_rate": 0.00011286407766990291, |
|
"loss": 0.2718, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.5517241379310347, |
|
"grad_norm": 0.4873185157775879, |
|
"learning_rate": 0.00011213592233009707, |
|
"loss": 0.3336, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.560919540229885, |
|
"grad_norm": 0.3319976329803467, |
|
"learning_rate": 0.00011140776699029125, |
|
"loss": 0.2026, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.5701149425287357, |
|
"grad_norm": 0.3072815239429474, |
|
"learning_rate": 0.00011067961165048543, |
|
"loss": 0.187, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.5793103448275865, |
|
"grad_norm": 0.3600887656211853, |
|
"learning_rate": 0.00010995145631067961, |
|
"loss": 0.1711, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.5885057471264368, |
|
"grad_norm": 0.40916576981544495, |
|
"learning_rate": 0.00010922330097087377, |
|
"loss": 0.2618, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.5977011494252875, |
|
"grad_norm": 0.6749149560928345, |
|
"learning_rate": 0.00010849514563106795, |
|
"loss": 0.4636, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.606896551724138, |
|
"grad_norm": 0.29242533445358276, |
|
"learning_rate": 0.00010776699029126213, |
|
"loss": 0.1577, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.6160919540229886, |
|
"grad_norm": 0.3938550651073456, |
|
"learning_rate": 0.00010703883495145631, |
|
"loss": 0.1937, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.625287356321839, |
|
"grad_norm": 0.3914635479450226, |
|
"learning_rate": 0.00010631067961165047, |
|
"loss": 0.2525, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.6344827586206896, |
|
"grad_norm": 0.3480489253997803, |
|
"learning_rate": 0.00010558252427184465, |
|
"loss": 0.178, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.6436781609195403, |
|
"grad_norm": 0.230756476521492, |
|
"learning_rate": 0.00010485436893203883, |
|
"loss": 0.1148, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.6528735632183906, |
|
"grad_norm": 0.44225171208381653, |
|
"learning_rate": 0.00010412621359223301, |
|
"loss": 0.3141, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.6620689655172414, |
|
"grad_norm": 0.4487713873386383, |
|
"learning_rate": 0.00010339805825242718, |
|
"loss": 0.2956, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.671264367816092, |
|
"grad_norm": 0.499773234128952, |
|
"learning_rate": 0.00010266990291262135, |
|
"loss": 0.3489, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.6804597701149424, |
|
"grad_norm": 0.40772294998168945, |
|
"learning_rate": 0.00010194174757281553, |
|
"loss": 0.2208, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.689655172413793, |
|
"grad_norm": 0.5451008081436157, |
|
"learning_rate": 0.00010121359223300971, |
|
"loss": 0.3616, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.698850574712644, |
|
"grad_norm": 0.4137297570705414, |
|
"learning_rate": 0.00010048543689320388, |
|
"loss": 0.2938, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.708045977011494, |
|
"grad_norm": 0.839198648929596, |
|
"learning_rate": 9.975728155339806e-05, |
|
"loss": 0.6327, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.717241379310345, |
|
"grad_norm": 0.6517747044563293, |
|
"learning_rate": 9.902912621359223e-05, |
|
"loss": 0.4386, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.7264367816091957, |
|
"grad_norm": 0.3699157238006592, |
|
"learning_rate": 9.83009708737864e-05, |
|
"loss": 0.2534, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.735632183908046, |
|
"grad_norm": 0.4842491149902344, |
|
"learning_rate": 9.757281553398058e-05, |
|
"loss": 0.3054, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.7448275862068967, |
|
"grad_norm": 0.34171465039253235, |
|
"learning_rate": 9.684466019417476e-05, |
|
"loss": 0.2192, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.754022988505747, |
|
"grad_norm": 0.43019741773605347, |
|
"learning_rate": 9.611650485436893e-05, |
|
"loss": 0.2504, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.7632183908045977, |
|
"grad_norm": 0.42596566677093506, |
|
"learning_rate": 9.53883495145631e-05, |
|
"loss": 0.2598, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.772413793103448, |
|
"grad_norm": 0.675719141960144, |
|
"learning_rate": 9.466019417475728e-05, |
|
"loss": 0.3822, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.781609195402299, |
|
"grad_norm": 0.4107106924057007, |
|
"learning_rate": 9.393203883495146e-05, |
|
"loss": 0.2068, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.7908045977011495, |
|
"grad_norm": 0.6644109487533569, |
|
"learning_rate": 9.320388349514561e-05, |
|
"loss": 0.3806, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.3467462956905365, |
|
"learning_rate": 9.247572815533979e-05, |
|
"loss": 0.1818, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.8091954022988506, |
|
"grad_norm": 0.26561155915260315, |
|
"learning_rate": 9.174757281553397e-05, |
|
"loss": 0.1497, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.8183908045977013, |
|
"grad_norm": 0.7984749674797058, |
|
"learning_rate": 9.101941747572814e-05, |
|
"loss": 0.5195, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.8275862068965516, |
|
"grad_norm": 0.252616286277771, |
|
"learning_rate": 9.029126213592231e-05, |
|
"loss": 0.1348, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.8367816091954023, |
|
"grad_norm": 0.3006296157836914, |
|
"learning_rate": 8.956310679611649e-05, |
|
"loss": 0.1629, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.845977011494253, |
|
"grad_norm": 0.3508792519569397, |
|
"learning_rate": 8.883495145631067e-05, |
|
"loss": 0.2039, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8551724137931034, |
|
"grad_norm": 0.2189541608095169, |
|
"learning_rate": 8.810679611650485e-05, |
|
"loss": 0.1008, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.864367816091954, |
|
"grad_norm": 0.7708775997161865, |
|
"learning_rate": 8.737864077669901e-05, |
|
"loss": 0.5565, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.873563218390805, |
|
"grad_norm": 0.3029266595840454, |
|
"learning_rate": 8.665048543689319e-05, |
|
"loss": 0.1925, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.882758620689655, |
|
"grad_norm": 0.29565373063087463, |
|
"learning_rate": 8.592233009708737e-05, |
|
"loss": 0.1708, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.891954022988506, |
|
"grad_norm": 0.66410893201828, |
|
"learning_rate": 8.519417475728155e-05, |
|
"loss": 0.4796, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.901149425287356, |
|
"grad_norm": 0.48440781235694885, |
|
"learning_rate": 8.446601941747571e-05, |
|
"loss": 0.3075, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.910344827586207, |
|
"grad_norm": 0.7811682820320129, |
|
"learning_rate": 8.373786407766989e-05, |
|
"loss": 0.5639, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.9195402298850572, |
|
"grad_norm": 0.3656238913536072, |
|
"learning_rate": 8.300970873786407e-05, |
|
"loss": 0.208, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.928735632183908, |
|
"grad_norm": 0.34803682565689087, |
|
"learning_rate": 8.228155339805825e-05, |
|
"loss": 0.1549, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.9379310344827587, |
|
"grad_norm": 0.7989238500595093, |
|
"learning_rate": 8.155339805825241e-05, |
|
"loss": 0.516, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.947126436781609, |
|
"grad_norm": 0.539603054523468, |
|
"learning_rate": 8.082524271844659e-05, |
|
"loss": 0.3175, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.9563218390804598, |
|
"grad_norm": 0.43805384635925293, |
|
"learning_rate": 8.009708737864077e-05, |
|
"loss": 0.2689, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.9655172413793105, |
|
"grad_norm": 0.3973037302494049, |
|
"learning_rate": 7.936893203883495e-05, |
|
"loss": 0.2238, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.974712643678161, |
|
"grad_norm": 0.555601954460144, |
|
"learning_rate": 7.864077669902911e-05, |
|
"loss": 0.3414, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.9839080459770115, |
|
"grad_norm": 0.7183153033256531, |
|
"learning_rate": 7.791262135922329e-05, |
|
"loss": 0.3484, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.9931034482758623, |
|
"grad_norm": 0.7718084454536438, |
|
"learning_rate": 7.718446601941747e-05, |
|
"loss": 0.4512, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.1820600032806396, |
|
"learning_rate": 7.645631067961165e-05, |
|
"loss": 0.6338, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.6946974992752075, |
|
"eval_runtime": 76.3864, |
|
"eval_samples_per_second": 4.346, |
|
"eval_steps_per_second": 2.173, |
|
"step": 327 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 432, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.3656641543714816e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|