|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.8023255813953485, |
|
"eval_steps": 500, |
|
"global_step": 168, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03488372093023256, |
|
"grad_norm": 43.01280212402344, |
|
"learning_rate": 5.0000000000000004e-08, |
|
"loss": 2.7496, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06976744186046512, |
|
"grad_norm": 44.429630279541016, |
|
"learning_rate": 1.0000000000000001e-07, |
|
"loss": 2.7809, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.10465116279069768, |
|
"grad_norm": 42.754234313964844, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": 2.6968, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.13953488372093023, |
|
"grad_norm": 45.27891159057617, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 2.8076, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.1744186046511628, |
|
"grad_norm": 43.920860290527344, |
|
"learning_rate": 2.5000000000000004e-07, |
|
"loss": 2.7255, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20930232558139536, |
|
"grad_norm": 44.61790466308594, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 2.7178, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2441860465116279, |
|
"grad_norm": 44.28205871582031, |
|
"learning_rate": 3.5000000000000004e-07, |
|
"loss": 2.7193, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.27906976744186046, |
|
"grad_norm": 45.1516227722168, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 2.6846, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.313953488372093, |
|
"grad_norm": 46.8723258972168, |
|
"learning_rate": 4.5000000000000003e-07, |
|
"loss": 2.67, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3488372093023256, |
|
"grad_norm": 48.33848571777344, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 2.6368, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.38372093023255816, |
|
"grad_norm": 46.772193908691406, |
|
"learning_rate": 5.5e-07, |
|
"loss": 2.5562, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.4186046511627907, |
|
"grad_norm": 48.12417984008789, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 2.5362, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.45348837209302323, |
|
"grad_norm": 49.0787239074707, |
|
"learning_rate": 6.5e-07, |
|
"loss": 2.3981, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4883720930232558, |
|
"grad_norm": 52.367183685302734, |
|
"learning_rate": 7.000000000000001e-07, |
|
"loss": 2.3837, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5232558139534884, |
|
"grad_norm": 53.57649230957031, |
|
"learning_rate": 7.5e-07, |
|
"loss": 2.2885, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5581395348837209, |
|
"grad_norm": 56.58591079711914, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 2.1447, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5930232558139535, |
|
"grad_norm": 55.064735412597656, |
|
"learning_rate": 8.500000000000001e-07, |
|
"loss": 2.0242, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.627906976744186, |
|
"grad_norm": 52.61149597167969, |
|
"learning_rate": 9.000000000000001e-07, |
|
"loss": 1.9263, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6627906976744186, |
|
"grad_norm": 44.34925079345703, |
|
"learning_rate": 9.500000000000001e-07, |
|
"loss": 1.7466, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 36.627296447753906, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.6084, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7325581395348837, |
|
"grad_norm": 30.89563751220703, |
|
"learning_rate": 1.0500000000000001e-06, |
|
"loss": 1.4701, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.7674418604651163, |
|
"grad_norm": 29.17167091369629, |
|
"learning_rate": 1.1e-06, |
|
"loss": 1.4218, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.8023255813953488, |
|
"grad_norm": 28.237022399902344, |
|
"learning_rate": 1.1500000000000002e-06, |
|
"loss": 1.3014, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.8372093023255814, |
|
"grad_norm": 28.778654098510742, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.1857, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.872093023255814, |
|
"grad_norm": 29.368289947509766, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.082, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.9069767441860465, |
|
"grad_norm": 28.46448516845703, |
|
"learning_rate": 1.3e-06, |
|
"loss": 0.8716, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.9418604651162791, |
|
"grad_norm": 24.584318161010742, |
|
"learning_rate": 1.3500000000000002e-06, |
|
"loss": 0.6848, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.9767441860465116, |
|
"grad_norm": 21.166847229003906, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 0.5438, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 21.166847229003906, |
|
"learning_rate": 1.45e-06, |
|
"loss": 0.4084, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.0348837209302326, |
|
"grad_norm": 24.098440170288086, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.3545, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.069767441860465, |
|
"grad_norm": 12.6813325881958, |
|
"learning_rate": 1.5500000000000002e-06, |
|
"loss": 0.2673, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.1046511627906976, |
|
"grad_norm": 7.6524128913879395, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.2033, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.1395348837209303, |
|
"grad_norm": 5.23082160949707, |
|
"learning_rate": 1.6500000000000003e-06, |
|
"loss": 0.1887, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.1744186046511629, |
|
"grad_norm": 4.729929447174072, |
|
"learning_rate": 1.7000000000000002e-06, |
|
"loss": 0.1683, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.2093023255813953, |
|
"grad_norm": 5.831579208374023, |
|
"learning_rate": 1.75e-06, |
|
"loss": 0.1643, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.244186046511628, |
|
"grad_norm": 4.030057430267334, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 0.1528, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.2790697674418605, |
|
"grad_norm": 3.560523509979248, |
|
"learning_rate": 1.85e-06, |
|
"loss": 0.1422, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.3139534883720931, |
|
"grad_norm": 3.3749780654907227, |
|
"learning_rate": 1.9000000000000002e-06, |
|
"loss": 0.1327, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.3488372093023255, |
|
"grad_norm": 2.7184131145477295, |
|
"learning_rate": 1.9500000000000004e-06, |
|
"loss": 0.1199, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.3837209302325582, |
|
"grad_norm": 2.8681583404541016, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.1107, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.4186046511627908, |
|
"grad_norm": 2.8731987476348877, |
|
"learning_rate": 2.05e-06, |
|
"loss": 0.1063, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.4534883720930232, |
|
"grad_norm": 2.9522054195404053, |
|
"learning_rate": 2.1000000000000002e-06, |
|
"loss": 0.0989, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.4883720930232558, |
|
"grad_norm": 2.689038038253784, |
|
"learning_rate": 2.15e-06, |
|
"loss": 0.0949, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.5232558139534884, |
|
"grad_norm": 2.6328952312469482, |
|
"learning_rate": 2.2e-06, |
|
"loss": 0.1006, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.558139534883721, |
|
"grad_norm": 2.7056033611297607, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.0929, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.5930232558139537, |
|
"grad_norm": 2.568206310272217, |
|
"learning_rate": 2.3000000000000004e-06, |
|
"loss": 0.0881, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.627906976744186, |
|
"grad_norm": 2.714211940765381, |
|
"learning_rate": 2.35e-06, |
|
"loss": 0.096, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.6627906976744184, |
|
"grad_norm": 2.3650710582733154, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.086, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.697674418604651, |
|
"grad_norm": 2.1763498783111572, |
|
"learning_rate": 2.4500000000000003e-06, |
|
"loss": 0.0858, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.7325581395348837, |
|
"grad_norm": 2.124727487564087, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0843, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.7674418604651163, |
|
"grad_norm": 1.641269564628601, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.0803, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.802325581395349, |
|
"grad_norm": 1.5644842386245728, |
|
"learning_rate": 2.6e-06, |
|
"loss": 0.0779, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.8372093023255816, |
|
"grad_norm": 1.1404681205749512, |
|
"learning_rate": 2.6500000000000005e-06, |
|
"loss": 0.0765, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.872093023255814, |
|
"grad_norm": 0.8386123776435852, |
|
"learning_rate": 2.7000000000000004e-06, |
|
"loss": 0.0749, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.9069767441860463, |
|
"grad_norm": 1.8363338708877563, |
|
"learning_rate": 2.7500000000000004e-06, |
|
"loss": 0.0737, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.941860465116279, |
|
"grad_norm": 1.2105377912521362, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.0762, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.9767441860465116, |
|
"grad_norm": 1.5502218008041382, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.07, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.0641449689865112, |
|
"learning_rate": 2.9e-06, |
|
"loss": 0.0679, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.0348837209302326, |
|
"grad_norm": 0.9201306104660034, |
|
"learning_rate": 2.95e-06, |
|
"loss": 0.0677, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.0697674418604652, |
|
"grad_norm": 0.5951386094093323, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0654, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.104651162790698, |
|
"grad_norm": 0.8307608962059021, |
|
"learning_rate": 3.05e-06, |
|
"loss": 0.0706, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.13953488372093, |
|
"grad_norm": 0.7110892534255981, |
|
"learning_rate": 3.1000000000000004e-06, |
|
"loss": 0.0665, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.1744186046511627, |
|
"grad_norm": 0.6766234040260315, |
|
"learning_rate": 3.1500000000000003e-06, |
|
"loss": 0.0618, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.2093023255813953, |
|
"grad_norm": 0.3967410922050476, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.0644, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.244186046511628, |
|
"grad_norm": 0.3713420331478119, |
|
"learning_rate": 3.2500000000000002e-06, |
|
"loss": 0.0607, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.2790697674418605, |
|
"grad_norm": 0.5613359212875366, |
|
"learning_rate": 3.3000000000000006e-06, |
|
"loss": 0.0623, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.313953488372093, |
|
"grad_norm": 0.5458635687828064, |
|
"learning_rate": 3.3500000000000005e-06, |
|
"loss": 0.0563, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.3488372093023258, |
|
"grad_norm": 0.4196176528930664, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 0.0547, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.383720930232558, |
|
"grad_norm": 0.5757117867469788, |
|
"learning_rate": 3.45e-06, |
|
"loss": 0.0551, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.4186046511627906, |
|
"grad_norm": 0.45777687430381775, |
|
"learning_rate": 3.5e-06, |
|
"loss": 0.0575, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.453488372093023, |
|
"grad_norm": 0.5204553008079529, |
|
"learning_rate": 3.5500000000000003e-06, |
|
"loss": 0.057, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.488372093023256, |
|
"grad_norm": 0.6110821962356567, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.0556, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.5232558139534884, |
|
"grad_norm": 0.45246991515159607, |
|
"learning_rate": 3.65e-06, |
|
"loss": 0.0564, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.558139534883721, |
|
"grad_norm": 0.441976398229599, |
|
"learning_rate": 3.7e-06, |
|
"loss": 0.0534, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.5930232558139537, |
|
"grad_norm": 0.46637651324272156, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.0498, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.6279069767441863, |
|
"grad_norm": 0.482038289308548, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 0.0521, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.6627906976744184, |
|
"grad_norm": 0.5914385318756104, |
|
"learning_rate": 3.85e-06, |
|
"loss": 0.0511, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.697674418604651, |
|
"grad_norm": 0.34532907605171204, |
|
"learning_rate": 3.900000000000001e-06, |
|
"loss": 0.0527, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.7325581395348837, |
|
"grad_norm": 0.35315006971359253, |
|
"learning_rate": 3.95e-06, |
|
"loss": 0.0486, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.7674418604651163, |
|
"grad_norm": 0.4521324634552002, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0485, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.802325581395349, |
|
"grad_norm": 0.49457868933677673, |
|
"learning_rate": 4.05e-06, |
|
"loss": 0.0525, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.8372093023255816, |
|
"grad_norm": 0.7616601586341858, |
|
"learning_rate": 4.1e-06, |
|
"loss": 0.0528, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.8720930232558137, |
|
"grad_norm": 0.4791123569011688, |
|
"learning_rate": 4.15e-06, |
|
"loss": 0.0512, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.9069767441860463, |
|
"grad_norm": 0.38186997175216675, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 0.0442, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.941860465116279, |
|
"grad_norm": 0.32648414373397827, |
|
"learning_rate": 4.25e-06, |
|
"loss": 0.0432, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.9767441860465116, |
|
"grad_norm": 0.410888671875, |
|
"learning_rate": 4.3e-06, |
|
"loss": 0.042, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.4732547104358673, |
|
"learning_rate": 4.350000000000001e-06, |
|
"loss": 0.0483, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 3.0348837209302326, |
|
"grad_norm": 0.6372231841087341, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.0382, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 3.0697674418604652, |
|
"grad_norm": 0.393078088760376, |
|
"learning_rate": 4.450000000000001e-06, |
|
"loss": 0.0403, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 3.104651162790698, |
|
"grad_norm": 0.4366248846054077, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.0367, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.13953488372093, |
|
"grad_norm": 0.301724374294281, |
|
"learning_rate": 4.5500000000000005e-06, |
|
"loss": 0.0351, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 3.1744186046511627, |
|
"grad_norm": 0.37665656208992004, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 0.03, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 3.2093023255813953, |
|
"grad_norm": 0.581331729888916, |
|
"learning_rate": 4.65e-06, |
|
"loss": 0.0316, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 3.244186046511628, |
|
"grad_norm": 0.44136878848075867, |
|
"learning_rate": 4.7e-06, |
|
"loss": 0.0312, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 3.2790697674418605, |
|
"grad_norm": 0.7624006271362305, |
|
"learning_rate": 4.75e-06, |
|
"loss": 0.0323, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.313953488372093, |
|
"grad_norm": 0.35294386744499207, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.0286, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.3488372093023258, |
|
"grad_norm": 0.6240035891532898, |
|
"learning_rate": 4.85e-06, |
|
"loss": 0.0289, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.383720930232558, |
|
"grad_norm": 0.40580570697784424, |
|
"learning_rate": 4.9000000000000005e-06, |
|
"loss": 0.0259, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.4186046511627906, |
|
"grad_norm": 0.2971636652946472, |
|
"learning_rate": 4.95e-06, |
|
"loss": 0.0251, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.453488372093023, |
|
"grad_norm": 0.3758476972579956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0244, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.488372093023256, |
|
"grad_norm": 0.3845921754837036, |
|
"learning_rate": 4.997332437005932e-06, |
|
"loss": 0.0242, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.5232558139534884, |
|
"grad_norm": 0.48894616961479187, |
|
"learning_rate": 4.989335440737587e-06, |
|
"loss": 0.0252, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.558139534883721, |
|
"grad_norm": 0.36844325065612793, |
|
"learning_rate": 4.976026077188013e-06, |
|
"loss": 0.0197, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.5930232558139537, |
|
"grad_norm": 0.2800253629684448, |
|
"learning_rate": 4.957432749209755e-06, |
|
"loss": 0.0187, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.6279069767441863, |
|
"grad_norm": 0.34042924642562866, |
|
"learning_rate": 4.933595135901733e-06, |
|
"loss": 0.0182, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.6627906976744184, |
|
"grad_norm": 0.33217477798461914, |
|
"learning_rate": 4.904564107932048e-06, |
|
"loss": 0.0166, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.697674418604651, |
|
"grad_norm": 0.32867398858070374, |
|
"learning_rate": 4.870401618977415e-06, |
|
"loss": 0.017, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.7325581395348837, |
|
"grad_norm": 0.24199356138706207, |
|
"learning_rate": 4.83118057351089e-06, |
|
"loss": 0.0138, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.7674418604651163, |
|
"grad_norm": 0.3216392695903778, |
|
"learning_rate": 4.786984671220053e-06, |
|
"loss": 0.0171, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.802325581395349, |
|
"grad_norm": 0.3574189841747284, |
|
"learning_rate": 4.737908228387656e-06, |
|
"loss": 0.0146, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.8372093023255816, |
|
"grad_norm": 0.30931738018989563, |
|
"learning_rate": 4.684055976615924e-06, |
|
"loss": 0.0128, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.8720930232558137, |
|
"grad_norm": 0.2074785828590393, |
|
"learning_rate": 4.625542839324036e-06, |
|
"loss": 0.0089, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.9069767441860463, |
|
"grad_norm": 0.28993067145347595, |
|
"learning_rate": 4.562493686495756e-06, |
|
"loss": 0.0088, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.941860465116279, |
|
"grad_norm": 0.21493716537952423, |
|
"learning_rate": 4.4950430682005995e-06, |
|
"loss": 0.0092, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.9767441860465116, |
|
"grad_norm": 0.4238521158695221, |
|
"learning_rate": 4.423334927457198e-06, |
|
"loss": 0.0096, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4238521158695221, |
|
"learning_rate": 4.3475222930516484e-06, |
|
"loss": 0.0092, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 4.034883720930233, |
|
"grad_norm": 0.31130537390708923, |
|
"learning_rate": 4.267766952966369e-06, |
|
"loss": 0.0052, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 4.069767441860465, |
|
"grad_norm": 0.19592078030109406, |
|
"learning_rate": 4.184239109116393e-06, |
|
"loss": 0.0052, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 4.104651162790698, |
|
"grad_norm": 0.1717974841594696, |
|
"learning_rate": 4.097117014129903e-06, |
|
"loss": 0.0041, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 4.1395348837209305, |
|
"grad_norm": 0.15367873013019562, |
|
"learning_rate": 4.006586590948141e-06, |
|
"loss": 0.0061, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 4.174418604651163, |
|
"grad_norm": 0.19444844126701355, |
|
"learning_rate": 3.91284103605648e-06, |
|
"loss": 0.0072, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 4.209302325581396, |
|
"grad_norm": 0.22090725600719452, |
|
"learning_rate": 3.81608040719339e-06, |
|
"loss": 0.0041, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 4.2441860465116275, |
|
"grad_norm": 0.13613103330135345, |
|
"learning_rate": 3.7165111964171407e-06, |
|
"loss": 0.0042, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 4.27906976744186, |
|
"grad_norm": 0.5199090838432312, |
|
"learning_rate": 3.6143458894413463e-06, |
|
"loss": 0.0056, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 4.313953488372093, |
|
"grad_norm": 0.2258583903312683, |
|
"learning_rate": 3.5098025121797375e-06, |
|
"loss": 0.004, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 4.348837209302325, |
|
"grad_norm": 0.2459285408258438, |
|
"learning_rate": 3.403104165467883e-06, |
|
"loss": 0.0045, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 4.383720930232558, |
|
"grad_norm": 0.18142816424369812, |
|
"learning_rate": 3.2944785489547544e-06, |
|
"loss": 0.0043, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.4186046511627906, |
|
"grad_norm": 0.14586737751960754, |
|
"learning_rate": 3.184157475180208e-06, |
|
"loss": 0.0027, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.453488372093023, |
|
"grad_norm": 0.17918141186237335, |
|
"learning_rate": 3.0723763748753354e-06, |
|
"loss": 0.0048, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.488372093023256, |
|
"grad_norm": 0.23981280624866486, |
|
"learning_rate": 2.9593737945414264e-06, |
|
"loss": 0.0048, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.523255813953488, |
|
"grad_norm": 0.20855571329593658, |
|
"learning_rate": 2.845390887379706e-06, |
|
"loss": 0.002, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.558139534883721, |
|
"grad_norm": 0.15569192171096802, |
|
"learning_rate": 2.730670898658255e-06, |
|
"loss": 0.0053, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.593023255813954, |
|
"grad_norm": 0.17529034614562988, |
|
"learning_rate": 2.6154586466143495e-06, |
|
"loss": 0.0037, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.627906976744186, |
|
"grad_norm": 0.2721562683582306, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0047, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.662790697674419, |
|
"grad_norm": 0.15811768174171448, |
|
"learning_rate": 2.3845413533856517e-06, |
|
"loss": 0.0031, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.6976744186046515, |
|
"grad_norm": 0.3956407904624939, |
|
"learning_rate": 2.269329101341745e-06, |
|
"loss": 0.0048, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.732558139534884, |
|
"grad_norm": 1.2278478145599365, |
|
"learning_rate": 2.1546091126202955e-06, |
|
"loss": 0.0036, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.767441860465116, |
|
"grad_norm": 0.16240213811397552, |
|
"learning_rate": 2.040626205458574e-06, |
|
"loss": 0.0032, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.8023255813953485, |
|
"grad_norm": 0.15741980075836182, |
|
"learning_rate": 1.9276236251246655e-06, |
|
"loss": 0.0027, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.837209302325581, |
|
"grad_norm": 0.15352095663547516, |
|
"learning_rate": 1.8158425248197931e-06, |
|
"loss": 0.0021, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.872093023255814, |
|
"grad_norm": 0.16562862694263458, |
|
"learning_rate": 1.7055214510452462e-06, |
|
"loss": 0.0017, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.906976744186046, |
|
"grad_norm": 0.07386556267738342, |
|
"learning_rate": 1.5968958345321178e-06, |
|
"loss": 0.0028, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.941860465116279, |
|
"grad_norm": 0.11063098907470703, |
|
"learning_rate": 1.490197487820263e-06, |
|
"loss": 0.0022, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.976744186046512, |
|
"grad_norm": 0.1346900910139084, |
|
"learning_rate": 1.3856541105586545e-06, |
|
"loss": 0.0021, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.11798722296953201, |
|
"learning_rate": 1.2834888035828597e-06, |
|
"loss": 0.0031, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 5.034883720930233, |
|
"grad_norm": 0.18651635944843292, |
|
"learning_rate": 1.1839195928066101e-06, |
|
"loss": 0.002, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 5.069767441860465, |
|
"grad_norm": 0.10228271782398224, |
|
"learning_rate": 1.0871589639435204e-06, |
|
"loss": 0.0019, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 5.104651162790698, |
|
"grad_norm": 0.12258665263652802, |
|
"learning_rate": 9.934134090518593e-07, |
|
"loss": 0.002, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 5.1395348837209305, |
|
"grad_norm": 0.0909138098359108, |
|
"learning_rate": 9.028829858700974e-07, |
|
"loss": 0.0019, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 5.174418604651163, |
|
"grad_norm": 0.1315290331840515, |
|
"learning_rate": 8.157608908836071e-07, |
|
"loss": 0.0013, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 5.209302325581396, |
|
"grad_norm": 0.0621444433927536, |
|
"learning_rate": 7.322330470336314e-07, |
|
"loss": 0.0017, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 5.2441860465116275, |
|
"grad_norm": 0.11794009059667587, |
|
"learning_rate": 6.524777069483526e-07, |
|
"loss": 0.0016, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 5.27906976744186, |
|
"grad_norm": 0.07475598901510239, |
|
"learning_rate": 5.766650725428027e-07, |
|
"loss": 0.0024, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 5.313953488372093, |
|
"grad_norm": 0.12244551628828049, |
|
"learning_rate": 5.049569317994013e-07, |
|
"loss": 0.0021, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 5.348837209302325, |
|
"grad_norm": 0.12658093869686127, |
|
"learning_rate": 4.3750631350424456e-07, |
|
"loss": 0.0015, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 5.383720930232558, |
|
"grad_norm": 0.13053877651691437, |
|
"learning_rate": 3.7445716067596506e-07, |
|
"loss": 0.0019, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 5.4186046511627906, |
|
"grad_norm": 0.07347141206264496, |
|
"learning_rate": 3.1594402338407633e-07, |
|
"loss": 0.0014, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 5.453488372093023, |
|
"grad_norm": 0.09592650085687637, |
|
"learning_rate": 2.620917716123444e-07, |
|
"loss": 0.0026, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 5.488372093023256, |
|
"grad_norm": 0.10409682989120483, |
|
"learning_rate": 2.1301532877994747e-07, |
|
"loss": 0.0014, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 5.523255813953488, |
|
"grad_norm": 0.06076663359999657, |
|
"learning_rate": 1.6881942648911077e-07, |
|
"loss": 0.002, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.558139534883721, |
|
"grad_norm": 0.0755394697189331, |
|
"learning_rate": 1.2959838102258537e-07, |
|
"loss": 0.0023, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.593023255813954, |
|
"grad_norm": 0.05655893310904503, |
|
"learning_rate": 9.54358920679524e-08, |
|
"loss": 0.001, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.627906976744186, |
|
"grad_norm": 0.06078566238284111, |
|
"learning_rate": 6.640486409826785e-08, |
|
"loss": 0.0014, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.662790697674419, |
|
"grad_norm": 0.09118495136499405, |
|
"learning_rate": 4.256725079024554e-08, |
|
"loss": 0.0017, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.6976744186046515, |
|
"grad_norm": 0.10137756168842316, |
|
"learning_rate": 2.3973922811987295e-08, |
|
"loss": 0.0015, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.732558139534884, |
|
"grad_norm": 0.12966589629650116, |
|
"learning_rate": 1.0664559262413831e-08, |
|
"loss": 0.0034, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.767441860465116, |
|
"grad_norm": 0.1168215349316597, |
|
"learning_rate": 2.6675629940689508e-09, |
|
"loss": 0.0012, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.8023255813953485, |
|
"grad_norm": 0.08311488479375839, |
|
"learning_rate": 0.0, |
|
"loss": 0.0017, |
|
"step": 168 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 168, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 28, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4267371372895273e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|