|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 252, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015873015873015872, |
|
"grad_norm": 2.3323949793109238, |
|
"learning_rate": 0.0, |
|
"loss": 1.0469, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.031746031746031744, |
|
"grad_norm": 2.317201200688066, |
|
"learning_rate": 3.125e-07, |
|
"loss": 0.9931, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.047619047619047616, |
|
"grad_norm": 2.603689956679125, |
|
"learning_rate": 6.25e-07, |
|
"loss": 1.0188, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.06349206349206349, |
|
"grad_norm": 2.2583787301898592, |
|
"learning_rate": 9.375000000000001e-07, |
|
"loss": 0.9097, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07936507936507936, |
|
"grad_norm": 2.197466891038096, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.0459, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": 2.1259963361099747, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.9986, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 2.0707820881041, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.9555, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.12698412698412698, |
|
"grad_norm": 1.870407527874291, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 0.952, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 1.8578085390534953, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.9993, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.15873015873015872, |
|
"grad_norm": 1.881148688458384, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.9373, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1746031746031746, |
|
"grad_norm": 1.6917769845914787, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.8839, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 1.2541345576396532, |
|
"learning_rate": 3.4375e-06, |
|
"loss": 0.9909, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.20634920634920634, |
|
"grad_norm": 1.4038335670152517, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.9322, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 1.363468897891553, |
|
"learning_rate": 4.0625000000000005e-06, |
|
"loss": 1.0934, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 1.1331989679866032, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.977, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.25396825396825395, |
|
"grad_norm": 0.9899834287202586, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 1.0443, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2698412698412698, |
|
"grad_norm": 1.155920523517074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9483, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 1.2715867938274161, |
|
"learning_rate": 5.3125e-06, |
|
"loss": 1.0096, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.30158730158730157, |
|
"grad_norm": 0.9922231339593638, |
|
"learning_rate": 5.625e-06, |
|
"loss": 0.7463, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 1.2551959582539625, |
|
"learning_rate": 5.9375e-06, |
|
"loss": 0.9226, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.892951024999124, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.988, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3492063492063492, |
|
"grad_norm": 1.4360539096520086, |
|
"learning_rate": 6.5625e-06, |
|
"loss": 1.0509, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.36507936507936506, |
|
"grad_norm": 1.1100051374669628, |
|
"learning_rate": 6.875e-06, |
|
"loss": 0.8728, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 0.9630208551024003, |
|
"learning_rate": 7.1875e-06, |
|
"loss": 0.8352, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3968253968253968, |
|
"grad_norm": 1.109963225007402, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.0289, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4126984126984127, |
|
"grad_norm": 0.842175710243708, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 0.8616, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.8255762742603932, |
|
"learning_rate": 8.125000000000001e-06, |
|
"loss": 0.7234, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.8274507712792363, |
|
"learning_rate": 8.4375e-06, |
|
"loss": 0.9758, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4603174603174603, |
|
"grad_norm": 0.7834224887700044, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 0.9056, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 1.187020605300137, |
|
"learning_rate": 9.0625e-06, |
|
"loss": 0.9481, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.49206349206349204, |
|
"grad_norm": 1.0233176856791018, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 0.9194, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5079365079365079, |
|
"grad_norm": 0.848791394024066, |
|
"learning_rate": 9.6875e-06, |
|
"loss": 0.8852, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5238095238095238, |
|
"grad_norm": 0.8289281876622956, |
|
"learning_rate": 1e-05, |
|
"loss": 1.038, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5396825396825397, |
|
"grad_norm": 0.7738330911179299, |
|
"learning_rate": 9.999691920767945e-06, |
|
"loss": 0.8374, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.65004421093035, |
|
"learning_rate": 9.998767721036901e-06, |
|
"loss": 0.8242, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.718229691257778, |
|
"learning_rate": 9.997227514697568e-06, |
|
"loss": 0.9693, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5873015873015873, |
|
"grad_norm": 0.598178727036991, |
|
"learning_rate": 9.99507149155218e-06, |
|
"loss": 0.9843, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6031746031746031, |
|
"grad_norm": 0.6896420594948925, |
|
"learning_rate": 9.992299917291118e-06, |
|
"loss": 0.848, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6190476190476191, |
|
"grad_norm": 0.7218001479564617, |
|
"learning_rate": 9.98891313346017e-06, |
|
"loss": 0.9095, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 0.673383804041238, |
|
"learning_rate": 9.984911557418444e-06, |
|
"loss": 0.7682, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6507936507936508, |
|
"grad_norm": 0.9044903125501461, |
|
"learning_rate": 9.980295682286924e-06, |
|
"loss": 0.8388, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.6528626470394925, |
|
"learning_rate": 9.97506607688772e-06, |
|
"loss": 0.9107, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6825396825396826, |
|
"grad_norm": 0.5248039585149111, |
|
"learning_rate": 9.969223385673958e-06, |
|
"loss": 0.8307, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6984126984126984, |
|
"grad_norm": 0.568338771820042, |
|
"learning_rate": 9.962768328650367e-06, |
|
"loss": 0.7523, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.5429855696185105, |
|
"learning_rate": 9.95570170128455e-06, |
|
"loss": 0.8442, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7301587301587301, |
|
"grad_norm": 0.5098426033492849, |
|
"learning_rate": 9.94802437440896e-06, |
|
"loss": 0.7962, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.746031746031746, |
|
"grad_norm": 0.6078990273192543, |
|
"learning_rate": 9.939737294113585e-06, |
|
"loss": 0.8969, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 0.4709547244829324, |
|
"learning_rate": 9.930841481629358e-06, |
|
"loss": 0.8885, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 0.54039591858629, |
|
"learning_rate": 9.92133803320231e-06, |
|
"loss": 0.7818, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.7936507936507936, |
|
"grad_norm": 0.4875170254753124, |
|
"learning_rate": 9.91122811995848e-06, |
|
"loss": 0.8193, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8095238095238095, |
|
"grad_norm": 0.5005396928536703, |
|
"learning_rate": 9.90051298775959e-06, |
|
"loss": 0.8692, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.8253968253968254, |
|
"grad_norm": 0.40245216027036546, |
|
"learning_rate": 9.88919395704952e-06, |
|
"loss": 0.826, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8412698412698413, |
|
"grad_norm": 0.5389952051377087, |
|
"learning_rate": 9.877272422691583e-06, |
|
"loss": 0.9318, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.5638980417584056, |
|
"learning_rate": 9.864749853796642e-06, |
|
"loss": 0.7985, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.873015873015873, |
|
"grad_norm": 0.5506830661309166, |
|
"learning_rate": 9.85162779354206e-06, |
|
"loss": 0.7291, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.48566023212019677, |
|
"learning_rate": 9.837907858981536e-06, |
|
"loss": 0.8802, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9047619047619048, |
|
"grad_norm": 0.4725406192484581, |
|
"learning_rate": 9.823591740845831e-06, |
|
"loss": 0.8627, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.9206349206349206, |
|
"grad_norm": 0.5270784935436914, |
|
"learning_rate": 9.808681203334416e-06, |
|
"loss": 0.7976, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.9365079365079365, |
|
"grad_norm": 0.4795159174595573, |
|
"learning_rate": 9.793178083898073e-06, |
|
"loss": 0.8783, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.42309628953003137, |
|
"learning_rate": 9.777084293012448e-06, |
|
"loss": 0.842, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9682539682539683, |
|
"grad_norm": 0.464555539059811, |
|
"learning_rate": 9.760401813942641e-06, |
|
"loss": 0.7662, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9841269841269841, |
|
"grad_norm": 0.5141212041737542, |
|
"learning_rate": 9.743132702498785e-06, |
|
"loss": 0.8688, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5165788253828009, |
|
"learning_rate": 9.725279086782719e-06, |
|
"loss": 0.768, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.0158730158730158, |
|
"grad_norm": 0.576629868282963, |
|
"learning_rate": 9.706843166925733e-06, |
|
"loss": 0.7989, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.0317460317460316, |
|
"grad_norm": 0.4946943998511545, |
|
"learning_rate": 9.687827214817433e-06, |
|
"loss": 0.8261, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0476190476190477, |
|
"grad_norm": 0.4987216606535057, |
|
"learning_rate": 9.668233573825794e-06, |
|
"loss": 0.8905, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0634920634920635, |
|
"grad_norm": 0.45688977932466196, |
|
"learning_rate": 9.64806465850836e-06, |
|
"loss": 0.7327, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.0793650793650793, |
|
"grad_norm": 0.5226340006885853, |
|
"learning_rate": 9.62732295431471e-06, |
|
"loss": 0.7311, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.0952380952380953, |
|
"grad_norm": 0.6684025298786129, |
|
"learning_rate": 9.606011017280166e-06, |
|
"loss": 0.8971, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.5147703758608321, |
|
"learning_rate": 9.5841314737108e-06, |
|
"loss": 0.7652, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.126984126984127, |
|
"grad_norm": 0.5417227409614662, |
|
"learning_rate": 9.56168701985981e-06, |
|
"loss": 0.7999, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.5016561221704748, |
|
"learning_rate": 9.538680421595236e-06, |
|
"loss": 0.8074, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.1587301587301586, |
|
"grad_norm": 0.4853528793957531, |
|
"learning_rate": 9.515114514059127e-06, |
|
"loss": 0.8135, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.1746031746031746, |
|
"grad_norm": 0.47765415470199357, |
|
"learning_rate": 9.490992201318165e-06, |
|
"loss": 0.7879, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 0.46535342031003013, |
|
"learning_rate": 9.466316456005783e-06, |
|
"loss": 0.7762, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.2063492063492063, |
|
"grad_norm": 0.5033568814253909, |
|
"learning_rate": 9.441090318955843e-06, |
|
"loss": 0.7022, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 0.4986643533291915, |
|
"learning_rate": 9.415316898827923e-06, |
|
"loss": 0.7349, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.2380952380952381, |
|
"grad_norm": 0.43657193718859494, |
|
"learning_rate": 9.388999371724212e-06, |
|
"loss": 0.8264, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.253968253968254, |
|
"grad_norm": 0.47617277777848616, |
|
"learning_rate": 9.362140980798127e-06, |
|
"loss": 0.8944, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.2698412698412698, |
|
"grad_norm": 0.4295219607791053, |
|
"learning_rate": 9.334745035854646e-06, |
|
"loss": 0.7588, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.5225987407011279, |
|
"learning_rate": 9.306814912942445e-06, |
|
"loss": 0.8359, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.3015873015873016, |
|
"grad_norm": 0.4173684559568506, |
|
"learning_rate": 9.278354053937848e-06, |
|
"loss": 0.7804, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.3174603174603174, |
|
"grad_norm": 0.5238592049595157, |
|
"learning_rate": 9.249365966120692e-06, |
|
"loss": 0.8564, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.4526393208745273, |
|
"learning_rate": 9.219854221742106e-06, |
|
"loss": 0.8102, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.3492063492063493, |
|
"grad_norm": 0.44471888761912887, |
|
"learning_rate": 9.189822457584311e-06, |
|
"loss": 0.7439, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.3650793650793651, |
|
"grad_norm": 0.43731884433734214, |
|
"learning_rate": 9.159274374512444e-06, |
|
"loss": 0.6592, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.380952380952381, |
|
"grad_norm": 0.4377614076782124, |
|
"learning_rate": 9.128213737018493e-06, |
|
"loss": 0.806, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.3968253968253967, |
|
"grad_norm": 0.4027105033083121, |
|
"learning_rate": 9.096644372757393e-06, |
|
"loss": 0.8855, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.4126984126984126, |
|
"grad_norm": 0.571463019194369, |
|
"learning_rate": 9.064570172075349e-06, |
|
"loss": 0.7979, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.4801097800367482, |
|
"learning_rate": 9.031995087530403e-06, |
|
"loss": 0.7992, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.47255682704462587, |
|
"learning_rate": 8.99892313340537e-06, |
|
"loss": 0.6633, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.4603174603174602, |
|
"grad_norm": 0.4862492507086913, |
|
"learning_rate": 8.96535838521314e-06, |
|
"loss": 0.8033, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.4761904761904763, |
|
"grad_norm": 0.4794987734861929, |
|
"learning_rate": 8.931304979194452e-06, |
|
"loss": 0.8069, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.492063492063492, |
|
"grad_norm": 0.4658669415595415, |
|
"learning_rate": 8.896767111808177e-06, |
|
"loss": 0.7371, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.507936507936508, |
|
"grad_norm": 0.5683125861447418, |
|
"learning_rate": 8.861749039214177e-06, |
|
"loss": 0.9145, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"grad_norm": 0.47857884026171116, |
|
"learning_rate": 8.826255076748823e-06, |
|
"loss": 0.8455, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.5396825396825395, |
|
"grad_norm": 0.429389167302876, |
|
"learning_rate": 8.790289598393186e-06, |
|
"loss": 0.7216, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.522031534882144, |
|
"learning_rate": 8.753857036234055e-06, |
|
"loss": 0.8155, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 0.5375692580431519, |
|
"learning_rate": 8.716961879917734e-06, |
|
"loss": 0.7373, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.5873015873015874, |
|
"grad_norm": 0.4277716225580266, |
|
"learning_rate": 8.679608676096793e-06, |
|
"loss": 0.8132, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6031746031746033, |
|
"grad_norm": 0.9709114563751018, |
|
"learning_rate": 8.641802027869774e-06, |
|
"loss": 0.7952, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.619047619047619, |
|
"grad_norm": 0.6722991060253756, |
|
"learning_rate": 8.603546594213935e-06, |
|
"loss": 0.8566, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6349206349206349, |
|
"grad_norm": 0.48227435877100366, |
|
"learning_rate": 8.564847089411128e-06, |
|
"loss": 0.8292, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.6507936507936507, |
|
"grad_norm": 0.43738769808282163, |
|
"learning_rate": 8.525708282466839e-06, |
|
"loss": 0.8424, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.42758983764847835, |
|
"learning_rate": 8.486134996522502e-06, |
|
"loss": 0.8179, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.6825396825396826, |
|
"grad_norm": 0.6465752665836958, |
|
"learning_rate": 8.446132108261136e-06, |
|
"loss": 0.806, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.6984126984126984, |
|
"grad_norm": 0.5216064305348748, |
|
"learning_rate": 8.405704547306379e-06, |
|
"loss": 0.8041, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.46284349128240304, |
|
"learning_rate": 8.364857295615006e-06, |
|
"loss": 0.8924, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.7301587301587302, |
|
"grad_norm": 0.48814352812138595, |
|
"learning_rate": 8.323595386862985e-06, |
|
"loss": 0.7929, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.746031746031746, |
|
"grad_norm": 0.48088506678769916, |
|
"learning_rate": 8.281923905825188e-06, |
|
"loss": 0.7671, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7619047619047619, |
|
"grad_norm": 0.4594586947272896, |
|
"learning_rate": 8.23984798774876e-06, |
|
"loss": 0.7366, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.4673793179812366, |
|
"learning_rate": 8.197372817720314e-06, |
|
"loss": 0.7397, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.7936507936507935, |
|
"grad_norm": 0.6557346369623661, |
|
"learning_rate": 8.154503630026955e-06, |
|
"loss": 0.7262, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.8095238095238095, |
|
"grad_norm": 0.45128446254113314, |
|
"learning_rate": 8.111245707511253e-06, |
|
"loss": 0.7213, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.8253968253968254, |
|
"grad_norm": 0.41666335434637974, |
|
"learning_rate": 8.067604380920228e-06, |
|
"loss": 0.7952, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.8412698412698414, |
|
"grad_norm": 0.4407610683896587, |
|
"learning_rate": 8.023585028248435e-06, |
|
"loss": 0.8486, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 0.5501977264080524, |
|
"learning_rate": 7.979193074075216e-06, |
|
"loss": 0.8911, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.873015873015873, |
|
"grad_norm": 0.459940871244406, |
|
"learning_rate": 7.934433988896233e-06, |
|
"loss": 0.6535, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.46949896874504654, |
|
"learning_rate": 7.889313288449323e-06, |
|
"loss": 0.8232, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.41110722374315695, |
|
"learning_rate": 7.843836533034784e-06, |
|
"loss": 0.7628, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.9206349206349205, |
|
"grad_norm": 0.47755036946919965, |
|
"learning_rate": 7.798009326830167e-06, |
|
"loss": 0.8003, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.9365079365079365, |
|
"grad_norm": 0.41342145123270885, |
|
"learning_rate": 7.751837317199673e-06, |
|
"loss": 0.8683, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.9523809523809523, |
|
"grad_norm": 0.4479867168170251, |
|
"learning_rate": 7.705326193998207e-06, |
|
"loss": 0.7552, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.9682539682539684, |
|
"grad_norm": 0.4549548876094008, |
|
"learning_rate": 7.658481688870218e-06, |
|
"loss": 0.7587, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.9841269841269842, |
|
"grad_norm": 0.4684989926335189, |
|
"learning_rate": 7.611309574543373e-06, |
|
"loss": 0.7607, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.4367513791425883, |
|
"learning_rate": 7.563815664117173e-06, |
|
"loss": 0.9146, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.015873015873016, |
|
"grad_norm": 0.7927149278076437, |
|
"learning_rate": 7.5160058103465985e-06, |
|
"loss": 0.7131, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.0317460317460316, |
|
"grad_norm": 0.5847918647965703, |
|
"learning_rate": 7.467885904920864e-06, |
|
"loss": 0.7578, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.0476190476190474, |
|
"grad_norm": 0.7836046335272314, |
|
"learning_rate": 7.419461877737373e-06, |
|
"loss": 0.8327, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.0634920634920633, |
|
"grad_norm": 2.1428241341527117, |
|
"learning_rate": 7.370739696170971e-06, |
|
"loss": 0.7441, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0793650793650795, |
|
"grad_norm": 0.9566247813485141, |
|
"learning_rate": 7.321725364338566e-06, |
|
"loss": 0.6185, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.0952380952380953, |
|
"grad_norm": 0.5336099004301172, |
|
"learning_rate": 7.272424922359246e-06, |
|
"loss": 0.6455, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.111111111111111, |
|
"grad_norm": 0.7132260718912609, |
|
"learning_rate": 7.222844445609931e-06, |
|
"loss": 0.7834, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.126984126984127, |
|
"grad_norm": 0.5749113101610002, |
|
"learning_rate": 7.172990043976703e-06, |
|
"loss": 0.7296, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.5366676899164674, |
|
"learning_rate": 7.122867861101868e-06, |
|
"loss": 0.795, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.1587301587301586, |
|
"grad_norm": 0.44931031781346276, |
|
"learning_rate": 7.072484073626872e-06, |
|
"loss": 0.6875, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.1746031746031744, |
|
"grad_norm": 0.6709913679680917, |
|
"learning_rate": 7.021844890431136e-06, |
|
"loss": 0.7669, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.1904761904761907, |
|
"grad_norm": 0.5782700607354144, |
|
"learning_rate": 6.970956551866925e-06, |
|
"loss": 0.7273, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.2063492063492065, |
|
"grad_norm": 0.5008612890527109, |
|
"learning_rate": 6.9198253289903515e-06, |
|
"loss": 0.6634, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.5733594756270326, |
|
"learning_rate": 6.868457522788561e-06, |
|
"loss": 0.7358, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.238095238095238, |
|
"grad_norm": 0.48532685396257946, |
|
"learning_rate": 6.816859463403271e-06, |
|
"loss": 0.659, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.253968253968254, |
|
"grad_norm": 0.5460096768726493, |
|
"learning_rate": 6.765037509350685e-06, |
|
"loss": 0.7585, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.2698412698412698, |
|
"grad_norm": 0.4827715321224934, |
|
"learning_rate": 6.7129980467379265e-06, |
|
"loss": 0.6664, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.5417449745700821, |
|
"learning_rate": 6.660747488476066e-06, |
|
"loss": 0.663, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.3015873015873014, |
|
"grad_norm": 0.5672091588208017, |
|
"learning_rate": 6.608292273489851e-06, |
|
"loss": 0.6122, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.317460317460317, |
|
"grad_norm": 0.5264115445856029, |
|
"learning_rate": 6.555638865924221e-06, |
|
"loss": 0.7035, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 0.5168486054014866, |
|
"learning_rate": 6.502793754347721e-06, |
|
"loss": 0.7598, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.3492063492063493, |
|
"grad_norm": 0.6085627519823247, |
|
"learning_rate": 6.449763450952912e-06, |
|
"loss": 0.6875, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.365079365079365, |
|
"grad_norm": 0.504951049632705, |
|
"learning_rate": 6.396554490753848e-06, |
|
"loss": 0.6839, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.42239268629753335, |
|
"learning_rate": 6.343173430780769e-06, |
|
"loss": 0.8396, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.3968253968253967, |
|
"grad_norm": 0.5170870251352963, |
|
"learning_rate": 6.289626849272062e-06, |
|
"loss": 0.8013, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.4126984126984126, |
|
"grad_norm": 0.5408561718958109, |
|
"learning_rate": 6.2359213448636104e-06, |
|
"loss": 0.754, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.4285714285714284, |
|
"grad_norm": 0.42606389993166277, |
|
"learning_rate": 6.182063535775634e-06, |
|
"loss": 0.7662, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 0.41021417431281776, |
|
"learning_rate": 6.1280600589971225e-06, |
|
"loss": 0.791, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.4603174603174605, |
|
"grad_norm": 0.4068459581892925, |
|
"learning_rate": 6.073917569467934e-06, |
|
"loss": 0.8066, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.4761904761904763, |
|
"grad_norm": 0.40243757072180364, |
|
"learning_rate": 6.0196427392587085e-06, |
|
"loss": 0.7061, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.492063492063492, |
|
"grad_norm": 0.5924677871750427, |
|
"learning_rate": 5.96524225674865e-06, |
|
"loss": 0.744, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.507936507936508, |
|
"grad_norm": 0.4344103520994765, |
|
"learning_rate": 5.9107228258013085e-06, |
|
"loss": 0.7076, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.5238095238095237, |
|
"grad_norm": 0.4824828219676673, |
|
"learning_rate": 5.856091164938451e-06, |
|
"loss": 0.6534, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.5396825396825395, |
|
"grad_norm": 0.4197375023372333, |
|
"learning_rate": 5.801354006512127e-06, |
|
"loss": 0.6902, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.5555555555555554, |
|
"grad_norm": 0.4523354962317184, |
|
"learning_rate": 5.746518095875033e-06, |
|
"loss": 0.6996, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 0.41073692830700287, |
|
"learning_rate": 5.6915901905492586e-06, |
|
"loss": 0.629, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.5873015873015874, |
|
"grad_norm": 0.5807356357914126, |
|
"learning_rate": 5.6365770593935665e-06, |
|
"loss": 0.5924, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.6031746031746033, |
|
"grad_norm": 0.5296154741304107, |
|
"learning_rate": 5.581485481769231e-06, |
|
"loss": 0.7197, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.4462893254042338, |
|
"learning_rate": 5.526322246704628e-06, |
|
"loss": 0.8007, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.634920634920635, |
|
"grad_norm": 0.3974463949753287, |
|
"learning_rate": 5.471094152058592e-06, |
|
"loss": 0.6822, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.6507936507936507, |
|
"grad_norm": 0.46244966479154553, |
|
"learning_rate": 5.415808003682717e-06, |
|
"loss": 0.7318, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.438557400530548, |
|
"learning_rate": 5.360470614582661e-06, |
|
"loss": 0.7147, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.682539682539683, |
|
"grad_norm": 0.5680373876053647, |
|
"learning_rate": 5.305088804078559e-06, |
|
"loss": 0.7357, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.6984126984126986, |
|
"grad_norm": 0.4556205137087138, |
|
"learning_rate": 5.249669396964665e-06, |
|
"loss": 0.6361, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.7142857142857144, |
|
"grad_norm": 0.44940699263796485, |
|
"learning_rate": 5.1942192226683385e-06, |
|
"loss": 0.7778, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.7301587301587302, |
|
"grad_norm": 0.47535854965434626, |
|
"learning_rate": 5.138745114408427e-06, |
|
"loss": 0.6008, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.746031746031746, |
|
"grad_norm": 0.5020715004802897, |
|
"learning_rate": 5.083253908353193e-06, |
|
"loss": 0.6696, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.761904761904762, |
|
"grad_norm": 0.4715489187155987, |
|
"learning_rate": 5.0277524427778986e-06, |
|
"loss": 0.7846, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.44938039077917374, |
|
"learning_rate": 4.972247557222102e-06, |
|
"loss": 0.7187, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.7936507936507935, |
|
"grad_norm": 0.536309868809644, |
|
"learning_rate": 4.916746091646808e-06, |
|
"loss": 0.6818, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.8095238095238093, |
|
"grad_norm": 0.4238224566275176, |
|
"learning_rate": 4.8612548855915755e-06, |
|
"loss": 0.7252, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.825396825396825, |
|
"grad_norm": 0.5075369152051689, |
|
"learning_rate": 4.805780777331662e-06, |
|
"loss": 0.7461, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.8412698412698414, |
|
"grad_norm": 0.463068134108742, |
|
"learning_rate": 4.750330603035336e-06, |
|
"loss": 0.7141, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.44910366292391646, |
|
"learning_rate": 4.694911195921443e-06, |
|
"loss": 0.7278, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.873015873015873, |
|
"grad_norm": 0.43362119780351166, |
|
"learning_rate": 4.6395293854173395e-06, |
|
"loss": 0.6069, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 0.7285135499415637, |
|
"learning_rate": 4.584191996317285e-06, |
|
"loss": 0.6846, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.9047619047619047, |
|
"grad_norm": 0.49976201370002465, |
|
"learning_rate": 4.528905847941411e-06, |
|
"loss": 0.843, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.9206349206349205, |
|
"grad_norm": 0.47745344638517, |
|
"learning_rate": 4.473677753295375e-06, |
|
"loss": 0.6609, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.9365079365079367, |
|
"grad_norm": 0.4075892143069301, |
|
"learning_rate": 4.418514518230769e-06, |
|
"loss": 0.7133, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.9523809523809526, |
|
"grad_norm": 0.490679894902017, |
|
"learning_rate": 4.363422940606435e-06, |
|
"loss": 0.7483, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.9682539682539684, |
|
"grad_norm": 0.507751484260846, |
|
"learning_rate": 4.308409809450742e-06, |
|
"loss": 0.7635, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.984126984126984, |
|
"grad_norm": 0.5129728167302848, |
|
"learning_rate": 4.253481904124968e-06, |
|
"loss": 0.7353, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.44280290900369257, |
|
"learning_rate": 4.198645993487872e-06, |
|
"loss": 0.6059, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 3.015873015873016, |
|
"grad_norm": 0.7949393554198322, |
|
"learning_rate": 4.143908835061551e-06, |
|
"loss": 0.6868, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.0317460317460316, |
|
"grad_norm": 0.5012314119268376, |
|
"learning_rate": 4.089277174198694e-06, |
|
"loss": 0.7037, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 3.0476190476190474, |
|
"grad_norm": 0.8765248539640519, |
|
"learning_rate": 4.0347577432513515e-06, |
|
"loss": 0.746, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.0634920634920633, |
|
"grad_norm": 0.5276377235611475, |
|
"learning_rate": 3.980357260741293e-06, |
|
"loss": 0.6836, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 3.0793650793650795, |
|
"grad_norm": 0.5739417223582697, |
|
"learning_rate": 3.926082430532067e-06, |
|
"loss": 0.6428, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 3.0952380952380953, |
|
"grad_norm": 0.6325463534989497, |
|
"learning_rate": 3.87193994100288e-06, |
|
"loss": 0.6092, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 0.6843617935822326, |
|
"learning_rate": 3.817936464224367e-06, |
|
"loss": 0.6763, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.126984126984127, |
|
"grad_norm": 0.5698355849375702, |
|
"learning_rate": 3.764078655136391e-06, |
|
"loss": 0.7472, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 3.142857142857143, |
|
"grad_norm": 0.5699592517012283, |
|
"learning_rate": 3.7103731507279383e-06, |
|
"loss": 0.7029, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.1587301587301586, |
|
"grad_norm": 0.4423177821267063, |
|
"learning_rate": 3.656826569219233e-06, |
|
"loss": 0.6717, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 3.1746031746031744, |
|
"grad_norm": 0.5057172241583261, |
|
"learning_rate": 3.603445509246154e-06, |
|
"loss": 0.6429, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.1904761904761907, |
|
"grad_norm": 0.4627898485974749, |
|
"learning_rate": 3.55023654904709e-06, |
|
"loss": 0.7171, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 3.2063492063492065, |
|
"grad_norm": 0.4765018395044146, |
|
"learning_rate": 3.49720624565228e-06, |
|
"loss": 0.5539, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 3.2222222222222223, |
|
"grad_norm": 0.46472301884916256, |
|
"learning_rate": 3.44436113407578e-06, |
|
"loss": 0.6852, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 3.238095238095238, |
|
"grad_norm": 0.7227019117707013, |
|
"learning_rate": 3.3917077265101505e-06, |
|
"loss": 0.751, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 3.253968253968254, |
|
"grad_norm": 0.46124162458293566, |
|
"learning_rate": 3.3392525115239353e-06, |
|
"loss": 0.5753, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.2698412698412698, |
|
"grad_norm": 1.150296667678599, |
|
"learning_rate": 3.2870019532620744e-06, |
|
"loss": 0.7116, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 3.2857142857142856, |
|
"grad_norm": 0.45629133977245157, |
|
"learning_rate": 3.2349624906493164e-06, |
|
"loss": 0.6506, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 3.3015873015873014, |
|
"grad_norm": 0.6330635820823547, |
|
"learning_rate": 3.1831405365967315e-06, |
|
"loss": 0.5314, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 3.317460317460317, |
|
"grad_norm": 0.5103783322022635, |
|
"learning_rate": 3.1315424772114404e-06, |
|
"loss": 0.7163, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.5407291227510194, |
|
"learning_rate": 3.0801746710096497e-06, |
|
"loss": 0.5543, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.3492063492063493, |
|
"grad_norm": 0.5148220790962434, |
|
"learning_rate": 3.0290434481330746e-06, |
|
"loss": 0.5885, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 3.365079365079365, |
|
"grad_norm": 0.4436633763550198, |
|
"learning_rate": 2.978155109568864e-06, |
|
"loss": 0.6205, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 3.380952380952381, |
|
"grad_norm": 0.3967936149581789, |
|
"learning_rate": 2.927515926373129e-06, |
|
"loss": 0.6664, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 3.3968253968253967, |
|
"grad_norm": 0.4218431236274798, |
|
"learning_rate": 2.8771321388981334e-06, |
|
"loss": 0.6664, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 3.4126984126984126, |
|
"grad_norm": 0.6850450086103512, |
|
"learning_rate": 2.8270099560232992e-06, |
|
"loss": 0.71, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 0.49624036517991055, |
|
"learning_rate": 2.77715555439007e-06, |
|
"loss": 0.7545, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 3.4444444444444446, |
|
"grad_norm": 0.4389117683902036, |
|
"learning_rate": 2.7275750776407568e-06, |
|
"loss": 0.6014, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 3.4603174603174605, |
|
"grad_norm": 0.44946398322882497, |
|
"learning_rate": 2.6782746356614364e-06, |
|
"loss": 0.5866, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 3.4761904761904763, |
|
"grad_norm": 0.5130278875069821, |
|
"learning_rate": 2.6292603038290306e-06, |
|
"loss": 0.7161, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 3.492063492063492, |
|
"grad_norm": 0.6079510505594462, |
|
"learning_rate": 2.580538122262627e-06, |
|
"loss": 0.6545, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.507936507936508, |
|
"grad_norm": 0.5185432227363381, |
|
"learning_rate": 2.532114095079137e-06, |
|
"loss": 0.5745, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 3.5238095238095237, |
|
"grad_norm": 0.47475284651402894, |
|
"learning_rate": 2.4839941896534027e-06, |
|
"loss": 0.6287, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 3.5396825396825395, |
|
"grad_norm": 0.4721541505351033, |
|
"learning_rate": 2.4361843358828287e-06, |
|
"loss": 0.5891, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.6637884613662758, |
|
"learning_rate": 2.388690425456629e-06, |
|
"loss": 0.7191, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 0.49879760044528987, |
|
"learning_rate": 2.341518311129781e-06, |
|
"loss": 0.5703, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.5873015873015874, |
|
"grad_norm": 0.42482396593298977, |
|
"learning_rate": 2.2946738060017947e-06, |
|
"loss": 0.706, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 3.6031746031746033, |
|
"grad_norm": 0.5244046208280333, |
|
"learning_rate": 2.24816268280033e-06, |
|
"loss": 0.6567, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 3.619047619047619, |
|
"grad_norm": 0.45713016270372664, |
|
"learning_rate": 2.2019906731698337e-06, |
|
"loss": 0.6519, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 3.634920634920635, |
|
"grad_norm": 0.3877769815934568, |
|
"learning_rate": 2.156163466965218e-06, |
|
"loss": 0.63, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 3.6507936507936507, |
|
"grad_norm": 0.5213212350040638, |
|
"learning_rate": 2.110686711550678e-06, |
|
"loss": 0.7059, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.6666666666666665, |
|
"grad_norm": 0.39785815195503926, |
|
"learning_rate": 2.0655660111037685e-06, |
|
"loss": 0.6371, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 3.682539682539683, |
|
"grad_norm": 0.42984736444835686, |
|
"learning_rate": 2.0208069259247866e-06, |
|
"loss": 0.659, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 3.6984126984126986, |
|
"grad_norm": 0.4602245826690893, |
|
"learning_rate": 1.976414971751568e-06, |
|
"loss": 0.6043, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 3.7142857142857144, |
|
"grad_norm": 0.4841459331133356, |
|
"learning_rate": 1.932395619079771e-06, |
|
"loss": 0.6762, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 3.7301587301587302, |
|
"grad_norm": 0.38677471787487294, |
|
"learning_rate": 1.8887542924887486e-06, |
|
"loss": 0.7034, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.746031746031746, |
|
"grad_norm": 0.669293977144537, |
|
"learning_rate": 1.8454963699730471e-06, |
|
"loss": 0.6753, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 3.761904761904762, |
|
"grad_norm": 0.9653712675428361, |
|
"learning_rate": 1.802627182279687e-06, |
|
"loss": 0.5958, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 3.7777777777777777, |
|
"grad_norm": 0.8023679085767069, |
|
"learning_rate": 1.760152012251241e-06, |
|
"loss": 0.5046, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 3.7936507936507935, |
|
"grad_norm": 0.523400494967504, |
|
"learning_rate": 1.7180760941748132e-06, |
|
"loss": 0.6704, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 0.5650557974529034, |
|
"learning_rate": 1.6764046131370142e-06, |
|
"loss": 0.7334, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.825396825396825, |
|
"grad_norm": 0.446176823325039, |
|
"learning_rate": 1.6351427043849955e-06, |
|
"loss": 0.6972, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 3.8412698412698414, |
|
"grad_norm": 0.5687688471627884, |
|
"learning_rate": 1.5942954526936217e-06, |
|
"loss": 0.6563, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 3.857142857142857, |
|
"grad_norm": 0.588174299336183, |
|
"learning_rate": 1.5538678917388638e-06, |
|
"loss": 0.6638, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 3.873015873015873, |
|
"grad_norm": 0.48774660261391006, |
|
"learning_rate": 1.5138650034775004e-06, |
|
"loss": 0.5733, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 3.888888888888889, |
|
"grad_norm": 0.4185611368252772, |
|
"learning_rate": 1.4742917175331644e-06, |
|
"loss": 0.7174, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.9047619047619047, |
|
"grad_norm": 0.43603269341453055, |
|
"learning_rate": 1.4351529105888735e-06, |
|
"loss": 0.7672, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 3.9206349206349205, |
|
"grad_norm": 0.4318904871120016, |
|
"learning_rate": 1.3964534057860652e-06, |
|
"loss": 0.5978, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 3.9365079365079367, |
|
"grad_norm": 0.40904640871839104, |
|
"learning_rate": 1.3581979721302286e-06, |
|
"loss": 0.6579, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 3.9523809523809526, |
|
"grad_norm": 0.6005145592007414, |
|
"learning_rate": 1.3203913239032074e-06, |
|
"loss": 0.6694, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 3.9682539682539684, |
|
"grad_norm": 0.472367689533449, |
|
"learning_rate": 1.283038120082268e-06, |
|
"loss": 0.6197, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.984126984126984, |
|
"grad_norm": 0.4356830095251736, |
|
"learning_rate": 1.2461429637659466e-06, |
|
"loss": 0.6213, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4857139342731584, |
|
"learning_rate": 1.2097104016068146e-06, |
|
"loss": 0.6352, |
|
"step": 252 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 315, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 62903662804992.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|