|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 315, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015873015873015872, |
|
"grad_norm": 2.3323949793109238, |
|
"learning_rate": 0.0, |
|
"loss": 1.0469, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.031746031746031744, |
|
"grad_norm": 2.317201200688066, |
|
"learning_rate": 3.125e-07, |
|
"loss": 0.9931, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.047619047619047616, |
|
"grad_norm": 2.603689956679125, |
|
"learning_rate": 6.25e-07, |
|
"loss": 1.0188, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.06349206349206349, |
|
"grad_norm": 2.2583787301898592, |
|
"learning_rate": 9.375000000000001e-07, |
|
"loss": 0.9097, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07936507936507936, |
|
"grad_norm": 2.197466891038096, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.0459, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": 2.1259963361099747, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.9986, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 2.0707820881041, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.9555, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.12698412698412698, |
|
"grad_norm": 1.870407527874291, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 0.952, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 1.8578085390534953, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.9993, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.15873015873015872, |
|
"grad_norm": 1.881148688458384, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.9373, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1746031746031746, |
|
"grad_norm": 1.6917769845914787, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.8839, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 1.2541345576396532, |
|
"learning_rate": 3.4375e-06, |
|
"loss": 0.9909, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.20634920634920634, |
|
"grad_norm": 1.4038335670152517, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.9322, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 1.363468897891553, |
|
"learning_rate": 4.0625000000000005e-06, |
|
"loss": 1.0934, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 1.1331989679866032, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.977, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.25396825396825395, |
|
"grad_norm": 0.9899834287202586, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 1.0443, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2698412698412698, |
|
"grad_norm": 1.155920523517074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9483, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 1.2715867938274161, |
|
"learning_rate": 5.3125e-06, |
|
"loss": 1.0096, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.30158730158730157, |
|
"grad_norm": 0.9922231339593638, |
|
"learning_rate": 5.625e-06, |
|
"loss": 0.7463, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 1.2551959582539625, |
|
"learning_rate": 5.9375e-06, |
|
"loss": 0.9226, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.892951024999124, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.988, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3492063492063492, |
|
"grad_norm": 1.4360539096520086, |
|
"learning_rate": 6.5625e-06, |
|
"loss": 1.0509, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.36507936507936506, |
|
"grad_norm": 1.1100051374669628, |
|
"learning_rate": 6.875e-06, |
|
"loss": 0.8728, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 0.9630208551024003, |
|
"learning_rate": 7.1875e-06, |
|
"loss": 0.8352, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3968253968253968, |
|
"grad_norm": 1.109963225007402, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.0289, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4126984126984127, |
|
"grad_norm": 0.842175710243708, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 0.8616, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.8255762742603932, |
|
"learning_rate": 8.125000000000001e-06, |
|
"loss": 0.7234, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.8274507712792363, |
|
"learning_rate": 8.4375e-06, |
|
"loss": 0.9758, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4603174603174603, |
|
"grad_norm": 0.7834224887700044, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 0.9056, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 1.187020605300137, |
|
"learning_rate": 9.0625e-06, |
|
"loss": 0.9481, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.49206349206349204, |
|
"grad_norm": 1.0233176856791018, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 0.9194, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5079365079365079, |
|
"grad_norm": 0.848791394024066, |
|
"learning_rate": 9.6875e-06, |
|
"loss": 0.8852, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5238095238095238, |
|
"grad_norm": 0.8289281876622956, |
|
"learning_rate": 1e-05, |
|
"loss": 1.038, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5396825396825397, |
|
"grad_norm": 0.7738330911179299, |
|
"learning_rate": 9.999691920767945e-06, |
|
"loss": 0.8374, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.65004421093035, |
|
"learning_rate": 9.998767721036901e-06, |
|
"loss": 0.8242, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.718229691257778, |
|
"learning_rate": 9.997227514697568e-06, |
|
"loss": 0.9693, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5873015873015873, |
|
"grad_norm": 0.598178727036991, |
|
"learning_rate": 9.99507149155218e-06, |
|
"loss": 0.9843, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6031746031746031, |
|
"grad_norm": 0.6896420594948925, |
|
"learning_rate": 9.992299917291118e-06, |
|
"loss": 0.848, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6190476190476191, |
|
"grad_norm": 0.7218001479564617, |
|
"learning_rate": 9.98891313346017e-06, |
|
"loss": 0.9095, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 0.673383804041238, |
|
"learning_rate": 9.984911557418444e-06, |
|
"loss": 0.7682, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6507936507936508, |
|
"grad_norm": 0.9044903125501461, |
|
"learning_rate": 9.980295682286924e-06, |
|
"loss": 0.8388, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.6528626470394925, |
|
"learning_rate": 9.97506607688772e-06, |
|
"loss": 0.9107, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6825396825396826, |
|
"grad_norm": 0.5248039585149111, |
|
"learning_rate": 9.969223385673958e-06, |
|
"loss": 0.8307, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6984126984126984, |
|
"grad_norm": 0.568338771820042, |
|
"learning_rate": 9.962768328650367e-06, |
|
"loss": 0.7523, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.5429855696185105, |
|
"learning_rate": 9.95570170128455e-06, |
|
"loss": 0.8442, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7301587301587301, |
|
"grad_norm": 0.5098426033492849, |
|
"learning_rate": 9.94802437440896e-06, |
|
"loss": 0.7962, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.746031746031746, |
|
"grad_norm": 0.6078990273192543, |
|
"learning_rate": 9.939737294113585e-06, |
|
"loss": 0.8969, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 0.4709547244829324, |
|
"learning_rate": 9.930841481629358e-06, |
|
"loss": 0.8885, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 0.54039591858629, |
|
"learning_rate": 9.92133803320231e-06, |
|
"loss": 0.7818, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.7936507936507936, |
|
"grad_norm": 0.4875170254753124, |
|
"learning_rate": 9.91122811995848e-06, |
|
"loss": 0.8193, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8095238095238095, |
|
"grad_norm": 0.5005396928536703, |
|
"learning_rate": 9.90051298775959e-06, |
|
"loss": 0.8692, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.8253968253968254, |
|
"grad_norm": 0.40245216027036546, |
|
"learning_rate": 9.88919395704952e-06, |
|
"loss": 0.826, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8412698412698413, |
|
"grad_norm": 0.5389952051377087, |
|
"learning_rate": 9.877272422691583e-06, |
|
"loss": 0.9318, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.5638980417584056, |
|
"learning_rate": 9.864749853796642e-06, |
|
"loss": 0.7985, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.873015873015873, |
|
"grad_norm": 0.5506830661309166, |
|
"learning_rate": 9.85162779354206e-06, |
|
"loss": 0.7291, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.48566023212019677, |
|
"learning_rate": 9.837907858981536e-06, |
|
"loss": 0.8802, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9047619047619048, |
|
"grad_norm": 0.4725406192484581, |
|
"learning_rate": 9.823591740845831e-06, |
|
"loss": 0.8627, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.9206349206349206, |
|
"grad_norm": 0.5270784935436914, |
|
"learning_rate": 9.808681203334416e-06, |
|
"loss": 0.7976, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.9365079365079365, |
|
"grad_norm": 0.4795159174595573, |
|
"learning_rate": 9.793178083898073e-06, |
|
"loss": 0.8783, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.42309628953003137, |
|
"learning_rate": 9.777084293012448e-06, |
|
"loss": 0.842, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9682539682539683, |
|
"grad_norm": 0.464555539059811, |
|
"learning_rate": 9.760401813942641e-06, |
|
"loss": 0.7662, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9841269841269841, |
|
"grad_norm": 0.5141212041737542, |
|
"learning_rate": 9.743132702498785e-06, |
|
"loss": 0.8688, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5165788253828009, |
|
"learning_rate": 9.725279086782719e-06, |
|
"loss": 0.768, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.0158730158730158, |
|
"grad_norm": 0.576629868282963, |
|
"learning_rate": 9.706843166925733e-06, |
|
"loss": 0.7989, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.0317460317460316, |
|
"grad_norm": 0.4946943998511545, |
|
"learning_rate": 9.687827214817433e-06, |
|
"loss": 0.8261, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0476190476190477, |
|
"grad_norm": 0.4987216606535057, |
|
"learning_rate": 9.668233573825794e-06, |
|
"loss": 0.8905, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0634920634920635, |
|
"grad_norm": 0.45688977932466196, |
|
"learning_rate": 9.64806465850836e-06, |
|
"loss": 0.7327, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.0793650793650793, |
|
"grad_norm": 0.5226340006885853, |
|
"learning_rate": 9.62732295431471e-06, |
|
"loss": 0.7311, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.0952380952380953, |
|
"grad_norm": 0.6684025298786129, |
|
"learning_rate": 9.606011017280166e-06, |
|
"loss": 0.8971, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.5147703758608321, |
|
"learning_rate": 9.5841314737108e-06, |
|
"loss": 0.7652, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.126984126984127, |
|
"grad_norm": 0.5417227409614662, |
|
"learning_rate": 9.56168701985981e-06, |
|
"loss": 0.7999, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.5016561221704748, |
|
"learning_rate": 9.538680421595236e-06, |
|
"loss": 0.8074, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.1587301587301586, |
|
"grad_norm": 0.4853528793957531, |
|
"learning_rate": 9.515114514059127e-06, |
|
"loss": 0.8135, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.1746031746031746, |
|
"grad_norm": 0.47765415470199357, |
|
"learning_rate": 9.490992201318165e-06, |
|
"loss": 0.7879, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 0.46535342031003013, |
|
"learning_rate": 9.466316456005783e-06, |
|
"loss": 0.7762, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.2063492063492063, |
|
"grad_norm": 0.5033568814253909, |
|
"learning_rate": 9.441090318955843e-06, |
|
"loss": 0.7022, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 0.4986643533291915, |
|
"learning_rate": 9.415316898827923e-06, |
|
"loss": 0.7349, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.2380952380952381, |
|
"grad_norm": 0.43657193718859494, |
|
"learning_rate": 9.388999371724212e-06, |
|
"loss": 0.8264, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.253968253968254, |
|
"grad_norm": 0.47617277777848616, |
|
"learning_rate": 9.362140980798127e-06, |
|
"loss": 0.8944, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.2698412698412698, |
|
"grad_norm": 0.4295219607791053, |
|
"learning_rate": 9.334745035854646e-06, |
|
"loss": 0.7588, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.5225987407011279, |
|
"learning_rate": 9.306814912942445e-06, |
|
"loss": 0.8359, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.3015873015873016, |
|
"grad_norm": 0.4173684559568506, |
|
"learning_rate": 9.278354053937848e-06, |
|
"loss": 0.7804, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.3174603174603174, |
|
"grad_norm": 0.5238592049595157, |
|
"learning_rate": 9.249365966120692e-06, |
|
"loss": 0.8564, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.4526393208745273, |
|
"learning_rate": 9.219854221742106e-06, |
|
"loss": 0.8102, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.3492063492063493, |
|
"grad_norm": 0.44471888761912887, |
|
"learning_rate": 9.189822457584311e-06, |
|
"loss": 0.7439, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.3650793650793651, |
|
"grad_norm": 0.43731884433734214, |
|
"learning_rate": 9.159274374512444e-06, |
|
"loss": 0.6592, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.380952380952381, |
|
"grad_norm": 0.4377614076782124, |
|
"learning_rate": 9.128213737018493e-06, |
|
"loss": 0.806, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.3968253968253967, |
|
"grad_norm": 0.4027105033083121, |
|
"learning_rate": 9.096644372757393e-06, |
|
"loss": 0.8855, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.4126984126984126, |
|
"grad_norm": 0.571463019194369, |
|
"learning_rate": 9.064570172075349e-06, |
|
"loss": 0.7979, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.4801097800367482, |
|
"learning_rate": 9.031995087530403e-06, |
|
"loss": 0.7992, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.47255682704462587, |
|
"learning_rate": 8.99892313340537e-06, |
|
"loss": 0.6633, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.4603174603174602, |
|
"grad_norm": 0.4862492507086913, |
|
"learning_rate": 8.96535838521314e-06, |
|
"loss": 0.8033, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.4761904761904763, |
|
"grad_norm": 0.4794987734861929, |
|
"learning_rate": 8.931304979194452e-06, |
|
"loss": 0.8069, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.492063492063492, |
|
"grad_norm": 0.4658669415595415, |
|
"learning_rate": 8.896767111808177e-06, |
|
"loss": 0.7371, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.507936507936508, |
|
"grad_norm": 0.5683125861447418, |
|
"learning_rate": 8.861749039214177e-06, |
|
"loss": 0.9145, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"grad_norm": 0.47857884026171116, |
|
"learning_rate": 8.826255076748823e-06, |
|
"loss": 0.8455, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.5396825396825395, |
|
"grad_norm": 0.429389167302876, |
|
"learning_rate": 8.790289598393186e-06, |
|
"loss": 0.7216, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.522031534882144, |
|
"learning_rate": 8.753857036234055e-06, |
|
"loss": 0.8155, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 0.5375692580431519, |
|
"learning_rate": 8.716961879917734e-06, |
|
"loss": 0.7373, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.5873015873015874, |
|
"grad_norm": 0.4277716225580266, |
|
"learning_rate": 8.679608676096793e-06, |
|
"loss": 0.8132, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6031746031746033, |
|
"grad_norm": 0.9709114563751018, |
|
"learning_rate": 8.641802027869774e-06, |
|
"loss": 0.7952, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.619047619047619, |
|
"grad_norm": 0.6722991060253756, |
|
"learning_rate": 8.603546594213935e-06, |
|
"loss": 0.8566, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6349206349206349, |
|
"grad_norm": 0.48227435877100366, |
|
"learning_rate": 8.564847089411128e-06, |
|
"loss": 0.8292, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.6507936507936507, |
|
"grad_norm": 0.43738769808282163, |
|
"learning_rate": 8.525708282466839e-06, |
|
"loss": 0.8424, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.42758983764847835, |
|
"learning_rate": 8.486134996522502e-06, |
|
"loss": 0.8179, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.6825396825396826, |
|
"grad_norm": 0.6465752665836958, |
|
"learning_rate": 8.446132108261136e-06, |
|
"loss": 0.806, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.6984126984126984, |
|
"grad_norm": 0.5216064305348748, |
|
"learning_rate": 8.405704547306379e-06, |
|
"loss": 0.8041, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.46284349128240304, |
|
"learning_rate": 8.364857295615006e-06, |
|
"loss": 0.8924, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.7301587301587302, |
|
"grad_norm": 0.48814352812138595, |
|
"learning_rate": 8.323595386862985e-06, |
|
"loss": 0.7929, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.746031746031746, |
|
"grad_norm": 0.48088506678769916, |
|
"learning_rate": 8.281923905825188e-06, |
|
"loss": 0.7671, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7619047619047619, |
|
"grad_norm": 0.4594586947272896, |
|
"learning_rate": 8.23984798774876e-06, |
|
"loss": 0.7366, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.4673793179812366, |
|
"learning_rate": 8.197372817720314e-06, |
|
"loss": 0.7397, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.7936507936507935, |
|
"grad_norm": 0.6557346369623661, |
|
"learning_rate": 8.154503630026955e-06, |
|
"loss": 0.7262, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.8095238095238095, |
|
"grad_norm": 0.45128446254113314, |
|
"learning_rate": 8.111245707511253e-06, |
|
"loss": 0.7213, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.8253968253968254, |
|
"grad_norm": 0.41666335434637974, |
|
"learning_rate": 8.067604380920228e-06, |
|
"loss": 0.7952, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.8412698412698414, |
|
"grad_norm": 0.4407610683896587, |
|
"learning_rate": 8.023585028248435e-06, |
|
"loss": 0.8486, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 0.5501977264080524, |
|
"learning_rate": 7.979193074075216e-06, |
|
"loss": 0.8911, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.873015873015873, |
|
"grad_norm": 0.459940871244406, |
|
"learning_rate": 7.934433988896233e-06, |
|
"loss": 0.6535, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.46949896874504654, |
|
"learning_rate": 7.889313288449323e-06, |
|
"loss": 0.8232, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.41110722374315695, |
|
"learning_rate": 7.843836533034784e-06, |
|
"loss": 0.7628, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.9206349206349205, |
|
"grad_norm": 0.47755036946919965, |
|
"learning_rate": 7.798009326830167e-06, |
|
"loss": 0.8003, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.9365079365079365, |
|
"grad_norm": 0.41342145123270885, |
|
"learning_rate": 7.751837317199673e-06, |
|
"loss": 0.8683, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.9523809523809523, |
|
"grad_norm": 0.4479867168170251, |
|
"learning_rate": 7.705326193998207e-06, |
|
"loss": 0.7552, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.9682539682539684, |
|
"grad_norm": 0.4549548876094008, |
|
"learning_rate": 7.658481688870218e-06, |
|
"loss": 0.7587, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.9841269841269842, |
|
"grad_norm": 0.4684989926335189, |
|
"learning_rate": 7.611309574543373e-06, |
|
"loss": 0.7607, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.4367513791425883, |
|
"learning_rate": 7.563815664117173e-06, |
|
"loss": 0.9146, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.015873015873016, |
|
"grad_norm": 0.7927149278076437, |
|
"learning_rate": 7.5160058103465985e-06, |
|
"loss": 0.7131, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.0317460317460316, |
|
"grad_norm": 0.5847918647965703, |
|
"learning_rate": 7.467885904920864e-06, |
|
"loss": 0.7578, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.0476190476190474, |
|
"grad_norm": 0.7836046335272314, |
|
"learning_rate": 7.419461877737373e-06, |
|
"loss": 0.8327, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.0634920634920633, |
|
"grad_norm": 2.1428241341527117, |
|
"learning_rate": 7.370739696170971e-06, |
|
"loss": 0.7441, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0793650793650795, |
|
"grad_norm": 0.9566247813485141, |
|
"learning_rate": 7.321725364338566e-06, |
|
"loss": 0.6185, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.0952380952380953, |
|
"grad_norm": 0.5336099004301172, |
|
"learning_rate": 7.272424922359246e-06, |
|
"loss": 0.6455, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.111111111111111, |
|
"grad_norm": 0.7132260718912609, |
|
"learning_rate": 7.222844445609931e-06, |
|
"loss": 0.7834, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.126984126984127, |
|
"grad_norm": 0.5749113101610002, |
|
"learning_rate": 7.172990043976703e-06, |
|
"loss": 0.7296, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.5366676899164674, |
|
"learning_rate": 7.122867861101868e-06, |
|
"loss": 0.795, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.1587301587301586, |
|
"grad_norm": 0.44931031781346276, |
|
"learning_rate": 7.072484073626872e-06, |
|
"loss": 0.6875, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.1746031746031744, |
|
"grad_norm": 0.6709913679680917, |
|
"learning_rate": 7.021844890431136e-06, |
|
"loss": 0.7669, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.1904761904761907, |
|
"grad_norm": 0.5782700607354144, |
|
"learning_rate": 6.970956551866925e-06, |
|
"loss": 0.7273, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.2063492063492065, |
|
"grad_norm": 0.5008612890527109, |
|
"learning_rate": 6.9198253289903515e-06, |
|
"loss": 0.6634, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.5733594756270326, |
|
"learning_rate": 6.868457522788561e-06, |
|
"loss": 0.7358, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.238095238095238, |
|
"grad_norm": 0.48532685396257946, |
|
"learning_rate": 6.816859463403271e-06, |
|
"loss": 0.659, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.253968253968254, |
|
"grad_norm": 0.5460096768726493, |
|
"learning_rate": 6.765037509350685e-06, |
|
"loss": 0.7585, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.2698412698412698, |
|
"grad_norm": 0.4827715321224934, |
|
"learning_rate": 6.7129980467379265e-06, |
|
"loss": 0.6664, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.5417449745700821, |
|
"learning_rate": 6.660747488476066e-06, |
|
"loss": 0.663, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.3015873015873014, |
|
"grad_norm": 0.5672091588208017, |
|
"learning_rate": 6.608292273489851e-06, |
|
"loss": 0.6122, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.317460317460317, |
|
"grad_norm": 0.5264115445856029, |
|
"learning_rate": 6.555638865924221e-06, |
|
"loss": 0.7035, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 0.5168486054014866, |
|
"learning_rate": 6.502793754347721e-06, |
|
"loss": 0.7598, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.3492063492063493, |
|
"grad_norm": 0.6085627519823247, |
|
"learning_rate": 6.449763450952912e-06, |
|
"loss": 0.6875, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.365079365079365, |
|
"grad_norm": 0.504951049632705, |
|
"learning_rate": 6.396554490753848e-06, |
|
"loss": 0.6839, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.42239268629753335, |
|
"learning_rate": 6.343173430780769e-06, |
|
"loss": 0.8396, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.3968253968253967, |
|
"grad_norm": 0.5170870251352963, |
|
"learning_rate": 6.289626849272062e-06, |
|
"loss": 0.8013, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.4126984126984126, |
|
"grad_norm": 0.5408561718958109, |
|
"learning_rate": 6.2359213448636104e-06, |
|
"loss": 0.754, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.4285714285714284, |
|
"grad_norm": 0.42606389993166277, |
|
"learning_rate": 6.182063535775634e-06, |
|
"loss": 0.7662, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 0.41021417431281776, |
|
"learning_rate": 6.1280600589971225e-06, |
|
"loss": 0.791, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.4603174603174605, |
|
"grad_norm": 0.4068459581892925, |
|
"learning_rate": 6.073917569467934e-06, |
|
"loss": 0.8066, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.4761904761904763, |
|
"grad_norm": 0.40243757072180364, |
|
"learning_rate": 6.0196427392587085e-06, |
|
"loss": 0.7061, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.492063492063492, |
|
"grad_norm": 0.5924677871750427, |
|
"learning_rate": 5.96524225674865e-06, |
|
"loss": 0.744, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.507936507936508, |
|
"grad_norm": 0.4344103520994765, |
|
"learning_rate": 5.9107228258013085e-06, |
|
"loss": 0.7076, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.5238095238095237, |
|
"grad_norm": 0.4824828219676673, |
|
"learning_rate": 5.856091164938451e-06, |
|
"loss": 0.6534, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.5396825396825395, |
|
"grad_norm": 0.4197375023372333, |
|
"learning_rate": 5.801354006512127e-06, |
|
"loss": 0.6902, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.5555555555555554, |
|
"grad_norm": 0.4523354962317184, |
|
"learning_rate": 5.746518095875033e-06, |
|
"loss": 0.6996, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 0.41073692830700287, |
|
"learning_rate": 5.6915901905492586e-06, |
|
"loss": 0.629, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.5873015873015874, |
|
"grad_norm": 0.5807356357914126, |
|
"learning_rate": 5.6365770593935665e-06, |
|
"loss": 0.5924, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.6031746031746033, |
|
"grad_norm": 0.5296154741304107, |
|
"learning_rate": 5.581485481769231e-06, |
|
"loss": 0.7197, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.4462893254042338, |
|
"learning_rate": 5.526322246704628e-06, |
|
"loss": 0.8007, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.634920634920635, |
|
"grad_norm": 0.3974463949753287, |
|
"learning_rate": 5.471094152058592e-06, |
|
"loss": 0.6822, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.6507936507936507, |
|
"grad_norm": 0.46244966479154553, |
|
"learning_rate": 5.415808003682717e-06, |
|
"loss": 0.7318, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.438557400530548, |
|
"learning_rate": 5.360470614582661e-06, |
|
"loss": 0.7147, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.682539682539683, |
|
"grad_norm": 0.5680373876053647, |
|
"learning_rate": 5.305088804078559e-06, |
|
"loss": 0.7357, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.6984126984126986, |
|
"grad_norm": 0.4556205137087138, |
|
"learning_rate": 5.249669396964665e-06, |
|
"loss": 0.6361, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.7142857142857144, |
|
"grad_norm": 0.44940699263796485, |
|
"learning_rate": 5.1942192226683385e-06, |
|
"loss": 0.7778, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.7301587301587302, |
|
"grad_norm": 0.47535854965434626, |
|
"learning_rate": 5.138745114408427e-06, |
|
"loss": 0.6008, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.746031746031746, |
|
"grad_norm": 0.5020715004802897, |
|
"learning_rate": 5.083253908353193e-06, |
|
"loss": 0.6696, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.761904761904762, |
|
"grad_norm": 0.4715489187155987, |
|
"learning_rate": 5.0277524427778986e-06, |
|
"loss": 0.7846, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.44938039077917374, |
|
"learning_rate": 4.972247557222102e-06, |
|
"loss": 0.7187, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.7936507936507935, |
|
"grad_norm": 0.536309868809644, |
|
"learning_rate": 4.916746091646808e-06, |
|
"loss": 0.6818, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.8095238095238093, |
|
"grad_norm": 0.4238224566275176, |
|
"learning_rate": 4.8612548855915755e-06, |
|
"loss": 0.7252, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.825396825396825, |
|
"grad_norm": 0.5075369152051689, |
|
"learning_rate": 4.805780777331662e-06, |
|
"loss": 0.7461, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.8412698412698414, |
|
"grad_norm": 0.463068134108742, |
|
"learning_rate": 4.750330603035336e-06, |
|
"loss": 0.7141, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.44910366292391646, |
|
"learning_rate": 4.694911195921443e-06, |
|
"loss": 0.7278, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.873015873015873, |
|
"grad_norm": 0.43362119780351166, |
|
"learning_rate": 4.6395293854173395e-06, |
|
"loss": 0.6069, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 0.7285135499415637, |
|
"learning_rate": 4.584191996317285e-06, |
|
"loss": 0.6846, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.9047619047619047, |
|
"grad_norm": 0.49976201370002465, |
|
"learning_rate": 4.528905847941411e-06, |
|
"loss": 0.843, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.9206349206349205, |
|
"grad_norm": 0.47745344638517, |
|
"learning_rate": 4.473677753295375e-06, |
|
"loss": 0.6609, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.9365079365079367, |
|
"grad_norm": 0.4075892143069301, |
|
"learning_rate": 4.418514518230769e-06, |
|
"loss": 0.7133, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.9523809523809526, |
|
"grad_norm": 0.490679894902017, |
|
"learning_rate": 4.363422940606435e-06, |
|
"loss": 0.7483, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.9682539682539684, |
|
"grad_norm": 0.507751484260846, |
|
"learning_rate": 4.308409809450742e-06, |
|
"loss": 0.7635, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.984126984126984, |
|
"grad_norm": 0.5129728167302848, |
|
"learning_rate": 4.253481904124968e-06, |
|
"loss": 0.7353, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.44280290900369257, |
|
"learning_rate": 4.198645993487872e-06, |
|
"loss": 0.6059, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 3.015873015873016, |
|
"grad_norm": 0.7949393554198322, |
|
"learning_rate": 4.143908835061551e-06, |
|
"loss": 0.6868, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.0317460317460316, |
|
"grad_norm": 0.5012314119268376, |
|
"learning_rate": 4.089277174198694e-06, |
|
"loss": 0.7037, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 3.0476190476190474, |
|
"grad_norm": 0.8765248539640519, |
|
"learning_rate": 4.0347577432513515e-06, |
|
"loss": 0.746, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.0634920634920633, |
|
"grad_norm": 0.5276377235611475, |
|
"learning_rate": 3.980357260741293e-06, |
|
"loss": 0.6836, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 3.0793650793650795, |
|
"grad_norm": 0.5739417223582697, |
|
"learning_rate": 3.926082430532067e-06, |
|
"loss": 0.6428, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 3.0952380952380953, |
|
"grad_norm": 0.6325463534989497, |
|
"learning_rate": 3.87193994100288e-06, |
|
"loss": 0.6092, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 0.6843617935822326, |
|
"learning_rate": 3.817936464224367e-06, |
|
"loss": 0.6763, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.126984126984127, |
|
"grad_norm": 0.5698355849375702, |
|
"learning_rate": 3.764078655136391e-06, |
|
"loss": 0.7472, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 3.142857142857143, |
|
"grad_norm": 0.5699592517012283, |
|
"learning_rate": 3.7103731507279383e-06, |
|
"loss": 0.7029, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.1587301587301586, |
|
"grad_norm": 0.4423177821267063, |
|
"learning_rate": 3.656826569219233e-06, |
|
"loss": 0.6717, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 3.1746031746031744, |
|
"grad_norm": 0.5057172241583261, |
|
"learning_rate": 3.603445509246154e-06, |
|
"loss": 0.6429, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.1904761904761907, |
|
"grad_norm": 0.4627898485974749, |
|
"learning_rate": 3.55023654904709e-06, |
|
"loss": 0.7171, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 3.2063492063492065, |
|
"grad_norm": 0.4765018395044146, |
|
"learning_rate": 3.49720624565228e-06, |
|
"loss": 0.5539, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 3.2222222222222223, |
|
"grad_norm": 0.46472301884916256, |
|
"learning_rate": 3.44436113407578e-06, |
|
"loss": 0.6852, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 3.238095238095238, |
|
"grad_norm": 0.7227019117707013, |
|
"learning_rate": 3.3917077265101505e-06, |
|
"loss": 0.751, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 3.253968253968254, |
|
"grad_norm": 0.46124162458293566, |
|
"learning_rate": 3.3392525115239353e-06, |
|
"loss": 0.5753, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.2698412698412698, |
|
"grad_norm": 1.150296667678599, |
|
"learning_rate": 3.2870019532620744e-06, |
|
"loss": 0.7116, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 3.2857142857142856, |
|
"grad_norm": 0.45629133977245157, |
|
"learning_rate": 3.2349624906493164e-06, |
|
"loss": 0.6506, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 3.3015873015873014, |
|
"grad_norm": 0.6330635820823547, |
|
"learning_rate": 3.1831405365967315e-06, |
|
"loss": 0.5314, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 3.317460317460317, |
|
"grad_norm": 0.5103783322022635, |
|
"learning_rate": 3.1315424772114404e-06, |
|
"loss": 0.7163, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.5407291227510194, |
|
"learning_rate": 3.0801746710096497e-06, |
|
"loss": 0.5543, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.3492063492063493, |
|
"grad_norm": 0.5148220790962434, |
|
"learning_rate": 3.0290434481330746e-06, |
|
"loss": 0.5885, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 3.365079365079365, |
|
"grad_norm": 0.4436633763550198, |
|
"learning_rate": 2.978155109568864e-06, |
|
"loss": 0.6205, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 3.380952380952381, |
|
"grad_norm": 0.3967936149581789, |
|
"learning_rate": 2.927515926373129e-06, |
|
"loss": 0.6664, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 3.3968253968253967, |
|
"grad_norm": 0.4218431236274798, |
|
"learning_rate": 2.8771321388981334e-06, |
|
"loss": 0.6664, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 3.4126984126984126, |
|
"grad_norm": 0.6850450086103512, |
|
"learning_rate": 2.8270099560232992e-06, |
|
"loss": 0.71, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 0.49624036517991055, |
|
"learning_rate": 2.77715555439007e-06, |
|
"loss": 0.7545, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 3.4444444444444446, |
|
"grad_norm": 0.4389117683902036, |
|
"learning_rate": 2.7275750776407568e-06, |
|
"loss": 0.6014, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 3.4603174603174605, |
|
"grad_norm": 0.44946398322882497, |
|
"learning_rate": 2.6782746356614364e-06, |
|
"loss": 0.5866, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 3.4761904761904763, |
|
"grad_norm": 0.5130278875069821, |
|
"learning_rate": 2.6292603038290306e-06, |
|
"loss": 0.7161, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 3.492063492063492, |
|
"grad_norm": 0.6079510505594462, |
|
"learning_rate": 2.580538122262627e-06, |
|
"loss": 0.6545, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.507936507936508, |
|
"grad_norm": 0.5185432227363381, |
|
"learning_rate": 2.532114095079137e-06, |
|
"loss": 0.5745, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 3.5238095238095237, |
|
"grad_norm": 0.47475284651402894, |
|
"learning_rate": 2.4839941896534027e-06, |
|
"loss": 0.6287, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 3.5396825396825395, |
|
"grad_norm": 0.4721541505351033, |
|
"learning_rate": 2.4361843358828287e-06, |
|
"loss": 0.5891, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.6637884613662758, |
|
"learning_rate": 2.388690425456629e-06, |
|
"loss": 0.7191, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 0.49879760044528987, |
|
"learning_rate": 2.341518311129781e-06, |
|
"loss": 0.5703, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.5873015873015874, |
|
"grad_norm": 0.42482396593298977, |
|
"learning_rate": 2.2946738060017947e-06, |
|
"loss": 0.706, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 3.6031746031746033, |
|
"grad_norm": 0.5244046208280333, |
|
"learning_rate": 2.24816268280033e-06, |
|
"loss": 0.6567, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 3.619047619047619, |
|
"grad_norm": 0.45713016270372664, |
|
"learning_rate": 2.2019906731698337e-06, |
|
"loss": 0.6519, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 3.634920634920635, |
|
"grad_norm": 0.3877769815934568, |
|
"learning_rate": 2.156163466965218e-06, |
|
"loss": 0.63, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 3.6507936507936507, |
|
"grad_norm": 0.5213212350040638, |
|
"learning_rate": 2.110686711550678e-06, |
|
"loss": 0.7059, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.6666666666666665, |
|
"grad_norm": 0.39785815195503926, |
|
"learning_rate": 2.0655660111037685e-06, |
|
"loss": 0.6371, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 3.682539682539683, |
|
"grad_norm": 0.42984736444835686, |
|
"learning_rate": 2.0208069259247866e-06, |
|
"loss": 0.659, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 3.6984126984126986, |
|
"grad_norm": 0.4602245826690893, |
|
"learning_rate": 1.976414971751568e-06, |
|
"loss": 0.6043, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 3.7142857142857144, |
|
"grad_norm": 0.4841459331133356, |
|
"learning_rate": 1.932395619079771e-06, |
|
"loss": 0.6762, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 3.7301587301587302, |
|
"grad_norm": 0.38677471787487294, |
|
"learning_rate": 1.8887542924887486e-06, |
|
"loss": 0.7034, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.746031746031746, |
|
"grad_norm": 0.669293977144537, |
|
"learning_rate": 1.8454963699730471e-06, |
|
"loss": 0.6753, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 3.761904761904762, |
|
"grad_norm": 0.9653712675428361, |
|
"learning_rate": 1.802627182279687e-06, |
|
"loss": 0.5958, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 3.7777777777777777, |
|
"grad_norm": 0.8023679085767069, |
|
"learning_rate": 1.760152012251241e-06, |
|
"loss": 0.5046, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 3.7936507936507935, |
|
"grad_norm": 0.523400494967504, |
|
"learning_rate": 1.7180760941748132e-06, |
|
"loss": 0.6704, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 0.5650557974529034, |
|
"learning_rate": 1.6764046131370142e-06, |
|
"loss": 0.7334, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.825396825396825, |
|
"grad_norm": 0.446176823325039, |
|
"learning_rate": 1.6351427043849955e-06, |
|
"loss": 0.6972, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 3.8412698412698414, |
|
"grad_norm": 0.5687688471627884, |
|
"learning_rate": 1.5942954526936217e-06, |
|
"loss": 0.6563, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 3.857142857142857, |
|
"grad_norm": 0.588174299336183, |
|
"learning_rate": 1.5538678917388638e-06, |
|
"loss": 0.6638, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 3.873015873015873, |
|
"grad_norm": 0.48774660261391006, |
|
"learning_rate": 1.5138650034775004e-06, |
|
"loss": 0.5733, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 3.888888888888889, |
|
"grad_norm": 0.4185611368252772, |
|
"learning_rate": 1.4742917175331644e-06, |
|
"loss": 0.7174, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.9047619047619047, |
|
"grad_norm": 0.43603269341453055, |
|
"learning_rate": 1.4351529105888735e-06, |
|
"loss": 0.7672, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 3.9206349206349205, |
|
"grad_norm": 0.4318904871120016, |
|
"learning_rate": 1.3964534057860652e-06, |
|
"loss": 0.5978, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 3.9365079365079367, |
|
"grad_norm": 0.40904640871839104, |
|
"learning_rate": 1.3581979721302286e-06, |
|
"loss": 0.6579, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 3.9523809523809526, |
|
"grad_norm": 0.6005145592007414, |
|
"learning_rate": 1.3203913239032074e-06, |
|
"loss": 0.6694, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 3.9682539682539684, |
|
"grad_norm": 0.472367689533449, |
|
"learning_rate": 1.283038120082268e-06, |
|
"loss": 0.6197, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.984126984126984, |
|
"grad_norm": 0.4356830095251736, |
|
"learning_rate": 1.2461429637659466e-06, |
|
"loss": 0.6213, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4857139342731584, |
|
"learning_rate": 1.2097104016068146e-06, |
|
"loss": 0.6352, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 4.015873015873016, |
|
"grad_norm": 0.7237535323689852, |
|
"learning_rate": 1.1737449232511799e-06, |
|
"loss": 0.6382, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 4.031746031746032, |
|
"grad_norm": 0.46436683787098876, |
|
"learning_rate": 1.1382509607858233e-06, |
|
"loss": 0.681, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 4.0476190476190474, |
|
"grad_norm": 0.38871205852451385, |
|
"learning_rate": 1.1032328881918237e-06, |
|
"loss": 0.6655, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 4.063492063492063, |
|
"grad_norm": 0.5245733396531106, |
|
"learning_rate": 1.0686950208055486e-06, |
|
"loss": 0.6977, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 4.079365079365079, |
|
"grad_norm": 0.7180379448497728, |
|
"learning_rate": 1.034641614786862e-06, |
|
"loss": 0.6271, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 4.095238095238095, |
|
"grad_norm": 0.47735389131691536, |
|
"learning_rate": 1.0010768665946309e-06, |
|
"loss": 0.6079, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 4.111111111111111, |
|
"grad_norm": 0.5783859241207984, |
|
"learning_rate": 9.680049124695973e-07, |
|
"loss": 0.6364, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 4.1269841269841265, |
|
"grad_norm": 0.47172528206140724, |
|
"learning_rate": 9.35429827924652e-07, |
|
"loss": 0.6471, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.142857142857143, |
|
"grad_norm": 0.7730664217116625, |
|
"learning_rate": 9.033556272426075e-07, |
|
"loss": 0.5769, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 4.158730158730159, |
|
"grad_norm": 0.6533050547442746, |
|
"learning_rate": 8.717862629815099e-07, |
|
"loss": 0.6638, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 4.174603174603175, |
|
"grad_norm": 0.5126950213106886, |
|
"learning_rate": 8.407256254875573e-07, |
|
"loss": 0.5556, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 4.190476190476191, |
|
"grad_norm": 0.4249288916316267, |
|
"learning_rate": 8.101775424156888e-07, |
|
"loss": 0.7416, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 4.2063492063492065, |
|
"grad_norm": 0.4999911716251449, |
|
"learning_rate": 7.801457782578947e-07, |
|
"loss": 0.5759, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 4.222222222222222, |
|
"grad_norm": 0.44804880194019553, |
|
"learning_rate": 7.506340338793111e-07, |
|
"loss": 0.7019, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 4.238095238095238, |
|
"grad_norm": 0.502803217299879, |
|
"learning_rate": 7.216459460621528e-07, |
|
"loss": 0.569, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 4.253968253968254, |
|
"grad_norm": 0.6816091836503904, |
|
"learning_rate": 6.931850870575563e-07, |
|
"loss": 0.607, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 4.26984126984127, |
|
"grad_norm": 0.7355066410111105, |
|
"learning_rate": 6.652549641453543e-07, |
|
"loss": 0.6546, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.6616759448500391, |
|
"learning_rate": 6.378590192018752e-07, |
|
"loss": 0.5275, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.301587301587301, |
|
"grad_norm": 0.5266225510350064, |
|
"learning_rate": 6.110006282757897e-07, |
|
"loss": 0.6357, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 4.317460317460317, |
|
"grad_norm": 0.5556516253636915, |
|
"learning_rate": 5.846831011720789e-07, |
|
"loss": 0.5667, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 4.333333333333333, |
|
"grad_norm": 0.45705890325071213, |
|
"learning_rate": 5.589096810441574e-07, |
|
"loss": 0.5398, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 4.349206349206349, |
|
"grad_norm": 0.4272112657516473, |
|
"learning_rate": 5.3368354399422e-07, |
|
"loss": 0.6608, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 4.365079365079365, |
|
"grad_norm": 0.49431513901379337, |
|
"learning_rate": 5.090077986818365e-07, |
|
"loss": 0.5874, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 4.380952380952381, |
|
"grad_norm": 0.5019371137242049, |
|
"learning_rate": 4.848854859408731e-07, |
|
"loss": 0.6658, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 4.396825396825397, |
|
"grad_norm": 0.4921839831510061, |
|
"learning_rate": 4.613195784047653e-07, |
|
"loss": 0.5992, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 4.412698412698413, |
|
"grad_norm": 0.4913329983646245, |
|
"learning_rate": 4.3831298014019144e-07, |
|
"loss": 0.6414, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 4.428571428571429, |
|
"grad_norm": 0.4591709887480562, |
|
"learning_rate": 4.1586852628920095e-07, |
|
"loss": 0.5581, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.49202921355042367, |
|
"learning_rate": 3.939889827198362e-07, |
|
"loss": 0.4977, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.4603174603174605, |
|
"grad_norm": 0.4454959373647045, |
|
"learning_rate": 3.7267704568529015e-07, |
|
"loss": 0.5291, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 4.476190476190476, |
|
"grad_norm": 0.5229427416726549, |
|
"learning_rate": 3.519353414916404e-07, |
|
"loss": 0.6844, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 4.492063492063492, |
|
"grad_norm": 0.4117177933501312, |
|
"learning_rate": 3.3176642617420817e-07, |
|
"loss": 0.6441, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 4.507936507936508, |
|
"grad_norm": 0.5207909494470314, |
|
"learning_rate": 3.1217278518256844e-07, |
|
"loss": 0.6815, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 4.523809523809524, |
|
"grad_norm": 0.47298038451032376, |
|
"learning_rate": 2.93156833074269e-07, |
|
"loss": 0.7125, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 4.5396825396825395, |
|
"grad_norm": 0.4559655191725887, |
|
"learning_rate": 2.7472091321728067e-07, |
|
"loss": 0.5207, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 4.555555555555555, |
|
"grad_norm": 0.48538261262399124, |
|
"learning_rate": 2.568672975012154e-07, |
|
"loss": 0.5553, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 4.571428571428571, |
|
"grad_norm": 0.42805849682825836, |
|
"learning_rate": 2.3959818605736095e-07, |
|
"loss": 0.5694, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 4.587301587301587, |
|
"grad_norm": 0.46054414700724183, |
|
"learning_rate": 2.229157069875537e-07, |
|
"loss": 0.6352, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 4.603174603174603, |
|
"grad_norm": 0.4170725725245928, |
|
"learning_rate": 2.068219161019297e-07, |
|
"loss": 0.493, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.619047619047619, |
|
"grad_norm": 0.5643812716552056, |
|
"learning_rate": 1.9131879666558385e-07, |
|
"loss": 0.6324, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 4.634920634920634, |
|
"grad_norm": 0.462594946066143, |
|
"learning_rate": 1.7640825915416994e-07, |
|
"loss": 0.5406, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 4.650793650793651, |
|
"grad_norm": 0.38882576034876637, |
|
"learning_rate": 1.6209214101846394e-07, |
|
"loss": 0.5732, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 4.666666666666667, |
|
"grad_norm": 0.4703648367216305, |
|
"learning_rate": 1.4837220645793905e-07, |
|
"loss": 0.6893, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 4.682539682539683, |
|
"grad_norm": 0.4272728817767105, |
|
"learning_rate": 1.3525014620335786e-07, |
|
"loss": 0.6755, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 4.698412698412699, |
|
"grad_norm": 0.612286386647566, |
|
"learning_rate": 1.2272757730841744e-07, |
|
"loss": 0.7234, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 4.714285714285714, |
|
"grad_norm": 0.3683204109754377, |
|
"learning_rate": 1.1080604295048203e-07, |
|
"loss": 0.5681, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 4.73015873015873, |
|
"grad_norm": 0.4198898648516418, |
|
"learning_rate": 9.948701224041124e-08, |
|
"loss": 0.682, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 4.746031746031746, |
|
"grad_norm": 0.4161451987722102, |
|
"learning_rate": 8.877188004152104e-08, |
|
"loss": 0.6787, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"grad_norm": 0.45897732699104254, |
|
"learning_rate": 7.866196679768956e-08, |
|
"loss": 0.589, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.777777777777778, |
|
"grad_norm": 0.40150992469071173, |
|
"learning_rate": 6.91585183706428e-08, |
|
"loss": 0.5974, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 4.7936507936507935, |
|
"grad_norm": 0.43359782320810974, |
|
"learning_rate": 6.02627058864158e-08, |
|
"loss": 0.6319, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 4.809523809523809, |
|
"grad_norm": 0.44506261105820183, |
|
"learning_rate": 5.19756255910403e-08, |
|
"loss": 0.6191, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 4.825396825396825, |
|
"grad_norm": 0.5066033533058674, |
|
"learning_rate": 4.429829871545055e-08, |
|
"loss": 0.6192, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 4.841269841269841, |
|
"grad_norm": 0.408870784528967, |
|
"learning_rate": 3.7231671349634015e-08, |
|
"loss": 0.5396, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 4.857142857142857, |
|
"grad_norm": 0.41548539048694993, |
|
"learning_rate": 3.077661432604184e-08, |
|
"loss": 0.573, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 4.8730158730158735, |
|
"grad_norm": 0.41079087097410333, |
|
"learning_rate": 2.4933923112279712e-08, |
|
"loss": 0.6776, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 0.4175105640706622, |
|
"learning_rate": 1.9704317713076236e-08, |
|
"loss": 0.7029, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 4.904761904761905, |
|
"grad_norm": 0.4780736273310837, |
|
"learning_rate": 1.508844258155728e-08, |
|
"loss": 0.6435, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 4.920634920634921, |
|
"grad_norm": 0.4004152110528723, |
|
"learning_rate": 1.1086866539830044e-08, |
|
"loss": 0.6868, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.936507936507937, |
|
"grad_norm": 0.4479777279190579, |
|
"learning_rate": 7.700082708883006e-09, |
|
"loss": 0.6421, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 4.9523809523809526, |
|
"grad_norm": 0.44065610989172405, |
|
"learning_rate": 4.928508447821223e-09, |
|
"loss": 0.617, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 4.968253968253968, |
|
"grad_norm": 0.4640148914737005, |
|
"learning_rate": 2.7724853024324594e-09, |
|
"loss": 0.5754, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 4.984126984126984, |
|
"grad_norm": 0.5313400670046616, |
|
"learning_rate": 1.2322789630997422e-09, |
|
"loss": 0.6552, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.4643036864898895, |
|
"learning_rate": 3.080792320564463e-10, |
|
"loss": 0.5647, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 315, |
|
"total_flos": 78480301031424.0, |
|
"train_loss": 0.7375902401076423, |
|
"train_runtime": 10558.2121, |
|
"train_samples_per_second": 0.474, |
|
"train_steps_per_second": 0.03 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 315, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 78480301031424.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|