|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 189, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015873015873015872, |
|
"grad_norm": 2.3323949793109238, |
|
"learning_rate": 0.0, |
|
"loss": 1.0469, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.031746031746031744, |
|
"grad_norm": 2.317201200688066, |
|
"learning_rate": 3.125e-07, |
|
"loss": 0.9931, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.047619047619047616, |
|
"grad_norm": 2.603689956679125, |
|
"learning_rate": 6.25e-07, |
|
"loss": 1.0188, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.06349206349206349, |
|
"grad_norm": 2.2583787301898592, |
|
"learning_rate": 9.375000000000001e-07, |
|
"loss": 0.9097, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07936507936507936, |
|
"grad_norm": 2.197466891038096, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.0459, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": 2.1259963361099747, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.9986, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 2.0707820881041, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.9555, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.12698412698412698, |
|
"grad_norm": 1.870407527874291, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 0.952, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 1.8578085390534953, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.9993, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.15873015873015872, |
|
"grad_norm": 1.881148688458384, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.9373, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1746031746031746, |
|
"grad_norm": 1.6917769845914787, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.8839, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 1.2541345576396532, |
|
"learning_rate": 3.4375e-06, |
|
"loss": 0.9909, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.20634920634920634, |
|
"grad_norm": 1.4038335670152517, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.9322, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 1.363468897891553, |
|
"learning_rate": 4.0625000000000005e-06, |
|
"loss": 1.0934, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 1.1331989679866032, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.977, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.25396825396825395, |
|
"grad_norm": 0.9899834287202586, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 1.0443, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2698412698412698, |
|
"grad_norm": 1.155920523517074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9483, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 1.2715867938274161, |
|
"learning_rate": 5.3125e-06, |
|
"loss": 1.0096, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.30158730158730157, |
|
"grad_norm": 0.9922231339593638, |
|
"learning_rate": 5.625e-06, |
|
"loss": 0.7463, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 1.2551959582539625, |
|
"learning_rate": 5.9375e-06, |
|
"loss": 0.9226, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.892951024999124, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.988, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3492063492063492, |
|
"grad_norm": 1.4360539096520086, |
|
"learning_rate": 6.5625e-06, |
|
"loss": 1.0509, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.36507936507936506, |
|
"grad_norm": 1.1100051374669628, |
|
"learning_rate": 6.875e-06, |
|
"loss": 0.8728, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 0.9630208551024003, |
|
"learning_rate": 7.1875e-06, |
|
"loss": 0.8352, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3968253968253968, |
|
"grad_norm": 1.109963225007402, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.0289, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4126984126984127, |
|
"grad_norm": 0.842175710243708, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 0.8616, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.8255762742603932, |
|
"learning_rate": 8.125000000000001e-06, |
|
"loss": 0.7234, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.8274507712792363, |
|
"learning_rate": 8.4375e-06, |
|
"loss": 0.9758, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4603174603174603, |
|
"grad_norm": 0.7834224887700044, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 0.9056, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 1.187020605300137, |
|
"learning_rate": 9.0625e-06, |
|
"loss": 0.9481, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.49206349206349204, |
|
"grad_norm": 1.0233176856791018, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 0.9194, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5079365079365079, |
|
"grad_norm": 0.848791394024066, |
|
"learning_rate": 9.6875e-06, |
|
"loss": 0.8852, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5238095238095238, |
|
"grad_norm": 0.8289281876622956, |
|
"learning_rate": 1e-05, |
|
"loss": 1.038, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5396825396825397, |
|
"grad_norm": 0.7738330911179299, |
|
"learning_rate": 9.999691920767945e-06, |
|
"loss": 0.8374, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.65004421093035, |
|
"learning_rate": 9.998767721036901e-06, |
|
"loss": 0.8242, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.718229691257778, |
|
"learning_rate": 9.997227514697568e-06, |
|
"loss": 0.9693, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5873015873015873, |
|
"grad_norm": 0.598178727036991, |
|
"learning_rate": 9.99507149155218e-06, |
|
"loss": 0.9843, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6031746031746031, |
|
"grad_norm": 0.6896420594948925, |
|
"learning_rate": 9.992299917291118e-06, |
|
"loss": 0.848, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6190476190476191, |
|
"grad_norm": 0.7218001479564617, |
|
"learning_rate": 9.98891313346017e-06, |
|
"loss": 0.9095, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 0.673383804041238, |
|
"learning_rate": 9.984911557418444e-06, |
|
"loss": 0.7682, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6507936507936508, |
|
"grad_norm": 0.9044903125501461, |
|
"learning_rate": 9.980295682286924e-06, |
|
"loss": 0.8388, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.6528626470394925, |
|
"learning_rate": 9.97506607688772e-06, |
|
"loss": 0.9107, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6825396825396826, |
|
"grad_norm": 0.5248039585149111, |
|
"learning_rate": 9.969223385673958e-06, |
|
"loss": 0.8307, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6984126984126984, |
|
"grad_norm": 0.568338771820042, |
|
"learning_rate": 9.962768328650367e-06, |
|
"loss": 0.7523, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.5429855696185105, |
|
"learning_rate": 9.95570170128455e-06, |
|
"loss": 0.8442, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7301587301587301, |
|
"grad_norm": 0.5098426033492849, |
|
"learning_rate": 9.94802437440896e-06, |
|
"loss": 0.7962, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.746031746031746, |
|
"grad_norm": 0.6078990273192543, |
|
"learning_rate": 9.939737294113585e-06, |
|
"loss": 0.8969, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 0.4709547244829324, |
|
"learning_rate": 9.930841481629358e-06, |
|
"loss": 0.8885, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 0.54039591858629, |
|
"learning_rate": 9.92133803320231e-06, |
|
"loss": 0.7818, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.7936507936507936, |
|
"grad_norm": 0.4875170254753124, |
|
"learning_rate": 9.91122811995848e-06, |
|
"loss": 0.8193, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8095238095238095, |
|
"grad_norm": 0.5005396928536703, |
|
"learning_rate": 9.90051298775959e-06, |
|
"loss": 0.8692, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.8253968253968254, |
|
"grad_norm": 0.40245216027036546, |
|
"learning_rate": 9.88919395704952e-06, |
|
"loss": 0.826, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8412698412698413, |
|
"grad_norm": 0.5389952051377087, |
|
"learning_rate": 9.877272422691583e-06, |
|
"loss": 0.9318, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.5638980417584056, |
|
"learning_rate": 9.864749853796642e-06, |
|
"loss": 0.7985, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.873015873015873, |
|
"grad_norm": 0.5506830661309166, |
|
"learning_rate": 9.85162779354206e-06, |
|
"loss": 0.7291, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.48566023212019677, |
|
"learning_rate": 9.837907858981536e-06, |
|
"loss": 0.8802, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9047619047619048, |
|
"grad_norm": 0.4725406192484581, |
|
"learning_rate": 9.823591740845831e-06, |
|
"loss": 0.8627, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.9206349206349206, |
|
"grad_norm": 0.5270784935436914, |
|
"learning_rate": 9.808681203334416e-06, |
|
"loss": 0.7976, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.9365079365079365, |
|
"grad_norm": 0.4795159174595573, |
|
"learning_rate": 9.793178083898073e-06, |
|
"loss": 0.8783, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.42309628953003137, |
|
"learning_rate": 9.777084293012448e-06, |
|
"loss": 0.842, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9682539682539683, |
|
"grad_norm": 0.464555539059811, |
|
"learning_rate": 9.760401813942641e-06, |
|
"loss": 0.7662, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9841269841269841, |
|
"grad_norm": 0.5141212041737542, |
|
"learning_rate": 9.743132702498785e-06, |
|
"loss": 0.8688, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5165788253828009, |
|
"learning_rate": 9.725279086782719e-06, |
|
"loss": 0.768, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.0158730158730158, |
|
"grad_norm": 0.576629868282963, |
|
"learning_rate": 9.706843166925733e-06, |
|
"loss": 0.7989, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.0317460317460316, |
|
"grad_norm": 0.4946943998511545, |
|
"learning_rate": 9.687827214817433e-06, |
|
"loss": 0.8261, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0476190476190477, |
|
"grad_norm": 0.4987216606535057, |
|
"learning_rate": 9.668233573825794e-06, |
|
"loss": 0.8905, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0634920634920635, |
|
"grad_norm": 0.45688977932466196, |
|
"learning_rate": 9.64806465850836e-06, |
|
"loss": 0.7327, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.0793650793650793, |
|
"grad_norm": 0.5226340006885853, |
|
"learning_rate": 9.62732295431471e-06, |
|
"loss": 0.7311, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.0952380952380953, |
|
"grad_norm": 0.6684025298786129, |
|
"learning_rate": 9.606011017280166e-06, |
|
"loss": 0.8971, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.5147703758608321, |
|
"learning_rate": 9.5841314737108e-06, |
|
"loss": 0.7652, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.126984126984127, |
|
"grad_norm": 0.5417227409614662, |
|
"learning_rate": 9.56168701985981e-06, |
|
"loss": 0.7999, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.5016561221704748, |
|
"learning_rate": 9.538680421595236e-06, |
|
"loss": 0.8074, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.1587301587301586, |
|
"grad_norm": 0.4853528793957531, |
|
"learning_rate": 9.515114514059127e-06, |
|
"loss": 0.8135, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.1746031746031746, |
|
"grad_norm": 0.47765415470199357, |
|
"learning_rate": 9.490992201318165e-06, |
|
"loss": 0.7879, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 0.46535342031003013, |
|
"learning_rate": 9.466316456005783e-06, |
|
"loss": 0.7762, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.2063492063492063, |
|
"grad_norm": 0.5033568814253909, |
|
"learning_rate": 9.441090318955843e-06, |
|
"loss": 0.7022, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 0.4986643533291915, |
|
"learning_rate": 9.415316898827923e-06, |
|
"loss": 0.7349, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.2380952380952381, |
|
"grad_norm": 0.43657193718859494, |
|
"learning_rate": 9.388999371724212e-06, |
|
"loss": 0.8264, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.253968253968254, |
|
"grad_norm": 0.47617277777848616, |
|
"learning_rate": 9.362140980798127e-06, |
|
"loss": 0.8944, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.2698412698412698, |
|
"grad_norm": 0.4295219607791053, |
|
"learning_rate": 9.334745035854646e-06, |
|
"loss": 0.7588, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.5225987407011279, |
|
"learning_rate": 9.306814912942445e-06, |
|
"loss": 0.8359, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.3015873015873016, |
|
"grad_norm": 0.4173684559568506, |
|
"learning_rate": 9.278354053937848e-06, |
|
"loss": 0.7804, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.3174603174603174, |
|
"grad_norm": 0.5238592049595157, |
|
"learning_rate": 9.249365966120692e-06, |
|
"loss": 0.8564, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.4526393208745273, |
|
"learning_rate": 9.219854221742106e-06, |
|
"loss": 0.8102, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.3492063492063493, |
|
"grad_norm": 0.44471888761912887, |
|
"learning_rate": 9.189822457584311e-06, |
|
"loss": 0.7439, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.3650793650793651, |
|
"grad_norm": 0.43731884433734214, |
|
"learning_rate": 9.159274374512444e-06, |
|
"loss": 0.6592, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.380952380952381, |
|
"grad_norm": 0.4377614076782124, |
|
"learning_rate": 9.128213737018493e-06, |
|
"loss": 0.806, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.3968253968253967, |
|
"grad_norm": 0.4027105033083121, |
|
"learning_rate": 9.096644372757393e-06, |
|
"loss": 0.8855, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.4126984126984126, |
|
"grad_norm": 0.571463019194369, |
|
"learning_rate": 9.064570172075349e-06, |
|
"loss": 0.7979, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.4801097800367482, |
|
"learning_rate": 9.031995087530403e-06, |
|
"loss": 0.7992, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.47255682704462587, |
|
"learning_rate": 8.99892313340537e-06, |
|
"loss": 0.6633, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.4603174603174602, |
|
"grad_norm": 0.4862492507086913, |
|
"learning_rate": 8.96535838521314e-06, |
|
"loss": 0.8033, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.4761904761904763, |
|
"grad_norm": 0.4794987734861929, |
|
"learning_rate": 8.931304979194452e-06, |
|
"loss": 0.8069, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.492063492063492, |
|
"grad_norm": 0.4658669415595415, |
|
"learning_rate": 8.896767111808177e-06, |
|
"loss": 0.7371, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.507936507936508, |
|
"grad_norm": 0.5683125861447418, |
|
"learning_rate": 8.861749039214177e-06, |
|
"loss": 0.9145, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"grad_norm": 0.47857884026171116, |
|
"learning_rate": 8.826255076748823e-06, |
|
"loss": 0.8455, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.5396825396825395, |
|
"grad_norm": 0.429389167302876, |
|
"learning_rate": 8.790289598393186e-06, |
|
"loss": 0.7216, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.522031534882144, |
|
"learning_rate": 8.753857036234055e-06, |
|
"loss": 0.8155, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 0.5375692580431519, |
|
"learning_rate": 8.716961879917734e-06, |
|
"loss": 0.7373, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.5873015873015874, |
|
"grad_norm": 0.4277716225580266, |
|
"learning_rate": 8.679608676096793e-06, |
|
"loss": 0.8132, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6031746031746033, |
|
"grad_norm": 0.9709114563751018, |
|
"learning_rate": 8.641802027869774e-06, |
|
"loss": 0.7952, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.619047619047619, |
|
"grad_norm": 0.6722991060253756, |
|
"learning_rate": 8.603546594213935e-06, |
|
"loss": 0.8566, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6349206349206349, |
|
"grad_norm": 0.48227435877100366, |
|
"learning_rate": 8.564847089411128e-06, |
|
"loss": 0.8292, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.6507936507936507, |
|
"grad_norm": 0.43738769808282163, |
|
"learning_rate": 8.525708282466839e-06, |
|
"loss": 0.8424, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.42758983764847835, |
|
"learning_rate": 8.486134996522502e-06, |
|
"loss": 0.8179, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.6825396825396826, |
|
"grad_norm": 0.6465752665836958, |
|
"learning_rate": 8.446132108261136e-06, |
|
"loss": 0.806, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.6984126984126984, |
|
"grad_norm": 0.5216064305348748, |
|
"learning_rate": 8.405704547306379e-06, |
|
"loss": 0.8041, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.46284349128240304, |
|
"learning_rate": 8.364857295615006e-06, |
|
"loss": 0.8924, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.7301587301587302, |
|
"grad_norm": 0.48814352812138595, |
|
"learning_rate": 8.323595386862985e-06, |
|
"loss": 0.7929, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.746031746031746, |
|
"grad_norm": 0.48088506678769916, |
|
"learning_rate": 8.281923905825188e-06, |
|
"loss": 0.7671, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7619047619047619, |
|
"grad_norm": 0.4594586947272896, |
|
"learning_rate": 8.23984798774876e-06, |
|
"loss": 0.7366, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.4673793179812366, |
|
"learning_rate": 8.197372817720314e-06, |
|
"loss": 0.7397, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.7936507936507935, |
|
"grad_norm": 0.6557346369623661, |
|
"learning_rate": 8.154503630026955e-06, |
|
"loss": 0.7262, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.8095238095238095, |
|
"grad_norm": 0.45128446254113314, |
|
"learning_rate": 8.111245707511253e-06, |
|
"loss": 0.7213, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.8253968253968254, |
|
"grad_norm": 0.41666335434637974, |
|
"learning_rate": 8.067604380920228e-06, |
|
"loss": 0.7952, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.8412698412698414, |
|
"grad_norm": 0.4407610683896587, |
|
"learning_rate": 8.023585028248435e-06, |
|
"loss": 0.8486, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 0.5501977264080524, |
|
"learning_rate": 7.979193074075216e-06, |
|
"loss": 0.8911, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.873015873015873, |
|
"grad_norm": 0.459940871244406, |
|
"learning_rate": 7.934433988896233e-06, |
|
"loss": 0.6535, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.46949896874504654, |
|
"learning_rate": 7.889313288449323e-06, |
|
"loss": 0.8232, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.41110722374315695, |
|
"learning_rate": 7.843836533034784e-06, |
|
"loss": 0.7628, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.9206349206349205, |
|
"grad_norm": 0.47755036946919965, |
|
"learning_rate": 7.798009326830167e-06, |
|
"loss": 0.8003, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.9365079365079365, |
|
"grad_norm": 0.41342145123270885, |
|
"learning_rate": 7.751837317199673e-06, |
|
"loss": 0.8683, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.9523809523809523, |
|
"grad_norm": 0.4479867168170251, |
|
"learning_rate": 7.705326193998207e-06, |
|
"loss": 0.7552, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.9682539682539684, |
|
"grad_norm": 0.4549548876094008, |
|
"learning_rate": 7.658481688870218e-06, |
|
"loss": 0.7587, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.9841269841269842, |
|
"grad_norm": 0.4684989926335189, |
|
"learning_rate": 7.611309574543373e-06, |
|
"loss": 0.7607, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.4367513791425883, |
|
"learning_rate": 7.563815664117173e-06, |
|
"loss": 0.9146, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.015873015873016, |
|
"grad_norm": 0.7927149278076437, |
|
"learning_rate": 7.5160058103465985e-06, |
|
"loss": 0.7131, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.0317460317460316, |
|
"grad_norm": 0.5847918647965703, |
|
"learning_rate": 7.467885904920864e-06, |
|
"loss": 0.7578, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.0476190476190474, |
|
"grad_norm": 0.7836046335272314, |
|
"learning_rate": 7.419461877737373e-06, |
|
"loss": 0.8327, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.0634920634920633, |
|
"grad_norm": 2.1428241341527117, |
|
"learning_rate": 7.370739696170971e-06, |
|
"loss": 0.7441, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0793650793650795, |
|
"grad_norm": 0.9566247813485141, |
|
"learning_rate": 7.321725364338566e-06, |
|
"loss": 0.6185, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.0952380952380953, |
|
"grad_norm": 0.5336099004301172, |
|
"learning_rate": 7.272424922359246e-06, |
|
"loss": 0.6455, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.111111111111111, |
|
"grad_norm": 0.7132260718912609, |
|
"learning_rate": 7.222844445609931e-06, |
|
"loss": 0.7834, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.126984126984127, |
|
"grad_norm": 0.5749113101610002, |
|
"learning_rate": 7.172990043976703e-06, |
|
"loss": 0.7296, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.5366676899164674, |
|
"learning_rate": 7.122867861101868e-06, |
|
"loss": 0.795, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.1587301587301586, |
|
"grad_norm": 0.44931031781346276, |
|
"learning_rate": 7.072484073626872e-06, |
|
"loss": 0.6875, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.1746031746031744, |
|
"grad_norm": 0.6709913679680917, |
|
"learning_rate": 7.021844890431136e-06, |
|
"loss": 0.7669, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.1904761904761907, |
|
"grad_norm": 0.5782700607354144, |
|
"learning_rate": 6.970956551866925e-06, |
|
"loss": 0.7273, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.2063492063492065, |
|
"grad_norm": 0.5008612890527109, |
|
"learning_rate": 6.9198253289903515e-06, |
|
"loss": 0.6634, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.5733594756270326, |
|
"learning_rate": 6.868457522788561e-06, |
|
"loss": 0.7358, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.238095238095238, |
|
"grad_norm": 0.48532685396257946, |
|
"learning_rate": 6.816859463403271e-06, |
|
"loss": 0.659, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.253968253968254, |
|
"grad_norm": 0.5460096768726493, |
|
"learning_rate": 6.765037509350685e-06, |
|
"loss": 0.7585, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.2698412698412698, |
|
"grad_norm": 0.4827715321224934, |
|
"learning_rate": 6.7129980467379265e-06, |
|
"loss": 0.6664, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.5417449745700821, |
|
"learning_rate": 6.660747488476066e-06, |
|
"loss": 0.663, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.3015873015873014, |
|
"grad_norm": 0.5672091588208017, |
|
"learning_rate": 6.608292273489851e-06, |
|
"loss": 0.6122, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.317460317460317, |
|
"grad_norm": 0.5264115445856029, |
|
"learning_rate": 6.555638865924221e-06, |
|
"loss": 0.7035, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 0.5168486054014866, |
|
"learning_rate": 6.502793754347721e-06, |
|
"loss": 0.7598, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.3492063492063493, |
|
"grad_norm": 0.6085627519823247, |
|
"learning_rate": 6.449763450952912e-06, |
|
"loss": 0.6875, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.365079365079365, |
|
"grad_norm": 0.504951049632705, |
|
"learning_rate": 6.396554490753848e-06, |
|
"loss": 0.6839, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.42239268629753335, |
|
"learning_rate": 6.343173430780769e-06, |
|
"loss": 0.8396, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.3968253968253967, |
|
"grad_norm": 0.5170870251352963, |
|
"learning_rate": 6.289626849272062e-06, |
|
"loss": 0.8013, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.4126984126984126, |
|
"grad_norm": 0.5408561718958109, |
|
"learning_rate": 6.2359213448636104e-06, |
|
"loss": 0.754, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.4285714285714284, |
|
"grad_norm": 0.42606389993166277, |
|
"learning_rate": 6.182063535775634e-06, |
|
"loss": 0.7662, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 0.41021417431281776, |
|
"learning_rate": 6.1280600589971225e-06, |
|
"loss": 0.791, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.4603174603174605, |
|
"grad_norm": 0.4068459581892925, |
|
"learning_rate": 6.073917569467934e-06, |
|
"loss": 0.8066, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.4761904761904763, |
|
"grad_norm": 0.40243757072180364, |
|
"learning_rate": 6.0196427392587085e-06, |
|
"loss": 0.7061, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.492063492063492, |
|
"grad_norm": 0.5924677871750427, |
|
"learning_rate": 5.96524225674865e-06, |
|
"loss": 0.744, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.507936507936508, |
|
"grad_norm": 0.4344103520994765, |
|
"learning_rate": 5.9107228258013085e-06, |
|
"loss": 0.7076, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.5238095238095237, |
|
"grad_norm": 0.4824828219676673, |
|
"learning_rate": 5.856091164938451e-06, |
|
"loss": 0.6534, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.5396825396825395, |
|
"grad_norm": 0.4197375023372333, |
|
"learning_rate": 5.801354006512127e-06, |
|
"loss": 0.6902, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.5555555555555554, |
|
"grad_norm": 0.4523354962317184, |
|
"learning_rate": 5.746518095875033e-06, |
|
"loss": 0.6996, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 0.41073692830700287, |
|
"learning_rate": 5.6915901905492586e-06, |
|
"loss": 0.629, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.5873015873015874, |
|
"grad_norm": 0.5807356357914126, |
|
"learning_rate": 5.6365770593935665e-06, |
|
"loss": 0.5924, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.6031746031746033, |
|
"grad_norm": 0.5296154741304107, |
|
"learning_rate": 5.581485481769231e-06, |
|
"loss": 0.7197, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.4462893254042338, |
|
"learning_rate": 5.526322246704628e-06, |
|
"loss": 0.8007, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.634920634920635, |
|
"grad_norm": 0.3974463949753287, |
|
"learning_rate": 5.471094152058592e-06, |
|
"loss": 0.6822, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.6507936507936507, |
|
"grad_norm": 0.46244966479154553, |
|
"learning_rate": 5.415808003682717e-06, |
|
"loss": 0.7318, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.438557400530548, |
|
"learning_rate": 5.360470614582661e-06, |
|
"loss": 0.7147, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.682539682539683, |
|
"grad_norm": 0.5680373876053647, |
|
"learning_rate": 5.305088804078559e-06, |
|
"loss": 0.7357, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.6984126984126986, |
|
"grad_norm": 0.4556205137087138, |
|
"learning_rate": 5.249669396964665e-06, |
|
"loss": 0.6361, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.7142857142857144, |
|
"grad_norm": 0.44940699263796485, |
|
"learning_rate": 5.1942192226683385e-06, |
|
"loss": 0.7778, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.7301587301587302, |
|
"grad_norm": 0.47535854965434626, |
|
"learning_rate": 5.138745114408427e-06, |
|
"loss": 0.6008, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.746031746031746, |
|
"grad_norm": 0.5020715004802897, |
|
"learning_rate": 5.083253908353193e-06, |
|
"loss": 0.6696, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.761904761904762, |
|
"grad_norm": 0.4715489187155987, |
|
"learning_rate": 5.0277524427778986e-06, |
|
"loss": 0.7846, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.44938039077917374, |
|
"learning_rate": 4.972247557222102e-06, |
|
"loss": 0.7187, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.7936507936507935, |
|
"grad_norm": 0.536309868809644, |
|
"learning_rate": 4.916746091646808e-06, |
|
"loss": 0.6818, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.8095238095238093, |
|
"grad_norm": 0.4238224566275176, |
|
"learning_rate": 4.8612548855915755e-06, |
|
"loss": 0.7252, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.825396825396825, |
|
"grad_norm": 0.5075369152051689, |
|
"learning_rate": 4.805780777331662e-06, |
|
"loss": 0.7461, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.8412698412698414, |
|
"grad_norm": 0.463068134108742, |
|
"learning_rate": 4.750330603035336e-06, |
|
"loss": 0.7141, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.44910366292391646, |
|
"learning_rate": 4.694911195921443e-06, |
|
"loss": 0.7278, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.873015873015873, |
|
"grad_norm": 0.43362119780351166, |
|
"learning_rate": 4.6395293854173395e-06, |
|
"loss": 0.6069, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 0.7285135499415637, |
|
"learning_rate": 4.584191996317285e-06, |
|
"loss": 0.6846, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.9047619047619047, |
|
"grad_norm": 0.49976201370002465, |
|
"learning_rate": 4.528905847941411e-06, |
|
"loss": 0.843, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.9206349206349205, |
|
"grad_norm": 0.47745344638517, |
|
"learning_rate": 4.473677753295375e-06, |
|
"loss": 0.6609, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.9365079365079367, |
|
"grad_norm": 0.4075892143069301, |
|
"learning_rate": 4.418514518230769e-06, |
|
"loss": 0.7133, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.9523809523809526, |
|
"grad_norm": 0.490679894902017, |
|
"learning_rate": 4.363422940606435e-06, |
|
"loss": 0.7483, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.9682539682539684, |
|
"grad_norm": 0.507751484260846, |
|
"learning_rate": 4.308409809450742e-06, |
|
"loss": 0.7635, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.984126984126984, |
|
"grad_norm": 0.5129728167302848, |
|
"learning_rate": 4.253481904124968e-06, |
|
"loss": 0.7353, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.44280290900369257, |
|
"learning_rate": 4.198645993487872e-06, |
|
"loss": 0.6059, |
|
"step": 189 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 315, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 47138450767872.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|