|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 189, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015873015873015872, |
|
"grad_norm": 2.3559653063882835, |
|
"learning_rate": 0.0, |
|
"loss": 1.0469, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.031746031746031744, |
|
"grad_norm": 2.29659253609106, |
|
"learning_rate": 3.125e-07, |
|
"loss": 0.9929, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.047619047619047616, |
|
"grad_norm": 2.8899785663629123, |
|
"learning_rate": 6.25e-07, |
|
"loss": 1.0189, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.06349206349206349, |
|
"grad_norm": 2.2527917765154153, |
|
"learning_rate": 9.375000000000001e-07, |
|
"loss": 0.9098, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07936507936507936, |
|
"grad_norm": 2.2029116064708907, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.0462, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": 2.122312207060731, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.9986, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 2.045608321522422, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.9554, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.12698412698412698, |
|
"grad_norm": 1.8619152723479657, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 0.9522, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 1.8663709940802706, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.9994, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.15873015873015872, |
|
"grad_norm": 1.756651050051264, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.9373, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1746031746031746, |
|
"grad_norm": 1.6604489437599113, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.8839, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 1.292095856227553, |
|
"learning_rate": 3.4375e-06, |
|
"loss": 0.9907, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.20634920634920634, |
|
"grad_norm": 1.2693344466908103, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.9322, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 1.6344663181288221, |
|
"learning_rate": 4.0625000000000005e-06, |
|
"loss": 1.0934, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 1.1511973531225708, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.9771, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.25396825396825395, |
|
"grad_norm": 1.1464903643465947, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 1.0442, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2698412698412698, |
|
"grad_norm": 1.1549964376534243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.949, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 1.2909225755543452, |
|
"learning_rate": 5.3125e-06, |
|
"loss": 1.0098, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.30158730158730157, |
|
"grad_norm": 1.0413894105842352, |
|
"learning_rate": 5.625e-06, |
|
"loss": 0.7468, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 1.2863204827157997, |
|
"learning_rate": 5.9375e-06, |
|
"loss": 0.9232, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.901486908411037, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.9885, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3492063492063492, |
|
"grad_norm": 1.0229681930848715, |
|
"learning_rate": 6.5625e-06, |
|
"loss": 1.0508, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.36507936507936506, |
|
"grad_norm": 1.0772270927236638, |
|
"learning_rate": 6.875e-06, |
|
"loss": 0.8728, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 0.9393952588410857, |
|
"learning_rate": 7.1875e-06, |
|
"loss": 0.8349, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3968253968253968, |
|
"grad_norm": 1.0822345499912303, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.0283, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4126984126984127, |
|
"grad_norm": 0.8124841375138875, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 0.8612, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.7868328056966778, |
|
"learning_rate": 8.125000000000001e-06, |
|
"loss": 0.7232, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.8283002452965974, |
|
"learning_rate": 8.4375e-06, |
|
"loss": 0.9752, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4603174603174603, |
|
"grad_norm": 0.8117454313345658, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 0.9053, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 0.8594519052279771, |
|
"learning_rate": 9.0625e-06, |
|
"loss": 0.9479, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.49206349206349204, |
|
"grad_norm": 0.9958158956912483, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 0.9188, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5079365079365079, |
|
"grad_norm": 0.8918575253813723, |
|
"learning_rate": 9.6875e-06, |
|
"loss": 0.8847, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5238095238095238, |
|
"grad_norm": 0.8125329449215294, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0379, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5396825396825397, |
|
"grad_norm": 0.7108806956906407, |
|
"learning_rate": 9.999691920767945e-06, |
|
"loss": 0.8376, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.641926463787557, |
|
"learning_rate": 9.998767721036901e-06, |
|
"loss": 0.8241, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.7048636529194373, |
|
"learning_rate": 9.997227514697568e-06, |
|
"loss": 0.9693, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5873015873015873, |
|
"grad_norm": 0.6041864409794199, |
|
"learning_rate": 9.99507149155218e-06, |
|
"loss": 0.9839, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6031746031746031, |
|
"grad_norm": 0.6529381186048961, |
|
"learning_rate": 9.992299917291118e-06, |
|
"loss": 0.8479, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6190476190476191, |
|
"grad_norm": 0.7457758141141355, |
|
"learning_rate": 9.98891313346017e-06, |
|
"loss": 0.9095, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 0.6700791615416641, |
|
"learning_rate": 9.984911557418444e-06, |
|
"loss": 0.7685, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6507936507936508, |
|
"grad_norm": 0.6202447937301818, |
|
"learning_rate": 9.980295682286924e-06, |
|
"loss": 0.8387, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.6888680420644837, |
|
"learning_rate": 9.97506607688772e-06, |
|
"loss": 0.9107, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6825396825396826, |
|
"grad_norm": 0.5229452850388104, |
|
"learning_rate": 9.969223385673958e-06, |
|
"loss": 0.8308, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6984126984126984, |
|
"grad_norm": 0.5679326043532053, |
|
"learning_rate": 9.962768328650367e-06, |
|
"loss": 0.7516, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.5234412349262514, |
|
"learning_rate": 9.95570170128455e-06, |
|
"loss": 0.8443, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7301587301587301, |
|
"grad_norm": 0.5148736685750067, |
|
"learning_rate": 9.94802437440896e-06, |
|
"loss": 0.7959, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.746031746031746, |
|
"grad_norm": 0.6223703419413371, |
|
"learning_rate": 9.939737294113585e-06, |
|
"loss": 0.8964, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 0.4712938980573866, |
|
"learning_rate": 9.930841481629358e-06, |
|
"loss": 0.8884, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 0.6385581101993485, |
|
"learning_rate": 9.92133803320231e-06, |
|
"loss": 0.7817, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.7936507936507936, |
|
"grad_norm": 0.47528095545287, |
|
"learning_rate": 9.91122811995848e-06, |
|
"loss": 0.819, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8095238095238095, |
|
"grad_norm": 0.5522186664203698, |
|
"learning_rate": 9.90051298775959e-06, |
|
"loss": 0.8691, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.8253968253968254, |
|
"grad_norm": 0.3924890188917555, |
|
"learning_rate": 9.88919395704952e-06, |
|
"loss": 0.8259, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8412698412698413, |
|
"grad_norm": 0.5584015479821739, |
|
"learning_rate": 9.877272422691583e-06, |
|
"loss": 0.9318, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.5472693893320031, |
|
"learning_rate": 9.864749853796642e-06, |
|
"loss": 0.7983, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.873015873015873, |
|
"grad_norm": 0.5011856989250408, |
|
"learning_rate": 9.85162779354206e-06, |
|
"loss": 0.7289, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.48176520075987733, |
|
"learning_rate": 9.837907858981536e-06, |
|
"loss": 0.8795, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9047619047619048, |
|
"grad_norm": 0.4693619944653085, |
|
"learning_rate": 9.823591740845831e-06, |
|
"loss": 0.8625, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.9206349206349206, |
|
"grad_norm": 0.5158078748351012, |
|
"learning_rate": 9.808681203334416e-06, |
|
"loss": 0.7975, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.9365079365079365, |
|
"grad_norm": 0.467299048377056, |
|
"learning_rate": 9.793178083898073e-06, |
|
"loss": 0.878, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.4360100853426926, |
|
"learning_rate": 9.777084293012448e-06, |
|
"loss": 0.842, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9682539682539683, |
|
"grad_norm": 0.4999196363033725, |
|
"learning_rate": 9.760401813942641e-06, |
|
"loss": 0.7661, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9841269841269841, |
|
"grad_norm": 0.49451715958225617, |
|
"learning_rate": 9.743132702498785e-06, |
|
"loss": 0.8685, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.51449429417728, |
|
"learning_rate": 9.725279086782719e-06, |
|
"loss": 0.7676, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.0158730158730158, |
|
"grad_norm": 0.5392465569053122, |
|
"learning_rate": 9.706843166925733e-06, |
|
"loss": 0.7978, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.0317460317460316, |
|
"grad_norm": 0.49426185655546884, |
|
"learning_rate": 9.687827214817433e-06, |
|
"loss": 0.8264, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0476190476190477, |
|
"grad_norm": 0.5050909892528982, |
|
"learning_rate": 9.668233573825794e-06, |
|
"loss": 0.8898, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0634920634920635, |
|
"grad_norm": 0.45134127922296613, |
|
"learning_rate": 9.64806465850836e-06, |
|
"loss": 0.7317, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.0793650793650793, |
|
"grad_norm": 0.5413266326970981, |
|
"learning_rate": 9.62732295431471e-06, |
|
"loss": 0.7307, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.0952380952380953, |
|
"grad_norm": 0.4781316290575908, |
|
"learning_rate": 9.606011017280166e-06, |
|
"loss": 0.8977, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.5064140744904799, |
|
"learning_rate": 9.5841314737108e-06, |
|
"loss": 0.7648, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.126984126984127, |
|
"grad_norm": 0.5543523877170532, |
|
"learning_rate": 9.56168701985981e-06, |
|
"loss": 0.7995, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.4891764300467825, |
|
"learning_rate": 9.538680421595236e-06, |
|
"loss": 0.8072, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.1587301587301586, |
|
"grad_norm": 0.48203192054287314, |
|
"learning_rate": 9.515114514059127e-06, |
|
"loss": 0.8128, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.1746031746031746, |
|
"grad_norm": 0.499915788005329, |
|
"learning_rate": 9.490992201318165e-06, |
|
"loss": 0.7876, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 0.43129451868532453, |
|
"learning_rate": 9.466316456005783e-06, |
|
"loss": 0.7755, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.2063492063492063, |
|
"grad_norm": 0.49436944947590167, |
|
"learning_rate": 9.441090318955843e-06, |
|
"loss": 0.7015, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 0.5018080177691097, |
|
"learning_rate": 9.415316898827923e-06, |
|
"loss": 0.7346, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.2380952380952381, |
|
"grad_norm": 0.42117192002428844, |
|
"learning_rate": 9.388999371724212e-06, |
|
"loss": 0.8242, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.253968253968254, |
|
"grad_norm": 0.47397540901194374, |
|
"learning_rate": 9.362140980798127e-06, |
|
"loss": 0.8928, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.2698412698412698, |
|
"grad_norm": 0.48823131897505534, |
|
"learning_rate": 9.334745035854646e-06, |
|
"loss": 0.7581, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.5170460810325518, |
|
"learning_rate": 9.306814912942445e-06, |
|
"loss": 0.8361, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.3015873015873016, |
|
"grad_norm": 0.41118521047488926, |
|
"learning_rate": 9.278354053937848e-06, |
|
"loss": 0.7794, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.3174603174603174, |
|
"grad_norm": 0.4827654705693697, |
|
"learning_rate": 9.249365966120692e-06, |
|
"loss": 0.8542, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.45176873751511454, |
|
"learning_rate": 9.219854221742106e-06, |
|
"loss": 0.8101, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.3492063492063493, |
|
"grad_norm": 0.44526540495239475, |
|
"learning_rate": 9.189822457584311e-06, |
|
"loss": 0.7419, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.3650793650793651, |
|
"grad_norm": 0.41133066066087726, |
|
"learning_rate": 9.159274374512444e-06, |
|
"loss": 0.6576, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.380952380952381, |
|
"grad_norm": 0.4500027229237173, |
|
"learning_rate": 9.128213737018493e-06, |
|
"loss": 0.8058, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.3968253968253967, |
|
"grad_norm": 0.40834920107678924, |
|
"learning_rate": 9.096644372757393e-06, |
|
"loss": 0.8849, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.4126984126984126, |
|
"grad_norm": 0.5843795042717066, |
|
"learning_rate": 9.064570172075349e-06, |
|
"loss": 0.7969, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.5139681695756663, |
|
"learning_rate": 9.031995087530403e-06, |
|
"loss": 0.7983, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.47799160571848326, |
|
"learning_rate": 8.99892313340537e-06, |
|
"loss": 0.6612, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.4603174603174602, |
|
"grad_norm": 0.48090290795792257, |
|
"learning_rate": 8.96535838521314e-06, |
|
"loss": 0.8026, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.4761904761904763, |
|
"grad_norm": 0.48955363216016506, |
|
"learning_rate": 8.931304979194452e-06, |
|
"loss": 0.8051, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.492063492063492, |
|
"grad_norm": 0.47949685756309185, |
|
"learning_rate": 8.896767111808177e-06, |
|
"loss": 0.7354, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.507936507936508, |
|
"grad_norm": 0.5732670061875946, |
|
"learning_rate": 8.861749039214177e-06, |
|
"loss": 0.9129, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"grad_norm": 0.48050508555262206, |
|
"learning_rate": 8.826255076748823e-06, |
|
"loss": 0.8445, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.5396825396825395, |
|
"grad_norm": 0.4329532952395629, |
|
"learning_rate": 8.790289598393186e-06, |
|
"loss": 0.7212, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.522751486773223, |
|
"learning_rate": 8.753857036234055e-06, |
|
"loss": 0.8149, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 0.4570961856172299, |
|
"learning_rate": 8.716961879917734e-06, |
|
"loss": 0.7365, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.5873015873015874, |
|
"grad_norm": 0.4363179134183329, |
|
"learning_rate": 8.679608676096793e-06, |
|
"loss": 0.8131, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6031746031746033, |
|
"grad_norm": 0.4655541415571893, |
|
"learning_rate": 8.641802027869774e-06, |
|
"loss": 0.7946, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.619047619047619, |
|
"grad_norm": 0.5743139418639736, |
|
"learning_rate": 8.603546594213935e-06, |
|
"loss": 0.8574, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6349206349206349, |
|
"grad_norm": 0.5267570867681096, |
|
"learning_rate": 8.564847089411128e-06, |
|
"loss": 0.8286, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.6507936507936507, |
|
"grad_norm": 0.40799736834923667, |
|
"learning_rate": 8.525708282466839e-06, |
|
"loss": 0.8412, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.4236881481332967, |
|
"learning_rate": 8.486134996522502e-06, |
|
"loss": 0.8172, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.6825396825396826, |
|
"grad_norm": 0.5593679767726464, |
|
"learning_rate": 8.446132108261136e-06, |
|
"loss": 0.8058, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.6984126984126984, |
|
"grad_norm": 0.5031166228419733, |
|
"learning_rate": 8.405704547306379e-06, |
|
"loss": 0.8031, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.45322610730579044, |
|
"learning_rate": 8.364857295615006e-06, |
|
"loss": 0.8903, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.7301587301587302, |
|
"grad_norm": 0.5335556769284883, |
|
"learning_rate": 8.323595386862985e-06, |
|
"loss": 0.7925, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.746031746031746, |
|
"grad_norm": 0.4699718024263939, |
|
"learning_rate": 8.281923905825188e-06, |
|
"loss": 0.7664, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7619047619047619, |
|
"grad_norm": 0.47207237316096745, |
|
"learning_rate": 8.23984798774876e-06, |
|
"loss": 0.7347, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.4532509556288616, |
|
"learning_rate": 8.197372817720314e-06, |
|
"loss": 0.7369, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.7936507936507935, |
|
"grad_norm": 0.5443221798521994, |
|
"learning_rate": 8.154503630026955e-06, |
|
"loss": 0.7261, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.8095238095238095, |
|
"grad_norm": 0.4456098920838456, |
|
"learning_rate": 8.111245707511253e-06, |
|
"loss": 0.7194, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.8253968253968254, |
|
"grad_norm": 0.4159654938486175, |
|
"learning_rate": 8.067604380920228e-06, |
|
"loss": 0.7945, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.8412698412698414, |
|
"grad_norm": 0.4706342532274064, |
|
"learning_rate": 8.023585028248435e-06, |
|
"loss": 0.8487, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 0.5701232470412769, |
|
"learning_rate": 7.979193074075216e-06, |
|
"loss": 0.8887, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.873015873015873, |
|
"grad_norm": 0.530430629054239, |
|
"learning_rate": 7.934433988896233e-06, |
|
"loss": 0.6534, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.48414840419963984, |
|
"learning_rate": 7.889313288449323e-06, |
|
"loss": 0.8214, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.4200926363513126, |
|
"learning_rate": 7.843836533034784e-06, |
|
"loss": 0.7614, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.9206349206349205, |
|
"grad_norm": 0.4941849127950555, |
|
"learning_rate": 7.798009326830167e-06, |
|
"loss": 0.7996, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.9365079365079365, |
|
"grad_norm": 0.41647477043231534, |
|
"learning_rate": 7.751837317199673e-06, |
|
"loss": 0.867, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.9523809523809523, |
|
"grad_norm": 0.4462896414872465, |
|
"learning_rate": 7.705326193998207e-06, |
|
"loss": 0.7547, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.9682539682539684, |
|
"grad_norm": 0.46366747032871125, |
|
"learning_rate": 7.658481688870218e-06, |
|
"loss": 0.7582, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.9841269841269842, |
|
"grad_norm": 0.4714130206121814, |
|
"learning_rate": 7.611309574543373e-06, |
|
"loss": 0.7606, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.46690869317456135, |
|
"learning_rate": 7.563815664117173e-06, |
|
"loss": 0.9121, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.015873015873016, |
|
"grad_norm": 0.8060769356732992, |
|
"learning_rate": 7.5160058103465985e-06, |
|
"loss": 0.7122, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.0317460317460316, |
|
"grad_norm": 0.5953210710991091, |
|
"learning_rate": 7.467885904920864e-06, |
|
"loss": 0.7567, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.0476190476190474, |
|
"grad_norm": 0.7560871980312371, |
|
"learning_rate": 7.419461877737373e-06, |
|
"loss": 0.8318, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.0634920634920633, |
|
"grad_norm": 2.1011598702400667, |
|
"learning_rate": 7.370739696170971e-06, |
|
"loss": 0.7428, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0793650793650795, |
|
"grad_norm": 0.940827314326734, |
|
"learning_rate": 7.321725364338566e-06, |
|
"loss": 0.6161, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.0952380952380953, |
|
"grad_norm": 0.5381465457966281, |
|
"learning_rate": 7.272424922359246e-06, |
|
"loss": 0.6432, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.111111111111111, |
|
"grad_norm": 0.7212158226191104, |
|
"learning_rate": 7.222844445609931e-06, |
|
"loss": 0.7817, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.126984126984127, |
|
"grad_norm": 0.6031927565028607, |
|
"learning_rate": 7.172990043976703e-06, |
|
"loss": 0.7291, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.5554913039306149, |
|
"learning_rate": 7.122867861101868e-06, |
|
"loss": 0.7928, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.1587301587301586, |
|
"grad_norm": 0.4440614086169425, |
|
"learning_rate": 7.072484073626872e-06, |
|
"loss": 0.6864, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.1746031746031744, |
|
"grad_norm": 2.875286251781212, |
|
"learning_rate": 7.021844890431136e-06, |
|
"loss": 0.7627, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.1904761904761907, |
|
"grad_norm": 0.7248206082063566, |
|
"learning_rate": 6.970956551866925e-06, |
|
"loss": 0.728, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.2063492063492065, |
|
"grad_norm": 0.5344769870855947, |
|
"learning_rate": 6.9198253289903515e-06, |
|
"loss": 0.6621, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.5334409779130068, |
|
"learning_rate": 6.868457522788561e-06, |
|
"loss": 0.7351, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.238095238095238, |
|
"grad_norm": 0.4791675678917909, |
|
"learning_rate": 6.816859463403271e-06, |
|
"loss": 0.6568, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.253968253968254, |
|
"grad_norm": 0.5667962259074942, |
|
"learning_rate": 6.765037509350685e-06, |
|
"loss": 0.758, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.2698412698412698, |
|
"grad_norm": 0.523154654898243, |
|
"learning_rate": 6.7129980467379265e-06, |
|
"loss": 0.6657, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.45239550513741295, |
|
"learning_rate": 6.660747488476066e-06, |
|
"loss": 0.6615, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.3015873015873014, |
|
"grad_norm": 0.6580127713752147, |
|
"learning_rate": 6.608292273489851e-06, |
|
"loss": 0.6112, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.317460317460317, |
|
"grad_norm": 0.6033248382665617, |
|
"learning_rate": 6.555638865924221e-06, |
|
"loss": 0.7033, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 0.5578461067365529, |
|
"learning_rate": 6.502793754347721e-06, |
|
"loss": 0.7578, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.3492063492063493, |
|
"grad_norm": 0.5451762654132818, |
|
"learning_rate": 6.449763450952912e-06, |
|
"loss": 0.6863, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.365079365079365, |
|
"grad_norm": 0.4667833185680937, |
|
"learning_rate": 6.396554490753848e-06, |
|
"loss": 0.6825, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.4209933154088852, |
|
"learning_rate": 6.343173430780769e-06, |
|
"loss": 0.836, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.3968253968253967, |
|
"grad_norm": 0.46876037251704294, |
|
"learning_rate": 6.289626849272062e-06, |
|
"loss": 0.7981, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.4126984126984126, |
|
"grad_norm": 0.47367833829704725, |
|
"learning_rate": 6.2359213448636104e-06, |
|
"loss": 0.751, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.4285714285714284, |
|
"grad_norm": 0.43459439089398605, |
|
"learning_rate": 6.182063535775634e-06, |
|
"loss": 0.7654, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 0.39767398947957067, |
|
"learning_rate": 6.1280600589971225e-06, |
|
"loss": 0.7896, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.4603174603174605, |
|
"grad_norm": 0.4231324131775063, |
|
"learning_rate": 6.073917569467934e-06, |
|
"loss": 0.8051, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.4761904761904763, |
|
"grad_norm": 0.3983830637612639, |
|
"learning_rate": 6.0196427392587085e-06, |
|
"loss": 0.7038, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.492063492063492, |
|
"grad_norm": 0.4585701856768339, |
|
"learning_rate": 5.96524225674865e-06, |
|
"loss": 0.7422, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.507936507936508, |
|
"grad_norm": 0.4299692751487169, |
|
"learning_rate": 5.9107228258013085e-06, |
|
"loss": 0.7053, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.5238095238095237, |
|
"grad_norm": 0.42827944956580943, |
|
"learning_rate": 5.856091164938451e-06, |
|
"loss": 0.6523, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.5396825396825395, |
|
"grad_norm": 0.4098750307712162, |
|
"learning_rate": 5.801354006512127e-06, |
|
"loss": 0.6895, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.5555555555555554, |
|
"grad_norm": 0.45624383692077836, |
|
"learning_rate": 5.746518095875033e-06, |
|
"loss": 0.6973, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 0.40961695420487504, |
|
"learning_rate": 5.6915901905492586e-06, |
|
"loss": 0.6285, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.5873015873015874, |
|
"grad_norm": 0.5946218628280344, |
|
"learning_rate": 5.6365770593935665e-06, |
|
"loss": 0.5907, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.6031746031746033, |
|
"grad_norm": 0.5401440035651196, |
|
"learning_rate": 5.581485481769231e-06, |
|
"loss": 0.7181, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.4378876946579892, |
|
"learning_rate": 5.526322246704628e-06, |
|
"loss": 0.7978, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.634920634920635, |
|
"grad_norm": 0.40853074862176036, |
|
"learning_rate": 5.471094152058592e-06, |
|
"loss": 0.681, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.6507936507936507, |
|
"grad_norm": 0.46389651051528763, |
|
"learning_rate": 5.415808003682717e-06, |
|
"loss": 0.7308, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.42992484133372394, |
|
"learning_rate": 5.360470614582661e-06, |
|
"loss": 0.7136, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.682539682539683, |
|
"grad_norm": 0.557870344379466, |
|
"learning_rate": 5.305088804078559e-06, |
|
"loss": 0.7333, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.6984126984126986, |
|
"grad_norm": 0.45453618737081114, |
|
"learning_rate": 5.249669396964665e-06, |
|
"loss": 0.6349, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.7142857142857144, |
|
"grad_norm": 0.4511080452383348, |
|
"learning_rate": 5.1942192226683385e-06, |
|
"loss": 0.776, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.7301587301587302, |
|
"grad_norm": 0.4900595891663431, |
|
"learning_rate": 5.138745114408427e-06, |
|
"loss": 0.5998, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.746031746031746, |
|
"grad_norm": 0.46419645481002475, |
|
"learning_rate": 5.083253908353193e-06, |
|
"loss": 0.6676, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.761904761904762, |
|
"grad_norm": 0.4905121964865482, |
|
"learning_rate": 5.0277524427778986e-06, |
|
"loss": 0.7831, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.4525848803424086, |
|
"learning_rate": 4.972247557222102e-06, |
|
"loss": 0.7164, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.7936507936507935, |
|
"grad_norm": 0.5368330661361714, |
|
"learning_rate": 4.916746091646808e-06, |
|
"loss": 0.6805, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.8095238095238093, |
|
"grad_norm": 0.4224136348005534, |
|
"learning_rate": 4.8612548855915755e-06, |
|
"loss": 0.724, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.825396825396825, |
|
"grad_norm": 0.4869146817578471, |
|
"learning_rate": 4.805780777331662e-06, |
|
"loss": 0.7446, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.8412698412698414, |
|
"grad_norm": 0.4511553539717499, |
|
"learning_rate": 4.750330603035336e-06, |
|
"loss": 0.7124, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.45286461188582156, |
|
"learning_rate": 4.694911195921443e-06, |
|
"loss": 0.7252, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.873015873015873, |
|
"grad_norm": 0.4293856541441545, |
|
"learning_rate": 4.6395293854173395e-06, |
|
"loss": 0.6053, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 0.5836445711298119, |
|
"learning_rate": 4.584191996317285e-06, |
|
"loss": 0.6828, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.9047619047619047, |
|
"grad_norm": 0.4710990644177235, |
|
"learning_rate": 4.528905847941411e-06, |
|
"loss": 0.8414, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.9206349206349205, |
|
"grad_norm": 0.4770727006845428, |
|
"learning_rate": 4.473677753295375e-06, |
|
"loss": 0.6592, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.9365079365079367, |
|
"grad_norm": 0.4087196329651188, |
|
"learning_rate": 4.418514518230769e-06, |
|
"loss": 0.7122, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.9523809523809526, |
|
"grad_norm": 0.48530953017287554, |
|
"learning_rate": 4.363422940606435e-06, |
|
"loss": 0.7454, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.9682539682539684, |
|
"grad_norm": 0.4900655757859956, |
|
"learning_rate": 4.308409809450742e-06, |
|
"loss": 0.7621, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.984126984126984, |
|
"grad_norm": 0.49992358594135816, |
|
"learning_rate": 4.253481904124968e-06, |
|
"loss": 0.7331, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.4385555753140365, |
|
"learning_rate": 4.198645993487872e-06, |
|
"loss": 0.604, |
|
"step": 189 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 315, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 47138450767872.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|