|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.3255813953488373, |
|
"eval_steps": 10, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011627906976744186, |
|
"grad_norm": 1.630926489830017, |
|
"learning_rate": 0.0, |
|
"loss": 2.6815, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.023255813953488372, |
|
"grad_norm": 1.7017581462860107, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 2.8172, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03488372093023256, |
|
"grad_norm": 1.7061189413070679, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 2.7979, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.046511627906976744, |
|
"grad_norm": 1.735384225845337, |
|
"learning_rate": 2.307692307692308e-05, |
|
"loss": 2.7698, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05813953488372093, |
|
"grad_norm": 1.8297396898269653, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 2.6846, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06976744186046512, |
|
"grad_norm": 1.9019414186477661, |
|
"learning_rate": 3.846153846153846e-05, |
|
"loss": 2.4865, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.08139534883720931, |
|
"grad_norm": 2.0125694274902344, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 2.3993, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.09302325581395349, |
|
"grad_norm": 2.0356626510620117, |
|
"learning_rate": 5.384615384615385e-05, |
|
"loss": 2.0547, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.10465116279069768, |
|
"grad_norm": 2.1409757137298584, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 1.7189, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.11627906976744186, |
|
"grad_norm": 2.369163751602173, |
|
"learning_rate": 6.923076923076924e-05, |
|
"loss": 1.4047, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11627906976744186, |
|
"eval_loss": 1.1081944704055786, |
|
"eval_runtime": 69.1299, |
|
"eval_samples_per_second": 70.707, |
|
"eval_steps_per_second": 1.114, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.12790697674418605, |
|
"grad_norm": 2.303812265396118, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 1.0561, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.13953488372093023, |
|
"grad_norm": 1.9584007263183594, |
|
"learning_rate": 8.461538461538461e-05, |
|
"loss": 0.7269, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1511627906976744, |
|
"grad_norm": 1.8204364776611328, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 0.4613, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.16279069767441862, |
|
"grad_norm": 1.328773021697998, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2739, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1744186046511628, |
|
"grad_norm": 1.0226366519927979, |
|
"learning_rate": 0.0001076923076923077, |
|
"loss": 0.1802, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.18604651162790697, |
|
"grad_norm": 0.5955405831336975, |
|
"learning_rate": 0.00011538461538461538, |
|
"loss": 0.1093, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.19767441860465115, |
|
"grad_norm": 0.2616266906261444, |
|
"learning_rate": 0.0001230769230769231, |
|
"loss": 0.0676, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.20930232558139536, |
|
"grad_norm": 0.12042512744665146, |
|
"learning_rate": 0.00013076923076923077, |
|
"loss": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.22093023255813954, |
|
"grad_norm": 0.12201035767793655, |
|
"learning_rate": 0.00013846153846153847, |
|
"loss": 0.0505, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"grad_norm": 0.09313634783029556, |
|
"learning_rate": 0.00014615384615384615, |
|
"loss": 0.0377, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"eval_loss": 0.04212512448430061, |
|
"eval_runtime": 68.7789, |
|
"eval_samples_per_second": 71.068, |
|
"eval_steps_per_second": 1.12, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2441860465116279, |
|
"grad_norm": 0.11886867135763168, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 0.0409, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2558139534883721, |
|
"grad_norm": 0.09348498284816742, |
|
"learning_rate": 0.00016153846153846155, |
|
"loss": 0.0322, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.26744186046511625, |
|
"grad_norm": 0.11308024078607559, |
|
"learning_rate": 0.00016923076923076923, |
|
"loss": 0.0389, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.27906976744186046, |
|
"grad_norm": 0.10123038291931152, |
|
"learning_rate": 0.00017692307692307693, |
|
"loss": 0.0355, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.29069767441860467, |
|
"grad_norm": 0.20477375388145447, |
|
"learning_rate": 0.00018461538461538463, |
|
"loss": 0.0392, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3023255813953488, |
|
"grad_norm": 0.09108395129442215, |
|
"learning_rate": 0.00019230769230769233, |
|
"loss": 0.0311, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.313953488372093, |
|
"grad_norm": 0.10242355614900589, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0311, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.32558139534883723, |
|
"grad_norm": 0.10945220291614532, |
|
"learning_rate": 0.00019999083173529673, |
|
"loss": 0.0304, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3372093023255814, |
|
"grad_norm": 0.07543787360191345, |
|
"learning_rate": 0.0001999633286223284, |
|
"loss": 0.0295, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.3488372093023256, |
|
"grad_norm": 0.07906319946050644, |
|
"learning_rate": 0.00019991749570421146, |
|
"loss": 0.0309, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3488372093023256, |
|
"eval_loss": 0.03273880109190941, |
|
"eval_runtime": 69.34, |
|
"eval_samples_per_second": 70.493, |
|
"eval_steps_per_second": 1.11, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.36046511627906974, |
|
"grad_norm": 0.07591050863265991, |
|
"learning_rate": 0.00019985334138511237, |
|
"loss": 0.0298, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.37209302325581395, |
|
"grad_norm": 0.07976777851581573, |
|
"learning_rate": 0.0001997708774287068, |
|
"loss": 0.0346, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.38372093023255816, |
|
"grad_norm": 0.06528059393167496, |
|
"learning_rate": 0.0001996701189560223, |
|
"loss": 0.0301, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3953488372093023, |
|
"grad_norm": 0.04785207286477089, |
|
"learning_rate": 0.00019955108444266585, |
|
"loss": 0.0291, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.4069767441860465, |
|
"grad_norm": 0.06423522531986237, |
|
"learning_rate": 0.00019941379571543596, |
|
"loss": 0.0302, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4186046511627907, |
|
"grad_norm": 0.043477609753608704, |
|
"learning_rate": 0.00019925827794832056, |
|
"loss": 0.0281, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.43023255813953487, |
|
"grad_norm": 0.0563591867685318, |
|
"learning_rate": 0.00019908455965788067, |
|
"loss": 0.0321, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.4418604651162791, |
|
"grad_norm": 0.07481367886066437, |
|
"learning_rate": 0.00019889267269802176, |
|
"loss": 0.0285, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.45348837209302323, |
|
"grad_norm": 0.05782244727015495, |
|
"learning_rate": 0.00019868265225415265, |
|
"loss": 0.0283, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 0.05342981219291687, |
|
"learning_rate": 0.00019845453683673368, |
|
"loss": 0.0276, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"eval_loss": 0.02999330498278141, |
|
"eval_runtime": 68.8117, |
|
"eval_samples_per_second": 71.034, |
|
"eval_steps_per_second": 1.119, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.47674418604651164, |
|
"grad_norm": 0.07017785310745239, |
|
"learning_rate": 0.0001982083682742156, |
|
"loss": 0.0298, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4883720930232558, |
|
"grad_norm": 0.04703626409173012, |
|
"learning_rate": 0.00019794419170536916, |
|
"loss": 0.0264, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.04164445772767067, |
|
"learning_rate": 0.00019766205557100868, |
|
"loss": 0.0286, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.5116279069767442, |
|
"grad_norm": 0.04445081949234009, |
|
"learning_rate": 0.00019736201160510931, |
|
"loss": 0.0282, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.5232558139534884, |
|
"grad_norm": 0.03947937488555908, |
|
"learning_rate": 0.00019704411482532116, |
|
"loss": 0.0253, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5348837209302325, |
|
"grad_norm": 0.04509953781962395, |
|
"learning_rate": 0.0001967084235228807, |
|
"loss": 0.0218, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.5465116279069767, |
|
"grad_norm": 0.0683850422501564, |
|
"learning_rate": 0.0001963549992519223, |
|
"loss": 0.028, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.5581395348837209, |
|
"grad_norm": 0.045640990138053894, |
|
"learning_rate": 0.0001959839068181914, |
|
"loss": 0.0302, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5697674418604651, |
|
"grad_norm": 0.1620291918516159, |
|
"learning_rate": 0.00019559521426716118, |
|
"loss": 0.0242, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5813953488372093, |
|
"grad_norm": 0.04173683002591133, |
|
"learning_rate": 0.00019518899287155556, |
|
"loss": 0.0307, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5813953488372093, |
|
"eval_loss": 0.02883034199476242, |
|
"eval_runtime": 69.0904, |
|
"eval_samples_per_second": 70.748, |
|
"eval_steps_per_second": 1.114, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5930232558139535, |
|
"grad_norm": 0.03936028108000755, |
|
"learning_rate": 0.00019476531711828027, |
|
"loss": 0.0251, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6046511627906976, |
|
"grad_norm": 0.04982665926218033, |
|
"learning_rate": 0.0001943242646947643, |
|
"loss": 0.0252, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.6162790697674418, |
|
"grad_norm": 0.042471516877412796, |
|
"learning_rate": 0.00019386591647471506, |
|
"loss": 0.0287, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.627906976744186, |
|
"grad_norm": 0.03394132852554321, |
|
"learning_rate": 0.00019339035650328869, |
|
"loss": 0.0278, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.6395348837209303, |
|
"grad_norm": 0.03576912358403206, |
|
"learning_rate": 0.00019289767198167916, |
|
"loss": 0.0259, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.6511627906976745, |
|
"grad_norm": 0.035896990448236465, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 0.0252, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.6627906976744186, |
|
"grad_norm": 0.03331352025270462, |
|
"learning_rate": 0.0001918612937763622, |
|
"loss": 0.0274, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6744186046511628, |
|
"grad_norm": 0.04263336956501007, |
|
"learning_rate": 0.00019131779012844912, |
|
"loss": 0.0231, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.686046511627907, |
|
"grad_norm": 0.04020223766565323, |
|
"learning_rate": 0.00019075754196709572, |
|
"loss": 0.0224, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 0.043280404061079025, |
|
"learning_rate": 0.00019018065202237083, |
|
"loss": 0.0266, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"eval_loss": 0.028244854882359505, |
|
"eval_runtime": 68.7936, |
|
"eval_samples_per_second": 71.053, |
|
"eval_steps_per_second": 1.119, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7093023255813954, |
|
"grad_norm": 0.03325851261615753, |
|
"learning_rate": 0.0001895872260758688, |
|
"loss": 0.024, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.7209302325581395, |
|
"grad_norm": 0.02916116639971733, |
|
"learning_rate": 0.00018897737294131284, |
|
"loss": 0.0237, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.7325581395348837, |
|
"grad_norm": 0.033790189772844315, |
|
"learning_rate": 0.0001883512044446023, |
|
"loss": 0.0261, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.7441860465116279, |
|
"grad_norm": 0.03239690884947777, |
|
"learning_rate": 0.0001877088354033077, |
|
"loss": 0.0272, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.7558139534883721, |
|
"grad_norm": 0.03925548493862152, |
|
"learning_rate": 0.0001870503836056172, |
|
"loss": 0.0246, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.7674418604651163, |
|
"grad_norm": 0.03802689164876938, |
|
"learning_rate": 0.00018637596978873835, |
|
"loss": 0.0294, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7790697674418605, |
|
"grad_norm": 0.03799804300069809, |
|
"learning_rate": 0.00018568571761675893, |
|
"loss": 0.025, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.7906976744186046, |
|
"grad_norm": 0.034483980387449265, |
|
"learning_rate": 0.0001849797536579715, |
|
"loss": 0.0261, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.8023255813953488, |
|
"grad_norm": 0.049156103283166885, |
|
"learning_rate": 0.0001842582073616649, |
|
"loss": 0.024, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.813953488372093, |
|
"grad_norm": 0.03340472653508186, |
|
"learning_rate": 0.000183521211034388, |
|
"loss": 0.0293, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.813953488372093, |
|
"eval_loss": 0.02760264091193676, |
|
"eval_runtime": 68.781, |
|
"eval_samples_per_second": 71.066, |
|
"eval_steps_per_second": 1.119, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.8255813953488372, |
|
"grad_norm": 0.030273284763097763, |
|
"learning_rate": 0.00018276889981568906, |
|
"loss": 0.026, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.8372093023255814, |
|
"grad_norm": 0.02898152358829975, |
|
"learning_rate": 0.0001820014116533359, |
|
"loss": 0.0247, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.8488372093023255, |
|
"grad_norm": 0.03946846351027489, |
|
"learning_rate": 0.00018121888727802113, |
|
"loss": 0.0259, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.8604651162790697, |
|
"grad_norm": 0.035859089344739914, |
|
"learning_rate": 0.0001804214701775569, |
|
"loss": 0.0276, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.872093023255814, |
|
"grad_norm": 0.03241611272096634, |
|
"learning_rate": 0.00017960930657056438, |
|
"loss": 0.0229, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8837209302325582, |
|
"grad_norm": 0.025535929948091507, |
|
"learning_rate": 0.00017878254537966216, |
|
"loss": 0.0202, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.8953488372093024, |
|
"grad_norm": 0.03790373355150223, |
|
"learning_rate": 0.00017794133820415916, |
|
"loss": 0.026, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.9069767441860465, |
|
"grad_norm": 0.0408620722591877, |
|
"learning_rate": 0.0001770858392922565, |
|
"loss": 0.0253, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.9186046511627907, |
|
"grad_norm": 0.033651720732450485, |
|
"learning_rate": 0.00017621620551276366, |
|
"loss": 0.0227, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 0.03782816231250763, |
|
"learning_rate": 0.00017533259632633442, |
|
"loss": 0.0254, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"eval_loss": 0.026909608393907547, |
|
"eval_runtime": 68.7973, |
|
"eval_samples_per_second": 71.049, |
|
"eval_steps_per_second": 1.119, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9418604651162791, |
|
"grad_norm": 0.03370513767004013, |
|
"learning_rate": 0.00017443517375622704, |
|
"loss": 0.0261, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.9534883720930233, |
|
"grad_norm": 0.03856685757637024, |
|
"learning_rate": 0.00017352410235859503, |
|
"loss": 0.0256, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.9651162790697675, |
|
"grad_norm": 0.04497801512479782, |
|
"learning_rate": 0.0001725995491923131, |
|
"loss": 0.0262, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.9767441860465116, |
|
"grad_norm": 0.031994592398405075, |
|
"learning_rate": 0.00017166168378834448, |
|
"loss": 0.0251, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.9883720930232558, |
|
"grad_norm": 0.035724181681871414, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.0254, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.029222311452031136, |
|
"learning_rate": 0.00016974670656467824, |
|
"loss": 0.0238, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.0116279069767442, |
|
"grad_norm": 0.039559703320264816, |
|
"learning_rate": 0.00016876994588534234, |
|
"loss": 0.0296, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.0232558139534884, |
|
"grad_norm": 0.031729090958833694, |
|
"learning_rate": 0.0001677805751846563, |
|
"loss": 0.018, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.0348837209302326, |
|
"grad_norm": 0.029029319062829018, |
|
"learning_rate": 0.00016677877587886956, |
|
"loss": 0.0244, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.0465116279069768, |
|
"grad_norm": 0.025509672239422798, |
|
"learning_rate": 0.00016576473166320644, |
|
"loss": 0.0215, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.0465116279069768, |
|
"eval_loss": 0.026453962549567223, |
|
"eval_runtime": 68.7872, |
|
"eval_samples_per_second": 71.06, |
|
"eval_steps_per_second": 1.119, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.058139534883721, |
|
"grad_norm": 0.027732428163290024, |
|
"learning_rate": 0.00016473862847818277, |
|
"loss": 0.0251, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.069767441860465, |
|
"grad_norm": 0.023567862808704376, |
|
"learning_rate": 0.00016370065447551078, |
|
"loss": 0.0224, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.0813953488372092, |
|
"grad_norm": 0.030995313078165054, |
|
"learning_rate": 0.00016265099998359866, |
|
"loss": 0.0236, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.0930232558139534, |
|
"grad_norm": 0.03294675052165985, |
|
"learning_rate": 0.00016158985747265108, |
|
"loss": 0.0253, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.1046511627906976, |
|
"grad_norm": 0.030441010370850563, |
|
"learning_rate": 0.00016051742151937655, |
|
"loss": 0.0228, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.1162790697674418, |
|
"grad_norm": 0.029724519699811935, |
|
"learning_rate": 0.000159433888771309, |
|
"loss": 0.0193, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.127906976744186, |
|
"grad_norm": 0.04900391027331352, |
|
"learning_rate": 0.00015833945791074943, |
|
"loss": 0.0206, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.1395348837209303, |
|
"grad_norm": 0.0280914343893528, |
|
"learning_rate": 0.0001572343296183344, |
|
"loss": 0.0189, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.1511627906976745, |
|
"grad_norm": 0.031953178346157074, |
|
"learning_rate": 0.00015611870653623825, |
|
"loss": 0.0226, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"grad_norm": 0.02610064297914505, |
|
"learning_rate": 0.0001549927932310155, |
|
"loss": 0.0176, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"eval_loss": 0.026211915537714958, |
|
"eval_runtime": 68.7868, |
|
"eval_samples_per_second": 71.06, |
|
"eval_steps_per_second": 1.119, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.1744186046511629, |
|
"grad_norm": 0.04419023171067238, |
|
"learning_rate": 0.00015385679615609042, |
|
"loss": 0.0269, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.1860465116279069, |
|
"grad_norm": 0.09231790900230408, |
|
"learning_rate": 0.00015271092361390077, |
|
"loss": 0.0258, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.197674418604651, |
|
"grad_norm": 0.034355148673057556, |
|
"learning_rate": 0.00015155538571770218, |
|
"loss": 0.0244, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.2093023255813953, |
|
"grad_norm": 0.03240971267223358, |
|
"learning_rate": 0.00015039039435304078, |
|
"loss": 0.0235, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.2209302325581395, |
|
"grad_norm": 0.02766534686088562, |
|
"learning_rate": 0.00014921616313890072, |
|
"loss": 0.021, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.2325581395348837, |
|
"grad_norm": 0.030099626630544662, |
|
"learning_rate": 0.00014803290738853395, |
|
"loss": 0.0218, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.244186046511628, |
|
"grad_norm": 0.030833614990115166, |
|
"learning_rate": 0.00014684084406997903, |
|
"loss": 0.0197, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.255813953488372, |
|
"grad_norm": 0.02916071005165577, |
|
"learning_rate": 0.0001456401917662769, |
|
"loss": 0.022, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.2674418604651163, |
|
"grad_norm": 0.024599241092801094, |
|
"learning_rate": 0.00014443117063539038, |
|
"loss": 0.0249, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.2790697674418605, |
|
"grad_norm": 0.04152291268110275, |
|
"learning_rate": 0.00014321400236983457, |
|
"loss": 0.0227, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.2790697674418605, |
|
"eval_loss": 0.025697337463498116, |
|
"eval_runtime": 68.7913, |
|
"eval_samples_per_second": 71.055, |
|
"eval_steps_per_second": 1.119, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.2906976744186047, |
|
"grad_norm": 0.026202471926808357, |
|
"learning_rate": 0.00014198891015602646, |
|
"loss": 0.0225, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.302325581395349, |
|
"grad_norm": 0.026729293167591095, |
|
"learning_rate": 0.0001407561186333601, |
|
"loss": 0.0231, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.3139534883720931, |
|
"grad_norm": 0.03199277073144913, |
|
"learning_rate": 0.00013951585385301555, |
|
"loss": 0.0187, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.3255813953488373, |
|
"grad_norm": 0.030409252271056175, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 0.0251, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.3372093023255813, |
|
"grad_norm": 0.029044533148407936, |
|
"learning_rate": 0.00013701381553399145, |
|
"loss": 0.0206, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.3488372093023255, |
|
"grad_norm": 0.0352545827627182, |
|
"learning_rate": 0.000135752500782304, |
|
"loss": 0.0195, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.3604651162790697, |
|
"grad_norm": 0.03767949342727661, |
|
"learning_rate": 0.00013448463026279704, |
|
"loss": 0.0253, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.372093023255814, |
|
"grad_norm": 0.02688649669289589, |
|
"learning_rate": 0.0001332104364589212, |
|
"loss": 0.0196, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.3837209302325582, |
|
"grad_norm": 0.03161188215017319, |
|
"learning_rate": 0.000131930153013598, |
|
"loss": 0.0219, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"grad_norm": 0.03074447624385357, |
|
"learning_rate": 0.00013064401468637792, |
|
"loss": 0.0198, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"eval_loss": 0.025383805856108665, |
|
"eval_runtime": 68.781, |
|
"eval_samples_per_second": 71.066, |
|
"eval_steps_per_second": 1.119, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.4069767441860466, |
|
"grad_norm": 0.03676707297563553, |
|
"learning_rate": 0.00012935225731039348, |
|
"loss": 0.0268, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.4186046511627908, |
|
"grad_norm": 0.04459831491112709, |
|
"learning_rate": 0.00012805511774911584, |
|
"loss": 0.0233, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.4302325581395348, |
|
"grad_norm": 0.03590243309736252, |
|
"learning_rate": 0.00012675283385292212, |
|
"loss": 0.0222, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.441860465116279, |
|
"grad_norm": 0.036192964762449265, |
|
"learning_rate": 0.00012544564441548182, |
|
"loss": 0.0251, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.4534883720930232, |
|
"grad_norm": 0.03172110393643379, |
|
"learning_rate": 0.00012413378912997058, |
|
"loss": 0.0202, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.4651162790697674, |
|
"grad_norm": 0.032995227724313736, |
|
"learning_rate": 0.0001228175085451186, |
|
"loss": 0.0219, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.4767441860465116, |
|
"grad_norm": 0.02672835998237133, |
|
"learning_rate": 0.00012149704402110243, |
|
"loss": 0.0185, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.4883720930232558, |
|
"grad_norm": 0.03171510249376297, |
|
"learning_rate": 0.00012017263768528775, |
|
"loss": 0.0196, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.03766058757901192, |
|
"learning_rate": 0.00011884453238783185, |
|
"loss": 0.0223, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.5116279069767442, |
|
"grad_norm": 0.038156960159540176, |
|
"learning_rate": 0.00011751297165715309, |
|
"loss": 0.0245, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.5116279069767442, |
|
"eval_loss": 0.025349650532007217, |
|
"eval_runtime": 68.7837, |
|
"eval_samples_per_second": 71.063, |
|
"eval_steps_per_second": 1.119, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.5232558139534884, |
|
"grad_norm": 0.03054482489824295, |
|
"learning_rate": 0.0001161781996552765, |
|
"loss": 0.0217, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.5348837209302326, |
|
"grad_norm": 0.026866618543863297, |
|
"learning_rate": 0.00011484046113306262, |
|
"loss": 0.0196, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.5465116279069768, |
|
"grad_norm": 0.035294584929943085, |
|
"learning_rate": 0.00011350000138532902, |
|
"loss": 0.0237, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.558139534883721, |
|
"grad_norm": 0.02969173528254032, |
|
"learning_rate": 0.00011215706620587149, |
|
"loss": 0.0203, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.5697674418604652, |
|
"grad_norm": 0.031717926263809204, |
|
"learning_rate": 0.00011081190184239419, |
|
"loss": 0.0192, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.5813953488372094, |
|
"grad_norm": 0.03667771443724632, |
|
"learning_rate": 0.0001094647549513561, |
|
"loss": 0.0268, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.5930232558139537, |
|
"grad_norm": 0.0326247438788414, |
|
"learning_rate": 0.00010811587255274313, |
|
"loss": 0.0213, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.6046511627906976, |
|
"grad_norm": 0.03693091496825218, |
|
"learning_rate": 0.00010676550198477293, |
|
"loss": 0.0203, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.6162790697674418, |
|
"grad_norm": 0.037649210542440414, |
|
"learning_rate": 0.00010541389085854176, |
|
"loss": 0.0255, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.627906976744186, |
|
"grad_norm": 0.03507932275533676, |
|
"learning_rate": 0.00010406128701262128, |
|
"loss": 0.0217, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.627906976744186, |
|
"eval_loss": 0.025237275287508965, |
|
"eval_runtime": 69.013, |
|
"eval_samples_per_second": 70.827, |
|
"eval_steps_per_second": 1.116, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.6395348837209303, |
|
"grad_norm": 0.02867533639073372, |
|
"learning_rate": 0.00010270793846761347, |
|
"loss": 0.022, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.6511627906976745, |
|
"grad_norm": 0.02936953864991665, |
|
"learning_rate": 0.00010135409338067219, |
|
"loss": 0.0208, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.6627906976744184, |
|
"grad_norm": 0.02879083901643753, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0226, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.6744186046511627, |
|
"grad_norm": 0.029482927173376083, |
|
"learning_rate": 9.864590661932783e-05, |
|
"loss": 0.0216, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.6860465116279069, |
|
"grad_norm": 0.033599238842725754, |
|
"learning_rate": 9.729206153238657e-05, |
|
"loss": 0.0255, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.697674418604651, |
|
"grad_norm": 0.03904499486088753, |
|
"learning_rate": 9.59387129873787e-05, |
|
"loss": 0.0273, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.7093023255813953, |
|
"grad_norm": 0.028350962325930595, |
|
"learning_rate": 9.458610914145826e-05, |
|
"loss": 0.0206, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.7209302325581395, |
|
"grad_norm": 0.030183596536517143, |
|
"learning_rate": 9.323449801522709e-05, |
|
"loss": 0.0194, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.7325581395348837, |
|
"grad_norm": 0.030208786949515343, |
|
"learning_rate": 9.18841274472569e-05, |
|
"loss": 0.0184, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.744186046511628, |
|
"grad_norm": 0.033600978553295135, |
|
"learning_rate": 9.05352450486439e-05, |
|
"loss": 0.0209, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.744186046511628, |
|
"eval_loss": 0.024917516857385635, |
|
"eval_runtime": 68.8307, |
|
"eval_samples_per_second": 71.015, |
|
"eval_steps_per_second": 1.119, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.755813953488372, |
|
"grad_norm": 0.030587706714868546, |
|
"learning_rate": 8.918809815760585e-05, |
|
"loss": 0.0193, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.7674418604651163, |
|
"grad_norm": 0.03076143190264702, |
|
"learning_rate": 8.78429337941285e-05, |
|
"loss": 0.0234, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.7790697674418605, |
|
"grad_norm": 0.031419869512319565, |
|
"learning_rate": 8.649999861467099e-05, |
|
"loss": 0.0213, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.7906976744186047, |
|
"grad_norm": 0.03213745728135109, |
|
"learning_rate": 8.515953886693739e-05, |
|
"loss": 0.02, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.802325581395349, |
|
"grad_norm": 0.035864025354385376, |
|
"learning_rate": 8.382180034472353e-05, |
|
"loss": 0.0199, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.8139534883720931, |
|
"grad_norm": 0.029758954420685768, |
|
"learning_rate": 8.248702834284693e-05, |
|
"loss": 0.0227, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.8255813953488373, |
|
"grad_norm": 0.02980395406484604, |
|
"learning_rate": 8.115546761216822e-05, |
|
"loss": 0.0168, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.8372093023255816, |
|
"grad_norm": 0.04690724238753319, |
|
"learning_rate": 7.982736231471224e-05, |
|
"loss": 0.022, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.8488372093023255, |
|
"grad_norm": 0.035520877689123154, |
|
"learning_rate": 7.85029559788976e-05, |
|
"loss": 0.0221, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 0.032926399260759354, |
|
"learning_rate": 7.718249145488142e-05, |
|
"loss": 0.0227, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"eval_loss": 0.024740872904658318, |
|
"eval_runtime": 69.0656, |
|
"eval_samples_per_second": 70.773, |
|
"eval_steps_per_second": 1.115, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.872093023255814, |
|
"grad_norm": 0.030970241874456406, |
|
"learning_rate": 7.586621087002945e-05, |
|
"loss": 0.0192, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.8837209302325582, |
|
"grad_norm": 0.03738875314593315, |
|
"learning_rate": 7.455435558451823e-05, |
|
"loss": 0.0213, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.8953488372093024, |
|
"grad_norm": 0.043416742235422134, |
|
"learning_rate": 7.324716614707793e-05, |
|
"loss": 0.0212, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.9069767441860463, |
|
"grad_norm": 0.029200483113527298, |
|
"learning_rate": 7.194488225088417e-05, |
|
"loss": 0.0172, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.9186046511627906, |
|
"grad_norm": 0.03626865893602371, |
|
"learning_rate": 7.064774268960653e-05, |
|
"loss": 0.0218, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.9302325581395348, |
|
"grad_norm": 0.03200054168701172, |
|
"learning_rate": 6.93559853136221e-05, |
|
"loss": 0.02, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.941860465116279, |
|
"grad_norm": 0.04698159173130989, |
|
"learning_rate": 6.806984698640202e-05, |
|
"loss": 0.0245, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.9534883720930232, |
|
"grad_norm": 0.03742319345474243, |
|
"learning_rate": 6.678956354107882e-05, |
|
"loss": 0.025, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.9651162790697674, |
|
"grad_norm": 0.033966902643442154, |
|
"learning_rate": 6.551536973720298e-05, |
|
"loss": 0.0174, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.9767441860465116, |
|
"grad_norm": 0.03295022249221802, |
|
"learning_rate": 6.4247499217696e-05, |
|
"loss": 0.0195, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.9767441860465116, |
|
"eval_loss": 0.02444678172469139, |
|
"eval_runtime": 69.9558, |
|
"eval_samples_per_second": 69.873, |
|
"eval_steps_per_second": 1.101, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.9883720930232558, |
|
"grad_norm": 0.031102096661925316, |
|
"learning_rate": 6.298618446600856e-05, |
|
"loss": 0.02, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.03594454750418663, |
|
"learning_rate": 6.173165676349103e-05, |
|
"loss": 0.0211, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.011627906976744, |
|
"grad_norm": 0.02976617030799389, |
|
"learning_rate": 6.048414614698448e-05, |
|
"loss": 0.0205, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.0232558139534884, |
|
"grad_norm": 0.03257077932357788, |
|
"learning_rate": 5.924388136663992e-05, |
|
"loss": 0.0187, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.0348837209302326, |
|
"grad_norm": 0.027616139501333237, |
|
"learning_rate": 5.801108984397354e-05, |
|
"loss": 0.0153, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.046511627906977, |
|
"grad_norm": 0.029210377484560013, |
|
"learning_rate": 5.6785997630165435e-05, |
|
"loss": 0.0192, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.058139534883721, |
|
"grad_norm": 0.029252031818032265, |
|
"learning_rate": 5.5568829364609664e-05, |
|
"loss": 0.0171, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.0697674418604652, |
|
"grad_norm": 0.029388127848505974, |
|
"learning_rate": 5.435980823372311e-05, |
|
"loss": 0.0184, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.0813953488372094, |
|
"grad_norm": 0.028690453618764877, |
|
"learning_rate": 5.3159155930021e-05, |
|
"loss": 0.0191, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.0930232558139537, |
|
"grad_norm": 0.027052663266658783, |
|
"learning_rate": 5.196709261146606e-05, |
|
"loss": 0.0174, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.0930232558139537, |
|
"eval_loss": 0.024556750431656837, |
|
"eval_runtime": 68.786, |
|
"eval_samples_per_second": 71.061, |
|
"eval_steps_per_second": 1.119, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.104651162790698, |
|
"grad_norm": 0.027682358399033546, |
|
"learning_rate": 5.078383686109926e-05, |
|
"loss": 0.0192, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.116279069767442, |
|
"grad_norm": 0.030152970924973488, |
|
"learning_rate": 4.9609605646959226e-05, |
|
"loss": 0.0182, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.1279069767441863, |
|
"grad_norm": 0.030966833233833313, |
|
"learning_rate": 4.844461428229782e-05, |
|
"loss": 0.0168, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.13953488372093, |
|
"grad_norm": 0.02938106097280979, |
|
"learning_rate": 4.728907638609925e-05, |
|
"loss": 0.0209, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.1511627906976742, |
|
"grad_norm": 0.03003690205514431, |
|
"learning_rate": 4.614320384390959e-05, |
|
"loss": 0.0171, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.1627906976744184, |
|
"grad_norm": 0.03510993719100952, |
|
"learning_rate": 4.500720676898452e-05, |
|
"loss": 0.0196, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.1744186046511627, |
|
"grad_norm": 0.028933702036738396, |
|
"learning_rate": 4.388129346376178e-05, |
|
"loss": 0.0154, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.186046511627907, |
|
"grad_norm": 0.040435321629047394, |
|
"learning_rate": 4.276567038166563e-05, |
|
"loss": 0.0214, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.197674418604651, |
|
"grad_norm": 0.03420122340321541, |
|
"learning_rate": 4.16605420892506e-05, |
|
"loss": 0.0162, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.2093023255813953, |
|
"grad_norm": 0.033222515136003494, |
|
"learning_rate": 4.0566111228691064e-05, |
|
"loss": 0.018, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.2093023255813953, |
|
"eval_loss": 0.02473200112581253, |
|
"eval_runtime": 68.7641, |
|
"eval_samples_per_second": 71.084, |
|
"eval_steps_per_second": 1.12, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.2209302325581395, |
|
"grad_norm": 0.03317669406533241, |
|
"learning_rate": 3.948257848062351e-05, |
|
"loss": 0.0169, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.2325581395348837, |
|
"grad_norm": 0.03263445943593979, |
|
"learning_rate": 3.841014252734896e-05, |
|
"loss": 0.0179, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.244186046511628, |
|
"grad_norm": 0.030220864340662956, |
|
"learning_rate": 3.734900001640135e-05, |
|
"loss": 0.0185, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.255813953488372, |
|
"grad_norm": 0.033804602921009064, |
|
"learning_rate": 3.629934552448925e-05, |
|
"loss": 0.0192, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.2674418604651163, |
|
"grad_norm": 0.03279354050755501, |
|
"learning_rate": 3.5261371521817244e-05, |
|
"loss": 0.0211, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.2790697674418605, |
|
"grad_norm": 0.03980812057852745, |
|
"learning_rate": 3.423526833679355e-05, |
|
"loss": 0.0187, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.2906976744186047, |
|
"grad_norm": 0.03647474944591522, |
|
"learning_rate": 3.322122412113047e-05, |
|
"loss": 0.0187, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.302325581395349, |
|
"grad_norm": 0.0309713426977396, |
|
"learning_rate": 3.2219424815343735e-05, |
|
"loss": 0.0175, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.313953488372093, |
|
"grad_norm": 0.030178574845194817, |
|
"learning_rate": 3.123005411465766e-05, |
|
"loss": 0.0174, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 0.03233598917722702, |
|
"learning_rate": 3.0253293435321793e-05, |
|
"loss": 0.0176, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"eval_loss": 0.024754056707024574, |
|
"eval_runtime": 68.7907, |
|
"eval_samples_per_second": 71.056, |
|
"eval_steps_per_second": 1.119, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 258, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.5918719337187246e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|