|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 250, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 1.954319715499878, |
|
"learning_rate": 1e-05, |
|
"loss": 2.411, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 1.7826515436172485, |
|
"learning_rate": 9.960000000000001e-06, |
|
"loss": 2.2813, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 1.8655924797058105, |
|
"learning_rate": 9.920000000000002e-06, |
|
"loss": 2.2412, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.745436668395996, |
|
"learning_rate": 9.88e-06, |
|
"loss": 2.449, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.694860816001892, |
|
"learning_rate": 9.84e-06, |
|
"loss": 2.3644, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 1.674206018447876, |
|
"learning_rate": 9.800000000000001e-06, |
|
"loss": 2.3375, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.028, |
|
"grad_norm": 1.4836173057556152, |
|
"learning_rate": 9.760000000000001e-06, |
|
"loss": 2.2718, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 1.6038180589675903, |
|
"learning_rate": 9.72e-06, |
|
"loss": 2.32, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.036, |
|
"grad_norm": 1.4479186534881592, |
|
"learning_rate": 9.68e-06, |
|
"loss": 2.316, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.392088770866394, |
|
"learning_rate": 9.640000000000001e-06, |
|
"loss": 2.2769, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.044, |
|
"grad_norm": 1.4886748790740967, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 2.1541, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 1.4993741512298584, |
|
"learning_rate": 9.56e-06, |
|
"loss": 2.2039, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.052, |
|
"grad_norm": 1.4934766292572021, |
|
"learning_rate": 9.52e-06, |
|
"loss": 2.192, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 1.4041507244110107, |
|
"learning_rate": 9.48e-06, |
|
"loss": 2.1539, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.3733670711517334, |
|
"learning_rate": 9.440000000000001e-06, |
|
"loss": 2.0952, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 1.3157683610916138, |
|
"learning_rate": 9.4e-06, |
|
"loss": 2.1532, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.068, |
|
"grad_norm": 1.329785943031311, |
|
"learning_rate": 9.360000000000002e-06, |
|
"loss": 2.2528, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 1.3144254684448242, |
|
"learning_rate": 9.32e-06, |
|
"loss": 2.1163, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.076, |
|
"grad_norm": 1.3946971893310547, |
|
"learning_rate": 9.280000000000001e-06, |
|
"loss": 2.1822, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.2859883308410645, |
|
"learning_rate": 9.240000000000001e-06, |
|
"loss": 2.1111, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.084, |
|
"grad_norm": 1.2906239032745361, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 2.1035, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 1.3045598268508911, |
|
"learning_rate": 9.16e-06, |
|
"loss": 2.0066, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.092, |
|
"grad_norm": 1.3252729177474976, |
|
"learning_rate": 9.12e-06, |
|
"loss": 2.1423, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.444812297821045, |
|
"learning_rate": 9.080000000000001e-06, |
|
"loss": 1.9729, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.274637222290039, |
|
"learning_rate": 9.040000000000002e-06, |
|
"loss": 1.9568, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 1.3475300073623657, |
|
"learning_rate": 9e-06, |
|
"loss": 2.0642, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.108, |
|
"grad_norm": 1.1989250183105469, |
|
"learning_rate": 8.96e-06, |
|
"loss": 1.9397, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 1.2486416101455688, |
|
"learning_rate": 8.920000000000001e-06, |
|
"loss": 2.0845, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.116, |
|
"grad_norm": 1.170324683189392, |
|
"learning_rate": 8.880000000000001e-06, |
|
"loss": 1.9076, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.1869423389434814, |
|
"learning_rate": 8.84e-06, |
|
"loss": 1.9553, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.124, |
|
"grad_norm": 1.3021348714828491, |
|
"learning_rate": 8.8e-06, |
|
"loss": 2.0035, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 1.2392538785934448, |
|
"learning_rate": 8.76e-06, |
|
"loss": 1.9291, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.132, |
|
"grad_norm": 1.1870639324188232, |
|
"learning_rate": 8.720000000000001e-06, |
|
"loss": 1.8562, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 1.2068073749542236, |
|
"learning_rate": 8.68e-06, |
|
"loss": 1.8835, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.216802716255188, |
|
"learning_rate": 8.64e-06, |
|
"loss": 1.9927, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.2364400625228882, |
|
"learning_rate": 8.6e-06, |
|
"loss": 1.9287, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.148, |
|
"grad_norm": 1.190116047859192, |
|
"learning_rate": 8.560000000000001e-06, |
|
"loss": 1.8866, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 1.259717583656311, |
|
"learning_rate": 8.52e-06, |
|
"loss": 1.9072, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.156, |
|
"grad_norm": 1.1407251358032227, |
|
"learning_rate": 8.48e-06, |
|
"loss": 1.8916, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.2043702602386475, |
|
"learning_rate": 8.44e-06, |
|
"loss": 1.9885, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.164, |
|
"grad_norm": 1.1877334117889404, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 1.8913, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 1.1940908432006836, |
|
"learning_rate": 8.36e-06, |
|
"loss": 1.9593, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.172, |
|
"grad_norm": 1.2312895059585571, |
|
"learning_rate": 8.32e-06, |
|
"loss": 1.853, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 1.228988528251648, |
|
"learning_rate": 8.28e-06, |
|
"loss": 1.8425, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.1870458126068115, |
|
"learning_rate": 8.24e-06, |
|
"loss": 1.8729, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 1.1809366941452026, |
|
"learning_rate": 8.2e-06, |
|
"loss": 1.869, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.188, |
|
"grad_norm": 1.2264615297317505, |
|
"learning_rate": 8.16e-06, |
|
"loss": 1.709, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.2154760360717773, |
|
"learning_rate": 8.120000000000002e-06, |
|
"loss": 1.8711, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.196, |
|
"grad_norm": 1.2827329635620117, |
|
"learning_rate": 8.08e-06, |
|
"loss": 1.7218, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.321017861366272, |
|
"learning_rate": 8.040000000000001e-06, |
|
"loss": 1.8414, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.204, |
|
"grad_norm": 1.2692463397979736, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.8819, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 1.2398897409439087, |
|
"learning_rate": 7.960000000000002e-06, |
|
"loss": 1.8504, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.212, |
|
"grad_norm": 1.1805490255355835, |
|
"learning_rate": 7.92e-06, |
|
"loss": 1.7266, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 1.3034391403198242, |
|
"learning_rate": 7.88e-06, |
|
"loss": 1.9034, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.2424261569976807, |
|
"learning_rate": 7.840000000000001e-06, |
|
"loss": 1.8717, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 1.2809594869613647, |
|
"learning_rate": 7.800000000000002e-06, |
|
"loss": 1.842, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.228, |
|
"grad_norm": 1.2217411994934082, |
|
"learning_rate": 7.76e-06, |
|
"loss": 1.8669, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 1.245792269706726, |
|
"learning_rate": 7.72e-06, |
|
"loss": 1.7957, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.236, |
|
"grad_norm": 1.4041963815689087, |
|
"learning_rate": 7.680000000000001e-06, |
|
"loss": 1.8658, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.2613027095794678, |
|
"learning_rate": 7.640000000000001e-06, |
|
"loss": 1.7077, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.244, |
|
"grad_norm": 1.2692285776138306, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 1.8825, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 1.2766786813735962, |
|
"learning_rate": 7.5600000000000005e-06, |
|
"loss": 1.7079, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.252, |
|
"grad_norm": 1.2243366241455078, |
|
"learning_rate": 7.520000000000001e-06, |
|
"loss": 1.6271, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 1.1690571308135986, |
|
"learning_rate": 7.48e-06, |
|
"loss": 1.658, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.285218596458435, |
|
"learning_rate": 7.440000000000001e-06, |
|
"loss": 1.8225, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 1.2221527099609375, |
|
"learning_rate": 7.4e-06, |
|
"loss": 1.7738, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.268, |
|
"grad_norm": 1.266739010810852, |
|
"learning_rate": 7.360000000000001e-06, |
|
"loss": 1.8241, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 1.1950359344482422, |
|
"learning_rate": 7.32e-06, |
|
"loss": 1.707, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.276, |
|
"grad_norm": 1.2839792966842651, |
|
"learning_rate": 7.280000000000001e-06, |
|
"loss": 1.6992, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.2620984315872192, |
|
"learning_rate": 7.24e-06, |
|
"loss": 1.774, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.284, |
|
"grad_norm": 1.3442317247390747, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 1.6569, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.1748533248901367, |
|
"learning_rate": 7.16e-06, |
|
"loss": 1.6679, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.292, |
|
"grad_norm": 1.2641892433166504, |
|
"learning_rate": 7.1200000000000004e-06, |
|
"loss": 1.7939, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 1.3012604713439941, |
|
"learning_rate": 7.08e-06, |
|
"loss": 1.8023, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.231961965560913, |
|
"learning_rate": 7.04e-06, |
|
"loss": 1.6496, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 1.1863617897033691, |
|
"learning_rate": 7e-06, |
|
"loss": 1.724, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.308, |
|
"grad_norm": 1.3211452960968018, |
|
"learning_rate": 6.96e-06, |
|
"loss": 1.774, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 1.2411987781524658, |
|
"learning_rate": 6.92e-06, |
|
"loss": 1.747, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.316, |
|
"grad_norm": 1.2533900737762451, |
|
"learning_rate": 6.88e-06, |
|
"loss": 1.6799, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.239896297454834, |
|
"learning_rate": 6.8400000000000014e-06, |
|
"loss": 1.7044, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.324, |
|
"grad_norm": 1.3009883165359497, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 1.7974, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 1.3433125019073486, |
|
"learning_rate": 6.760000000000001e-06, |
|
"loss": 1.7764, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.332, |
|
"grad_norm": 1.2757320404052734, |
|
"learning_rate": 6.720000000000001e-06, |
|
"loss": 1.8176, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 1.4214569330215454, |
|
"learning_rate": 6.680000000000001e-06, |
|
"loss": 1.7098, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.3198415040969849, |
|
"learning_rate": 6.640000000000001e-06, |
|
"loss": 1.7805, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 1.2718086242675781, |
|
"learning_rate": 6.600000000000001e-06, |
|
"loss": 1.7335, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.348, |
|
"grad_norm": 1.220406174659729, |
|
"learning_rate": 6.560000000000001e-06, |
|
"loss": 1.7863, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.254685878753662, |
|
"learning_rate": 6.520000000000001e-06, |
|
"loss": 1.6722, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.356, |
|
"grad_norm": 1.4736641645431519, |
|
"learning_rate": 6.480000000000001e-06, |
|
"loss": 1.8828, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.276246428489685, |
|
"learning_rate": 6.440000000000001e-06, |
|
"loss": 1.6934, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.364, |
|
"grad_norm": 1.2888675928115845, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 1.7737, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 1.3010715246200562, |
|
"learning_rate": 6.360000000000001e-06, |
|
"loss": 1.7787, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.372, |
|
"grad_norm": 1.3261510133743286, |
|
"learning_rate": 6.3200000000000005e-06, |
|
"loss": 1.6233, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 1.306959867477417, |
|
"learning_rate": 6.280000000000001e-06, |
|
"loss": 1.6648, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3729215860366821, |
|
"learning_rate": 6.24e-06, |
|
"loss": 1.7446, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.442742943763733, |
|
"learning_rate": 6.200000000000001e-06, |
|
"loss": 1.705, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.388, |
|
"grad_norm": 1.2511957883834839, |
|
"learning_rate": 6.16e-06, |
|
"loss": 1.6899, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 1.3324456214904785, |
|
"learning_rate": 6.120000000000001e-06, |
|
"loss": 1.6001, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.396, |
|
"grad_norm": 1.2704485654830933, |
|
"learning_rate": 6.08e-06, |
|
"loss": 1.6467, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.3349323272705078, |
|
"learning_rate": 6.040000000000001e-06, |
|
"loss": 1.9362, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.404, |
|
"grad_norm": 1.3596888780593872, |
|
"learning_rate": 6e-06, |
|
"loss": 1.6858, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 1.2084500789642334, |
|
"learning_rate": 5.9600000000000005e-06, |
|
"loss": 1.6745, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.412, |
|
"grad_norm": 1.3298012018203735, |
|
"learning_rate": 5.92e-06, |
|
"loss": 1.6407, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 1.2951877117156982, |
|
"learning_rate": 5.8800000000000005e-06, |
|
"loss": 1.6904, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3563202619552612, |
|
"learning_rate": 5.84e-06, |
|
"loss": 1.6854, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 1.3635480403900146, |
|
"learning_rate": 5.8e-06, |
|
"loss": 1.6669, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.428, |
|
"grad_norm": 1.2384741306304932, |
|
"learning_rate": 5.76e-06, |
|
"loss": 1.676, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 1.3344753980636597, |
|
"learning_rate": 5.72e-06, |
|
"loss": 1.7118, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.436, |
|
"grad_norm": 1.2847405672073364, |
|
"learning_rate": 5.68e-06, |
|
"loss": 1.644, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.3353115320205688, |
|
"learning_rate": 5.64e-06, |
|
"loss": 1.783, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.444, |
|
"grad_norm": 1.2775030136108398, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 1.6135, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 1.4320266246795654, |
|
"learning_rate": 5.560000000000001e-06, |
|
"loss": 1.5901, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.452, |
|
"grad_norm": 1.2850722074508667, |
|
"learning_rate": 5.5200000000000005e-06, |
|
"loss": 1.5889, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 1.4107635021209717, |
|
"learning_rate": 5.480000000000001e-06, |
|
"loss": 1.7243, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.3406022787094116, |
|
"learning_rate": 5.4400000000000004e-06, |
|
"loss": 1.7539, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 1.4117956161499023, |
|
"learning_rate": 5.400000000000001e-06, |
|
"loss": 1.7506, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.468, |
|
"grad_norm": 1.3653512001037598, |
|
"learning_rate": 5.36e-06, |
|
"loss": 1.6322, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 1.3492501974105835, |
|
"learning_rate": 5.320000000000001e-06, |
|
"loss": 1.7073, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.476, |
|
"grad_norm": 1.3545348644256592, |
|
"learning_rate": 5.28e-06, |
|
"loss": 1.7101, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.2072592973709106, |
|
"learning_rate": 5.240000000000001e-06, |
|
"loss": 1.6332, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.484, |
|
"grad_norm": 1.3392893075942993, |
|
"learning_rate": 5.2e-06, |
|
"loss": 1.6766, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 1.4758343696594238, |
|
"learning_rate": 5.1600000000000006e-06, |
|
"loss": 1.7023, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.492, |
|
"grad_norm": 1.2837586402893066, |
|
"learning_rate": 5.12e-06, |
|
"loss": 1.5553, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 1.3341403007507324, |
|
"learning_rate": 5.0800000000000005e-06, |
|
"loss": 1.5714, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.4433372020721436, |
|
"learning_rate": 5.04e-06, |
|
"loss": 1.758, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 1.3581253290176392, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5949, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.508, |
|
"grad_norm": 1.378012776374817, |
|
"learning_rate": 4.960000000000001e-06, |
|
"loss": 1.722, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 1.400965929031372, |
|
"learning_rate": 4.92e-06, |
|
"loss": 1.7045, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.516, |
|
"grad_norm": 1.4368276596069336, |
|
"learning_rate": 4.880000000000001e-06, |
|
"loss": 1.5844, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.3200892210006714, |
|
"learning_rate": 4.84e-06, |
|
"loss": 1.6877, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.524, |
|
"grad_norm": 1.4182710647583008, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.5604, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 1.2968662977218628, |
|
"learning_rate": 4.76e-06, |
|
"loss": 1.6281, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.532, |
|
"grad_norm": 1.2928485870361328, |
|
"learning_rate": 4.7200000000000005e-06, |
|
"loss": 1.621, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 1.3902649879455566, |
|
"learning_rate": 4.680000000000001e-06, |
|
"loss": 1.6301, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.3287855386734009, |
|
"learning_rate": 4.6400000000000005e-06, |
|
"loss": 1.6373, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.374890685081482, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 1.6926, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.548, |
|
"grad_norm": 1.2487736940383911, |
|
"learning_rate": 4.56e-06, |
|
"loss": 1.611, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 1.3028439283370972, |
|
"learning_rate": 4.520000000000001e-06, |
|
"loss": 1.6176, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.556, |
|
"grad_norm": 1.3260806798934937, |
|
"learning_rate": 4.48e-06, |
|
"loss": 1.6203, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.3496484756469727, |
|
"learning_rate": 4.440000000000001e-06, |
|
"loss": 1.7806, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.564, |
|
"grad_norm": 1.2870110273361206, |
|
"learning_rate": 4.4e-06, |
|
"loss": 1.6348, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 1.481292963027954, |
|
"learning_rate": 4.360000000000001e-06, |
|
"loss": 1.6622, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.572, |
|
"grad_norm": 1.426375389099121, |
|
"learning_rate": 4.32e-06, |
|
"loss": 1.6248, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.3465576171875, |
|
"learning_rate": 4.2800000000000005e-06, |
|
"loss": 1.5603, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.3932470083236694, |
|
"learning_rate": 4.24e-06, |
|
"loss": 1.6432, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 1.2891041040420532, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 1.562, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.588, |
|
"grad_norm": 1.3476612567901611, |
|
"learning_rate": 4.16e-06, |
|
"loss": 1.6593, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 1.3233203887939453, |
|
"learning_rate": 4.12e-06, |
|
"loss": 1.5696, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.596, |
|
"grad_norm": 1.3516829013824463, |
|
"learning_rate": 4.08e-06, |
|
"loss": 1.6378, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.2899479866027832, |
|
"learning_rate": 4.04e-06, |
|
"loss": 1.6015, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.604, |
|
"grad_norm": 1.4173705577850342, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.681, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 1.3100056648254395, |
|
"learning_rate": 3.96e-06, |
|
"loss": 1.5746, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.612, |
|
"grad_norm": 1.4304648637771606, |
|
"learning_rate": 3.920000000000001e-06, |
|
"loss": 1.5667, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 1.4166439771652222, |
|
"learning_rate": 3.88e-06, |
|
"loss": 1.6874, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.3523306846618652, |
|
"learning_rate": 3.8400000000000005e-06, |
|
"loss": 1.6085, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 1.4200947284698486, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 1.6213, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.628, |
|
"grad_norm": 1.3680000305175781, |
|
"learning_rate": 3.7600000000000004e-06, |
|
"loss": 1.6323, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 1.2544093132019043, |
|
"learning_rate": 3.7200000000000004e-06, |
|
"loss": 1.5722, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.636, |
|
"grad_norm": 1.284311056137085, |
|
"learning_rate": 3.6800000000000003e-06, |
|
"loss": 1.538, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.3962759971618652, |
|
"learning_rate": 3.6400000000000003e-06, |
|
"loss": 1.6742, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.644, |
|
"grad_norm": 1.3595976829528809, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 1.6504, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 1.4007598161697388, |
|
"learning_rate": 3.5600000000000002e-06, |
|
"loss": 1.6943, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.652, |
|
"grad_norm": 1.404727578163147, |
|
"learning_rate": 3.52e-06, |
|
"loss": 1.6085, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 1.4727367162704468, |
|
"learning_rate": 3.48e-06, |
|
"loss": 1.5892, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.340226411819458, |
|
"learning_rate": 3.44e-06, |
|
"loss": 1.7069, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 1.3784148693084717, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 1.5993, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.668, |
|
"grad_norm": 1.386871099472046, |
|
"learning_rate": 3.3600000000000004e-06, |
|
"loss": 1.5739, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 1.4597012996673584, |
|
"learning_rate": 3.3200000000000004e-06, |
|
"loss": 1.7895, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.676, |
|
"grad_norm": 1.5027787685394287, |
|
"learning_rate": 3.2800000000000004e-06, |
|
"loss": 1.6557, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.4223504066467285, |
|
"learning_rate": 3.2400000000000003e-06, |
|
"loss": 1.6052, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.684, |
|
"grad_norm": 1.4609079360961914, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 1.7185, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 1.4065134525299072, |
|
"learning_rate": 3.1600000000000002e-06, |
|
"loss": 1.6011, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.692, |
|
"grad_norm": 1.3931232690811157, |
|
"learning_rate": 3.12e-06, |
|
"loss": 1.5843, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 1.4128152132034302, |
|
"learning_rate": 3.08e-06, |
|
"loss": 1.6525, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.4391388893127441, |
|
"learning_rate": 3.04e-06, |
|
"loss": 1.6026, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 1.4180413484573364, |
|
"learning_rate": 3e-06, |
|
"loss": 1.5877, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.708, |
|
"grad_norm": 1.3166316747665405, |
|
"learning_rate": 2.96e-06, |
|
"loss": 1.6256, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 1.3945341110229492, |
|
"learning_rate": 2.92e-06, |
|
"loss": 1.5426, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.716, |
|
"grad_norm": 1.4100629091262817, |
|
"learning_rate": 2.88e-06, |
|
"loss": 1.4844, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.5133711099624634, |
|
"learning_rate": 2.84e-06, |
|
"loss": 1.6189, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.724, |
|
"grad_norm": 1.3762165307998657, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 1.6345, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 1.4408543109893799, |
|
"learning_rate": 2.7600000000000003e-06, |
|
"loss": 1.7058, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.732, |
|
"grad_norm": 1.4354008436203003, |
|
"learning_rate": 2.7200000000000002e-06, |
|
"loss": 1.7113, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 1.3285670280456543, |
|
"learning_rate": 2.68e-06, |
|
"loss": 1.5993, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.3333245515823364, |
|
"learning_rate": 2.64e-06, |
|
"loss": 1.5739, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 1.5411490201950073, |
|
"learning_rate": 2.6e-06, |
|
"loss": 1.6264, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.748, |
|
"grad_norm": 1.3106950521469116, |
|
"learning_rate": 2.56e-06, |
|
"loss": 1.555, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 1.3158583641052246, |
|
"learning_rate": 2.52e-06, |
|
"loss": 1.5946, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.756, |
|
"grad_norm": 1.4075291156768799, |
|
"learning_rate": 2.4800000000000004e-06, |
|
"loss": 1.565, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.310943603515625, |
|
"learning_rate": 2.4400000000000004e-06, |
|
"loss": 1.5143, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.764, |
|
"grad_norm": 1.434760570526123, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.659, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 1.3629409074783325, |
|
"learning_rate": 2.3600000000000003e-06, |
|
"loss": 1.607, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.772, |
|
"grad_norm": 1.3260722160339355, |
|
"learning_rate": 2.3200000000000002e-06, |
|
"loss": 1.4663, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 1.4491920471191406, |
|
"learning_rate": 2.28e-06, |
|
"loss": 1.594, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.4487640857696533, |
|
"learning_rate": 2.24e-06, |
|
"loss": 1.5447, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 1.5706510543823242, |
|
"learning_rate": 2.2e-06, |
|
"loss": 1.6544, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.788, |
|
"grad_norm": 1.368772268295288, |
|
"learning_rate": 2.16e-06, |
|
"loss": 1.515, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 1.4399783611297607, |
|
"learning_rate": 2.12e-06, |
|
"loss": 1.6139, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.796, |
|
"grad_norm": 1.3945013284683228, |
|
"learning_rate": 2.08e-06, |
|
"loss": 1.4373, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.3934296369552612, |
|
"learning_rate": 2.04e-06, |
|
"loss": 1.6531, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.804, |
|
"grad_norm": 1.404791235923767, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.5855, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 1.3408340215682983, |
|
"learning_rate": 1.9600000000000003e-06, |
|
"loss": 1.4074, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.812, |
|
"grad_norm": 1.465818166732788, |
|
"learning_rate": 1.9200000000000003e-06, |
|
"loss": 1.6122, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 1.3890620470046997, |
|
"learning_rate": 1.8800000000000002e-06, |
|
"loss": 1.6707, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.3814433813095093, |
|
"learning_rate": 1.8400000000000002e-06, |
|
"loss": 1.5917, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 1.4209574460983276, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 1.6357, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.828, |
|
"grad_norm": 1.2836953401565552, |
|
"learning_rate": 1.76e-06, |
|
"loss": 1.5397, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 1.5015844106674194, |
|
"learning_rate": 1.72e-06, |
|
"loss": 1.6291, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.836, |
|
"grad_norm": 1.3769359588623047, |
|
"learning_rate": 1.6800000000000002e-06, |
|
"loss": 1.5962, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.430960774421692, |
|
"learning_rate": 1.6400000000000002e-06, |
|
"loss": 1.6396, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.844, |
|
"grad_norm": 1.3863214254379272, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 1.6696, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 1.364683985710144, |
|
"learning_rate": 1.56e-06, |
|
"loss": 1.4985, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.852, |
|
"grad_norm": 1.4108574390411377, |
|
"learning_rate": 1.52e-06, |
|
"loss": 1.5946, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 1.4607338905334473, |
|
"learning_rate": 1.48e-06, |
|
"loss": 1.585, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.480090856552124, |
|
"learning_rate": 1.44e-06, |
|
"loss": 1.5559, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 1.4284939765930176, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 1.5759, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.868, |
|
"grad_norm": 1.5613925457000732, |
|
"learning_rate": 1.3600000000000001e-06, |
|
"loss": 1.7214, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 1.457170009613037, |
|
"learning_rate": 1.32e-06, |
|
"loss": 1.5808, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.876, |
|
"grad_norm": 1.364310383796692, |
|
"learning_rate": 1.28e-06, |
|
"loss": 1.5733, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.4274003505706787, |
|
"learning_rate": 1.2400000000000002e-06, |
|
"loss": 1.5918, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.884, |
|
"grad_norm": 1.4450329542160034, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.574, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 1.4518463611602783, |
|
"learning_rate": 1.1600000000000001e-06, |
|
"loss": 1.5178, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.892, |
|
"grad_norm": 1.3471320867538452, |
|
"learning_rate": 1.12e-06, |
|
"loss": 1.5961, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 1.5115176439285278, |
|
"learning_rate": 1.08e-06, |
|
"loss": 1.5624, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.3841177225112915, |
|
"learning_rate": 1.04e-06, |
|
"loss": 1.5607, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 1.419216275215149, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.6114, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.908, |
|
"grad_norm": 1.4189485311508179, |
|
"learning_rate": 9.600000000000001e-07, |
|
"loss": 1.5479, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 1.3959101438522339, |
|
"learning_rate": 9.200000000000001e-07, |
|
"loss": 1.6098, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.916, |
|
"grad_norm": 1.460270643234253, |
|
"learning_rate": 8.8e-07, |
|
"loss": 1.5402, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.4265800714492798, |
|
"learning_rate": 8.400000000000001e-07, |
|
"loss": 1.5805, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.924, |
|
"grad_norm": 1.356236457824707, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 1.6396, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 1.4282077550888062, |
|
"learning_rate": 7.6e-07, |
|
"loss": 1.6725, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.932, |
|
"grad_norm": 1.5110477209091187, |
|
"learning_rate": 7.2e-07, |
|
"loss": 1.6143, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 1.4182950258255005, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 1.5751, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.4530798196792603, |
|
"learning_rate": 6.4e-07, |
|
"loss": 1.634, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 1.4195228815078735, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 1.5055, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.948, |
|
"grad_norm": 1.4488164186477661, |
|
"learning_rate": 5.6e-07, |
|
"loss": 1.61, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 1.4380940198898315, |
|
"learning_rate": 5.2e-07, |
|
"loss": 1.5302, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.956, |
|
"grad_norm": 1.4326223134994507, |
|
"learning_rate": 4.800000000000001e-07, |
|
"loss": 1.6128, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.437170386314392, |
|
"learning_rate": 4.4e-07, |
|
"loss": 1.6482, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.964, |
|
"grad_norm": 1.435649037361145, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 1.5889, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 1.5094983577728271, |
|
"learning_rate": 3.6e-07, |
|
"loss": 1.6749, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.972, |
|
"grad_norm": 1.35372793674469, |
|
"learning_rate": 3.2e-07, |
|
"loss": 1.5732, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 1.4316813945770264, |
|
"learning_rate": 2.8e-07, |
|
"loss": 1.569, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.419772982597351, |
|
"learning_rate": 2.4000000000000003e-07, |
|
"loss": 1.6188, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 1.3961282968521118, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 1.5361, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.988, |
|
"grad_norm": 1.4259065389633179, |
|
"learning_rate": 1.6e-07, |
|
"loss": 1.6011, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 1.5136369466781616, |
|
"learning_rate": 1.2000000000000002e-07, |
|
"loss": 1.6597, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.996, |
|
"grad_norm": 1.4152812957763672, |
|
"learning_rate": 8e-08, |
|
"loss": 1.6362, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.4813553094863892, |
|
"learning_rate": 4e-08, |
|
"loss": 1.5665, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.6056512594223022, |
|
"eval_runtime": 73.6777, |
|
"eval_samples_per_second": 6.786, |
|
"eval_steps_per_second": 0.855, |
|
"step": 250 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.0019312328704e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|