|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.6510416666666666, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0013020833333333333, |
|
"grad_norm": 0.6808765530586243, |
|
"learning_rate": 2.02e-06, |
|
"loss": 1.5302, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0026041666666666665, |
|
"grad_norm": 0.7672109007835388, |
|
"learning_rate": 4.04e-06, |
|
"loss": 1.5499, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00390625, |
|
"grad_norm": 0.7675047516822815, |
|
"learning_rate": 6.06e-06, |
|
"loss": 1.6097, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.005208333333333333, |
|
"grad_norm": 0.6575904488563538, |
|
"learning_rate": 8.08e-06, |
|
"loss": 1.5117, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.006510416666666667, |
|
"grad_norm": 0.7006857991218567, |
|
"learning_rate": 1.0100000000000002e-05, |
|
"loss": 1.618, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0078125, |
|
"grad_norm": 0.8015255928039551, |
|
"learning_rate": 1.212e-05, |
|
"loss": 1.538, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.009114583333333334, |
|
"grad_norm": 0.7574004530906677, |
|
"learning_rate": 1.4140000000000002e-05, |
|
"loss": 1.5707, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.010416666666666666, |
|
"grad_norm": 0.7140095233917236, |
|
"learning_rate": 1.616e-05, |
|
"loss": 1.4702, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01171875, |
|
"grad_norm": 0.770279049873352, |
|
"learning_rate": 1.818e-05, |
|
"loss": 1.5559, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.013020833333333334, |
|
"grad_norm": 0.7535017728805542, |
|
"learning_rate": 2.0200000000000003e-05, |
|
"loss": 1.5051, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014322916666666666, |
|
"grad_norm": 0.7053647041320801, |
|
"learning_rate": 2.222e-05, |
|
"loss": 1.4195, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.015625, |
|
"grad_norm": 0.8053725361824036, |
|
"learning_rate": 2.424e-05, |
|
"loss": 1.4431, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.016927083333333332, |
|
"grad_norm": 0.7332000136375427, |
|
"learning_rate": 2.6260000000000003e-05, |
|
"loss": 1.3888, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.018229166666666668, |
|
"grad_norm": 0.7415592670440674, |
|
"learning_rate": 2.8280000000000004e-05, |
|
"loss": 1.3524, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01953125, |
|
"grad_norm": 0.8849363327026367, |
|
"learning_rate": 3.0299999999999998e-05, |
|
"loss": 1.4205, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.020833333333333332, |
|
"grad_norm": 0.8366559147834778, |
|
"learning_rate": 3.232e-05, |
|
"loss": 1.3502, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.022135416666666668, |
|
"grad_norm": 0.8472776412963867, |
|
"learning_rate": 3.434e-05, |
|
"loss": 1.3127, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0234375, |
|
"grad_norm": 0.9324319958686829, |
|
"learning_rate": 3.636e-05, |
|
"loss": 1.3929, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.024739583333333332, |
|
"grad_norm": 0.7191261649131775, |
|
"learning_rate": 3.838e-05, |
|
"loss": 1.2803, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.026041666666666668, |
|
"grad_norm": 0.9117175936698914, |
|
"learning_rate": 4.0400000000000006e-05, |
|
"loss": 1.2158, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02734375, |
|
"grad_norm": 0.7863415479660034, |
|
"learning_rate": 4.242e-05, |
|
"loss": 1.273, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.028645833333333332, |
|
"grad_norm": 0.7084434032440186, |
|
"learning_rate": 4.444e-05, |
|
"loss": 1.2066, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.029947916666666668, |
|
"grad_norm": 0.695151686668396, |
|
"learning_rate": 4.6460000000000006e-05, |
|
"loss": 1.1198, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 0.49544116854667664, |
|
"learning_rate": 4.848e-05, |
|
"loss": 1.1062, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.032552083333333336, |
|
"grad_norm": 0.47292080521583557, |
|
"learning_rate": 5.05e-05, |
|
"loss": 1.0123, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.033854166666666664, |
|
"grad_norm": 0.5713898539543152, |
|
"learning_rate": 5.2520000000000005e-05, |
|
"loss": 1.057, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03515625, |
|
"grad_norm": 0.6084786653518677, |
|
"learning_rate": 5.454e-05, |
|
"loss": 1.0196, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.036458333333333336, |
|
"grad_norm": 0.6218786239624023, |
|
"learning_rate": 5.656000000000001e-05, |
|
"loss": 1.0767, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.037760416666666664, |
|
"grad_norm": 0.6100573539733887, |
|
"learning_rate": 5.858e-05, |
|
"loss": 1.0825, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0390625, |
|
"grad_norm": 0.5810414552688599, |
|
"learning_rate": 6.0599999999999996e-05, |
|
"loss": 1.0459, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.040364583333333336, |
|
"grad_norm": 0.48789265751838684, |
|
"learning_rate": 6.262000000000001e-05, |
|
"loss": 0.9882, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.041666666666666664, |
|
"grad_norm": 0.46344229578971863, |
|
"learning_rate": 6.464e-05, |
|
"loss": 1.0365, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04296875, |
|
"grad_norm": 0.5264561772346497, |
|
"learning_rate": 6.666e-05, |
|
"loss": 1.0536, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.044270833333333336, |
|
"grad_norm": 0.5107188820838928, |
|
"learning_rate": 6.868e-05, |
|
"loss": 1.072, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.045572916666666664, |
|
"grad_norm": 0.49732786417007446, |
|
"learning_rate": 7.07e-05, |
|
"loss": 1.0793, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.046875, |
|
"grad_norm": 0.4549235999584198, |
|
"learning_rate": 7.272e-05, |
|
"loss": 0.9733, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.048177083333333336, |
|
"grad_norm": 0.4460265636444092, |
|
"learning_rate": 7.474e-05, |
|
"loss": 1.0077, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.049479166666666664, |
|
"grad_norm": 0.47252264618873596, |
|
"learning_rate": 7.676e-05, |
|
"loss": 1.0423, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.05078125, |
|
"grad_norm": 0.3926396071910858, |
|
"learning_rate": 7.878e-05, |
|
"loss": 0.9915, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.052083333333333336, |
|
"grad_norm": 0.4565116763114929, |
|
"learning_rate": 8.080000000000001e-05, |
|
"loss": 1.0371, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.053385416666666664, |
|
"grad_norm": 0.36098912358283997, |
|
"learning_rate": 8.282e-05, |
|
"loss": 0.9437, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0546875, |
|
"grad_norm": 0.3953251242637634, |
|
"learning_rate": 8.484e-05, |
|
"loss": 0.9441, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.055989583333333336, |
|
"grad_norm": 0.4209212064743042, |
|
"learning_rate": 8.686e-05, |
|
"loss": 1.0181, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.057291666666666664, |
|
"grad_norm": 0.3656007647514343, |
|
"learning_rate": 8.888e-05, |
|
"loss": 1.0204, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.05859375, |
|
"grad_norm": 0.4179205894470215, |
|
"learning_rate": 9.09e-05, |
|
"loss": 0.9894, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.059895833333333336, |
|
"grad_norm": 0.4047180116176605, |
|
"learning_rate": 9.292000000000001e-05, |
|
"loss": 0.9743, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.061197916666666664, |
|
"grad_norm": 0.36672013998031616, |
|
"learning_rate": 9.494e-05, |
|
"loss": 0.9355, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.4091084599494934, |
|
"learning_rate": 9.696e-05, |
|
"loss": 1.0145, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06380208333333333, |
|
"grad_norm": 0.4033145606517792, |
|
"learning_rate": 9.898e-05, |
|
"loss": 0.8795, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.06510416666666667, |
|
"grad_norm": 0.4093867838382721, |
|
"learning_rate": 0.000101, |
|
"loss": 0.9118, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06640625, |
|
"grad_norm": 0.4064587354660034, |
|
"learning_rate": 0.00010302, |
|
"loss": 0.9846, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06770833333333333, |
|
"grad_norm": 0.4279826283454895, |
|
"learning_rate": 0.00010504000000000001, |
|
"loss": 0.9459, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06901041666666667, |
|
"grad_norm": 0.3995688855648041, |
|
"learning_rate": 0.00010706000000000001, |
|
"loss": 0.8842, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0703125, |
|
"grad_norm": 0.37477144598960876, |
|
"learning_rate": 0.00010908, |
|
"loss": 0.9676, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.07161458333333333, |
|
"grad_norm": 0.4394771456718445, |
|
"learning_rate": 0.00011110000000000002, |
|
"loss": 1.0441, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07291666666666667, |
|
"grad_norm": 0.5129756331443787, |
|
"learning_rate": 0.00011312000000000001, |
|
"loss": 0.9511, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07421875, |
|
"grad_norm": 0.3840573728084564, |
|
"learning_rate": 0.00011514, |
|
"loss": 0.9381, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07552083333333333, |
|
"grad_norm": 0.3669443726539612, |
|
"learning_rate": 0.00011716, |
|
"loss": 0.9452, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.07682291666666667, |
|
"grad_norm": 0.4047168791294098, |
|
"learning_rate": 0.00011918, |
|
"loss": 0.9662, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 0.43623411655426025, |
|
"learning_rate": 0.00012119999999999999, |
|
"loss": 0.9399, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07942708333333333, |
|
"grad_norm": 0.45791196823120117, |
|
"learning_rate": 0.00012322, |
|
"loss": 0.9579, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.08072916666666667, |
|
"grad_norm": 0.3631390929222107, |
|
"learning_rate": 0.00012524000000000001, |
|
"loss": 0.9537, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.08203125, |
|
"grad_norm": 0.44578367471694946, |
|
"learning_rate": 0.00012726, |
|
"loss": 0.969, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 0.3980722427368164, |
|
"learning_rate": 0.00012928, |
|
"loss": 0.9296, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.08463541666666667, |
|
"grad_norm": 0.415637731552124, |
|
"learning_rate": 0.00013130000000000002, |
|
"loss": 0.871, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0859375, |
|
"grad_norm": 0.4139644503593445, |
|
"learning_rate": 0.00013332, |
|
"loss": 0.9521, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08723958333333333, |
|
"grad_norm": 0.38350746035575867, |
|
"learning_rate": 0.00013534000000000002, |
|
"loss": 0.8741, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.08854166666666667, |
|
"grad_norm": 0.3766394555568695, |
|
"learning_rate": 0.00013736, |
|
"loss": 0.9806, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08984375, |
|
"grad_norm": 0.3952779769897461, |
|
"learning_rate": 0.00013937999999999998, |
|
"loss": 0.9444, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.09114583333333333, |
|
"grad_norm": 0.3924815058708191, |
|
"learning_rate": 0.0001414, |
|
"loss": 0.9173, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09244791666666667, |
|
"grad_norm": 0.37496528029441833, |
|
"learning_rate": 0.00014342, |
|
"loss": 0.884, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 0.3437505066394806, |
|
"learning_rate": 0.00014544, |
|
"loss": 0.8915, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09505208333333333, |
|
"grad_norm": 0.4203055202960968, |
|
"learning_rate": 0.00014746, |
|
"loss": 0.8764, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.09635416666666667, |
|
"grad_norm": 0.37117621302604675, |
|
"learning_rate": 0.00014948, |
|
"loss": 0.9337, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.09765625, |
|
"grad_norm": 0.3810153305530548, |
|
"learning_rate": 0.0001515, |
|
"loss": 0.8872, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09895833333333333, |
|
"grad_norm": 0.3892213702201843, |
|
"learning_rate": 0.00015352, |
|
"loss": 0.8906, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.10026041666666667, |
|
"grad_norm": 0.37222960591316223, |
|
"learning_rate": 0.00015554000000000002, |
|
"loss": 0.8289, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1015625, |
|
"grad_norm": 0.37449032068252563, |
|
"learning_rate": 0.00015756, |
|
"loss": 0.9172, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.10286458333333333, |
|
"grad_norm": 0.4078482687473297, |
|
"learning_rate": 0.00015958000000000001, |
|
"loss": 0.8892, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"grad_norm": 0.3846503794193268, |
|
"learning_rate": 0.00016160000000000002, |
|
"loss": 0.8464, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10546875, |
|
"grad_norm": 0.39846357703208923, |
|
"learning_rate": 0.00016362, |
|
"loss": 0.9439, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.10677083333333333, |
|
"grad_norm": 0.3736095726490021, |
|
"learning_rate": 0.00016564, |
|
"loss": 0.9479, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.10807291666666667, |
|
"grad_norm": 0.38269343972206116, |
|
"learning_rate": 0.00016766, |
|
"loss": 0.8499, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.109375, |
|
"grad_norm": 0.4197899103164673, |
|
"learning_rate": 0.00016968, |
|
"loss": 0.9199, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.11067708333333333, |
|
"grad_norm": 0.3637363016605377, |
|
"learning_rate": 0.0001717, |
|
"loss": 0.8569, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.11197916666666667, |
|
"grad_norm": 0.36869698762893677, |
|
"learning_rate": 0.00017372, |
|
"loss": 0.9584, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.11328125, |
|
"grad_norm": 0.35092616081237793, |
|
"learning_rate": 0.00017574, |
|
"loss": 0.9374, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.11458333333333333, |
|
"grad_norm": 0.39582520723342896, |
|
"learning_rate": 0.00017776, |
|
"loss": 0.9239, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.11588541666666667, |
|
"grad_norm": 0.3559093773365021, |
|
"learning_rate": 0.00017978000000000002, |
|
"loss": 0.8901, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1171875, |
|
"grad_norm": 0.3778141140937805, |
|
"learning_rate": 0.0001818, |
|
"loss": 0.8903, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11848958333333333, |
|
"grad_norm": 0.36067256331443787, |
|
"learning_rate": 0.00018382, |
|
"loss": 0.8905, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.11979166666666667, |
|
"grad_norm": 0.36430442333221436, |
|
"learning_rate": 0.00018584000000000002, |
|
"loss": 0.8941, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.12109375, |
|
"grad_norm": 0.40255868434906006, |
|
"learning_rate": 0.00018786, |
|
"loss": 0.9192, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.12239583333333333, |
|
"grad_norm": 0.38338273763656616, |
|
"learning_rate": 0.00018988, |
|
"loss": 0.8976, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.12369791666666667, |
|
"grad_norm": 0.3768159747123718, |
|
"learning_rate": 0.0001919, |
|
"loss": 0.9337, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.3879520297050476, |
|
"learning_rate": 0.00019392, |
|
"loss": 0.8761, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.12630208333333334, |
|
"grad_norm": 0.3825957477092743, |
|
"learning_rate": 0.00019594, |
|
"loss": 0.9051, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.12760416666666666, |
|
"grad_norm": 0.3993157148361206, |
|
"learning_rate": 0.00019796, |
|
"loss": 0.8613, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.12890625, |
|
"grad_norm": 0.3906169831752777, |
|
"learning_rate": 0.00019998, |
|
"loss": 0.9948, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.13020833333333334, |
|
"grad_norm": 0.38024818897247314, |
|
"learning_rate": 0.000202, |
|
"loss": 0.981, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13151041666666666, |
|
"grad_norm": 0.3382592499256134, |
|
"learning_rate": 0.00020199688492212377, |
|
"loss": 0.8662, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.1328125, |
|
"grad_norm": 0.42753666639328003, |
|
"learning_rate": 0.00020198753988064772, |
|
"loss": 0.858, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.13411458333333334, |
|
"grad_norm": 0.36199378967285156, |
|
"learning_rate": 0.00020197196545201806, |
|
"loss": 0.863, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.13541666666666666, |
|
"grad_norm": 0.30593058466911316, |
|
"learning_rate": 0.0002019501625969389, |
|
"loss": 0.9211, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.13671875, |
|
"grad_norm": 0.40363338589668274, |
|
"learning_rate": 0.00020192213266031304, |
|
"loss": 0.9512, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.13802083333333334, |
|
"grad_norm": 0.34124210476875305, |
|
"learning_rate": 0.00020188787737115897, |
|
"loss": 0.8821, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.13932291666666666, |
|
"grad_norm": 0.324048787355423, |
|
"learning_rate": 0.00020184739884250436, |
|
"loss": 0.9053, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.140625, |
|
"grad_norm": 0.32893842458724976, |
|
"learning_rate": 0.00020180069957125544, |
|
"loss": 0.8581, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.14192708333333334, |
|
"grad_norm": 0.3708108067512512, |
|
"learning_rate": 0.0002017477824380433, |
|
"loss": 0.908, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.14322916666666666, |
|
"grad_norm": 0.3627496361732483, |
|
"learning_rate": 0.00020168865070704594, |
|
"loss": 0.8493, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14453125, |
|
"grad_norm": 0.38643914461135864, |
|
"learning_rate": 0.00020162330802578706, |
|
"loss": 0.9305, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.14583333333333334, |
|
"grad_norm": 0.3313356041908264, |
|
"learning_rate": 0.00020155175842491107, |
|
"loss": 0.9209, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.14713541666666666, |
|
"grad_norm": 0.36708641052246094, |
|
"learning_rate": 0.0002014740063179344, |
|
"loss": 0.908, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.1484375, |
|
"grad_norm": 0.3595336973667145, |
|
"learning_rate": 0.00020139005650097317, |
|
"loss": 0.8212, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.14973958333333334, |
|
"grad_norm": 0.43490394949913025, |
|
"learning_rate": 0.00020129991415244756, |
|
"loss": 0.8863, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.15104166666666666, |
|
"grad_norm": 0.35051169991493225, |
|
"learning_rate": 0.00020120358483276227, |
|
"loss": 0.9196, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.15234375, |
|
"grad_norm": 0.36855900287628174, |
|
"learning_rate": 0.00020110107448396346, |
|
"loss": 0.8889, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.15364583333333334, |
|
"grad_norm": 0.32522818446159363, |
|
"learning_rate": 0.0002009923894293723, |
|
"loss": 0.9072, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.15494791666666666, |
|
"grad_norm": 0.3217287063598633, |
|
"learning_rate": 0.00020087753637319499, |
|
"loss": 0.8859, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.4293065667152405, |
|
"learning_rate": 0.00020075652240010892, |
|
"loss": 0.8551, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15755208333333334, |
|
"grad_norm": 0.3320338726043701, |
|
"learning_rate": 0.00020062935497482606, |
|
"loss": 0.9491, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.15885416666666666, |
|
"grad_norm": 0.3622731864452362, |
|
"learning_rate": 0.00020049604194163217, |
|
"loss": 0.9246, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.16015625, |
|
"grad_norm": 0.33044755458831787, |
|
"learning_rate": 0.00020035659152390313, |
|
"loss": 0.8979, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.16145833333333334, |
|
"grad_norm": 0.3863113522529602, |
|
"learning_rate": 0.00020021101232359757, |
|
"loss": 0.8701, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.16276041666666666, |
|
"grad_norm": 0.39442121982574463, |
|
"learning_rate": 0.0002000593133207263, |
|
"loss": 0.8516, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1640625, |
|
"grad_norm": 0.3625333309173584, |
|
"learning_rate": 0.00019990150387279835, |
|
"loss": 0.887, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.16536458333333334, |
|
"grad_norm": 0.3347029983997345, |
|
"learning_rate": 0.00019973759371424388, |
|
"loss": 0.9712, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.34266117215156555, |
|
"learning_rate": 0.0001995675929558135, |
|
"loss": 0.9359, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.16796875, |
|
"grad_norm": 0.34118279814720154, |
|
"learning_rate": 0.0001993915120839548, |
|
"loss": 0.9324, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.16927083333333334, |
|
"grad_norm": 0.3674456477165222, |
|
"learning_rate": 0.00019920936196016534, |
|
"loss": 0.8567, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.17057291666666666, |
|
"grad_norm": 0.3397194445133209, |
|
"learning_rate": 0.0001990211538203228, |
|
"loss": 0.9137, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.171875, |
|
"grad_norm": 0.32985278964042664, |
|
"learning_rate": 0.00019882689927399174, |
|
"loss": 0.9193, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.17317708333333334, |
|
"grad_norm": 0.3857951760292053, |
|
"learning_rate": 0.00019862661030370764, |
|
"loss": 0.896, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.17447916666666666, |
|
"grad_norm": 0.3355730175971985, |
|
"learning_rate": 0.00019842029926423762, |
|
"loss": 0.9673, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.17578125, |
|
"grad_norm": 0.3015749156475067, |
|
"learning_rate": 0.00019820797888181837, |
|
"loss": 0.9244, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.17708333333333334, |
|
"grad_norm": 0.3197927474975586, |
|
"learning_rate": 0.00019798966225337126, |
|
"loss": 0.9298, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.17838541666666666, |
|
"grad_norm": 0.31796562671661377, |
|
"learning_rate": 0.00019776536284569425, |
|
"loss": 0.9274, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.1796875, |
|
"grad_norm": 0.37680956721305847, |
|
"learning_rate": 0.00019753509449463134, |
|
"loss": 0.8858, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.18098958333333334, |
|
"grad_norm": 0.33456477522850037, |
|
"learning_rate": 0.00019729887140421912, |
|
"loss": 0.8654, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.18229166666666666, |
|
"grad_norm": 0.3346468508243561, |
|
"learning_rate": 0.00019705670814581052, |
|
"loss": 0.7735, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.18359375, |
|
"grad_norm": 0.33176180720329285, |
|
"learning_rate": 0.00019680861965717597, |
|
"loss": 0.938, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.18489583333333334, |
|
"grad_norm": 0.36904606223106384, |
|
"learning_rate": 0.0001965546212415821, |
|
"loss": 0.9044, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.18619791666666666, |
|
"grad_norm": 0.37553444504737854, |
|
"learning_rate": 0.00019629472856684755, |
|
"loss": 0.8976, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.30502942204475403, |
|
"learning_rate": 0.00019602895766437678, |
|
"loss": 0.8745, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.18880208333333334, |
|
"grad_norm": 0.39606553316116333, |
|
"learning_rate": 0.00019575732492817092, |
|
"loss": 0.8426, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.19010416666666666, |
|
"grad_norm": 0.3282630741596222, |
|
"learning_rate": 0.00019547984711381662, |
|
"loss": 0.8231, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.19140625, |
|
"grad_norm": 0.3370037376880646, |
|
"learning_rate": 0.0001951965413374525, |
|
"loss": 0.7844, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.19270833333333334, |
|
"grad_norm": 0.30234625935554504, |
|
"learning_rate": 0.00019490742507471338, |
|
"loss": 0.8674, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.19401041666666666, |
|
"grad_norm": 0.34176871180534363, |
|
"learning_rate": 0.0001946125161596522, |
|
"loss": 0.9853, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.1953125, |
|
"grad_norm": 0.322853684425354, |
|
"learning_rate": 0.00019431183278363997, |
|
"loss": 0.9021, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.19661458333333334, |
|
"grad_norm": 0.32170554995536804, |
|
"learning_rate": 0.00019400539349424367, |
|
"loss": 0.8872, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.19791666666666666, |
|
"grad_norm": 0.3576914072036743, |
|
"learning_rate": 0.0001936932171940821, |
|
"loss": 0.9812, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.19921875, |
|
"grad_norm": 0.3470019996166229, |
|
"learning_rate": 0.00019337532313966, |
|
"loss": 0.9353, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.20052083333333334, |
|
"grad_norm": 0.2939004600048065, |
|
"learning_rate": 0.00019305173094017996, |
|
"loss": 0.8564, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.20182291666666666, |
|
"grad_norm": 0.353512167930603, |
|
"learning_rate": 0.0001927224605563332, |
|
"loss": 0.8534, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.203125, |
|
"grad_norm": 0.330636203289032, |
|
"learning_rate": 0.00019238753229906797, |
|
"loss": 0.8046, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.20442708333333334, |
|
"grad_norm": 0.2957157492637634, |
|
"learning_rate": 0.00019204696682833682, |
|
"loss": 0.8823, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.20572916666666666, |
|
"grad_norm": 0.3297777473926544, |
|
"learning_rate": 0.00019170078515182216, |
|
"loss": 0.8739, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.20703125, |
|
"grad_norm": 0.37313520908355713, |
|
"learning_rate": 0.00019134900862364054, |
|
"loss": 0.7445, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 0.3856740891933441, |
|
"learning_rate": 0.00019099165894302515, |
|
"loss": 0.9027, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20963541666666666, |
|
"grad_norm": 0.3400990664958954, |
|
"learning_rate": 0.00019062875815298763, |
|
"loss": 0.8591, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.2109375, |
|
"grad_norm": 0.33788663148880005, |
|
"learning_rate": 0.00019026032863895805, |
|
"loss": 0.8587, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.21223958333333334, |
|
"grad_norm": 0.31548887491226196, |
|
"learning_rate": 0.00018988639312740433, |
|
"loss": 0.8769, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.21354166666666666, |
|
"grad_norm": 0.319621205329895, |
|
"learning_rate": 0.0001895069746844302, |
|
"loss": 0.8355, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.21484375, |
|
"grad_norm": 0.32653823494911194, |
|
"learning_rate": 0.00018912209671435252, |
|
"loss": 0.9207, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.21614583333333334, |
|
"grad_norm": 0.3283182978630066, |
|
"learning_rate": 0.00018873178295825732, |
|
"loss": 0.8737, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.21744791666666666, |
|
"grad_norm": 0.32019880414009094, |
|
"learning_rate": 0.00018833605749253566, |
|
"loss": 0.9572, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.34993138909339905, |
|
"learning_rate": 0.00018793494472739831, |
|
"loss": 0.849, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.22005208333333334, |
|
"grad_norm": 0.33055248856544495, |
|
"learning_rate": 0.00018752846940537003, |
|
"loss": 0.8668, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.22135416666666666, |
|
"grad_norm": 0.31973016262054443, |
|
"learning_rate": 0.0001871166565997633, |
|
"loss": 0.8875, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.22265625, |
|
"grad_norm": 0.3294101357460022, |
|
"learning_rate": 0.00018669953171313188, |
|
"loss": 0.9007, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.22395833333333334, |
|
"grad_norm": 0.30089470744132996, |
|
"learning_rate": 0.00018627712047570352, |
|
"loss": 0.9276, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.22526041666666666, |
|
"grad_norm": 0.3125169575214386, |
|
"learning_rate": 0.0001858494489437931, |
|
"loss": 0.8886, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.2265625, |
|
"grad_norm": 0.34174057841300964, |
|
"learning_rate": 0.0001854165434981953, |
|
"loss": 0.8538, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.22786458333333334, |
|
"grad_norm": 0.31229665875434875, |
|
"learning_rate": 0.00018497843084255708, |
|
"loss": 0.8942, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.22916666666666666, |
|
"grad_norm": 0.35159239172935486, |
|
"learning_rate": 0.00018453513800173072, |
|
"loss": 0.8556, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.23046875, |
|
"grad_norm": 0.3651125431060791, |
|
"learning_rate": 0.00018408669232010684, |
|
"loss": 0.8867, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.23177083333333334, |
|
"grad_norm": 0.34238573908805847, |
|
"learning_rate": 0.00018363312145992737, |
|
"loss": 0.8788, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.23307291666666666, |
|
"grad_norm": 0.3748724162578583, |
|
"learning_rate": 0.0001831744533995795, |
|
"loss": 0.8432, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 0.34040236473083496, |
|
"learning_rate": 0.00018271071643186968, |
|
"loss": 0.8855, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.23567708333333334, |
|
"grad_norm": 0.3309285342693329, |
|
"learning_rate": 0.00018224193916227852, |
|
"loss": 0.7903, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.23697916666666666, |
|
"grad_norm": 0.3496496379375458, |
|
"learning_rate": 0.00018176815050719615, |
|
"loss": 0.8447, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.23828125, |
|
"grad_norm": 0.3104117810726166, |
|
"learning_rate": 0.00018128937969213852, |
|
"loss": 0.9041, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.23958333333333334, |
|
"grad_norm": 0.35145995020866394, |
|
"learning_rate": 0.00018080565624994474, |
|
"loss": 0.8347, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.24088541666666666, |
|
"grad_norm": 0.33385100960731506, |
|
"learning_rate": 0.00018031701001895524, |
|
"loss": 0.8578, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2421875, |
|
"grad_norm": 0.35553544759750366, |
|
"learning_rate": 0.0001798234711411713, |
|
"loss": 0.8204, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.24348958333333334, |
|
"grad_norm": 0.365278422832489, |
|
"learning_rate": 0.00017932507006039567, |
|
"loss": 0.8957, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.24479166666666666, |
|
"grad_norm": 0.3690166771411896, |
|
"learning_rate": 0.0001788218375203547, |
|
"loss": 0.906, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.24609375, |
|
"grad_norm": 0.30934587121009827, |
|
"learning_rate": 0.00017831380456280192, |
|
"loss": 0.8963, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.24739583333333334, |
|
"grad_norm": 0.3289198875427246, |
|
"learning_rate": 0.00017780100252560313, |
|
"loss": 0.9016, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.24869791666666666, |
|
"grad_norm": 0.3520403206348419, |
|
"learning_rate": 0.00017728346304080357, |
|
"loss": 0.9407, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.3425486981868744, |
|
"learning_rate": 0.0001767612180326764, |
|
"loss": 0.8207, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.2513020833333333, |
|
"grad_norm": 0.36947768926620483, |
|
"learning_rate": 0.00017623429971575384, |
|
"loss": 0.9397, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.2526041666666667, |
|
"grad_norm": 0.3345178961753845, |
|
"learning_rate": 0.0001757027405928396, |
|
"loss": 0.9446, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.25390625, |
|
"grad_norm": 0.35662034153938293, |
|
"learning_rate": 0.00017516657345300425, |
|
"loss": 0.8349, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2552083333333333, |
|
"grad_norm": 0.32738953828811646, |
|
"learning_rate": 0.00017462583136956258, |
|
"loss": 0.8485, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.2565104166666667, |
|
"grad_norm": 0.35002079606056213, |
|
"learning_rate": 0.00017408054769803337, |
|
"loss": 0.8226, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.2578125, |
|
"grad_norm": 0.3495853543281555, |
|
"learning_rate": 0.00017353075607408209, |
|
"loss": 0.8536, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.2591145833333333, |
|
"grad_norm": 0.35052958130836487, |
|
"learning_rate": 0.00017297649041144575, |
|
"loss": 0.8592, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.2604166666666667, |
|
"grad_norm": 0.328745573759079, |
|
"learning_rate": 0.0001724177848998413, |
|
"loss": 0.8492, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.26171875, |
|
"grad_norm": 0.32306015491485596, |
|
"learning_rate": 0.00017185467400285644, |
|
"loss": 0.9631, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.2630208333333333, |
|
"grad_norm": 0.33527088165283203, |
|
"learning_rate": 0.00017128719245582374, |
|
"loss": 0.923, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.2643229166666667, |
|
"grad_norm": 0.31656643748283386, |
|
"learning_rate": 0.00017071537526367817, |
|
"loss": 0.814, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.265625, |
|
"grad_norm": 0.3177151679992676, |
|
"learning_rate": 0.00017013925769879755, |
|
"loss": 0.8771, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.2669270833333333, |
|
"grad_norm": 0.3306122124195099, |
|
"learning_rate": 0.00016955887529882714, |
|
"loss": 0.8597, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2682291666666667, |
|
"grad_norm": 0.34116023778915405, |
|
"learning_rate": 0.0001689742638644871, |
|
"loss": 0.8552, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.26953125, |
|
"grad_norm": 0.3636139929294586, |
|
"learning_rate": 0.00016838545945736458, |
|
"loss": 0.8387, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.2708333333333333, |
|
"grad_norm": 0.3404369652271271, |
|
"learning_rate": 0.00016779249839768884, |
|
"loss": 0.8186, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.2721354166666667, |
|
"grad_norm": 0.3426530659198761, |
|
"learning_rate": 0.00016719541726209117, |
|
"loss": 0.9083, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.2734375, |
|
"grad_norm": 0.34500670433044434, |
|
"learning_rate": 0.00016659425288134854, |
|
"loss": 0.8645, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2747395833333333, |
|
"grad_norm": 0.36617475748062134, |
|
"learning_rate": 0.00016598904233811168, |
|
"loss": 0.8138, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.2760416666666667, |
|
"grad_norm": 0.349728524684906, |
|
"learning_rate": 0.00016537982296461768, |
|
"loss": 0.8747, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.27734375, |
|
"grad_norm": 0.3206585645675659, |
|
"learning_rate": 0.00016476663234038717, |
|
"loss": 0.8638, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.2786458333333333, |
|
"grad_norm": 0.3095620274543762, |
|
"learning_rate": 0.00016414950828990625, |
|
"loss": 0.8839, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.2799479166666667, |
|
"grad_norm": 0.3462662100791931, |
|
"learning_rate": 0.00016352848888029326, |
|
"loss": 0.8475, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 0.30881351232528687, |
|
"learning_rate": 0.00016290361241895064, |
|
"loss": 0.875, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.2825520833333333, |
|
"grad_norm": 0.3559829294681549, |
|
"learning_rate": 0.00016227491745120196, |
|
"loss": 0.8667, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.2838541666666667, |
|
"grad_norm": 0.3805384039878845, |
|
"learning_rate": 0.0001616424427579143, |
|
"loss": 0.8644, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.28515625, |
|
"grad_norm": 0.288903146982193, |
|
"learning_rate": 0.0001610062273531059, |
|
"loss": 0.9587, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.2864583333333333, |
|
"grad_norm": 0.3124583959579468, |
|
"learning_rate": 0.00016036631048153979, |
|
"loss": 0.9036, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2877604166666667, |
|
"grad_norm": 0.3762437403202057, |
|
"learning_rate": 0.0001597227316163029, |
|
"loss": 0.8723, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.2890625, |
|
"grad_norm": 0.2969296872615814, |
|
"learning_rate": 0.00015907553045637116, |
|
"loss": 0.9165, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.2903645833333333, |
|
"grad_norm": 0.3156786561012268, |
|
"learning_rate": 0.00015842474692416068, |
|
"loss": 0.8797, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.2916666666666667, |
|
"grad_norm": 0.3499281406402588, |
|
"learning_rate": 0.0001577704211630652, |
|
"loss": 0.9325, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.29296875, |
|
"grad_norm": 0.32044610381126404, |
|
"learning_rate": 0.00015711259353497981, |
|
"loss": 0.9274, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2942708333333333, |
|
"grad_norm": 0.3384315073490143, |
|
"learning_rate": 0.0001564513046178113, |
|
"loss": 0.8486, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.2955729166666667, |
|
"grad_norm": 0.35218653082847595, |
|
"learning_rate": 0.000155786595202975, |
|
"loss": 0.815, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.296875, |
|
"grad_norm": 0.308248907327652, |
|
"learning_rate": 0.00015511850629287865, |
|
"loss": 0.8801, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.2981770833333333, |
|
"grad_norm": 0.3766931891441345, |
|
"learning_rate": 0.00015444707909839325, |
|
"loss": 0.7229, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.2994791666666667, |
|
"grad_norm": 0.3386388421058655, |
|
"learning_rate": 0.00015377235503631083, |
|
"loss": 0.8249, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.30078125, |
|
"grad_norm": 0.3271256983280182, |
|
"learning_rate": 0.0001530943757267898, |
|
"loss": 0.8026, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.3020833333333333, |
|
"grad_norm": 0.3631853461265564, |
|
"learning_rate": 0.00015241318299078751, |
|
"loss": 0.8691, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.3033854166666667, |
|
"grad_norm": 0.32474151253700256, |
|
"learning_rate": 0.00015172881884748063, |
|
"loss": 0.8536, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.3046875, |
|
"grad_norm": 0.3289790153503418, |
|
"learning_rate": 0.00015104132551167318, |
|
"loss": 0.9035, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.3059895833333333, |
|
"grad_norm": 0.32092922925949097, |
|
"learning_rate": 0.00015035074539119248, |
|
"loss": 0.8217, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3072916666666667, |
|
"grad_norm": 0.350707471370697, |
|
"learning_rate": 0.00014965712108427323, |
|
"loss": 0.8041, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.30859375, |
|
"grad_norm": 0.33856043219566345, |
|
"learning_rate": 0.00014896049537693005, |
|
"loss": 0.8546, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.3098958333333333, |
|
"grad_norm": 0.3133806884288788, |
|
"learning_rate": 0.00014826091124031792, |
|
"loss": 0.9246, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.3111979166666667, |
|
"grad_norm": 0.37403765320777893, |
|
"learning_rate": 0.0001475584118280817, |
|
"loss": 0.8921, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.3411153256893158, |
|
"learning_rate": 0.00014685304047369423, |
|
"loss": 0.8605, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3138020833333333, |
|
"grad_norm": 0.3260857164859772, |
|
"learning_rate": 0.00014614484068778324, |
|
"loss": 0.8922, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.3151041666666667, |
|
"grad_norm": 0.32127058506011963, |
|
"learning_rate": 0.00014543385615544744, |
|
"loss": 0.8851, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.31640625, |
|
"grad_norm": 0.30613449215888977, |
|
"learning_rate": 0.00014472013073356184, |
|
"loss": 0.8732, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.3177083333333333, |
|
"grad_norm": 0.3390278220176697, |
|
"learning_rate": 0.00014400370844807234, |
|
"loss": 0.8251, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.3190104166666667, |
|
"grad_norm": 0.34041738510131836, |
|
"learning_rate": 0.00014328463349128025, |
|
"loss": 0.8146, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.3203125, |
|
"grad_norm": 0.3534432053565979, |
|
"learning_rate": 0.000142562950219116, |
|
"loss": 0.8614, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.3216145833333333, |
|
"grad_norm": 0.3607141673564911, |
|
"learning_rate": 0.00014183870314840325, |
|
"loss": 0.8102, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.3229166666666667, |
|
"grad_norm": 0.3430651128292084, |
|
"learning_rate": 0.00014111193695411285, |
|
"loss": 0.8703, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.32421875, |
|
"grad_norm": 0.35520392656326294, |
|
"learning_rate": 0.00014038269646660703, |
|
"loss": 0.8424, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.3255208333333333, |
|
"grad_norm": 0.35280680656433105, |
|
"learning_rate": 0.00013965102666887408, |
|
"loss": 0.8588, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3268229166666667, |
|
"grad_norm": 0.29341921210289, |
|
"learning_rate": 0.0001389169726937536, |
|
"loss": 0.8856, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.328125, |
|
"grad_norm": 0.344706267118454, |
|
"learning_rate": 0.0001381805798211525, |
|
"loss": 0.8694, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.3294270833333333, |
|
"grad_norm": 0.3089728355407715, |
|
"learning_rate": 0.00013744189347525182, |
|
"loss": 0.8805, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.3307291666666667, |
|
"grad_norm": 0.2982807159423828, |
|
"learning_rate": 0.00013670095922170498, |
|
"loss": 0.9559, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.33203125, |
|
"grad_norm": 0.35118257999420166, |
|
"learning_rate": 0.00013595782276482678, |
|
"loss": 0.8535, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.31711286306381226, |
|
"learning_rate": 0.00013521252994477446, |
|
"loss": 0.9169, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.3346354166666667, |
|
"grad_norm": 0.28975602984428406, |
|
"learning_rate": 0.00013446512673471965, |
|
"loss": 0.9879, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.3359375, |
|
"grad_norm": 0.3587123453617096, |
|
"learning_rate": 0.0001337156592380131, |
|
"loss": 0.8419, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.3372395833333333, |
|
"grad_norm": 0.32705414295196533, |
|
"learning_rate": 0.0001329641736853402, |
|
"loss": 0.8646, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.3385416666666667, |
|
"grad_norm": 0.301553338766098, |
|
"learning_rate": 0.0001322107164318697, |
|
"loss": 0.8199, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.33984375, |
|
"grad_norm": 0.31590837240219116, |
|
"learning_rate": 0.00013145533395439405, |
|
"loss": 0.8675, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.3411458333333333, |
|
"grad_norm": 0.28727132081985474, |
|
"learning_rate": 0.0001306980728484627, |
|
"loss": 0.8971, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.3424479166666667, |
|
"grad_norm": 0.38094404339790344, |
|
"learning_rate": 0.00012993897982550764, |
|
"loss": 0.8053, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 0.3682737648487091, |
|
"learning_rate": 0.00012917810170996218, |
|
"loss": 0.8066, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.3450520833333333, |
|
"grad_norm": 0.34040960669517517, |
|
"learning_rate": 0.0001284154854363725, |
|
"loss": 1.0175, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3463541666666667, |
|
"grad_norm": 0.3567488193511963, |
|
"learning_rate": 0.00012765117804650267, |
|
"loss": 0.8878, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.34765625, |
|
"grad_norm": 0.3080446720123291, |
|
"learning_rate": 0.00012688522668643268, |
|
"loss": 0.7767, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.3489583333333333, |
|
"grad_norm": 0.29930922389030457, |
|
"learning_rate": 0.00012611767860365038, |
|
"loss": 0.8593, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.3502604166666667, |
|
"grad_norm": 0.3136909008026123, |
|
"learning_rate": 0.00012534858114413692, |
|
"loss": 0.8393, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.3515625, |
|
"grad_norm": 0.3064602017402649, |
|
"learning_rate": 0.00012457798174944645, |
|
"loss": 0.8666, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3528645833333333, |
|
"grad_norm": 0.3652801215648651, |
|
"learning_rate": 0.0001238059279537795, |
|
"loss": 0.8608, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.3541666666666667, |
|
"grad_norm": 0.3020021319389343, |
|
"learning_rate": 0.00012303246738105082, |
|
"loss": 0.8949, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.35546875, |
|
"grad_norm": 0.3343137502670288, |
|
"learning_rate": 0.00012225764774195186, |
|
"loss": 0.8392, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.3567708333333333, |
|
"grad_norm": 0.33666467666625977, |
|
"learning_rate": 0.00012148151683100776, |
|
"loss": 0.8073, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.3580729166666667, |
|
"grad_norm": 0.3414088189601898, |
|
"learning_rate": 0.00012070412252362897, |
|
"loss": 0.7982, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.359375, |
|
"grad_norm": 0.34131115674972534, |
|
"learning_rate": 0.0001199255127731582, |
|
"loss": 0.8725, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.3606770833333333, |
|
"grad_norm": 0.3253263235092163, |
|
"learning_rate": 0.00011914573560791246, |
|
"loss": 0.8502, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.3619791666666667, |
|
"grad_norm": 0.2974787652492523, |
|
"learning_rate": 0.00011836483912822035, |
|
"loss": 0.8696, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.36328125, |
|
"grad_norm": 0.310968816280365, |
|
"learning_rate": 0.00011758287150345516, |
|
"loss": 0.8618, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.3645833333333333, |
|
"grad_norm": 0.30811068415641785, |
|
"learning_rate": 0.00011679988096906333, |
|
"loss": 0.7827, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3658854166666667, |
|
"grad_norm": 0.31703218817710876, |
|
"learning_rate": 0.00011601591582358924, |
|
"loss": 0.8557, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.3671875, |
|
"grad_norm": 0.3499086797237396, |
|
"learning_rate": 0.00011523102442569585, |
|
"loss": 0.8819, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.3684895833333333, |
|
"grad_norm": 0.3098037838935852, |
|
"learning_rate": 0.00011444525519118179, |
|
"loss": 0.928, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.3697916666666667, |
|
"grad_norm": 0.30361640453338623, |
|
"learning_rate": 0.00011365865658999474, |
|
"loss": 0.8187, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.37109375, |
|
"grad_norm": 0.3158656060695648, |
|
"learning_rate": 0.00011287127714324162, |
|
"loss": 0.8389, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3723958333333333, |
|
"grad_norm": 0.31333765387535095, |
|
"learning_rate": 0.00011208316542019556, |
|
"loss": 0.874, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.3736979166666667, |
|
"grad_norm": 0.3238910734653473, |
|
"learning_rate": 0.00011129437003530006, |
|
"loss": 0.8417, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.31688323616981506, |
|
"learning_rate": 0.00011050493964516997, |
|
"loss": 0.7645, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.3763020833333333, |
|
"grad_norm": 0.32902631163597107, |
|
"learning_rate": 0.00010971492294559029, |
|
"loss": 0.785, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.3776041666666667, |
|
"grad_norm": 0.3547747731208801, |
|
"learning_rate": 0.00010892436866851235, |
|
"loss": 0.8035, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.37890625, |
|
"grad_norm": 0.34230130910873413, |
|
"learning_rate": 0.00010813332557904784, |
|
"loss": 0.8288, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.3802083333333333, |
|
"grad_norm": 0.3104645013809204, |
|
"learning_rate": 0.00010734184247246066, |
|
"loss": 0.8458, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.3815104166666667, |
|
"grad_norm": 0.3715684711933136, |
|
"learning_rate": 0.00010654996817115704, |
|
"loss": 0.9265, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.3828125, |
|
"grad_norm": 0.30713942646980286, |
|
"learning_rate": 0.00010575775152167391, |
|
"loss": 0.904, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.3841145833333333, |
|
"grad_norm": 0.32410570979118347, |
|
"learning_rate": 0.00010496524139166594, |
|
"loss": 0.8621, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.3854166666666667, |
|
"grad_norm": 0.3331278860569, |
|
"learning_rate": 0.00010417248666689095, |
|
"loss": 0.8377, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.38671875, |
|
"grad_norm": 0.3093259334564209, |
|
"learning_rate": 0.00010337953624819464, |
|
"loss": 0.8502, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.3880208333333333, |
|
"grad_norm": 0.3295346796512604, |
|
"learning_rate": 0.0001025864390484939, |
|
"loss": 0.8671, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.3893229166666667, |
|
"grad_norm": 0.2909308075904846, |
|
"learning_rate": 0.00010179324398975984, |
|
"loss": 0.8519, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 0.3205868899822235, |
|
"learning_rate": 0.000101, |
|
"loss": 0.8516, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3919270833333333, |
|
"grad_norm": 0.3373914659023285, |
|
"learning_rate": 0.00010020675601024019, |
|
"loss": 0.9442, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.3932291666666667, |
|
"grad_norm": 0.31464096903800964, |
|
"learning_rate": 9.941356095150613e-05, |
|
"loss": 0.8757, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.39453125, |
|
"grad_norm": 0.32743626832962036, |
|
"learning_rate": 9.862046375180539e-05, |
|
"loss": 0.811, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.3958333333333333, |
|
"grad_norm": 0.36112740635871887, |
|
"learning_rate": 9.782751333310905e-05, |
|
"loss": 0.8523, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.3971354166666667, |
|
"grad_norm": 0.3145286738872528, |
|
"learning_rate": 9.70347586083341e-05, |
|
"loss": 0.9191, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3984375, |
|
"grad_norm": 0.3485451638698578, |
|
"learning_rate": 9.62422484783261e-05, |
|
"loss": 0.8646, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.3997395833333333, |
|
"grad_norm": 0.33839061856269836, |
|
"learning_rate": 9.5450031828843e-05, |
|
"loss": 0.9219, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.4010416666666667, |
|
"grad_norm": 0.29558366537094116, |
|
"learning_rate": 9.465815752753935e-05, |
|
"loss": 0.8581, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.40234375, |
|
"grad_norm": 0.30999499559402466, |
|
"learning_rate": 9.386667442095219e-05, |
|
"loss": 0.9484, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.4036458333333333, |
|
"grad_norm": 0.3167370557785034, |
|
"learning_rate": 9.307563133148767e-05, |
|
"loss": 0.8414, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4049479166666667, |
|
"grad_norm": 0.34423360228538513, |
|
"learning_rate": 9.228507705440976e-05, |
|
"loss": 0.8577, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 0.33800390362739563, |
|
"learning_rate": 9.149506035483005e-05, |
|
"loss": 0.866, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.4075520833333333, |
|
"grad_norm": 0.31321457028388977, |
|
"learning_rate": 9.070562996469997e-05, |
|
"loss": 0.8437, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.4088541666666667, |
|
"grad_norm": 0.3370513916015625, |
|
"learning_rate": 8.991683457980443e-05, |
|
"loss": 0.8091, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.41015625, |
|
"grad_norm": 0.31457364559173584, |
|
"learning_rate": 8.912872285675841e-05, |
|
"loss": 0.7871, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.4114583333333333, |
|
"grad_norm": 0.3068949282169342, |
|
"learning_rate": 8.834134341000527e-05, |
|
"loss": 0.8504, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.4127604166666667, |
|
"grad_norm": 0.33699652552604675, |
|
"learning_rate": 8.755474480881823e-05, |
|
"loss": 0.8919, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.4140625, |
|
"grad_norm": 0.2944529056549072, |
|
"learning_rate": 8.676897557430415e-05, |
|
"loss": 0.8644, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.4153645833333333, |
|
"grad_norm": 0.3244035840034485, |
|
"learning_rate": 8.598408417641078e-05, |
|
"loss": 0.901, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.29464152455329895, |
|
"learning_rate": 8.520011903093666e-05, |
|
"loss": 0.8057, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.41796875, |
|
"grad_norm": 0.347151517868042, |
|
"learning_rate": 8.441712849654485e-05, |
|
"loss": 0.8939, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.4192708333333333, |
|
"grad_norm": 0.3080258369445801, |
|
"learning_rate": 8.363516087177962e-05, |
|
"loss": 0.9819, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.4205729166666667, |
|
"grad_norm": 0.3100193440914154, |
|
"learning_rate": 8.285426439208755e-05, |
|
"loss": 0.8772, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.421875, |
|
"grad_norm": 0.34320497512817383, |
|
"learning_rate": 8.20744872268418e-05, |
|
"loss": 0.7966, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.4231770833333333, |
|
"grad_norm": 0.30067065358161926, |
|
"learning_rate": 8.129587747637105e-05, |
|
"loss": 0.8682, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.4244791666666667, |
|
"grad_norm": 0.34988993406295776, |
|
"learning_rate": 8.051848316899227e-05, |
|
"loss": 0.8989, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.42578125, |
|
"grad_norm": 0.34072110056877136, |
|
"learning_rate": 7.974235225804814e-05, |
|
"loss": 0.7937, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.4270833333333333, |
|
"grad_norm": 0.33146417140960693, |
|
"learning_rate": 7.896753261894923e-05, |
|
"loss": 0.7794, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.4283854166666667, |
|
"grad_norm": 0.31865736842155457, |
|
"learning_rate": 7.819407204622054e-05, |
|
"loss": 0.8462, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.4296875, |
|
"grad_norm": 0.35836029052734375, |
|
"learning_rate": 7.74220182505536e-05, |
|
"loss": 0.929, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4309895833333333, |
|
"grad_norm": 0.3302249014377594, |
|
"learning_rate": 7.665141885586312e-05, |
|
"loss": 0.8297, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.4322916666666667, |
|
"grad_norm": 0.36180800199508667, |
|
"learning_rate": 7.588232139634968e-05, |
|
"loss": 0.8518, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.43359375, |
|
"grad_norm": 0.30696389079093933, |
|
"learning_rate": 7.511477331356733e-05, |
|
"loss": 0.8821, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.4348958333333333, |
|
"grad_norm": 0.3090788424015045, |
|
"learning_rate": 7.434882195349736e-05, |
|
"loss": 0.8593, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.4361979166666667, |
|
"grad_norm": 0.32818740606307983, |
|
"learning_rate": 7.358451456362751e-05, |
|
"loss": 0.8474, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.33500784635543823, |
|
"learning_rate": 7.282189829003785e-05, |
|
"loss": 0.7979, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.4388020833333333, |
|
"grad_norm": 0.37808653712272644, |
|
"learning_rate": 7.206102017449237e-05, |
|
"loss": 0.857, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.4401041666666667, |
|
"grad_norm": 0.31529998779296875, |
|
"learning_rate": 7.130192715153731e-05, |
|
"loss": 0.8831, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.44140625, |
|
"grad_norm": 0.2927025556564331, |
|
"learning_rate": 7.054466604560595e-05, |
|
"loss": 0.8116, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.4427083333333333, |
|
"grad_norm": 0.3082600235939026, |
|
"learning_rate": 6.978928356813031e-05, |
|
"loss": 0.796, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4440104166666667, |
|
"grad_norm": 0.30604058504104614, |
|
"learning_rate": 6.90358263146598e-05, |
|
"loss": 0.9001, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.4453125, |
|
"grad_norm": 0.320302814245224, |
|
"learning_rate": 6.828434076198693e-05, |
|
"loss": 0.8057, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.4466145833333333, |
|
"grad_norm": 0.34440967440605164, |
|
"learning_rate": 6.753487326528033e-05, |
|
"loss": 0.8786, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.4479166666666667, |
|
"grad_norm": 0.313021719455719, |
|
"learning_rate": 6.678747005522557e-05, |
|
"loss": 0.8573, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.44921875, |
|
"grad_norm": 0.3133790194988251, |
|
"learning_rate": 6.60421772351732e-05, |
|
"loss": 0.8614, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.4505208333333333, |
|
"grad_norm": 0.32770583033561707, |
|
"learning_rate": 6.529904077829505e-05, |
|
"loss": 0.829, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.4518229166666667, |
|
"grad_norm": 0.327179491519928, |
|
"learning_rate": 6.455810652474817e-05, |
|
"loss": 0.8383, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.453125, |
|
"grad_norm": 0.30281123518943787, |
|
"learning_rate": 6.381942017884753e-05, |
|
"loss": 0.871, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.4544270833333333, |
|
"grad_norm": 0.3344803750514984, |
|
"learning_rate": 6.30830273062464e-05, |
|
"loss": 0.7489, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.4557291666666667, |
|
"grad_norm": 0.3110140860080719, |
|
"learning_rate": 6.234897333112594e-05, |
|
"loss": 0.8757, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.45703125, |
|
"grad_norm": 0.33310666680336, |
|
"learning_rate": 6.161730353339302e-05, |
|
"loss": 0.9292, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.4583333333333333, |
|
"grad_norm": 0.3537331521511078, |
|
"learning_rate": 6.088806304588717e-05, |
|
"loss": 0.8505, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.4596354166666667, |
|
"grad_norm": 0.3160480856895447, |
|
"learning_rate": 6.0161296851596766e-05, |
|
"loss": 0.867, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.4609375, |
|
"grad_norm": 0.32747846841812134, |
|
"learning_rate": 5.943704978088402e-05, |
|
"loss": 0.8153, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.4622395833333333, |
|
"grad_norm": 0.342339426279068, |
|
"learning_rate": 5.871536650871979e-05, |
|
"loss": 0.8754, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.4635416666666667, |
|
"grad_norm": 0.31615254282951355, |
|
"learning_rate": 5.7996291551927666e-05, |
|
"loss": 0.8338, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.46484375, |
|
"grad_norm": 0.31168562173843384, |
|
"learning_rate": 5.7279869266438234e-05, |
|
"loss": 0.7774, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.4661458333333333, |
|
"grad_norm": 0.31545719504356384, |
|
"learning_rate": 5.656614384455257e-05, |
|
"loss": 0.8077, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.4674479166666667, |
|
"grad_norm": 0.31938084959983826, |
|
"learning_rate": 5.585515931221677e-05, |
|
"loss": 0.8627, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.30571112036705017, |
|
"learning_rate": 5.514695952630578e-05, |
|
"loss": 0.8631, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4700520833333333, |
|
"grad_norm": 0.31153225898742676, |
|
"learning_rate": 5.444158817191832e-05, |
|
"loss": 0.8503, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.4713541666666667, |
|
"grad_norm": 0.32617253065109253, |
|
"learning_rate": 5.373908875968211e-05, |
|
"loss": 0.8148, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.47265625, |
|
"grad_norm": 0.3407813608646393, |
|
"learning_rate": 5.3039504623069965e-05, |
|
"loss": 0.8903, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.4739583333333333, |
|
"grad_norm": 0.32965362071990967, |
|
"learning_rate": 5.234287891572674e-05, |
|
"loss": 0.8629, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.4752604166666667, |
|
"grad_norm": 0.3094812035560608, |
|
"learning_rate": 5.164925460880758e-05, |
|
"loss": 0.8478, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.4765625, |
|
"grad_norm": 0.3095204532146454, |
|
"learning_rate": 5.095867448832683e-05, |
|
"loss": 0.9215, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.4778645833333333, |
|
"grad_norm": 0.3161752223968506, |
|
"learning_rate": 5.027118115251938e-05, |
|
"loss": 0.8831, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.4791666666666667, |
|
"grad_norm": 0.2942347824573517, |
|
"learning_rate": 4.95868170092125e-05, |
|
"loss": 0.8385, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.48046875, |
|
"grad_norm": 0.3367157280445099, |
|
"learning_rate": 4.890562427321021e-05, |
|
"loss": 0.8329, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.4817708333333333, |
|
"grad_norm": 0.3312656283378601, |
|
"learning_rate": 4.822764496368917e-05, |
|
"loss": 0.8909, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4830729166666667, |
|
"grad_norm": 0.2954017221927643, |
|
"learning_rate": 4.755292090160676e-05, |
|
"loss": 0.82, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.484375, |
|
"grad_norm": 0.28649380803108215, |
|
"learning_rate": 4.6881493707121315e-05, |
|
"loss": 0.8349, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.4856770833333333, |
|
"grad_norm": 0.3034970462322235, |
|
"learning_rate": 4.621340479702503e-05, |
|
"loss": 0.8533, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.4869791666666667, |
|
"grad_norm": 0.3515808582305908, |
|
"learning_rate": 4.554869538218868e-05, |
|
"loss": 0.8255, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.48828125, |
|
"grad_norm": 0.3340248763561249, |
|
"learning_rate": 4.48874064650202e-05, |
|
"loss": 0.7953, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4895833333333333, |
|
"grad_norm": 0.3169967532157898, |
|
"learning_rate": 4.422957883693483e-05, |
|
"loss": 0.7667, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.4908854166666667, |
|
"grad_norm": 0.33045902848243713, |
|
"learning_rate": 4.357525307583933e-05, |
|
"loss": 0.8221, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.4921875, |
|
"grad_norm": 0.27641358971595764, |
|
"learning_rate": 4.29244695436289e-05, |
|
"loss": 0.8908, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.4934895833333333, |
|
"grad_norm": 0.34106072783470154, |
|
"learning_rate": 4.227726838369711e-05, |
|
"loss": 0.8107, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.4947916666666667, |
|
"grad_norm": 0.3046337962150574, |
|
"learning_rate": 4.1633689518460225e-05, |
|
"loss": 0.8278, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.49609375, |
|
"grad_norm": 0.3120030462741852, |
|
"learning_rate": 4.0993772646894116e-05, |
|
"loss": 0.8378, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.4973958333333333, |
|
"grad_norm": 0.3395197093486786, |
|
"learning_rate": 4.035755724208573e-05, |
|
"loss": 0.8597, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.4986979166666667, |
|
"grad_norm": 0.33718255162239075, |
|
"learning_rate": 3.972508254879805e-05, |
|
"loss": 0.7818, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3265829086303711, |
|
"learning_rate": 3.90963875810494e-05, |
|
"loss": 0.8203, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.5013020833333334, |
|
"grad_norm": 0.3368302583694458, |
|
"learning_rate": 3.847151111970676e-05, |
|
"loss": 0.8319, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.5026041666666666, |
|
"grad_norm": 0.39881670475006104, |
|
"learning_rate": 3.785049171009381e-05, |
|
"loss": 0.7538, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.50390625, |
|
"grad_norm": 0.31735971570014954, |
|
"learning_rate": 3.723336765961285e-05, |
|
"loss": 0.8627, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.5052083333333334, |
|
"grad_norm": 0.2991076707839966, |
|
"learning_rate": 3.662017703538234e-05, |
|
"loss": 0.8189, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.5065104166666666, |
|
"grad_norm": 0.3316771686077118, |
|
"learning_rate": 3.601095766188833e-05, |
|
"loss": 0.8321, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.5078125, |
|
"grad_norm": 0.32285547256469727, |
|
"learning_rate": 3.540574711865146e-05, |
|
"loss": 0.8444, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5091145833333334, |
|
"grad_norm": 0.30543044209480286, |
|
"learning_rate": 3.4804582737908825e-05, |
|
"loss": 0.8559, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.5104166666666666, |
|
"grad_norm": 0.36356985569000244, |
|
"learning_rate": 3.420750160231118e-05, |
|
"loss": 0.7583, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.51171875, |
|
"grad_norm": 0.3065100908279419, |
|
"learning_rate": 3.361454054263541e-05, |
|
"loss": 0.8257, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.5130208333333334, |
|
"grad_norm": 0.3157348334789276, |
|
"learning_rate": 3.302573613551292e-05, |
|
"loss": 0.8502, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.5143229166666666, |
|
"grad_norm": 0.337979257106781, |
|
"learning_rate": 3.244112470117288e-05, |
|
"loss": 0.8172, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.515625, |
|
"grad_norm": 0.34926459193229675, |
|
"learning_rate": 3.186074230120244e-05, |
|
"loss": 0.8366, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.5169270833333334, |
|
"grad_norm": 0.2944015860557556, |
|
"learning_rate": 3.1284624736321846e-05, |
|
"loss": 0.8571, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.5182291666666666, |
|
"grad_norm": 0.27319666743278503, |
|
"learning_rate": 3.071280754417626e-05, |
|
"loss": 0.8694, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.51953125, |
|
"grad_norm": 0.3642820417881012, |
|
"learning_rate": 3.0145325997143577e-05, |
|
"loss": 0.8504, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"grad_norm": 0.33083510398864746, |
|
"learning_rate": 2.9582215100158706e-05, |
|
"loss": 0.8655, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5221354166666666, |
|
"grad_norm": 0.32897526025772095, |
|
"learning_rate": 2.902350958855426e-05, |
|
"loss": 0.9169, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.5234375, |
|
"grad_norm": 0.3111579418182373, |
|
"learning_rate": 2.846924392591794e-05, |
|
"loss": 0.869, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.5247395833333334, |
|
"grad_norm": 0.31903326511383057, |
|
"learning_rate": 2.791945230196663e-05, |
|
"loss": 0.7606, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.5260416666666666, |
|
"grad_norm": 0.32733333110809326, |
|
"learning_rate": 2.7374168630437456e-05, |
|
"loss": 0.815, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.52734375, |
|
"grad_norm": 0.3421482443809509, |
|
"learning_rate": 2.6833426546995782e-05, |
|
"loss": 0.7627, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5286458333333334, |
|
"grad_norm": 0.3265641927719116, |
|
"learning_rate": 2.629725940716041e-05, |
|
"loss": 0.8013, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.5299479166666666, |
|
"grad_norm": 0.3222253918647766, |
|
"learning_rate": 2.57657002842462e-05, |
|
"loss": 0.83, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 0.36147239804267883, |
|
"learning_rate": 2.523878196732358e-05, |
|
"loss": 0.8342, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.5325520833333334, |
|
"grad_norm": 0.30930349230766296, |
|
"learning_rate": 2.4716536959196462e-05, |
|
"loss": 0.9737, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.5338541666666666, |
|
"grad_norm": 0.33043771982192993, |
|
"learning_rate": 2.4198997474396877e-05, |
|
"loss": 0.7793, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.53515625, |
|
"grad_norm": 0.3458073139190674, |
|
"learning_rate": 2.3686195437198112e-05, |
|
"loss": 0.8822, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.5364583333333334, |
|
"grad_norm": 0.31358662247657776, |
|
"learning_rate": 2.31781624796453e-05, |
|
"loss": 0.8938, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.5377604166666666, |
|
"grad_norm": 0.32773175835609436, |
|
"learning_rate": 2.2674929939604332e-05, |
|
"loss": 0.8978, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.5390625, |
|
"grad_norm": 0.3206295073032379, |
|
"learning_rate": 2.217652885882869e-05, |
|
"loss": 0.8898, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.5403645833333334, |
|
"grad_norm": 0.31542304158210754, |
|
"learning_rate": 2.1682989981044783e-05, |
|
"loss": 0.8635, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5416666666666666, |
|
"grad_norm": 0.3245406150817871, |
|
"learning_rate": 2.119434375005527e-05, |
|
"loss": 0.8411, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.54296875, |
|
"grad_norm": 0.3141809105873108, |
|
"learning_rate": 2.071062030786149e-05, |
|
"loss": 0.7775, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.5442708333333334, |
|
"grad_norm": 0.2986677885055542, |
|
"learning_rate": 2.0231849492803852e-05, |
|
"loss": 0.8618, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.5455729166666666, |
|
"grad_norm": 0.3049768805503845, |
|
"learning_rate": 1.9758060837721467e-05, |
|
"loss": 0.8662, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.546875, |
|
"grad_norm": 0.2944938838481903, |
|
"learning_rate": 1.928928356813032e-05, |
|
"loss": 0.8116, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5481770833333334, |
|
"grad_norm": 0.31740906834602356, |
|
"learning_rate": 1.882554660042052e-05, |
|
"loss": 0.8354, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.5494791666666666, |
|
"grad_norm": 0.3051731288433075, |
|
"learning_rate": 1.8366878540072614e-05, |
|
"loss": 0.8606, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.55078125, |
|
"grad_norm": 0.3050106465816498, |
|
"learning_rate": 1.7913307679893173e-05, |
|
"loss": 0.8115, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.5520833333333334, |
|
"grad_norm": 0.2921428084373474, |
|
"learning_rate": 1.7464861998269243e-05, |
|
"loss": 0.8507, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.5533854166666666, |
|
"grad_norm": 0.3205814063549042, |
|
"learning_rate": 1.702156915744292e-05, |
|
"loss": 0.8631, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5546875, |
|
"grad_norm": 0.33637183904647827, |
|
"learning_rate": 1.6583456501804725e-05, |
|
"loss": 0.8691, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.5559895833333334, |
|
"grad_norm": 0.31378889083862305, |
|
"learning_rate": 1.6150551056206867e-05, |
|
"loss": 0.8181, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.5572916666666666, |
|
"grad_norm": 0.33184900879859924, |
|
"learning_rate": 1.57228795242965e-05, |
|
"loss": 0.7768, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.55859375, |
|
"grad_norm": 0.3419734537601471, |
|
"learning_rate": 1.5300468286868137e-05, |
|
"loss": 0.8905, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.5598958333333334, |
|
"grad_norm": 0.33187946677207947, |
|
"learning_rate": 1.488334340023669e-05, |
|
"loss": 0.8324, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5611979166666666, |
|
"grad_norm": 0.30014726519584656, |
|
"learning_rate": 1.4471530594629996e-05, |
|
"loss": 0.8708, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.27040472626686096, |
|
"learning_rate": 1.4065055272601703e-05, |
|
"loss": 0.8852, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.5638020833333334, |
|
"grad_norm": 0.31325167417526245, |
|
"learning_rate": 1.3663942507464348e-05, |
|
"loss": 0.7781, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.5651041666666666, |
|
"grad_norm": 0.3078592121601105, |
|
"learning_rate": 1.3268217041742701e-05, |
|
"loss": 0.84, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.56640625, |
|
"grad_norm": 0.32804566621780396, |
|
"learning_rate": 1.2877903285647486e-05, |
|
"loss": 0.8676, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5677083333333334, |
|
"grad_norm": 0.3207748234272003, |
|
"learning_rate": 1.2493025315569801e-05, |
|
"loss": 0.858, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.5690104166666666, |
|
"grad_norm": 0.3700745105743408, |
|
"learning_rate": 1.2113606872595673e-05, |
|
"loss": 0.9087, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.5703125, |
|
"grad_norm": 0.33619681000709534, |
|
"learning_rate": 1.173967136104196e-05, |
|
"loss": 0.8805, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.5716145833333334, |
|
"grad_norm": 0.3331134617328644, |
|
"learning_rate": 1.1371241847012401e-05, |
|
"loss": 0.8811, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.5729166666666666, |
|
"grad_norm": 0.3446657061576843, |
|
"learning_rate": 1.1008341056974854e-05, |
|
"loss": 0.8863, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.57421875, |
|
"grad_norm": 0.298917680978775, |
|
"learning_rate": 1.0650991376359473e-05, |
|
"loss": 0.7943, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.5755208333333334, |
|
"grad_norm": 0.36596038937568665, |
|
"learning_rate": 1.029921484817783e-05, |
|
"loss": 0.7989, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.5768229166666666, |
|
"grad_norm": 0.3092229664325714, |
|
"learning_rate": 9.953033171663175e-06, |
|
"loss": 0.9328, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.578125, |
|
"grad_norm": 0.2970651090145111, |
|
"learning_rate": 9.612467700932045e-06, |
|
"loss": 0.7675, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.5794270833333334, |
|
"grad_norm": 0.3474785089492798, |
|
"learning_rate": 9.277539443666783e-06, |
|
"loss": 0.846, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5807291666666666, |
|
"grad_norm": 0.28902187943458557, |
|
"learning_rate": 8.948269059820025e-06, |
|
"loss": 0.8712, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.58203125, |
|
"grad_norm": 0.3022470772266388, |
|
"learning_rate": 8.624676860340025e-06, |
|
"loss": 0.8763, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.5833333333333334, |
|
"grad_norm": 0.2670323848724365, |
|
"learning_rate": 8.306782805917904e-06, |
|
"loss": 0.886, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.5846354166666666, |
|
"grad_norm": 0.2715941369533539, |
|
"learning_rate": 7.994606505756355e-06, |
|
"loss": 0.8477, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.5859375, |
|
"grad_norm": 0.32153239846229553, |
|
"learning_rate": 7.68816721636004e-06, |
|
"loss": 0.8436, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5872395833333334, |
|
"grad_norm": 0.33201107382774353, |
|
"learning_rate": 7.3874838403478e-06, |
|
"loss": 0.887, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.5885416666666666, |
|
"grad_norm": 0.3307580053806305, |
|
"learning_rate": 7.092574925286614e-06, |
|
"loss": 0.8239, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.58984375, |
|
"grad_norm": 0.31865382194519043, |
|
"learning_rate": 6.803458662547507e-06, |
|
"loss": 0.8073, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.5911458333333334, |
|
"grad_norm": 0.30783191323280334, |
|
"learning_rate": 6.520152886183406e-06, |
|
"loss": 0.8604, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.5924479166666666, |
|
"grad_norm": 0.32232406735420227, |
|
"learning_rate": 6.242675071829111e-06, |
|
"loss": 0.8612, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 0.35587257146835327, |
|
"learning_rate": 5.971042335623229e-06, |
|
"loss": 0.8706, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.5950520833333334, |
|
"grad_norm": 0.2914402484893799, |
|
"learning_rate": 5.705271433152458e-06, |
|
"loss": 0.8478, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.5963541666666666, |
|
"grad_norm": 0.36631619930267334, |
|
"learning_rate": 5.445378758417925e-06, |
|
"loss": 0.7445, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.59765625, |
|
"grad_norm": 0.31569549441337585, |
|
"learning_rate": 5.191380342824035e-06, |
|
"loss": 0.8365, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.5989583333333334, |
|
"grad_norm": 0.31928232312202454, |
|
"learning_rate": 4.943291854189493e-06, |
|
"loss": 0.8951, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6002604166666666, |
|
"grad_norm": 0.31012532114982605, |
|
"learning_rate": 4.701128595780878e-06, |
|
"loss": 0.9263, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.6015625, |
|
"grad_norm": 0.30832427740097046, |
|
"learning_rate": 4.464905505368658e-06, |
|
"loss": 0.8536, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.6028645833333334, |
|
"grad_norm": 0.32320111989974976, |
|
"learning_rate": 4.23463715430577e-06, |
|
"loss": 0.8638, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.6041666666666666, |
|
"grad_norm": 0.3501695990562439, |
|
"learning_rate": 4.010337746628751e-06, |
|
"loss": 0.7462, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.60546875, |
|
"grad_norm": 0.3446730673313141, |
|
"learning_rate": 3.792021118181636e-06, |
|
"loss": 0.8484, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.6067708333333334, |
|
"grad_norm": 0.3519136309623718, |
|
"learning_rate": 3.5797007357623945e-06, |
|
"loss": 0.8201, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.6080729166666666, |
|
"grad_norm": 0.30372154712677, |
|
"learning_rate": 3.3733896962923658e-06, |
|
"loss": 0.8871, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.609375, |
|
"grad_norm": 0.3335200250148773, |
|
"learning_rate": 3.1731007260082616e-06, |
|
"loss": 0.8273, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.6106770833333334, |
|
"grad_norm": 0.3028860092163086, |
|
"learning_rate": 2.9788461796772114e-06, |
|
"loss": 0.8267, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.6119791666666666, |
|
"grad_norm": 0.33701807260513306, |
|
"learning_rate": 2.790638039834668e-06, |
|
"loss": 0.8493, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.61328125, |
|
"grad_norm": 0.3008849620819092, |
|
"learning_rate": 2.6084879160452166e-06, |
|
"loss": 0.7696, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.6145833333333334, |
|
"grad_norm": 0.37394464015960693, |
|
"learning_rate": 2.432407044186509e-06, |
|
"loss": 0.939, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.6158854166666666, |
|
"grad_norm": 0.3297758102416992, |
|
"learning_rate": 2.26240628575615e-06, |
|
"loss": 0.8087, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.6171875, |
|
"grad_norm": 0.28165456652641296, |
|
"learning_rate": 2.098496127201648e-06, |
|
"loss": 0.8894, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.6184895833333334, |
|
"grad_norm": 0.315613716840744, |
|
"learning_rate": 1.9406866792737267e-06, |
|
"loss": 0.8807, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6197916666666666, |
|
"grad_norm": 0.3206324279308319, |
|
"learning_rate": 1.7889876764024505e-06, |
|
"loss": 0.8192, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.62109375, |
|
"grad_norm": 0.3201400339603424, |
|
"learning_rate": 1.6434084760968697e-06, |
|
"loss": 0.8754, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.6223958333333334, |
|
"grad_norm": 0.3509829640388489, |
|
"learning_rate": 1.5039580583678393e-06, |
|
"loss": 0.86, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.6236979166666666, |
|
"grad_norm": 0.2788860499858856, |
|
"learning_rate": 1.3706450251739613e-06, |
|
"loss": 0.8008, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.3174276649951935, |
|
"learning_rate": 1.2434775998910964e-06, |
|
"loss": 0.8038, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6263020833333334, |
|
"grad_norm": 0.3377770483493805, |
|
"learning_rate": 1.1224636268050439e-06, |
|
"loss": 0.8609, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.6276041666666666, |
|
"grad_norm": 0.29502376914024353, |
|
"learning_rate": 1.0076105706276888e-06, |
|
"loss": 0.8764, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.62890625, |
|
"grad_norm": 0.30970874428749084, |
|
"learning_rate": 8.989255160365527e-07, |
|
"loss": 0.8442, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.6302083333333334, |
|
"grad_norm": 0.3286643624305725, |
|
"learning_rate": 7.964151672377458e-07, |
|
"loss": 0.7697, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.6315104166666666, |
|
"grad_norm": 0.32877275347709656, |
|
"learning_rate": 7.000858475524444e-07, |
|
"loss": 0.8386, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6328125, |
|
"grad_norm": 0.31582364439964294, |
|
"learning_rate": 6.099434990268609e-07, |
|
"loss": 0.799, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.6341145833333334, |
|
"grad_norm": 0.32822880148887634, |
|
"learning_rate": 5.259936820656257e-07, |
|
"loss": 0.7853, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.6354166666666666, |
|
"grad_norm": 0.31782621145248413, |
|
"learning_rate": 4.482415750889204e-07, |
|
"loss": 0.8704, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.63671875, |
|
"grad_norm": 0.30536094307899475, |
|
"learning_rate": 3.766919742129331e-07, |
|
"loss": 0.8136, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.6380208333333334, |
|
"grad_norm": 0.31908681988716125, |
|
"learning_rate": 3.1134929295407564e-07, |
|
"loss": 0.8366, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6393229166666666, |
|
"grad_norm": 0.32564041018486023, |
|
"learning_rate": 2.5221756195672563e-07, |
|
"loss": 0.7645, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.640625, |
|
"grad_norm": 0.30761629343032837, |
|
"learning_rate": 1.9930042874457254e-07, |
|
"loss": 0.8334, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.6419270833333334, |
|
"grad_norm": 0.31079787015914917, |
|
"learning_rate": 1.5260115749566882e-07, |
|
"loss": 0.8626, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.6432291666666666, |
|
"grad_norm": 0.34176650643348694, |
|
"learning_rate": 1.1212262884103974e-07, |
|
"loss": 0.69, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.64453125, |
|
"grad_norm": 0.31278514862060547, |
|
"learning_rate": 7.7867339686987e-08, |
|
"loss": 0.8859, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6458333333333334, |
|
"grad_norm": 0.30534476041793823, |
|
"learning_rate": 4.98374030611084e-08, |
|
"loss": 0.8918, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.6471354166666666, |
|
"grad_norm": 0.3080224096775055, |
|
"learning_rate": 2.8034547981943713e-08, |
|
"loss": 0.8508, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.6484375, |
|
"grad_norm": 0.3025684654712677, |
|
"learning_rate": 1.246011935228064e-08, |
|
"loss": 0.8545, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.6497395833333334, |
|
"grad_norm": 0.3287234604358673, |
|
"learning_rate": 3.115077876243988e-09, |
|
"loss": 0.8297, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.6510416666666666, |
|
"grad_norm": 0.34251466393470764, |
|
"learning_rate": 0.0, |
|
"loss": 0.8901, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.827040194259845e+17, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|