|
{ |
|
"best_metric": 0.6766157746315002, |
|
"best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-7b_anyres/checkpoint-250", |
|
"epoch": 10.0, |
|
"eval_steps": 1.0, |
|
"global_step": 320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 1.0039058937636163, |
|
"learning_rate": 0.0, |
|
"loss": 1.3969, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"eval_loss": 1.4111441373825073, |
|
"eval_runtime": 50.4639, |
|
"eval_samples_per_second": 3.963, |
|
"eval_steps_per_second": 0.495, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.8420754522690636, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3382, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"eval_loss": 1.4111441373825073, |
|
"eval_runtime": 43.3333, |
|
"eval_samples_per_second": 4.615, |
|
"eval_steps_per_second": 0.577, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 0.8367925175081548, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3867, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"eval_loss": 1.3688743114471436, |
|
"eval_runtime": 43.5247, |
|
"eval_samples_per_second": 4.595, |
|
"eval_steps_per_second": 0.574, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.7061648883003396, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3331, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"eval_loss": 1.3259124755859375, |
|
"eval_runtime": 43.4317, |
|
"eval_samples_per_second": 4.605, |
|
"eval_steps_per_second": 0.576, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.8059747640123492, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3031, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"eval_loss": 1.2872124910354614, |
|
"eval_runtime": 43.4379, |
|
"eval_samples_per_second": 4.604, |
|
"eval_steps_per_second": 0.576, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.7045153329302901, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2771, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"eval_loss": 1.2505193948745728, |
|
"eval_runtime": 43.5902, |
|
"eval_samples_per_second": 4.588, |
|
"eval_steps_per_second": 0.574, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.6329971562106237, |
|
"learning_rate": 2e-05, |
|
"loss": 1.249, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"eval_loss": 1.2199320793151855, |
|
"eval_runtime": 43.4066, |
|
"eval_samples_per_second": 4.608, |
|
"eval_steps_per_second": 0.576, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5550979385222247, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2257, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.1977466344833374, |
|
"eval_runtime": 43.5387, |
|
"eval_samples_per_second": 4.594, |
|
"eval_steps_per_second": 0.574, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 0.4406797963422461, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2462, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"eval_loss": 1.179214358329773, |
|
"eval_runtime": 43.4861, |
|
"eval_samples_per_second": 4.599, |
|
"eval_steps_per_second": 0.575, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.42022162096647486, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1858, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"eval_loss": 1.1616674661636353, |
|
"eval_runtime": 43.8611, |
|
"eval_samples_per_second": 4.56, |
|
"eval_steps_per_second": 0.57, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 0.39691998835013426, |
|
"learning_rate": 2e-05, |
|
"loss": 1.235, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"eval_loss": 1.1443771123886108, |
|
"eval_runtime": 43.5109, |
|
"eval_samples_per_second": 4.597, |
|
"eval_steps_per_second": 0.575, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.4500748148291364, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1953, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"eval_loss": 1.1261780261993408, |
|
"eval_runtime": 44.8553, |
|
"eval_samples_per_second": 4.459, |
|
"eval_steps_per_second": 0.557, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 0.4777471950803986, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2094, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"eval_loss": 1.1074599027633667, |
|
"eval_runtime": 43.6762, |
|
"eval_samples_per_second": 4.579, |
|
"eval_steps_per_second": 0.572, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.45433160021015917, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0426, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"eval_loss": 1.089483380317688, |
|
"eval_runtime": 43.9528, |
|
"eval_samples_per_second": 4.55, |
|
"eval_steps_per_second": 0.569, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.39854476457233645, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1595, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"eval_loss": 1.0731947422027588, |
|
"eval_runtime": 43.3809, |
|
"eval_samples_per_second": 4.61, |
|
"eval_steps_per_second": 0.576, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.41898459581564557, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0923, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.0587964057922363, |
|
"eval_runtime": 46.3861, |
|
"eval_samples_per_second": 4.312, |
|
"eval_steps_per_second": 0.539, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 0.3748700393546972, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0973, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"eval_loss": 1.0456310510635376, |
|
"eval_runtime": 44.8571, |
|
"eval_samples_per_second": 4.459, |
|
"eval_steps_per_second": 0.557, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.5226526211782249, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0901, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"eval_loss": 1.0317203998565674, |
|
"eval_runtime": 44.6579, |
|
"eval_samples_per_second": 4.478, |
|
"eval_steps_per_second": 0.56, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 0.3769885031745698, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0033, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"eval_loss": 1.0182812213897705, |
|
"eval_runtime": 44.6735, |
|
"eval_samples_per_second": 4.477, |
|
"eval_steps_per_second": 0.56, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.34752776954348064, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1256, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"eval_loss": 1.0062216520309448, |
|
"eval_runtime": 44.4317, |
|
"eval_samples_per_second": 4.501, |
|
"eval_steps_per_second": 0.563, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 0.275958956017114, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0333, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"eval_loss": 0.9957399964332581, |
|
"eval_runtime": 46.4719, |
|
"eval_samples_per_second": 4.304, |
|
"eval_steps_per_second": 0.538, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.31928085878737833, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0847, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"eval_loss": 0.9862645864486694, |
|
"eval_runtime": 46.7925, |
|
"eval_samples_per_second": 4.274, |
|
"eval_steps_per_second": 0.534, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 0.26966401299568643, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0678, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"eval_loss": 0.9774981141090393, |
|
"eval_runtime": 46.2095, |
|
"eval_samples_per_second": 4.328, |
|
"eval_steps_per_second": 0.541, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.24088872786986867, |
|
"learning_rate": 2e-05, |
|
"loss": 1.064, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.9695597887039185, |
|
"eval_runtime": 47.1059, |
|
"eval_samples_per_second": 4.246, |
|
"eval_steps_per_second": 0.531, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.27631902106476014, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0141, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"eval_loss": 0.9618983268737793, |
|
"eval_runtime": 46.1528, |
|
"eval_samples_per_second": 4.333, |
|
"eval_steps_per_second": 0.542, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.24434161495988888, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0376, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"eval_loss": 0.9548751711845398, |
|
"eval_runtime": 45.7844, |
|
"eval_samples_per_second": 4.368, |
|
"eval_steps_per_second": 0.546, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 0.25256672152337845, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9632, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"eval_loss": 0.9482427835464478, |
|
"eval_runtime": 47.8001, |
|
"eval_samples_per_second": 4.184, |
|
"eval_steps_per_second": 0.523, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.26872334126279845, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9819, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"eval_loss": 0.9416670203208923, |
|
"eval_runtime": 47.157, |
|
"eval_samples_per_second": 4.241, |
|
"eval_steps_per_second": 0.53, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 0.21711663558311656, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9953, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"eval_loss": 0.9355730414390564, |
|
"eval_runtime": 45.9328, |
|
"eval_samples_per_second": 4.354, |
|
"eval_steps_per_second": 0.544, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.21636473054277702, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0328, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"eval_loss": 0.9298823475837708, |
|
"eval_runtime": 46.0325, |
|
"eval_samples_per_second": 4.345, |
|
"eval_steps_per_second": 0.543, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 0.2530858798467821, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8713, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"eval_loss": 0.9241495728492737, |
|
"eval_runtime": 46.0309, |
|
"eval_samples_per_second": 4.345, |
|
"eval_steps_per_second": 0.543, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2500917296208238, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9831, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.9184038043022156, |
|
"eval_runtime": 46.1304, |
|
"eval_samples_per_second": 4.336, |
|
"eval_steps_per_second": 0.542, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 0.25563291180685294, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0227, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"eval_loss": 0.9126191735267639, |
|
"eval_runtime": 52.6388, |
|
"eval_samples_per_second": 3.799, |
|
"eval_steps_per_second": 0.475, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 0.2225226787999786, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0241, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"eval_loss": 0.9070788621902466, |
|
"eval_runtime": 43.6322, |
|
"eval_samples_per_second": 4.584, |
|
"eval_steps_per_second": 0.573, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.2052840697405099, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0476, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"eval_loss": 0.9018412828445435, |
|
"eval_runtime": 43.1975, |
|
"eval_samples_per_second": 4.63, |
|
"eval_steps_per_second": 0.579, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.23676392278447683, |
|
"learning_rate": 2e-05, |
|
"loss": 1.01, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"eval_loss": 0.8966168761253357, |
|
"eval_runtime": 45.9216, |
|
"eval_samples_per_second": 4.355, |
|
"eval_steps_per_second": 0.544, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 0.22099733575664926, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9525, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"eval_loss": 0.891795814037323, |
|
"eval_runtime": 44.7872, |
|
"eval_samples_per_second": 4.466, |
|
"eval_steps_per_second": 0.558, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 0.2527359179725302, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9627, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"eval_loss": 0.8872839212417603, |
|
"eval_runtime": 44.6369, |
|
"eval_samples_per_second": 4.481, |
|
"eval_steps_per_second": 0.56, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 0.25432158026395235, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9972, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"eval_loss": 0.8827975988388062, |
|
"eval_runtime": 44.7753, |
|
"eval_samples_per_second": 4.467, |
|
"eval_steps_per_second": 0.558, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.24171584871667898, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9897, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8785097599029541, |
|
"eval_runtime": 45.0743, |
|
"eval_samples_per_second": 4.437, |
|
"eval_steps_per_second": 0.555, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 0.23629659647320733, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9641, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"eval_loss": 0.8742367625236511, |
|
"eval_runtime": 45.6624, |
|
"eval_samples_per_second": 4.38, |
|
"eval_steps_per_second": 0.547, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 0.23515869880744614, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9445, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"eval_loss": 0.8701191544532776, |
|
"eval_runtime": 46.6778, |
|
"eval_samples_per_second": 4.285, |
|
"eval_steps_per_second": 0.536, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 0.2328447853974619, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9098, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"eval_loss": 0.8661414980888367, |
|
"eval_runtime": 45.7682, |
|
"eval_samples_per_second": 4.37, |
|
"eval_steps_per_second": 0.546, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.2208565035546648, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9269, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"eval_loss": 0.8625122904777527, |
|
"eval_runtime": 47.7405, |
|
"eval_samples_per_second": 4.189, |
|
"eval_steps_per_second": 0.524, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.24194310531833832, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9126, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"eval_loss": 0.859275221824646, |
|
"eval_runtime": 46.14, |
|
"eval_samples_per_second": 4.335, |
|
"eval_steps_per_second": 0.542, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 0.23294071980639222, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9525, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"eval_loss": 0.8560716509819031, |
|
"eval_runtime": 47.2955, |
|
"eval_samples_per_second": 4.229, |
|
"eval_steps_per_second": 0.529, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 0.22565596183142483, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9635, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"eval_loss": 0.8531911373138428, |
|
"eval_runtime": 46.3183, |
|
"eval_samples_per_second": 4.318, |
|
"eval_steps_per_second": 0.54, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.23251096636792043, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8684, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.8504599928855896, |
|
"eval_runtime": 45.7129, |
|
"eval_samples_per_second": 4.375, |
|
"eval_steps_per_second": 0.547, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 0.253882583102031, |
|
"learning_rate": 2e-05, |
|
"loss": 0.881, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"eval_loss": 0.8476203680038452, |
|
"eval_runtime": 45.8764, |
|
"eval_samples_per_second": 4.36, |
|
"eval_steps_per_second": 0.545, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.2572282615843019, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8634, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"eval_loss": 0.8446447849273682, |
|
"eval_runtime": 46.1254, |
|
"eval_samples_per_second": 4.336, |
|
"eval_steps_per_second": 0.542, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 0.24021257130991572, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8915, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"eval_loss": 0.8415327668190002, |
|
"eval_runtime": 45.7173, |
|
"eval_samples_per_second": 4.375, |
|
"eval_steps_per_second": 0.547, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.22076828593901424, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7849, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"eval_loss": 0.8386600017547607, |
|
"eval_runtime": 45.7889, |
|
"eval_samples_per_second": 4.368, |
|
"eval_steps_per_second": 0.546, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 0.2255866641078328, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9282, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"eval_loss": 0.8356924653053284, |
|
"eval_runtime": 45.6221, |
|
"eval_samples_per_second": 4.384, |
|
"eval_steps_per_second": 0.548, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 0.22783298909181773, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9012, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"eval_loss": 0.8328012228012085, |
|
"eval_runtime": 47.1607, |
|
"eval_samples_per_second": 4.241, |
|
"eval_steps_per_second": 0.53, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.22832233862063558, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9055, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"eval_loss": 0.830295741558075, |
|
"eval_runtime": 46.0231, |
|
"eval_samples_per_second": 4.346, |
|
"eval_steps_per_second": 0.543, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.2160389858258543, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9646, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 0.8281158208847046, |
|
"eval_runtime": 50.2412, |
|
"eval_samples_per_second": 3.981, |
|
"eval_steps_per_second": 0.498, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 0.2577519779258931, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8908, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"eval_loss": 0.8254660964012146, |
|
"eval_runtime": 43.4999, |
|
"eval_samples_per_second": 4.598, |
|
"eval_steps_per_second": 0.575, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 0.2425252190238059, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9392, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"eval_loss": 0.8230564594268799, |
|
"eval_runtime": 43.1396, |
|
"eval_samples_per_second": 4.636, |
|
"eval_steps_per_second": 0.58, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 0.2403612422125405, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8458, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"eval_loss": 0.8206232190132141, |
|
"eval_runtime": 43.4097, |
|
"eval_samples_per_second": 4.607, |
|
"eval_steps_per_second": 0.576, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.24599794763439686, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8533, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"eval_loss": 0.8178582787513733, |
|
"eval_runtime": 43.3225, |
|
"eval_samples_per_second": 4.617, |
|
"eval_steps_per_second": 0.577, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 0.24455796239061778, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9019, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"eval_loss": 0.81532883644104, |
|
"eval_runtime": 43.3919, |
|
"eval_samples_per_second": 4.609, |
|
"eval_steps_per_second": 0.576, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 0.25994876629591135, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9294, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"eval_loss": 0.813098669052124, |
|
"eval_runtime": 43.5546, |
|
"eval_samples_per_second": 4.592, |
|
"eval_steps_per_second": 0.574, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 0.2671215171096013, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7728, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"eval_loss": 0.8106216192245483, |
|
"eval_runtime": 43.3363, |
|
"eval_samples_per_second": 4.615, |
|
"eval_steps_per_second": 0.577, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.26274475710090606, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8746, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.8080699443817139, |
|
"eval_runtime": 44.6331, |
|
"eval_samples_per_second": 4.481, |
|
"eval_steps_per_second": 0.56, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 0.2775753424365695, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8665, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"eval_loss": 0.8051960468292236, |
|
"eval_runtime": 43.2561, |
|
"eval_samples_per_second": 4.624, |
|
"eval_steps_per_second": 0.578, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 0.27249086550617724, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8868, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"eval_loss": 0.8029299378395081, |
|
"eval_runtime": 43.1171, |
|
"eval_samples_per_second": 4.639, |
|
"eval_steps_per_second": 0.58, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 0.2719871749974866, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8651, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"eval_loss": 0.8006068468093872, |
|
"eval_runtime": 43.0661, |
|
"eval_samples_per_second": 4.644, |
|
"eval_steps_per_second": 0.581, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.24961006779343242, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9303, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"eval_loss": 0.7983291745185852, |
|
"eval_runtime": 44.5821, |
|
"eval_samples_per_second": 4.486, |
|
"eval_steps_per_second": 0.561, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 0.26632839922388696, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8625, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"eval_loss": 0.7961746454238892, |
|
"eval_runtime": 44.7163, |
|
"eval_samples_per_second": 4.473, |
|
"eval_steps_per_second": 0.559, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.28665202557154024, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8084, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"eval_loss": 0.7937586307525635, |
|
"eval_runtime": 43.1349, |
|
"eval_samples_per_second": 4.637, |
|
"eval_steps_per_second": 0.58, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 0.25474181970896226, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8943, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"eval_loss": 0.7917373776435852, |
|
"eval_runtime": 43.1701, |
|
"eval_samples_per_second": 4.633, |
|
"eval_steps_per_second": 0.579, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.28289708669257335, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8183, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 0.7898543477058411, |
|
"eval_runtime": 43.3669, |
|
"eval_samples_per_second": 4.612, |
|
"eval_steps_per_second": 0.576, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 0.3081846543495751, |
|
"learning_rate": 2e-05, |
|
"loss": 0.866, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"eval_loss": 0.7878245711326599, |
|
"eval_runtime": 43.2404, |
|
"eval_samples_per_second": 4.625, |
|
"eval_steps_per_second": 0.578, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 0.25291911217221025, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8643, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"eval_loss": 0.7859254479408264, |
|
"eval_runtime": 43.158, |
|
"eval_samples_per_second": 4.634, |
|
"eval_steps_per_second": 0.579, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 0.2671411105926486, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9148, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"eval_loss": 0.7841793894767761, |
|
"eval_runtime": 43.5393, |
|
"eval_samples_per_second": 4.594, |
|
"eval_steps_per_second": 0.574, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.2649328385798148, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8322, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"eval_loss": 0.7824788093566895, |
|
"eval_runtime": 44.6161, |
|
"eval_samples_per_second": 4.483, |
|
"eval_steps_per_second": 0.56, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 0.2770584815336495, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8845, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"eval_loss": 0.7810197472572327, |
|
"eval_runtime": 44.3474, |
|
"eval_samples_per_second": 4.51, |
|
"eval_steps_per_second": 0.564, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 0.3134056914363824, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8764, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"eval_loss": 0.7796530723571777, |
|
"eval_runtime": 44.6727, |
|
"eval_samples_per_second": 4.477, |
|
"eval_steps_per_second": 0.56, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 0.31159260857820364, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8842, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"eval_loss": 0.7792640924453735, |
|
"eval_runtime": 44.9476, |
|
"eval_samples_per_second": 4.45, |
|
"eval_steps_per_second": 0.556, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.30072325605647415, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9214, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.7791906595230103, |
|
"eval_runtime": 44.5732, |
|
"eval_samples_per_second": 4.487, |
|
"eval_steps_per_second": 0.561, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 0.3021628861526586, |
|
"learning_rate": 2e-05, |
|
"loss": 0.854, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"eval_loss": 0.7786081433296204, |
|
"eval_runtime": 46.7962, |
|
"eval_samples_per_second": 4.274, |
|
"eval_steps_per_second": 0.534, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 0.28647643667873335, |
|
"learning_rate": 2e-05, |
|
"loss": 0.915, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"eval_loss": 0.777721643447876, |
|
"eval_runtime": 46.0168, |
|
"eval_samples_per_second": 4.346, |
|
"eval_steps_per_second": 0.543, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 0.3053967339779788, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8616, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"eval_loss": 0.7763125896453857, |
|
"eval_runtime": 46.9482, |
|
"eval_samples_per_second": 4.26, |
|
"eval_steps_per_second": 0.533, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.3285655628944688, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8242, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"eval_loss": 0.7744290232658386, |
|
"eval_runtime": 45.8201, |
|
"eval_samples_per_second": 4.365, |
|
"eval_steps_per_second": 0.546, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 0.29338609850548214, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7927, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"eval_loss": 0.7727124094963074, |
|
"eval_runtime": 47.0822, |
|
"eval_samples_per_second": 4.248, |
|
"eval_steps_per_second": 0.531, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 0.3360259804530201, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8225, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"eval_loss": 0.7707045078277588, |
|
"eval_runtime": 45.904, |
|
"eval_samples_per_second": 4.357, |
|
"eval_steps_per_second": 0.545, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 0.3086865804573199, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8428, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"eval_loss": 0.7689979672431946, |
|
"eval_runtime": 46.5498, |
|
"eval_samples_per_second": 4.296, |
|
"eval_steps_per_second": 0.537, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.3441174342366127, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9349, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_loss": 0.7670918107032776, |
|
"eval_runtime": 45.9533, |
|
"eval_samples_per_second": 4.352, |
|
"eval_steps_per_second": 0.544, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 0.3192564489143439, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8281, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"eval_loss": 0.7653720378875732, |
|
"eval_runtime": 46.4157, |
|
"eval_samples_per_second": 4.309, |
|
"eval_steps_per_second": 0.539, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.318307521318246, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8826, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"eval_loss": 0.7641046047210693, |
|
"eval_runtime": 43.6527, |
|
"eval_samples_per_second": 4.582, |
|
"eval_steps_per_second": 0.573, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 0.3088619418824691, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7792, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"eval_loss": 0.7630372643470764, |
|
"eval_runtime": 43.3688, |
|
"eval_samples_per_second": 4.612, |
|
"eval_steps_per_second": 0.576, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.31484830204628667, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8771, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"eval_loss": 0.7621588110923767, |
|
"eval_runtime": 43.4895, |
|
"eval_samples_per_second": 4.599, |
|
"eval_steps_per_second": 0.575, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 0.3210986538440627, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8125, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"eval_loss": 0.7610002160072327, |
|
"eval_runtime": 44.5951, |
|
"eval_samples_per_second": 4.485, |
|
"eval_steps_per_second": 0.561, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 0.3584955691897743, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8869, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"eval_loss": 0.7591326832771301, |
|
"eval_runtime": 44.778, |
|
"eval_samples_per_second": 4.466, |
|
"eval_steps_per_second": 0.558, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 0.3231987362149406, |
|
"learning_rate": 2e-05, |
|
"loss": 0.828, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"eval_loss": 0.7578966021537781, |
|
"eval_runtime": 44.832, |
|
"eval_samples_per_second": 4.461, |
|
"eval_steps_per_second": 0.558, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.3195106075306484, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7563678026199341, |
|
"eval_runtime": 43.2334, |
|
"eval_samples_per_second": 4.626, |
|
"eval_steps_per_second": 0.578, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"grad_norm": 0.3319055768203625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7632, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"eval_loss": 0.7547956705093384, |
|
"eval_runtime": 50.7388, |
|
"eval_samples_per_second": 3.942, |
|
"eval_steps_per_second": 0.493, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 0.2995834652715153, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8407, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"eval_loss": 0.7533387541770935, |
|
"eval_runtime": 45.0847, |
|
"eval_samples_per_second": 4.436, |
|
"eval_steps_per_second": 0.555, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"grad_norm": 0.30711749226961915, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8117, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"eval_loss": 0.7517553567886353, |
|
"eval_runtime": 43.2975, |
|
"eval_samples_per_second": 4.619, |
|
"eval_steps_per_second": 0.577, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.3443284045264722, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8347, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"eval_loss": 0.749790370464325, |
|
"eval_runtime": 43.3922, |
|
"eval_samples_per_second": 4.609, |
|
"eval_steps_per_second": 0.576, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"grad_norm": 0.3080766546496095, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7748, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"eval_loss": 0.7480612397193909, |
|
"eval_runtime": 45.0132, |
|
"eval_samples_per_second": 4.443, |
|
"eval_steps_per_second": 0.555, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 0.34717566244235637, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8407, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"eval_loss": 0.7468411326408386, |
|
"eval_runtime": 43.1171, |
|
"eval_samples_per_second": 4.639, |
|
"eval_steps_per_second": 0.58, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"grad_norm": 0.3374839165175488, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8498, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"eval_loss": 0.7462002038955688, |
|
"eval_runtime": 44.7301, |
|
"eval_samples_per_second": 4.471, |
|
"eval_steps_per_second": 0.559, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.35610377004267274, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7608, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_loss": 0.7451856732368469, |
|
"eval_runtime": 43.1396, |
|
"eval_samples_per_second": 4.636, |
|
"eval_steps_per_second": 0.58, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 0.3147450389365033, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8077, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"eval_loss": 0.7444003224372864, |
|
"eval_runtime": 45.0088, |
|
"eval_samples_per_second": 4.444, |
|
"eval_steps_per_second": 0.555, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 0.3706462973318254, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8401, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"eval_loss": 0.7432863116264343, |
|
"eval_runtime": 43.5403, |
|
"eval_samples_per_second": 4.593, |
|
"eval_steps_per_second": 0.574, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"grad_norm": 0.40870394852693054, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7369, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"eval_loss": 0.7409774661064148, |
|
"eval_runtime": 43.3731, |
|
"eval_samples_per_second": 4.611, |
|
"eval_steps_per_second": 0.576, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 0.36546514227995835, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7822, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"eval_loss": 0.7388054132461548, |
|
"eval_runtime": 43.2852, |
|
"eval_samples_per_second": 4.621, |
|
"eval_steps_per_second": 0.578, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"grad_norm": 0.3623356150462002, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7693, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"eval_loss": 0.7370558977127075, |
|
"eval_runtime": 43.2105, |
|
"eval_samples_per_second": 4.629, |
|
"eval_steps_per_second": 0.579, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 0.36956774509216733, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7631, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"eval_loss": 0.7354567050933838, |
|
"eval_runtime": 45.0512, |
|
"eval_samples_per_second": 4.439, |
|
"eval_steps_per_second": 0.555, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"grad_norm": 0.37499211223571893, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8397, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"eval_loss": 0.7342872619628906, |
|
"eval_runtime": 44.1989, |
|
"eval_samples_per_second": 4.525, |
|
"eval_steps_per_second": 0.566, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.3656781606255811, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8156, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_loss": 0.7334136962890625, |
|
"eval_runtime": 43.3314, |
|
"eval_samples_per_second": 4.616, |
|
"eval_steps_per_second": 0.577, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"grad_norm": 0.360531666311953, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9039, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"eval_loss": 0.732928454875946, |
|
"eval_runtime": 43.6452, |
|
"eval_samples_per_second": 4.582, |
|
"eval_steps_per_second": 0.573, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 0.4106498291544766, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7632, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"eval_loss": 0.7328732013702393, |
|
"eval_runtime": 43.2922, |
|
"eval_samples_per_second": 4.62, |
|
"eval_steps_per_second": 0.577, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 0.35030054786635473, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8328, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"eval_loss": 0.7332839369773865, |
|
"eval_runtime": 43.1392, |
|
"eval_samples_per_second": 4.636, |
|
"eval_steps_per_second": 0.58, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 0.37866907463824806, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7992, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"eval_loss": 0.7333321571350098, |
|
"eval_runtime": 44.5672, |
|
"eval_samples_per_second": 4.488, |
|
"eval_steps_per_second": 0.561, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"grad_norm": 0.3868782215569731, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7929, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"eval_loss": 0.7327985167503357, |
|
"eval_runtime": 45.9132, |
|
"eval_samples_per_second": 4.356, |
|
"eval_steps_per_second": 0.545, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"grad_norm": 0.3823386198135366, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8064, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"eval_loss": 0.7325207591056824, |
|
"eval_runtime": 45.1557, |
|
"eval_samples_per_second": 4.429, |
|
"eval_steps_per_second": 0.554, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"grad_norm": 0.3586002374199349, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8677, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"eval_loss": 0.732402503490448, |
|
"eval_runtime": 44.5906, |
|
"eval_samples_per_second": 4.485, |
|
"eval_steps_per_second": 0.561, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.34075042751380596, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8119, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.7322152853012085, |
|
"eval_runtime": 44.3386, |
|
"eval_samples_per_second": 4.511, |
|
"eval_steps_per_second": 0.564, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"grad_norm": 0.38915259379047296, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7866, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"eval_loss": 0.7307778000831604, |
|
"eval_runtime": 45.0342, |
|
"eval_samples_per_second": 4.441, |
|
"eval_steps_per_second": 0.555, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"grad_norm": 0.39774471715347587, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8635, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"eval_loss": 0.7294437885284424, |
|
"eval_runtime": 47.2205, |
|
"eval_samples_per_second": 4.235, |
|
"eval_steps_per_second": 0.529, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"grad_norm": 0.3880340672056078, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7834, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"eval_loss": 0.7277958393096924, |
|
"eval_runtime": 45.5116, |
|
"eval_samples_per_second": 4.394, |
|
"eval_steps_per_second": 0.549, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 0.34955832039339413, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8048, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"eval_loss": 0.7262464761734009, |
|
"eval_runtime": 45.3196, |
|
"eval_samples_per_second": 4.413, |
|
"eval_steps_per_second": 0.552, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"grad_norm": 0.4502351954206266, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8494, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"eval_loss": 0.724558413028717, |
|
"eval_runtime": 45.2241, |
|
"eval_samples_per_second": 4.422, |
|
"eval_steps_per_second": 0.553, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"grad_norm": 0.40148506382728893, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8163, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"eval_loss": 0.7235116362571716, |
|
"eval_runtime": 46.1839, |
|
"eval_samples_per_second": 4.331, |
|
"eval_steps_per_second": 0.541, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"grad_norm": 0.41595103877364653, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7756, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"eval_loss": 0.7227371335029602, |
|
"eval_runtime": 43.5883, |
|
"eval_samples_per_second": 4.588, |
|
"eval_steps_per_second": 0.574, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.3959213167419436, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7107, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.721717357635498, |
|
"eval_runtime": 44.8751, |
|
"eval_samples_per_second": 4.457, |
|
"eval_steps_per_second": 0.557, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"grad_norm": 0.34668934768327436, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8028, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"eval_loss": 0.7208954095840454, |
|
"eval_runtime": 43.2092, |
|
"eval_samples_per_second": 4.629, |
|
"eval_steps_per_second": 0.579, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 0.3776564287872586, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8162, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"eval_loss": 0.7200332880020142, |
|
"eval_runtime": 43.1981, |
|
"eval_samples_per_second": 4.63, |
|
"eval_steps_per_second": 0.579, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"grad_norm": 0.35166731437552645, |
|
"learning_rate": 2e-05, |
|
"loss": 0.814, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"eval_loss": 0.7193570137023926, |
|
"eval_runtime": 43.3306, |
|
"eval_samples_per_second": 4.616, |
|
"eval_steps_per_second": 0.577, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"grad_norm": 0.39783214883157875, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7743, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"eval_loss": 0.7187802791595459, |
|
"eval_runtime": 44.0701, |
|
"eval_samples_per_second": 4.538, |
|
"eval_steps_per_second": 0.567, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"grad_norm": 0.3828880469066703, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8766, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"eval_loss": 0.7184324860572815, |
|
"eval_runtime": 43.3218, |
|
"eval_samples_per_second": 4.617, |
|
"eval_steps_per_second": 0.577, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"grad_norm": 0.46175115507112535, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7827, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"eval_loss": 0.717852771282196, |
|
"eval_runtime": 43.3706, |
|
"eval_samples_per_second": 4.611, |
|
"eval_steps_per_second": 0.576, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"grad_norm": 0.39552167703322383, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7846, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"eval_loss": 0.7171714901924133, |
|
"eval_runtime": 43.3199, |
|
"eval_samples_per_second": 4.617, |
|
"eval_steps_per_second": 0.577, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.40883049825529505, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7711, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"eval_loss": 0.7167998552322388, |
|
"eval_runtime": 43.4601, |
|
"eval_samples_per_second": 4.602, |
|
"eval_steps_per_second": 0.575, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"grad_norm": 0.4411120151436577, |
|
"learning_rate": 2e-05, |
|
"loss": 0.755, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"eval_loss": 0.7161502838134766, |
|
"eval_runtime": 45.0586, |
|
"eval_samples_per_second": 4.439, |
|
"eval_steps_per_second": 0.555, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"grad_norm": 0.4307733167956254, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7708, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"eval_loss": 0.7155695557594299, |
|
"eval_runtime": 44.7913, |
|
"eval_samples_per_second": 4.465, |
|
"eval_steps_per_second": 0.558, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"grad_norm": 0.4303129845521591, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7384, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"eval_loss": 0.7146069407463074, |
|
"eval_runtime": 43.3745, |
|
"eval_samples_per_second": 4.611, |
|
"eval_steps_per_second": 0.576, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 0.4160861103360693, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7693, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"eval_loss": 0.7138718962669373, |
|
"eval_runtime": 43.2941, |
|
"eval_samples_per_second": 4.62, |
|
"eval_steps_per_second": 0.577, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"grad_norm": 0.3974304749908327, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7855, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"eval_loss": 0.7131789922714233, |
|
"eval_runtime": 43.6908, |
|
"eval_samples_per_second": 4.578, |
|
"eval_steps_per_second": 0.572, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"grad_norm": 0.42212623603465876, |
|
"learning_rate": 2e-05, |
|
"loss": 0.733, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"eval_loss": 0.7126344442367554, |
|
"eval_runtime": 43.5706, |
|
"eval_samples_per_second": 4.59, |
|
"eval_steps_per_second": 0.574, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"grad_norm": 0.4290602874698813, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7372, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"eval_loss": 0.7121153473854065, |
|
"eval_runtime": 44.0917, |
|
"eval_samples_per_second": 4.536, |
|
"eval_steps_per_second": 0.567, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.38778639331277664, |
|
"learning_rate": 2e-05, |
|
"loss": 0.715, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"eval_loss": 0.7114359140396118, |
|
"eval_runtime": 90.4172, |
|
"eval_samples_per_second": 2.212, |
|
"eval_steps_per_second": 0.276, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"grad_norm": 0.44014343297224434, |
|
"learning_rate": 2e-05, |
|
"loss": 0.802, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"eval_loss": 0.7106121778488159, |
|
"eval_runtime": 43.5235, |
|
"eval_samples_per_second": 4.595, |
|
"eval_steps_per_second": 0.574, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"grad_norm": 0.45549843169611287, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6899, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"eval_loss": 0.7094995975494385, |
|
"eval_runtime": 43.5264, |
|
"eval_samples_per_second": 4.595, |
|
"eval_steps_per_second": 0.574, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"grad_norm": 0.46209967918252776, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7503, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"eval_loss": 0.7082768082618713, |
|
"eval_runtime": 44.8411, |
|
"eval_samples_per_second": 4.46, |
|
"eval_steps_per_second": 0.558, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"grad_norm": 0.43001381014670376, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7041, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"eval_loss": 0.7072634696960449, |
|
"eval_runtime": 43.1988, |
|
"eval_samples_per_second": 4.63, |
|
"eval_steps_per_second": 0.579, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"grad_norm": 0.4151229594087744, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8181, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"eval_loss": 0.7068669199943542, |
|
"eval_runtime": 43.3996, |
|
"eval_samples_per_second": 4.608, |
|
"eval_steps_per_second": 0.576, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 0.4534048991771139, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7411, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"eval_loss": 0.7062075734138489, |
|
"eval_runtime": 43.3013, |
|
"eval_samples_per_second": 4.619, |
|
"eval_steps_per_second": 0.577, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"grad_norm": 0.4739932075357852, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7621, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"eval_loss": 0.7047030925750732, |
|
"eval_runtime": 43.4211, |
|
"eval_samples_per_second": 4.606, |
|
"eval_steps_per_second": 0.576, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.46573796534078227, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7852, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_loss": 0.7033020257949829, |
|
"eval_runtime": 43.4066, |
|
"eval_samples_per_second": 4.608, |
|
"eval_steps_per_second": 0.576, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"grad_norm": 0.463007545995704, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7331, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"eval_loss": 0.7021228671073914, |
|
"eval_runtime": 43.4184, |
|
"eval_samples_per_second": 4.606, |
|
"eval_steps_per_second": 0.576, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"grad_norm": 0.46580692487948094, |
|
"learning_rate": 2e-05, |
|
"loss": 0.76, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"eval_loss": 0.701519250869751, |
|
"eval_runtime": 44.9732, |
|
"eval_samples_per_second": 4.447, |
|
"eval_steps_per_second": 0.556, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"grad_norm": 0.47378674394843967, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6912, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"eval_loss": 0.7011644244194031, |
|
"eval_runtime": 44.898, |
|
"eval_samples_per_second": 4.455, |
|
"eval_steps_per_second": 0.557, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"grad_norm": 0.44883703516788587, |
|
"learning_rate": 2e-05, |
|
"loss": 0.812, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"eval_loss": 0.7009950876235962, |
|
"eval_runtime": 44.4765, |
|
"eval_samples_per_second": 4.497, |
|
"eval_steps_per_second": 0.562, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"grad_norm": 0.43366130955490684, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7902, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"eval_loss": 0.7011439800262451, |
|
"eval_runtime": 44.3528, |
|
"eval_samples_per_second": 4.509, |
|
"eval_steps_per_second": 0.564, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"grad_norm": 0.4501399670257468, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7927, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"eval_loss": 0.7011370062828064, |
|
"eval_runtime": 46.6518, |
|
"eval_samples_per_second": 4.287, |
|
"eval_steps_per_second": 0.536, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"grad_norm": 0.44946550972510596, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7437, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"eval_loss": 0.7008097767829895, |
|
"eval_runtime": 45.6401, |
|
"eval_samples_per_second": 4.382, |
|
"eval_steps_per_second": 0.548, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.455086081766797, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7274, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.7002915143966675, |
|
"eval_runtime": 44.5003, |
|
"eval_samples_per_second": 4.494, |
|
"eval_steps_per_second": 0.562, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"grad_norm": 0.42610507864697433, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7084, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"eval_loss": 0.6996615529060364, |
|
"eval_runtime": 50.423, |
|
"eval_samples_per_second": 3.966, |
|
"eval_steps_per_second": 0.496, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"grad_norm": 0.41530618486274595, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8549, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"eval_loss": 0.6996638774871826, |
|
"eval_runtime": 43.3726, |
|
"eval_samples_per_second": 4.611, |
|
"eval_steps_per_second": 0.576, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"grad_norm": 0.46020582285044187, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6554, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"eval_loss": 0.6997809410095215, |
|
"eval_runtime": 43.1108, |
|
"eval_samples_per_second": 4.639, |
|
"eval_steps_per_second": 0.58, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"grad_norm": 0.45217206658399783, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7908, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"eval_loss": 0.7001843452453613, |
|
"eval_runtime": 43.3575, |
|
"eval_samples_per_second": 4.613, |
|
"eval_steps_per_second": 0.577, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"grad_norm": 0.5297838342887452, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6311, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"eval_loss": 0.6998342871665955, |
|
"eval_runtime": 44.2692, |
|
"eval_samples_per_second": 4.518, |
|
"eval_steps_per_second": 0.565, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"grad_norm": 0.5041508044224997, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7407, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"eval_loss": 0.6997390985488892, |
|
"eval_runtime": 44.9429, |
|
"eval_samples_per_second": 4.45, |
|
"eval_steps_per_second": 0.556, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"grad_norm": 0.4379864270565459, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7601, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"eval_loss": 0.6998906135559082, |
|
"eval_runtime": 44.7922, |
|
"eval_samples_per_second": 4.465, |
|
"eval_steps_per_second": 0.558, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.4908573554102339, |
|
"learning_rate": 2e-05, |
|
"loss": 0.802, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"eval_loss": 0.6996601819992065, |
|
"eval_runtime": 44.8792, |
|
"eval_samples_per_second": 4.456, |
|
"eval_steps_per_second": 0.557, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"grad_norm": 0.4708754671143599, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7212, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"eval_loss": 0.699320375919342, |
|
"eval_runtime": 42.8958, |
|
"eval_samples_per_second": 4.662, |
|
"eval_steps_per_second": 0.583, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 0.5157421152452428, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6919, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"eval_loss": 0.6992219686508179, |
|
"eval_runtime": 43.1543, |
|
"eval_samples_per_second": 4.635, |
|
"eval_steps_per_second": 0.579, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"grad_norm": 0.5604495452491726, |
|
"learning_rate": 2e-05, |
|
"loss": 0.708, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"eval_loss": 0.6983294486999512, |
|
"eval_runtime": 43.0431, |
|
"eval_samples_per_second": 4.647, |
|
"eval_steps_per_second": 0.581, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"grad_norm": 0.5538353889452822, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7922, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"eval_loss": 0.6967844367027283, |
|
"eval_runtime": 43.3554, |
|
"eval_samples_per_second": 4.613, |
|
"eval_steps_per_second": 0.577, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"grad_norm": 0.4750896425737706, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7552, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"eval_loss": 0.6954870820045471, |
|
"eval_runtime": 43.2105, |
|
"eval_samples_per_second": 4.629, |
|
"eval_steps_per_second": 0.579, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"grad_norm": 0.4939578777629157, |
|
"learning_rate": 2e-05, |
|
"loss": 0.793, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"eval_loss": 0.6942651271820068, |
|
"eval_runtime": 43.2018, |
|
"eval_samples_per_second": 4.629, |
|
"eval_steps_per_second": 0.579, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"grad_norm": 0.5275775814858564, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7812, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"eval_loss": 0.6938748359680176, |
|
"eval_runtime": 43.0238, |
|
"eval_samples_per_second": 4.649, |
|
"eval_steps_per_second": 0.581, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.516931179872771, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7157, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"eval_loss": 0.6937347650527954, |
|
"eval_runtime": 44.7687, |
|
"eval_samples_per_second": 4.467, |
|
"eval_steps_per_second": 0.558, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"grad_norm": 0.527427864430588, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7505, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"eval_loss": 0.6932395696640015, |
|
"eval_runtime": 44.5644, |
|
"eval_samples_per_second": 4.488, |
|
"eval_steps_per_second": 0.561, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"grad_norm": 0.5073638107520839, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7893, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"eval_loss": 0.692828357219696, |
|
"eval_runtime": 46.0526, |
|
"eval_samples_per_second": 4.343, |
|
"eval_steps_per_second": 0.543, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"grad_norm": 0.5234480045460208, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6786, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"eval_loss": 0.6927328705787659, |
|
"eval_runtime": 44.4221, |
|
"eval_samples_per_second": 4.502, |
|
"eval_steps_per_second": 0.563, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.509921375319416, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6839, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"eval_loss": 0.6922880411148071, |
|
"eval_runtime": 44.5254, |
|
"eval_samples_per_second": 4.492, |
|
"eval_steps_per_second": 0.561, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"grad_norm": 0.5307701692724383, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6949, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"eval_loss": 0.6916860938072205, |
|
"eval_runtime": 46.1897, |
|
"eval_samples_per_second": 4.33, |
|
"eval_steps_per_second": 0.541, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"grad_norm": 0.5405944672270007, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6644, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"eval_loss": 0.6913076639175415, |
|
"eval_runtime": 45.6494, |
|
"eval_samples_per_second": 4.381, |
|
"eval_steps_per_second": 0.548, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"grad_norm": 0.5911050914106935, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6993, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"eval_loss": 0.6910421848297119, |
|
"eval_runtime": 45.6849, |
|
"eval_samples_per_second": 4.378, |
|
"eval_steps_per_second": 0.547, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.5738317262291136, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6909, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"eval_loss": 0.6906780004501343, |
|
"eval_runtime": 45.8103, |
|
"eval_samples_per_second": 4.366, |
|
"eval_steps_per_second": 0.546, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"grad_norm": 0.6176885912626084, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7418, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"eval_loss": 0.6897534132003784, |
|
"eval_runtime": 46.2895, |
|
"eval_samples_per_second": 4.321, |
|
"eval_steps_per_second": 0.54, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"grad_norm": 0.5804047612157957, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7046, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"eval_loss": 0.6883871555328369, |
|
"eval_runtime": 46.9282, |
|
"eval_samples_per_second": 4.262, |
|
"eval_steps_per_second": 0.533, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"grad_norm": 0.5408722725454089, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7561, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"eval_loss": 0.6878187656402588, |
|
"eval_runtime": 47.6969, |
|
"eval_samples_per_second": 4.193, |
|
"eval_steps_per_second": 0.524, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"grad_norm": 0.5492560188161619, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6903, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"eval_loss": 0.6882662773132324, |
|
"eval_runtime": 47.2072, |
|
"eval_samples_per_second": 4.237, |
|
"eval_steps_per_second": 0.53, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"grad_norm": 0.5286439760924038, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7036, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"eval_loss": 0.6890198588371277, |
|
"eval_runtime": 47.4378, |
|
"eval_samples_per_second": 4.216, |
|
"eval_steps_per_second": 0.527, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 0.5540465829524065, |
|
"learning_rate": 2e-05, |
|
"loss": 0.715, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"eval_loss": 0.6893854737281799, |
|
"eval_runtime": 47.5957, |
|
"eval_samples_per_second": 4.202, |
|
"eval_steps_per_second": 0.525, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"grad_norm": 0.543055712644853, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7122, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"eval_loss": 0.688640296459198, |
|
"eval_runtime": 47.2791, |
|
"eval_samples_per_second": 4.23, |
|
"eval_steps_per_second": 0.529, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.5243011011968818, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6989, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.6877474784851074, |
|
"eval_runtime": 49.6808, |
|
"eval_samples_per_second": 4.026, |
|
"eval_steps_per_second": 0.503, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"grad_norm": 0.5427998890836598, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7643, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"eval_loss": 0.6871516704559326, |
|
"eval_runtime": 43.2416, |
|
"eval_samples_per_second": 4.625, |
|
"eval_steps_per_second": 0.578, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"grad_norm": 0.4848261239833822, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7333, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"eval_loss": 0.6872122287750244, |
|
"eval_runtime": 43.027, |
|
"eval_samples_per_second": 4.648, |
|
"eval_steps_per_second": 0.581, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"grad_norm": 0.5476878256408845, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6621, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"eval_loss": 0.6873424053192139, |
|
"eval_runtime": 43.0047, |
|
"eval_samples_per_second": 4.651, |
|
"eval_steps_per_second": 0.581, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"grad_norm": 0.5198863257357437, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6936, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"eval_loss": 0.6874563097953796, |
|
"eval_runtime": 43.2855, |
|
"eval_samples_per_second": 4.62, |
|
"eval_steps_per_second": 0.578, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"grad_norm": 0.5705568756769012, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7237, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"eval_loss": 0.6877203583717346, |
|
"eval_runtime": 44.8778, |
|
"eval_samples_per_second": 4.457, |
|
"eval_steps_per_second": 0.557, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"grad_norm": 0.5546703873264635, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8033, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"eval_loss": 0.6876934170722961, |
|
"eval_runtime": 43.3351, |
|
"eval_samples_per_second": 4.615, |
|
"eval_steps_per_second": 0.577, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"grad_norm": 0.5846944975931198, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6687, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"eval_loss": 0.6877866983413696, |
|
"eval_runtime": 43.1456, |
|
"eval_samples_per_second": 4.635, |
|
"eval_steps_per_second": 0.579, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.5882658410555619, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7169, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_loss": 0.6881275773048401, |
|
"eval_runtime": 44.9645, |
|
"eval_samples_per_second": 4.448, |
|
"eval_steps_per_second": 0.556, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"grad_norm": 0.5831610447904351, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7394, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"eval_loss": 0.6888833045959473, |
|
"eval_runtime": 45.09, |
|
"eval_samples_per_second": 4.436, |
|
"eval_steps_per_second": 0.554, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"grad_norm": 0.6592966385691889, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6537, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"eval_loss": 0.6880140900611877, |
|
"eval_runtime": 43.2447, |
|
"eval_samples_per_second": 4.625, |
|
"eval_steps_per_second": 0.578, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"grad_norm": 0.558654488415818, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7991, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"eval_loss": 0.6874076724052429, |
|
"eval_runtime": 42.9406, |
|
"eval_samples_per_second": 4.658, |
|
"eval_steps_per_second": 0.582, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"grad_norm": 0.6342316949523702, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6403, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"eval_loss": 0.6866291761398315, |
|
"eval_runtime": 43.1217, |
|
"eval_samples_per_second": 4.638, |
|
"eval_steps_per_second": 0.58, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"grad_norm": 0.544206621558966, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6314, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"eval_loss": 0.6863086223602295, |
|
"eval_runtime": 43.2951, |
|
"eval_samples_per_second": 4.619, |
|
"eval_steps_per_second": 0.577, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"grad_norm": 0.6380097809956626, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6851, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"eval_loss": 0.6859965324401855, |
|
"eval_runtime": 44.9257, |
|
"eval_samples_per_second": 4.452, |
|
"eval_steps_per_second": 0.556, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"grad_norm": 0.5870799307885896, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7367, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"eval_loss": 0.6856269836425781, |
|
"eval_runtime": 44.8384, |
|
"eval_samples_per_second": 4.46, |
|
"eval_steps_per_second": 0.558, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.6115022356518031, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6814, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"eval_loss": 0.6856591701507568, |
|
"eval_runtime": 42.9528, |
|
"eval_samples_per_second": 4.656, |
|
"eval_steps_per_second": 0.582, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"grad_norm": 0.6655918462314045, |
|
"learning_rate": 2e-05, |
|
"loss": 0.657, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"eval_loss": 0.6854197978973389, |
|
"eval_runtime": 43.2366, |
|
"eval_samples_per_second": 4.626, |
|
"eval_steps_per_second": 0.578, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"grad_norm": 0.6102352184035382, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6343, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"eval_loss": 0.6852834820747375, |
|
"eval_runtime": 43.1789, |
|
"eval_samples_per_second": 4.632, |
|
"eval_steps_per_second": 0.579, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"grad_norm": 0.6354143085331753, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6736, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"eval_loss": 0.6851873993873596, |
|
"eval_runtime": 44.5173, |
|
"eval_samples_per_second": 4.493, |
|
"eval_steps_per_second": 0.562, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"grad_norm": 0.6069083787831553, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6466, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"eval_loss": 0.6846270561218262, |
|
"eval_runtime": 44.7412, |
|
"eval_samples_per_second": 4.47, |
|
"eval_steps_per_second": 0.559, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"grad_norm": 0.5918704953369675, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7174, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"eval_loss": 0.6842523217201233, |
|
"eval_runtime": 46.0503, |
|
"eval_samples_per_second": 4.343, |
|
"eval_steps_per_second": 0.543, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"grad_norm": 0.5824866849171524, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6955, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"eval_loss": 0.6838890314102173, |
|
"eval_runtime": 44.5781, |
|
"eval_samples_per_second": 4.487, |
|
"eval_steps_per_second": 0.561, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"grad_norm": 0.6278777152900226, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6926, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"eval_loss": 0.6827735900878906, |
|
"eval_runtime": 44.483, |
|
"eval_samples_per_second": 4.496, |
|
"eval_steps_per_second": 0.562, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.6627082254561003, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6931, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"eval_loss": 0.6818405389785767, |
|
"eval_runtime": 46.0477, |
|
"eval_samples_per_second": 4.343, |
|
"eval_steps_per_second": 0.543, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"grad_norm": 0.6551951149808454, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6386, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"eval_loss": 0.6824897527694702, |
|
"eval_runtime": 47.3712, |
|
"eval_samples_per_second": 4.222, |
|
"eval_steps_per_second": 0.528, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"grad_norm": 0.6821330786477059, |
|
"learning_rate": 2e-05, |
|
"loss": 0.635, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"eval_loss": 0.6829469203948975, |
|
"eval_runtime": 46.2003, |
|
"eval_samples_per_second": 4.329, |
|
"eval_steps_per_second": 0.541, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"grad_norm": 0.7440273168609611, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7286, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"eval_loss": 0.6824621558189392, |
|
"eval_runtime": 45.8201, |
|
"eval_samples_per_second": 4.365, |
|
"eval_steps_per_second": 0.546, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 0.7007032012854347, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7376, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"eval_loss": 0.6805981397628784, |
|
"eval_runtime": 45.7474, |
|
"eval_samples_per_second": 4.372, |
|
"eval_steps_per_second": 0.546, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"grad_norm": 0.6422764032088494, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6959, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"eval_loss": 0.679237961769104, |
|
"eval_runtime": 48.4646, |
|
"eval_samples_per_second": 4.127, |
|
"eval_steps_per_second": 0.516, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"grad_norm": 0.7159695125034813, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6894, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"eval_loss": 0.6775233745574951, |
|
"eval_runtime": 47.4563, |
|
"eval_samples_per_second": 4.214, |
|
"eval_steps_per_second": 0.527, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"grad_norm": 0.6358380926544867, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7073, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"eval_loss": 0.6766613721847534, |
|
"eval_runtime": 47.4483, |
|
"eval_samples_per_second": 4.215, |
|
"eval_steps_per_second": 0.527, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.6716901613635139, |
|
"learning_rate": 2e-05, |
|
"loss": 0.76, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.6770586371421814, |
|
"eval_runtime": 47.0209, |
|
"eval_samples_per_second": 4.253, |
|
"eval_steps_per_second": 0.532, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"grad_norm": 0.5953096184448028, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6798, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"eval_loss": 0.6774635314941406, |
|
"eval_runtime": 51.4624, |
|
"eval_samples_per_second": 3.886, |
|
"eval_steps_per_second": 0.486, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.0625, |
|
"grad_norm": 0.6549589081607252, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6122, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.0625, |
|
"eval_loss": 0.6784033179283142, |
|
"eval_runtime": 45.8732, |
|
"eval_samples_per_second": 4.36, |
|
"eval_steps_per_second": 0.545, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.09375, |
|
"grad_norm": 0.6573259751745981, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6829, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.09375, |
|
"eval_loss": 0.6796069145202637, |
|
"eval_runtime": 44.2994, |
|
"eval_samples_per_second": 4.515, |
|
"eval_steps_per_second": 0.564, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"grad_norm": 0.725599779122791, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6336, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"eval_loss": 0.681220531463623, |
|
"eval_runtime": 45.7641, |
|
"eval_samples_per_second": 4.37, |
|
"eval_steps_per_second": 0.546, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.15625, |
|
"grad_norm": 0.7811517272176121, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6387, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.15625, |
|
"eval_loss": 0.6828885674476624, |
|
"eval_runtime": 44.7953, |
|
"eval_samples_per_second": 4.465, |
|
"eval_steps_per_second": 0.558, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"grad_norm": 0.6760384395465522, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6245, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"eval_loss": 0.6845852732658386, |
|
"eval_runtime": 44.3812, |
|
"eval_samples_per_second": 4.506, |
|
"eval_steps_per_second": 0.563, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.21875, |
|
"grad_norm": 0.7361186814868562, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7128, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.21875, |
|
"eval_loss": 0.685402512550354, |
|
"eval_runtime": 44.3763, |
|
"eval_samples_per_second": 4.507, |
|
"eval_steps_per_second": 0.563, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.7299978196751681, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7176, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"eval_loss": 0.685026228427887, |
|
"eval_runtime": 44.3181, |
|
"eval_samples_per_second": 4.513, |
|
"eval_steps_per_second": 0.564, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.28125, |
|
"grad_norm": 0.8584091654553072, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6653, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.28125, |
|
"eval_loss": 0.6831257343292236, |
|
"eval_runtime": 44.3805, |
|
"eval_samples_per_second": 4.506, |
|
"eval_steps_per_second": 0.563, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.3125, |
|
"grad_norm": 0.6919046534495772, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6968, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.3125, |
|
"eval_loss": 0.6820144653320312, |
|
"eval_runtime": 44.3397, |
|
"eval_samples_per_second": 4.511, |
|
"eval_steps_per_second": 0.564, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"grad_norm": 0.6716381808914595, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6626, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"eval_loss": 0.6815916299819946, |
|
"eval_runtime": 44.2997, |
|
"eval_samples_per_second": 4.515, |
|
"eval_steps_per_second": 0.564, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"grad_norm": 0.7098466238055623, |
|
"learning_rate": 2e-05, |
|
"loss": 0.629, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"eval_loss": 0.681601881980896, |
|
"eval_runtime": 44.2722, |
|
"eval_samples_per_second": 4.518, |
|
"eval_steps_per_second": 0.565, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.40625, |
|
"grad_norm": 0.7700763843474521, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6796, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.40625, |
|
"eval_loss": 0.6809589862823486, |
|
"eval_runtime": 44.4518, |
|
"eval_samples_per_second": 4.499, |
|
"eval_steps_per_second": 0.562, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.4375, |
|
"grad_norm": 0.7925088234539602, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6722, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.4375, |
|
"eval_loss": 0.6801493763923645, |
|
"eval_runtime": 44.4078, |
|
"eval_samples_per_second": 4.504, |
|
"eval_steps_per_second": 0.563, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.46875, |
|
"grad_norm": 0.6778717561377235, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6889, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.46875, |
|
"eval_loss": 0.6798510551452637, |
|
"eval_runtime": 44.3303, |
|
"eval_samples_per_second": 4.512, |
|
"eval_steps_per_second": 0.564, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.6683599876699755, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6383, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"eval_loss": 0.6800721883773804, |
|
"eval_runtime": 44.5868, |
|
"eval_samples_per_second": 4.486, |
|
"eval_steps_per_second": 0.561, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.53125, |
|
"grad_norm": 0.6242371910913779, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6809, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.53125, |
|
"eval_loss": 0.6809727549552917, |
|
"eval_runtime": 44.7112, |
|
"eval_samples_per_second": 4.473, |
|
"eval_steps_per_second": 0.559, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.5625, |
|
"grad_norm": 0.6966989602775038, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6777, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.5625, |
|
"eval_loss": 0.6819994449615479, |
|
"eval_runtime": 44.3272, |
|
"eval_samples_per_second": 4.512, |
|
"eval_steps_per_second": 0.564, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.59375, |
|
"grad_norm": 0.7373050917062219, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6622, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.59375, |
|
"eval_loss": 0.6821829080581665, |
|
"eval_runtime": 46.0527, |
|
"eval_samples_per_second": 4.343, |
|
"eval_steps_per_second": 0.543, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"grad_norm": 0.8266617785650243, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7248, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"eval_loss": 0.6813778877258301, |
|
"eval_runtime": 45.7663, |
|
"eval_samples_per_second": 4.37, |
|
"eval_steps_per_second": 0.546, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"grad_norm": 0.7459146574284048, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6301, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"eval_loss": 0.6811490654945374, |
|
"eval_runtime": 45.9361, |
|
"eval_samples_per_second": 4.354, |
|
"eval_steps_per_second": 0.544, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.6875, |
|
"grad_norm": 0.7612602223178182, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6713, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.6875, |
|
"eval_loss": 0.6800392866134644, |
|
"eval_runtime": 46.3359, |
|
"eval_samples_per_second": 4.316, |
|
"eval_steps_per_second": 0.54, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.71875, |
|
"grad_norm": 0.7391445622441601, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6721, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.71875, |
|
"eval_loss": 0.6794085502624512, |
|
"eval_runtime": 46.9877, |
|
"eval_samples_per_second": 4.256, |
|
"eval_steps_per_second": 0.532, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.7019243161207622, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6578, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"eval_loss": 0.6786046624183655, |
|
"eval_runtime": 46.0364, |
|
"eval_samples_per_second": 4.344, |
|
"eval_steps_per_second": 0.543, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.78125, |
|
"grad_norm": 0.7933438921741315, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7023, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 7.78125, |
|
"eval_loss": 0.6770951747894287, |
|
"eval_runtime": 45.6655, |
|
"eval_samples_per_second": 4.38, |
|
"eval_steps_per_second": 0.547, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"grad_norm": 0.7313927502966258, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7114, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"eval_loss": 0.6766157746315002, |
|
"eval_runtime": 45.7602, |
|
"eval_samples_per_second": 4.371, |
|
"eval_steps_per_second": 0.546, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.84375, |
|
"grad_norm": 0.7235467321597684, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6259, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 7.84375, |
|
"eval_loss": 0.6770395040512085, |
|
"eval_runtime": 46.9839, |
|
"eval_samples_per_second": 4.257, |
|
"eval_steps_per_second": 0.532, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"grad_norm": 0.773244621810685, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6262, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"eval_loss": 0.6780049800872803, |
|
"eval_runtime": 46.9808, |
|
"eval_samples_per_second": 4.257, |
|
"eval_steps_per_second": 0.532, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 7.90625, |
|
"grad_norm": 0.7620627775664955, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7219, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 7.90625, |
|
"eval_loss": 0.6781153678894043, |
|
"eval_runtime": 49.7208, |
|
"eval_samples_per_second": 4.022, |
|
"eval_steps_per_second": 0.503, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 7.9375, |
|
"grad_norm": 0.7332381519045823, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6777, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 7.9375, |
|
"eval_loss": 0.6787923574447632, |
|
"eval_runtime": 43.1001, |
|
"eval_samples_per_second": 4.64, |
|
"eval_steps_per_second": 0.58, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"grad_norm": 0.7847956878083815, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5983, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"eval_loss": 0.6779956817626953, |
|
"eval_runtime": 43.1273, |
|
"eval_samples_per_second": 4.637, |
|
"eval_steps_per_second": 0.58, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.7095399891563587, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6609, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.677204430103302, |
|
"eval_runtime": 43.0632, |
|
"eval_samples_per_second": 4.644, |
|
"eval_steps_per_second": 0.581, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.03125, |
|
"grad_norm": 0.7654004838243704, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6297, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 8.03125, |
|
"eval_loss": 0.6774580478668213, |
|
"eval_runtime": 50.3948, |
|
"eval_samples_per_second": 3.969, |
|
"eval_steps_per_second": 0.496, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 8.0625, |
|
"grad_norm": 0.7337064337590912, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6446, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 8.0625, |
|
"eval_loss": 0.6788855195045471, |
|
"eval_runtime": 43.163, |
|
"eval_samples_per_second": 4.634, |
|
"eval_steps_per_second": 0.579, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 8.09375, |
|
"grad_norm": 0.7426969285671609, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6732, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 8.09375, |
|
"eval_loss": 0.6811656355857849, |
|
"eval_runtime": 43.113, |
|
"eval_samples_per_second": 4.639, |
|
"eval_steps_per_second": 0.58, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"grad_norm": 0.8495552484217858, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6857, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"eval_loss": 0.6831929683685303, |
|
"eval_runtime": 43.0506, |
|
"eval_samples_per_second": 4.646, |
|
"eval_steps_per_second": 0.581, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.15625, |
|
"grad_norm": 0.8137654207236353, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6076, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 8.15625, |
|
"eval_loss": 0.685956597328186, |
|
"eval_runtime": 43.6958, |
|
"eval_samples_per_second": 4.577, |
|
"eval_steps_per_second": 0.572, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 8.1875, |
|
"grad_norm": 0.7920289131050305, |
|
"learning_rate": 2e-05, |
|
"loss": 0.723, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 8.1875, |
|
"eval_loss": 0.6895143389701843, |
|
"eval_runtime": 44.5485, |
|
"eval_samples_per_second": 4.489, |
|
"eval_steps_per_second": 0.561, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 8.21875, |
|
"grad_norm": 0.9058951636873679, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5836, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 8.21875, |
|
"eval_loss": 0.6920652985572815, |
|
"eval_runtime": 43.2986, |
|
"eval_samples_per_second": 4.619, |
|
"eval_steps_per_second": 0.577, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.8945234539908303, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6484, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"eval_loss": 0.69307541847229, |
|
"eval_runtime": 43.0302, |
|
"eval_samples_per_second": 4.648, |
|
"eval_steps_per_second": 0.581, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 8.28125, |
|
"grad_norm": 0.9973855113532047, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6735, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.28125, |
|
"eval_loss": 0.6918882727622986, |
|
"eval_runtime": 43.035, |
|
"eval_samples_per_second": 4.647, |
|
"eval_steps_per_second": 0.581, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.3125, |
|
"grad_norm": 0.8604633375599925, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6618, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 8.3125, |
|
"eval_loss": 0.6895372867584229, |
|
"eval_runtime": 43.31, |
|
"eval_samples_per_second": 4.618, |
|
"eval_steps_per_second": 0.577, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 8.34375, |
|
"grad_norm": 0.8414418828391491, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5879, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 8.34375, |
|
"eval_loss": 0.687466561794281, |
|
"eval_runtime": 43.1943, |
|
"eval_samples_per_second": 4.63, |
|
"eval_steps_per_second": 0.579, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 8.375, |
|
"grad_norm": 0.9186307751895403, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6488, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 8.375, |
|
"eval_loss": 0.6843683123588562, |
|
"eval_runtime": 43.0073, |
|
"eval_samples_per_second": 4.65, |
|
"eval_steps_per_second": 0.581, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 8.40625, |
|
"grad_norm": 0.8308076771594943, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6357, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 8.40625, |
|
"eval_loss": 0.6821109056472778, |
|
"eval_runtime": 43.2217, |
|
"eval_samples_per_second": 4.627, |
|
"eval_steps_per_second": 0.578, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"grad_norm": 0.7743250830620387, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6408, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"eval_loss": 0.6811809539794922, |
|
"eval_runtime": 44.8789, |
|
"eval_samples_per_second": 4.456, |
|
"eval_steps_per_second": 0.557, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.46875, |
|
"grad_norm": 0.8351441656367814, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5387, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 8.46875, |
|
"eval_loss": 0.6824797987937927, |
|
"eval_runtime": 43.2723, |
|
"eval_samples_per_second": 4.622, |
|
"eval_steps_per_second": 0.578, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.7808346216305826, |
|
"learning_rate": 2e-05, |
|
"loss": 0.637, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"eval_loss": 0.6853922009468079, |
|
"eval_runtime": 43.1091, |
|
"eval_samples_per_second": 4.639, |
|
"eval_steps_per_second": 0.58, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.53125, |
|
"grad_norm": 0.8566382439854656, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6524, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 8.53125, |
|
"eval_loss": 0.6853267550468445, |
|
"eval_runtime": 43.1515, |
|
"eval_samples_per_second": 4.635, |
|
"eval_steps_per_second": 0.579, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 8.5625, |
|
"grad_norm": 0.872427052560813, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6393, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 8.5625, |
|
"eval_loss": 0.6836146712303162, |
|
"eval_runtime": 44.9084, |
|
"eval_samples_per_second": 4.454, |
|
"eval_steps_per_second": 0.557, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 8.59375, |
|
"grad_norm": 0.8437899827314175, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6506, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.59375, |
|
"eval_loss": 0.6817864179611206, |
|
"eval_runtime": 44.8879, |
|
"eval_samples_per_second": 4.456, |
|
"eval_steps_per_second": 0.557, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.625, |
|
"grad_norm": 0.8790612317241222, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6442, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.625, |
|
"eval_loss": 0.6796035766601562, |
|
"eval_runtime": 43.3127, |
|
"eval_samples_per_second": 4.618, |
|
"eval_steps_per_second": 0.577, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.65625, |
|
"grad_norm": 0.8158092597576191, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5893, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 8.65625, |
|
"eval_loss": 0.6795459985733032, |
|
"eval_runtime": 44.6925, |
|
"eval_samples_per_second": 4.475, |
|
"eval_steps_per_second": 0.559, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 8.6875, |
|
"grad_norm": 0.878065597316925, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6418, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 8.6875, |
|
"eval_loss": 0.6804844737052917, |
|
"eval_runtime": 43.1496, |
|
"eval_samples_per_second": 4.635, |
|
"eval_steps_per_second": 0.579, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 8.71875, |
|
"grad_norm": 0.8184085366861941, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6007, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.71875, |
|
"eval_loss": 0.6821385025978088, |
|
"eval_runtime": 44.9156, |
|
"eval_samples_per_second": 4.453, |
|
"eval_steps_per_second": 0.557, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.9005368790411379, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6683, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"eval_loss": 0.6848174333572388, |
|
"eval_runtime": 43.544, |
|
"eval_samples_per_second": 4.593, |
|
"eval_steps_per_second": 0.574, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.78125, |
|
"grad_norm": 0.8154265443661354, |
|
"learning_rate": 2e-05, |
|
"loss": 0.612, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 8.78125, |
|
"eval_loss": 0.6864734888076782, |
|
"eval_runtime": 44.5814, |
|
"eval_samples_per_second": 4.486, |
|
"eval_steps_per_second": 0.561, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 8.8125, |
|
"grad_norm": 0.8905054313305548, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5992, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 8.8125, |
|
"eval_loss": 0.6864038109779358, |
|
"eval_runtime": 44.0748, |
|
"eval_samples_per_second": 4.538, |
|
"eval_steps_per_second": 0.567, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 8.84375, |
|
"grad_norm": 0.8492838619646935, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5775, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 8.84375, |
|
"eval_loss": 0.686205267906189, |
|
"eval_runtime": 44.1813, |
|
"eval_samples_per_second": 4.527, |
|
"eval_steps_per_second": 0.566, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 8.875, |
|
"grad_norm": 0.8739982729224768, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5447, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 8.875, |
|
"eval_loss": 0.6865501403808594, |
|
"eval_runtime": 46.5428, |
|
"eval_samples_per_second": 4.297, |
|
"eval_steps_per_second": 0.537, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 8.90625, |
|
"grad_norm": 0.9936570525936491, |
|
"learning_rate": 2e-05, |
|
"loss": 0.59, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 8.90625, |
|
"eval_loss": 0.6856868267059326, |
|
"eval_runtime": 44.6352, |
|
"eval_samples_per_second": 4.481, |
|
"eval_steps_per_second": 0.56, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 8.9375, |
|
"grad_norm": 0.9517307402112732, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7253, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 8.9375, |
|
"eval_loss": 0.6847086548805237, |
|
"eval_runtime": 47.1289, |
|
"eval_samples_per_second": 4.244, |
|
"eval_steps_per_second": 0.53, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 8.96875, |
|
"grad_norm": 0.8541430299481336, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6436, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 8.96875, |
|
"eval_loss": 0.6847487092018127, |
|
"eval_runtime": 46.395, |
|
"eval_samples_per_second": 4.311, |
|
"eval_steps_per_second": 0.539, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.9356185152979635, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5919, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.6830996870994568, |
|
"eval_runtime": 45.0389, |
|
"eval_samples_per_second": 4.441, |
|
"eval_steps_per_second": 0.555, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 9.03125, |
|
"grad_norm": 0.895841912664687, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6074, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 9.03125, |
|
"eval_loss": 0.6805940866470337, |
|
"eval_runtime": 43.275, |
|
"eval_samples_per_second": 4.622, |
|
"eval_steps_per_second": 0.578, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"grad_norm": 0.8181374187415763, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6233, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"eval_loss": 0.679899275302887, |
|
"eval_runtime": 43.4137, |
|
"eval_samples_per_second": 4.607, |
|
"eval_steps_per_second": 0.576, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.09375, |
|
"grad_norm": 0.8491986564498026, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6262, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 9.09375, |
|
"eval_loss": 0.682360053062439, |
|
"eval_runtime": 45.2147, |
|
"eval_samples_per_second": 4.423, |
|
"eval_steps_per_second": 0.553, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 9.125, |
|
"grad_norm": 0.9355368723165358, |
|
"learning_rate": 2e-05, |
|
"loss": 0.595, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 9.125, |
|
"eval_loss": 0.6852359175682068, |
|
"eval_runtime": 44.335, |
|
"eval_samples_per_second": 4.511, |
|
"eval_steps_per_second": 0.564, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 9.15625, |
|
"grad_norm": 0.9196086439363605, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6534, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 9.15625, |
|
"eval_loss": 0.6872662305831909, |
|
"eval_runtime": 46.5007, |
|
"eval_samples_per_second": 4.301, |
|
"eval_steps_per_second": 0.538, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 9.1875, |
|
"grad_norm": 0.8393737542433595, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5908, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 9.1875, |
|
"eval_loss": 0.6902926564216614, |
|
"eval_runtime": 46.1133, |
|
"eval_samples_per_second": 4.337, |
|
"eval_steps_per_second": 0.542, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 9.21875, |
|
"grad_norm": 1.00507877022181, |
|
"learning_rate": 2e-05, |
|
"loss": 0.536, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 9.21875, |
|
"eval_loss": 0.6969813704490662, |
|
"eval_runtime": 45.209, |
|
"eval_samples_per_second": 4.424, |
|
"eval_steps_per_second": 0.553, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.9241164807887086, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5562, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"eval_loss": 0.7055781483650208, |
|
"eval_runtime": 44.1347, |
|
"eval_samples_per_second": 4.532, |
|
"eval_steps_per_second": 0.566, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 9.28125, |
|
"grad_norm": 1.085449108925152, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6582, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 9.28125, |
|
"eval_loss": 0.7090529799461365, |
|
"eval_runtime": 46.0924, |
|
"eval_samples_per_second": 4.339, |
|
"eval_steps_per_second": 0.542, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 9.3125, |
|
"grad_norm": 1.2857794830276748, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5942, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 9.3125, |
|
"eval_loss": 0.7092991471290588, |
|
"eval_runtime": 45.9455, |
|
"eval_samples_per_second": 4.353, |
|
"eval_steps_per_second": 0.544, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 9.34375, |
|
"grad_norm": 1.1012657793973455, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5681, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 9.34375, |
|
"eval_loss": 0.7078263759613037, |
|
"eval_runtime": 44.3361, |
|
"eval_samples_per_second": 4.511, |
|
"eval_steps_per_second": 0.564, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"grad_norm": 1.0150133491916107, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5829, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"eval_loss": 0.7039945721626282, |
|
"eval_runtime": 46.0368, |
|
"eval_samples_per_second": 4.344, |
|
"eval_steps_per_second": 0.543, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.40625, |
|
"grad_norm": 1.0183449928898174, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5622, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 9.40625, |
|
"eval_loss": 0.6970013380050659, |
|
"eval_runtime": 44.3071, |
|
"eval_samples_per_second": 4.514, |
|
"eval_steps_per_second": 0.564, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 9.4375, |
|
"grad_norm": 1.160561076731859, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6207, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 9.4375, |
|
"eval_loss": 0.6882898211479187, |
|
"eval_runtime": 44.3423, |
|
"eval_samples_per_second": 4.51, |
|
"eval_steps_per_second": 0.564, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 9.46875, |
|
"grad_norm": 0.9775130871533282, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6121, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 9.46875, |
|
"eval_loss": 0.6842953562736511, |
|
"eval_runtime": 45.0998, |
|
"eval_samples_per_second": 4.435, |
|
"eval_steps_per_second": 0.554, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.8440645832373606, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6495, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"eval_loss": 0.6841378808021545, |
|
"eval_runtime": 44.4679, |
|
"eval_samples_per_second": 4.498, |
|
"eval_steps_per_second": 0.562, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 9.53125, |
|
"grad_norm": 0.9112261594523882, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6188, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 9.53125, |
|
"eval_loss": 0.6845135688781738, |
|
"eval_runtime": 44.4427, |
|
"eval_samples_per_second": 4.5, |
|
"eval_steps_per_second": 0.563, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 9.5625, |
|
"grad_norm": 1.0253409237396724, |
|
"learning_rate": 2e-05, |
|
"loss": 0.602, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 9.5625, |
|
"eval_loss": 0.6839584112167358, |
|
"eval_runtime": 44.1975, |
|
"eval_samples_per_second": 4.525, |
|
"eval_steps_per_second": 0.566, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 9.59375, |
|
"grad_norm": 1.0395385110757185, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6007, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 9.59375, |
|
"eval_loss": 0.6852008104324341, |
|
"eval_runtime": 44.4015, |
|
"eval_samples_per_second": 4.504, |
|
"eval_steps_per_second": 0.563, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 9.625, |
|
"grad_norm": 0.9468230481893222, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6376, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 9.625, |
|
"eval_loss": 0.6902636885643005, |
|
"eval_runtime": 45.6849, |
|
"eval_samples_per_second": 4.378, |
|
"eval_steps_per_second": 0.547, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 9.65625, |
|
"grad_norm": 0.9298141136824676, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6094, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 9.65625, |
|
"eval_loss": 0.6970698833465576, |
|
"eval_runtime": 44.2879, |
|
"eval_samples_per_second": 4.516, |
|
"eval_steps_per_second": 0.564, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"grad_norm": 1.2537810836544294, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6049, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"eval_loss": 0.6991828083992004, |
|
"eval_runtime": 46.2429, |
|
"eval_samples_per_second": 4.325, |
|
"eval_steps_per_second": 0.541, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.71875, |
|
"grad_norm": 1.082420692181638, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5241, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 9.71875, |
|
"eval_loss": 0.7002778649330139, |
|
"eval_runtime": 44.2468, |
|
"eval_samples_per_second": 4.52, |
|
"eval_steps_per_second": 0.565, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 1.0383910110357883, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6162, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"eval_loss": 0.7004844546318054, |
|
"eval_runtime": 44.357, |
|
"eval_samples_per_second": 4.509, |
|
"eval_steps_per_second": 0.564, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 9.78125, |
|
"grad_norm": 0.9375392905585037, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6082, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 9.78125, |
|
"eval_loss": 0.6998957991600037, |
|
"eval_runtime": 44.3911, |
|
"eval_samples_per_second": 4.505, |
|
"eval_steps_per_second": 0.563, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 9.8125, |
|
"grad_norm": 1.080227501802435, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5826, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 9.8125, |
|
"eval_loss": 0.698168158531189, |
|
"eval_runtime": 44.2481, |
|
"eval_samples_per_second": 4.52, |
|
"eval_steps_per_second": 0.565, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 9.84375, |
|
"grad_norm": 0.9707388919250783, |
|
"learning_rate": 2e-05, |
|
"loss": 0.61, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 9.84375, |
|
"eval_loss": 0.6951956152915955, |
|
"eval_runtime": 44.4353, |
|
"eval_samples_per_second": 4.501, |
|
"eval_steps_per_second": 0.563, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 9.875, |
|
"grad_norm": 0.9491238644745222, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5957, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 9.875, |
|
"eval_loss": 0.6926063299179077, |
|
"eval_runtime": 45.2893, |
|
"eval_samples_per_second": 4.416, |
|
"eval_steps_per_second": 0.552, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 9.90625, |
|
"grad_norm": 1.0530872213679219, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5611, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 9.90625, |
|
"eval_loss": 0.6899718642234802, |
|
"eval_runtime": 45.2963, |
|
"eval_samples_per_second": 4.415, |
|
"eval_steps_per_second": 0.552, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 9.9375, |
|
"grad_norm": 1.0052684640770637, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5838, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 9.9375, |
|
"eval_loss": 0.6875657439231873, |
|
"eval_runtime": 45.4935, |
|
"eval_samples_per_second": 4.396, |
|
"eval_steps_per_second": 0.55, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 9.96875, |
|
"grad_norm": 1.1010229534386275, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6106, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 9.96875, |
|
"eval_loss": 0.6842039227485657, |
|
"eval_runtime": 46.6347, |
|
"eval_samples_per_second": 4.289, |
|
"eval_steps_per_second": 0.536, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.0125768255592298, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5698, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.6834940910339355, |
|
"eval_runtime": 45.1875, |
|
"eval_samples_per_second": 4.426, |
|
"eval_steps_per_second": 0.553, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 320, |
|
"total_flos": 414794833330176.0, |
|
"train_loss": 0.12300790920853615, |
|
"train_runtime": 3683.354, |
|
"train_samples_per_second": 2.715, |
|
"train_steps_per_second": 0.087 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 414794833330176.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|