|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 8969, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 468.0, |
|
"learning_rate": 0.001, |
|
"loss": 11.4202, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 30.75, |
|
"learning_rate": 0.001, |
|
"loss": 19.6347, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 14.0, |
|
"learning_rate": 0.001, |
|
"loss": 10.8315, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 0.001, |
|
"loss": 9.9845, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 0.001, |
|
"loss": 9.1775, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 0.001, |
|
"loss": 8.5303, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 0.001, |
|
"loss": 8.0888, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 0.001, |
|
"loss": 7.8222, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 0.001, |
|
"loss": 7.3621, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 0.001, |
|
"loss": 7.1087, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 0.001, |
|
"loss": 6.9641, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 0.001, |
|
"loss": 6.7323, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.001, |
|
"loss": 6.5935, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 0.001, |
|
"loss": 6.7137, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 0.001, |
|
"loss": 6.4158, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 0.001, |
|
"loss": 6.3833, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 0.001, |
|
"loss": 6.4733, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.001, |
|
"loss": 6.2427, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.001, |
|
"loss": 6.2661, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.001, |
|
"loss": 6.1774, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.001, |
|
"loss": 6.1615, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.001, |
|
"loss": 6.1184, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 0.001, |
|
"loss": 5.9304, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 0.001, |
|
"loss": 5.9465, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.001, |
|
"loss": 5.8792, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.001, |
|
"loss": 5.768, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.001, |
|
"loss": 5.7801, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 0.001, |
|
"loss": 5.7555, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.001, |
|
"loss": 5.5612, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.001, |
|
"loss": 5.6745, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.001, |
|
"loss": 5.6213, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.001, |
|
"loss": 5.5163, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.001, |
|
"loss": 5.4944, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 0.001, |
|
"loss": 5.5869, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.001, |
|
"loss": 5.4195, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 0.001, |
|
"loss": 5.3747, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.001, |
|
"loss": 5.3658, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.001, |
|
"loss": 5.2998, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.001, |
|
"loss": 5.4196, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.001, |
|
"loss": 5.3192, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 0.001, |
|
"loss": 5.2397, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.001, |
|
"loss": 5.2464, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.001, |
|
"loss": 5.2192, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.001, |
|
"loss": 5.1029, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.001, |
|
"loss": 5.1451, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.001, |
|
"loss": 5.1545, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.001, |
|
"loss": 5.0767, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.001, |
|
"loss": 5.0355, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 0.001, |
|
"loss": 5.0593, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 0.001, |
|
"loss": 5.0839, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 4.9712, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.001, |
|
"loss": 4.968, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.001, |
|
"loss": 5.0095, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.001, |
|
"loss": 4.9378, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.001, |
|
"loss": 4.9852, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.001, |
|
"loss": 4.9289, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.001, |
|
"loss": 4.9237, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.001, |
|
"loss": 4.7961, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.001, |
|
"loss": 4.8586, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.001, |
|
"loss": 4.7868, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.001, |
|
"loss": 4.7914, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.001, |
|
"loss": 4.8173, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.001, |
|
"loss": 4.8026, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 4.7621, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.001, |
|
"loss": 4.7661, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.001, |
|
"loss": 4.7696, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.001, |
|
"loss": 4.657, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 0.001, |
|
"loss": 4.7683, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 0.001, |
|
"loss": 4.6882, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.001, |
|
"loss": 4.7274, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 0.001, |
|
"loss": 4.7224, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 0.001, |
|
"loss": 4.753, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 4.5646, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.001, |
|
"loss": 4.6671, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.001, |
|
"loss": 4.6817, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.001, |
|
"loss": 4.5382, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.001, |
|
"loss": 4.5105, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.001, |
|
"loss": 4.6714, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 4.5989, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.001, |
|
"loss": 4.6162, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.001, |
|
"loss": 4.5086, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.001, |
|
"loss": 4.5757, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.001, |
|
"loss": 4.5496, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 4.5085, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 4.4707, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 4.4716, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.001, |
|
"loss": 4.5159, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 0.001, |
|
"loss": 4.5706, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.001, |
|
"loss": 4.4737, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.001, |
|
"loss": 4.4327, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.001, |
|
"loss": 4.4394, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.001, |
|
"loss": 4.495, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 4.3558, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 4.3452, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 4.3795, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.001, |
|
"loss": 4.3633, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.001, |
|
"loss": 4.3212, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 4.4133, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 4.3349, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 4.355, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 4.2691, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 4.2382, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 4.3767, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.001, |
|
"loss": 4.323, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.001, |
|
"loss": 4.3758, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 0.001, |
|
"loss": 4.3027, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 4.3068, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.001, |
|
"loss": 4.3135, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 4.2856, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 4.291, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 4.262, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 4.3267, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 4.268, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 0.001, |
|
"loss": 4.3043, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 4.083, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 4.1305, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 4.1327, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 4.2689, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.001, |
|
"loss": 4.1684, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 4.1342, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 4.1759, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 4.1624, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.001, |
|
"loss": 4.201, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 4.2148, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 4.1464, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 4.1184, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.001, |
|
"loss": 4.027, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.001, |
|
"loss": 4.0665, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 4.1904, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.001, |
|
"loss": 4.0893, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 4.0258, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 4.0683, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 4.0854, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.9921, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 0.001, |
|
"loss": 4.1034, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 4.119, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.001, |
|
"loss": 4.1259, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 4.0325, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 4.002, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 4.1466, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 3.9527, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 4.1504, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.001, |
|
"loss": 4.0782, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 3.9826, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 4.0219, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 4.0031, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 3.9652, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 3.9176, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 3.9693, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 4.1027, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 3.9645, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.8947, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.001, |
|
"loss": 4.015, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.9853, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 3.9032, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.9793, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 3.8555, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.853, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.7918, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 3.8864, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.9658, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 3.9346, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.9032, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 3.8612, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.8483, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.001, |
|
"loss": 3.8366, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 3.8611, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 3.8658, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.7546, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 3.8955, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 3.8909, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7381, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.9318, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 3.8855, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 3.9868, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.837, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 3.7942, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7073, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.7289, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.7217, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 3.8807, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 3.6495, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.001, |
|
"loss": 3.7232, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7907, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7654, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.6906, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.7171, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7887, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.841, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 3.7338, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.7955, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 3.7954, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.6753, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.731, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 3.7734, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.7521, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001, |
|
"loss": 3.786, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7505, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.6746, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.6631, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 3.6193, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 3.6175, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.8167, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.7462, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7337, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 3.8234, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.7221, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.61, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 3.696, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 3.6931, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 3.8107, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7183, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.703, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7271, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.805, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.6972, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 3.6404, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.6821, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 3.718, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.6238, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.001, |
|
"loss": 3.727, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.6449, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.7375, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7099, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.7355, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.001, |
|
"loss": 3.6202, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.6891, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.6321, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.6889, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 3.6289, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.6676, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.5734, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.536, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.5922, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.5445, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.6754, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.5503, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 3.6787, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.6661, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.629, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.6414, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.6187, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.6122, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.6192, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 3.6335, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 3.5721, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.5466, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 3.5322, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.5597, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.583, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.6154, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.5325, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.5572, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.5091, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.6152, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.4866, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.5305, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.5229, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.4918, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.5985, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.467, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 3.5325, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.591, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 3.4826, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 3.4778, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.5079, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.5206, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.5359, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.5054, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3876, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 3.5013, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.4053, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 3.4865, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 3.556, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.4782, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.4029, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 3.5246, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.4893, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.4819, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 3.5376, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.4777, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.438, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.4363, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.5214, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.5338, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3541, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 3.4375, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 3.4896, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.5238, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3979, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.3255, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.4087, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.3471, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3955, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.4651, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.4859, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3995, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.5291, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.5589, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.3669, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 3.373, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.4424, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3791, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.4511, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.001, |
|
"loss": 3.3719, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3859, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.414, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.4589, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3673, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 3.399, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.3631, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3376, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2622, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.3692, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 3.401, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3968, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.4257, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3389, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.3529, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 3.4625, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3096, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3768, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2796, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2988, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.4475, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3677, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2559, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2786, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 3.3831, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3864, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.3959, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2918, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2775, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.3034, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 3.4116, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3383, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.3282, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.2493, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2698, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2708, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2881, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 3.3755, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 3.3396, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3008, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 3.2302, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3688, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2136, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3421, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2805, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2745, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.2147, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1843, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2455, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 3.2143, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1009, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2751, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3078, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2281, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1855, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.3936, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1809, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.2644, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.4201, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2126, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3431, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.3002, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3268, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2517, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.3158, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1521, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 3.1654, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.198, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1798, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.192, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2478, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 3.165, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2028, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2514, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2128, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1707, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 3.2027, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3155, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1595, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 3.2946, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2708, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2351, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2298, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2229, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.2835, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 3.258, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.3006, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1846, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2121, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0832, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2398, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1761, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.1958, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2419, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1544, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 3.1974, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.0909, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1775, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0954, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1065, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 3.2287, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1186, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2063, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2042, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.001, |
|
"loss": 3.208, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 3.203, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0422, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.3197, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2007, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1827, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1037, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0695, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.1399, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1754, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.0863, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 3.1698, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2883, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 3.1698, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 3.2341, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.0373, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.1708, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1345, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0521, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 3.1135, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 3.2268, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.0748, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0793, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0799, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 3.0898, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1049, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0061, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.1097, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.1246, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1433, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0436, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.1603, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0944, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 3.1839, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0935, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.1054, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0938, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 3.0771, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 3.0439, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.1046, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1784, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 3.1278, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.0594, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.1073, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0362, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0847, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.9977, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.0647, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.1407, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9501, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9796, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.9868, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 3.1029, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0329, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 3.0854, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 3.0628, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 3.1078, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 3.0313, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0872, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.9973, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9909, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.958, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.9788, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0805, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.981, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.0405, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0477, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.9396, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0082, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.948, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0341, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9754, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0006, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9661, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9024, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.9043, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 3.1154, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 3.0356, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.9301, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.932, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.9737, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9865, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.9408, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8823, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.9784, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.9849, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 3.0205, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.001, |
|
"loss": 2.9721, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.9403, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9458, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.993, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.9178, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001, |
|
"loss": 3.0433, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.8434, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8175, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.9316, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8653, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8296, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7935, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.9531, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8647, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9229, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8533, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.906, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9979, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.9295, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8835, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.8187, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8908, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.9334, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.8633, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8475, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.9979, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.8673, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8746, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.9179, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.9361, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.8913, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8556, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8247, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.9346, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8897, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.9438, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7655, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8996, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9045, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8311, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9492, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8869, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8418, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.9442, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.8688, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.9113, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8833, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.9694, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8323, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8322, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.8532, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8846, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7953, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7917, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7999, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7832, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.7757, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.9216, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7279, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8547, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.7974, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7325, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.8651, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.8027, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.8023, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7645, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8216, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7154, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.7438, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7664, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8217, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7378, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8457, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7347, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7388, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.8177, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8578, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7455, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7682, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8223, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8354, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8103, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8094, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7705, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7731, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.7048, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.7777, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8811, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7433, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7328, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7672, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6969, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6923, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.7048, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7554, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7421, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7542, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7626, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7957, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7428, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7071, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7833, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7262, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.706, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7063, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7755, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.7543, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7099, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6922, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7727, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7885, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8368, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6819, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6766, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.734, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8116, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6718, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6894, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6376, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.8011, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.7382, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7586, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8034, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7853, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.6979, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6383, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7292, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6234, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7024, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7145, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6933, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6224, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7334, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6673, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7149, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5578, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6527, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6242, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5558, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7399, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.8314, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7248, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6502, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.692, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7619, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.7757, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6067, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6181, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5756, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.6381, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7008, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7283, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.599, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6984, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.6424, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6042, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.7193, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6789, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.6459, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.6203, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.6347, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.7011, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6418, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6523, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.567, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5959, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6507, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5627, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6199, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6042, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6501, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6254, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.611, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5959, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5988, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5914, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5807, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5726, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6163, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.7316, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5764, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.619, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.7615, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6547, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5934, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5967, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 0.001, |
|
"loss": 2.599, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6839, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5772, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6305, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5354, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5099, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.575, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5827, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.6243, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6192, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5388, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.587, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5446, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6233, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.6057, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5322, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.6265, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.6747, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5641, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.001, |
|
"loss": 2.643, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5575, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6074, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5276, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5163, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5769, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5597, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5999, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5962, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5758, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 2.6153, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5332, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6169, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5641, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5202, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5156, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.485, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.557, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6137, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5028, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5713, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5385, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.001, |
|
"loss": 2.568, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5535, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4861, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5154, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5672, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5576, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5735, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5436, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.512, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4986, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5719, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.558, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6109, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4702, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5414, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4311, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5505, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5599, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5399, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 2.5745, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5591, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5645, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5349, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.64, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5241, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5465, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.548, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5006, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4411, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6081, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5661, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4901, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5783, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5633, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4383, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5552, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4974, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4294, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5585, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4823, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4983, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4712, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5071, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.378, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5846, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4814, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5205, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6069, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6136, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5675, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4644, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5886, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.6005, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5263, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6189, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4413, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.517, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5185, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5422, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4782, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5178, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5289, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5569, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5402, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4138, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4729, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4512, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3337, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4999, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 2.4464, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4477, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4136, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5954, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.582, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4916, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4851, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5241, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4568, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4604, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4243, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4554, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4236, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4336, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4906, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 2.4008, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4638, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4525, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4088, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5329, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 2.5568, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4684, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4425, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4634, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.5238, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4865, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.551, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4611, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5396, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4379, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4577, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5041, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4483, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5035, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5613, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4445, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.6129, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3973, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3411, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5052, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4608, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5404, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4894, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4287, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4712, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.411, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.449, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5091, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4494, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.4689, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5923, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4636, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4478, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4567, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3894, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4179, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5042, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4784, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4666, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3777, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4888, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4483, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3399, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4195, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4564, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4967, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.44, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4448, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4057, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.443, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4622, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3848, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.509, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.5265, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5115, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4607, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.5261, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.419, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4465, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4207, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4445, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5083, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4347, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4745, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.515, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5217, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4455, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4412, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4647, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4476, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4614, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5262, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4971, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4713, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3779, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4566, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5437, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4165, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4083, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3645, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4619, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4517, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4964, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3903, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4048, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5493, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4204, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4851, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4582, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3493, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4725, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4007, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4246, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4925, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3759, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.431, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.387, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.5083, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3847, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4038, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3835, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4496, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4231, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3516, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3787, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.331, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4552, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3459, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4763, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3367, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4216, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4127, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5296, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4159, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4337, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3075, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3824, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4537, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4326, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4517, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4543, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5044, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3716, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4272, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4517, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3564, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4594, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4573, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4016, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4329, |
|
"step": 4705 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4285, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4041, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4431, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4893, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4273, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3858, |
|
"step": 4735 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4082, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3506, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3004, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4124, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3558, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.394, |
|
"step": 4765 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3589, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3722, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3893, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.386, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4663, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4395, |
|
"step": 4795 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4552, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3771, |
|
"step": 4805 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4676, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4427, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3732, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3805, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3883, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4687, |
|
"step": 4835 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4263, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3938, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.371, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4461, |
|
"step": 4855 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4477, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2894, |
|
"step": 4865 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3719, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4017, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3104, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4409, |
|
"step": 4885 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3999, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3703, |
|
"step": 4895 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3998, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3721, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3626, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4011, |
|
"step": 4915 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3683, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3509, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2919, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.304, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3685, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3315, |
|
"step": 4945 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3417, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5319, |
|
"step": 4955 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3706, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3488, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.343, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.339, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3289, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3635, |
|
"step": 4985 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3701, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3865, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.3312, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3723, |
|
"step": 5005 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3693, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4211, |
|
"step": 5015 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3302, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4063, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4358, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.327, |
|
"step": 5035 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2614, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2946, |
|
"step": 5045 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4384, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3465, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.355, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3374, |
|
"step": 5065 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3319, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4058, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3335, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4275, |
|
"step": 5085 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2337, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.472, |
|
"step": 5095 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3425, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3226, |
|
"step": 5105 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3202, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4078, |
|
"step": 5115 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3514, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3719, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3483, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3145, |
|
"step": 5135 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3698, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4254, |
|
"step": 5145 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3652, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3388, |
|
"step": 5155 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3005, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.343, |
|
"step": 5165 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4356, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3843, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3718, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2947, |
|
"step": 5185 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3222, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.4169, |
|
"step": 5195 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3647, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3263, |
|
"step": 5205 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.363, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2993, |
|
"step": 5215 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.339, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.331, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3842, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4073, |
|
"step": 5235 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3942, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.4361, |
|
"step": 5245 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2972, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2457, |
|
"step": 5255 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3928, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3553, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.308, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4344, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.345, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3255, |
|
"step": 5285 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3048, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3411, |
|
"step": 5295 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.208, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2552, |
|
"step": 5305 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2885, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3439, |
|
"step": 5315 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3595, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2758, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3548, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4227, |
|
"step": 5335 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.394, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3507, |
|
"step": 5345 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.382, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3912, |
|
"step": 5355 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3112, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.408, |
|
"step": 5365 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2344, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4095, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3205, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3861, |
|
"step": 5385 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3335, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3337, |
|
"step": 5395 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3939, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3017, |
|
"step": 5405 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2953, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3509, |
|
"step": 5415 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3585, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2923, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3309, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 2.3466, |
|
"step": 5435 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2919, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3037, |
|
"step": 5445 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3836, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3335, |
|
"step": 5455 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3305, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2483, |
|
"step": 5465 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.31, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3389, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3941, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3019, |
|
"step": 5485 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2751, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3538, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3456, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3096, |
|
"step": 5505 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3386, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.328, |
|
"step": 5515 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 2.2578, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2215, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3991, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2228, |
|
"step": 5535 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.259, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2526, |
|
"step": 5545 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3117, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3628, |
|
"step": 5555 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3557, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2213, |
|
"step": 5565 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3814, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3071, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2928, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2477, |
|
"step": 5585 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4124, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2786, |
|
"step": 5595 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3296, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3383, |
|
"step": 5605 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3517, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2402, |
|
"step": 5615 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.315, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3727, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3322, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3768, |
|
"step": 5635 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3349, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3919, |
|
"step": 5645 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.4263, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3149, |
|
"step": 5655 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.383, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4073, |
|
"step": 5665 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.277, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3267, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.283, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3412, |
|
"step": 5685 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3126, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2937, |
|
"step": 5695 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2517, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3529, |
|
"step": 5705 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3374, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3249, |
|
"step": 5715 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2751, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2705, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2959, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3365, |
|
"step": 5735 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3377, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.387, |
|
"step": 5745 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3485, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3841, |
|
"step": 5755 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2726, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2472, |
|
"step": 5765 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3245, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3735, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3062, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3034, |
|
"step": 5785 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3342, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2772, |
|
"step": 5795 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3663, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.298, |
|
"step": 5805 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3009, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2918, |
|
"step": 5815 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2476, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2734, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2312, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3424, |
|
"step": 5835 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2396, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3923, |
|
"step": 5845 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.4244, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3257, |
|
"step": 5855 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2825, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3113, |
|
"step": 5865 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3198, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3237, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2635, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3256, |
|
"step": 5885 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2391, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3338, |
|
"step": 5895 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2332, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2905, |
|
"step": 5905 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3601, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.299, |
|
"step": 5915 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3948, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4341, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3247, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4271, |
|
"step": 5935 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2825, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3141, |
|
"step": 5945 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.364, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3063, |
|
"step": 5955 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2324, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3069, |
|
"step": 5965 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2703, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2642, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3167, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3079, |
|
"step": 5985 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3698, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2639, |
|
"step": 5995 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2953, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2728, |
|
"step": 6005 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2379, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2972, |
|
"step": 6015 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2669, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3334, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2345, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2557, |
|
"step": 6035 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3104, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2174, |
|
"step": 6045 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2894, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3237, |
|
"step": 6055 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2313, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.386, |
|
"step": 6065 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3001, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1863, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3525, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2239, |
|
"step": 6085 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2718, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2479, |
|
"step": 6095 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2787, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2199, |
|
"step": 6105 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3662, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.284, |
|
"step": 6115 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2711, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2889, |
|
"step": 6125 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.256, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2337, |
|
"step": 6135 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.001, |
|
"loss": 2.2716, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2392, |
|
"step": 6145 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2354, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1689, |
|
"step": 6155 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2155, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3151, |
|
"step": 6165 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2376, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3778, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2291, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2969, |
|
"step": 6185 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3347, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2838, |
|
"step": 6195 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2417, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.001, |
|
"loss": 2.227, |
|
"step": 6205 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2919, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1453, |
|
"step": 6215 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4227, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2441, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3082, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2227, |
|
"step": 6235 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3381, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2592, |
|
"step": 6245 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3264, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3429, |
|
"step": 6255 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.258, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3763, |
|
"step": 6265 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3234, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2315, |
|
"step": 6275 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2926, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2738, |
|
"step": 6285 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.273, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2739, |
|
"step": 6295 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2088, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3184, |
|
"step": 6305 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2584, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2646, |
|
"step": 6315 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2761, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2677, |
|
"step": 6325 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2255, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2457, |
|
"step": 6335 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3369, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1955, |
|
"step": 6345 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2271, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1785, |
|
"step": 6355 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2299, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1963, |
|
"step": 6365 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1968, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2366, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2328, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.2762, |
|
"step": 6385 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1722, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2829, |
|
"step": 6395 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2764, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3164, |
|
"step": 6405 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2618, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2912, |
|
"step": 6415 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.2168, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2411, |
|
"step": 6425 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2966, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2664, |
|
"step": 6435 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2472, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2971, |
|
"step": 6445 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2707, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2137, |
|
"step": 6455 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2746, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3172, |
|
"step": 6465 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.215, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3012, |
|
"step": 6475 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2543, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2402, |
|
"step": 6485 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2796, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3052, |
|
"step": 6495 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2445, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.001, |
|
"loss": 2.2294, |
|
"step": 6505 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2323, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2513, |
|
"step": 6515 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2792, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2732, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2925, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2928, |
|
"step": 6535 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2555, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2544, |
|
"step": 6545 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3104, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2852, |
|
"step": 6555 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.238, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1404, |
|
"step": 6565 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.167, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1843, |
|
"step": 6575 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3116, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.2582, |
|
"step": 6585 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1951, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3272, |
|
"step": 6595 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2032, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2803, |
|
"step": 6605 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1923, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.278, |
|
"step": 6615 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.208, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3667, |
|
"step": 6625 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.23, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3159, |
|
"step": 6635 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1728, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3895, |
|
"step": 6645 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2639, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2976, |
|
"step": 6655 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2421, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3828, |
|
"step": 6665 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3166, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.2423, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1668, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2309, |
|
"step": 6685 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2477, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1868, |
|
"step": 6695 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2333, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.232, |
|
"step": 6705 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2175, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2547, |
|
"step": 6715 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2624, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2089, |
|
"step": 6725 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1886, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2837, |
|
"step": 6735 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2128, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2201, |
|
"step": 6745 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1656, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.229, |
|
"step": 6755 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3482, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2294, |
|
"step": 6765 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2277, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2251, |
|
"step": 6775 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2425, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1912, |
|
"step": 6785 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1729, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2105, |
|
"step": 6795 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2288, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1926, |
|
"step": 6805 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2566, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1782, |
|
"step": 6815 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2487, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2902, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1668, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2062, |
|
"step": 6835 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2033, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3064, |
|
"step": 6845 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2519, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2718, |
|
"step": 6855 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2755, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2878, |
|
"step": 6865 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1555, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2228, |
|
"step": 6875 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3159, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1421, |
|
"step": 6885 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.218, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.249, |
|
"step": 6895 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.257, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2311, |
|
"step": 6905 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.3024, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3048, |
|
"step": 6915 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2174, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2053, |
|
"step": 6925 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2763, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2094, |
|
"step": 6935 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2562, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1941, |
|
"step": 6945 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2871, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2773, |
|
"step": 6955 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2026, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1996, |
|
"step": 6965 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2896, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2455, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2318, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3176, |
|
"step": 6985 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.001, |
|
"loss": 2.302, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2314, |
|
"step": 6995 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1735, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2345, |
|
"step": 7005 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1579, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2407, |
|
"step": 7015 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2276, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 8.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.2251, |
|
"step": 7025 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2293, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.164, |
|
"step": 7035 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1919, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2568, |
|
"step": 7045 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1385, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1111, |
|
"step": 7055 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2902, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2044, |
|
"step": 7065 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.212, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1734, |
|
"step": 7075 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2047, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1626, |
|
"step": 7085 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.188, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1195, |
|
"step": 7095 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2254, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1368, |
|
"step": 7105 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1321, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1857, |
|
"step": 7115 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2296, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2473, |
|
"step": 7125 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.3584, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2099, |
|
"step": 7135 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1661, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3145, |
|
"step": 7145 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2612, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2163, |
|
"step": 7155 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1993, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1804, |
|
"step": 7165 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2222, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2598, |
|
"step": 7175 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.222, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1146, |
|
"step": 7185 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2832, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2666, |
|
"step": 7195 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2489, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.001, |
|
"loss": 2.2843, |
|
"step": 7205 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2427, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1722, |
|
"step": 7215 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2615, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2178, |
|
"step": 7225 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2657, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2596, |
|
"step": 7235 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3205, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1141, |
|
"step": 7245 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2048, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1992, |
|
"step": 7255 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2168, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1951, |
|
"step": 7265 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1946, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2296, |
|
"step": 7275 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1909, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2634, |
|
"step": 7285 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1766, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2185, |
|
"step": 7295 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.3009, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.234, |
|
"step": 7305 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2108, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1009, |
|
"step": 7315 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1628, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2285, |
|
"step": 7325 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1172, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2362, |
|
"step": 7335 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1721, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.0911, |
|
"step": 7345 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1961, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2337, |
|
"step": 7355 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2182, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1035, |
|
"step": 7365 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1426, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2156, |
|
"step": 7375 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2241, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1933, |
|
"step": 7385 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2293, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2288, |
|
"step": 7395 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2141, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1613, |
|
"step": 7405 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2172, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2277, |
|
"step": 7415 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1964, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1474, |
|
"step": 7425 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1842, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2594, |
|
"step": 7435 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1938, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2454, |
|
"step": 7445 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1547, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2064, |
|
"step": 7455 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2425, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2289, |
|
"step": 7465 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1156, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2579, |
|
"step": 7475 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2268, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1241, |
|
"step": 7485 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1078, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2325, |
|
"step": 7495 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1536, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2238, |
|
"step": 7505 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.001, |
|
"loss": 2.223, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2105, |
|
"step": 7515 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2338, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.182, |
|
"step": 7525 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1806, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.1253, |
|
"step": 7535 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.065, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1058, |
|
"step": 7545 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1723, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2771, |
|
"step": 7555 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.178, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1832, |
|
"step": 7565 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.106, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1701, |
|
"step": 7575 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2016, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1198, |
|
"step": 7585 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.222, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1769, |
|
"step": 7595 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1416, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1255, |
|
"step": 7605 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1879, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1509, |
|
"step": 7615 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1773, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1697, |
|
"step": 7625 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1141, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2385, |
|
"step": 7635 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1444, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1821, |
|
"step": 7645 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2419, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1277, |
|
"step": 7655 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1443, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1499, |
|
"step": 7665 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1843, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1554, |
|
"step": 7675 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1742, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.253, |
|
"step": 7685 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2189, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.2244, |
|
"step": 7695 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2498, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1689, |
|
"step": 7705 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1435, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.0968, |
|
"step": 7715 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1769, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2411, |
|
"step": 7725 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2172, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1888, |
|
"step": 7735 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2317, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1126, |
|
"step": 7745 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1826, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1961, |
|
"step": 7755 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.0871, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2262, |
|
"step": 7765 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1916, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1953, |
|
"step": 7775 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2019, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2187, |
|
"step": 7785 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2037, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.0825, |
|
"step": 7795 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1189, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2302, |
|
"step": 7805 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1359, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2286, |
|
"step": 7815 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2267, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2747, |
|
"step": 7825 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1479, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2157, |
|
"step": 7835 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2047, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1771, |
|
"step": 7845 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2158, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1596, |
|
"step": 7855 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.226, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.138, |
|
"step": 7865 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.166, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1684, |
|
"step": 7875 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1095, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1289, |
|
"step": 7885 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2142, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.0919, |
|
"step": 7895 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2185, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1294, |
|
"step": 7905 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.001, |
|
"loss": 2.186, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1617, |
|
"step": 7915 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1857, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2289, |
|
"step": 7925 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2337, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2364, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2024, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1375, |
|
"step": 7945 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1636, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2233, |
|
"step": 7955 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1445, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.145, |
|
"step": 7965 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2116, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1848, |
|
"step": 7975 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1689, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2128, |
|
"step": 7985 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2035, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1877, |
|
"step": 7995 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2685, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1212, |
|
"step": 8005 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1883, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1043, |
|
"step": 8015 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.27, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1822, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2655, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2037, |
|
"step": 8035 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1545, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1945, |
|
"step": 8045 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2214, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1683, |
|
"step": 8055 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1066, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2586, |
|
"step": 8065 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1627, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2267, |
|
"step": 8075 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1868, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1543, |
|
"step": 8085 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.191, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1399, |
|
"step": 8095 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1241, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.0788, |
|
"step": 8105 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.154, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1215, |
|
"step": 8115 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1879, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1597, |
|
"step": 8125 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2198, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2092, |
|
"step": 8135 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.177, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1563, |
|
"step": 8145 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1434, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.1589, |
|
"step": 8155 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1215, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2276, |
|
"step": 8165 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2408, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1113, |
|
"step": 8175 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1163, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1909, |
|
"step": 8185 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1932, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.2223, |
|
"step": 8195 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1728, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2604, |
|
"step": 8205 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1194, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.0889, |
|
"step": 8215 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1805, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.1755, |
|
"step": 8225 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2337, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2678, |
|
"step": 8235 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1309, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1571, |
|
"step": 8245 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1756, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2012, |
|
"step": 8255 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1615, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.19, |
|
"step": 8265 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1919, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1195, |
|
"step": 8275 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1805, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.0899, |
|
"step": 8285 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2288, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1627, |
|
"step": 8295 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.0923, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1777, |
|
"step": 8305 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.0752, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1106, |
|
"step": 8315 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1276, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1765, |
|
"step": 8325 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1726, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2546, |
|
"step": 8335 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1831, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1938, |
|
"step": 8345 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.0966, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.0815, |
|
"step": 8355 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2203, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.0747, |
|
"step": 8365 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.159, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1264, |
|
"step": 8375 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1399, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1996, |
|
"step": 8385 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2055, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2027, |
|
"step": 8395 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1516, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1234, |
|
"step": 8405 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.1325, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1561, |
|
"step": 8415 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2077, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 2.0967, |
|
"step": 8425 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1205, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1933, |
|
"step": 8435 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2021, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1096, |
|
"step": 8445 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1006, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.0802, |
|
"step": 8455 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1566, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2041, |
|
"step": 8465 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2042, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.0913, |
|
"step": 8475 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1835, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1648, |
|
"step": 8485 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1875, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.194, |
|
"step": 8495 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.098, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1823, |
|
"step": 8505 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1069, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2228, |
|
"step": 8515 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1541, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1089, |
|
"step": 8525 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1029, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2125, |
|
"step": 8535 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1722, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2255, |
|
"step": 8545 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1747, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1575, |
|
"step": 8555 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.24, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2122, |
|
"step": 8565 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.001, |
|
"loss": 2.0925, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2339, |
|
"step": 8575 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1486, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.208, |
|
"step": 8585 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1253, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1997, |
|
"step": 8595 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.0718, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1493, |
|
"step": 8605 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1384, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1194, |
|
"step": 8615 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1114, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.001, |
|
"loss": 2.0944, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1073, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.001, |
|
"loss": 2.0895, |
|
"step": 8635 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2211, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.1039, |
|
"step": 8645 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1231, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.128, |
|
"step": 8655 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1983, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1355, |
|
"step": 8665 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.001, |
|
"loss": 2.175, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1366, |
|
"step": 8675 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1074, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.0969, |
|
"step": 8685 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1339, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1782, |
|
"step": 8695 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.001, |
|
"loss": 2.141, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1213, |
|
"step": 8705 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1564, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2358, |
|
"step": 8715 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1675, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1651, |
|
"step": 8725 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1643, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1753, |
|
"step": 8735 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2613, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1604, |
|
"step": 8745 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1436, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1084, |
|
"step": 8755 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1185, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1427, |
|
"step": 8765 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1018, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2008, |
|
"step": 8775 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.0736, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1633, |
|
"step": 8785 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.001, |
|
"loss": 2.2193, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.209, |
|
"step": 8795 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1654, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.001, |
|
"loss": 2.183, |
|
"step": 8805 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1428, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1481, |
|
"step": 8815 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.0974, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.04, |
|
"step": 8825 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1917, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1512, |
|
"step": 8835 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1508, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1739, |
|
"step": 8845 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1486, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.13, |
|
"step": 8855 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1971, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1131, |
|
"step": 8865 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1483, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1244, |
|
"step": 8875 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1369, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.1743, |
|
"step": 8885 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.001, |
|
"loss": 2.0747, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1792, |
|
"step": 8895 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.145, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1802, |
|
"step": 8905 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.001, |
|
"loss": 2.067, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.1613, |
|
"step": 8915 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.001, |
|
"loss": 2.142, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1794, |
|
"step": 8925 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.071, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1635, |
|
"step": 8935 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2223, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.001, |
|
"loss": 2.0432, |
|
"step": 8945 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.001, |
|
"loss": 2.0315, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1065, |
|
"step": 8955 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.001, |
|
"loss": 2.1791, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2013, |
|
"step": 8965 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.164351224899292, |
|
"eval_runtime": 1701.8515, |
|
"eval_samples_per_second": 9.332, |
|
"eval_steps_per_second": 1.167, |
|
"step": 8969 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 8969, |
|
"total_flos": 1.8254804498880922e+18, |
|
"train_loss": 2.789915239123786, |
|
"train_runtime": 47169.5182, |
|
"train_samples_per_second": 3.042, |
|
"train_steps_per_second": 0.19 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 8969, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"total_flos": 1.8254804498880922e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|