|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 5, |
|
"global_step": 910, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01098901098901099, |
|
"grad_norm": 2.787256796216871, |
|
"learning_rate": 1.098901098901099e-08, |
|
"loss": 2.6306, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.054945054945054944, |
|
"grad_norm": 2.7208828057352172, |
|
"learning_rate": 5.494505494505494e-08, |
|
"loss": 2.652, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.054945054945054944, |
|
"eval_loss": 2.6433982849121094, |
|
"eval_runtime": 98.8619, |
|
"eval_samples_per_second": 33.157, |
|
"eval_steps_per_second": 0.829, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.10989010989010989, |
|
"grad_norm": 2.7039289723117808, |
|
"learning_rate": 1.0989010989010988e-07, |
|
"loss": 2.6262, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10989010989010989, |
|
"eval_loss": 2.6424529552459717, |
|
"eval_runtime": 96.9771, |
|
"eval_samples_per_second": 33.802, |
|
"eval_steps_per_second": 0.846, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16483516483516483, |
|
"grad_norm": 2.9271293165910413, |
|
"learning_rate": 1.6483516483516482e-07, |
|
"loss": 2.6452, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.16483516483516483, |
|
"eval_loss": 2.6390280723571777, |
|
"eval_runtime": 94.3235, |
|
"eval_samples_per_second": 34.753, |
|
"eval_steps_per_second": 0.869, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 2.480796984302828, |
|
"learning_rate": 2.1978021978021976e-07, |
|
"loss": 2.6387, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"eval_loss": 2.6297805309295654, |
|
"eval_runtime": 97.0464, |
|
"eval_samples_per_second": 33.778, |
|
"eval_steps_per_second": 0.845, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.27472527472527475, |
|
"grad_norm": 2.3675810470522483, |
|
"learning_rate": 2.7472527472527475e-07, |
|
"loss": 2.6259, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.27472527472527475, |
|
"eval_loss": 2.6251583099365234, |
|
"eval_runtime": 94.763, |
|
"eval_samples_per_second": 34.592, |
|
"eval_steps_per_second": 0.865, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.32967032967032966, |
|
"grad_norm": 1.9222673443941214, |
|
"learning_rate": 3.2967032967032963e-07, |
|
"loss": 2.5997, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.32967032967032966, |
|
"eval_loss": 2.603151559829712, |
|
"eval_runtime": 95.4458, |
|
"eval_samples_per_second": 34.344, |
|
"eval_steps_per_second": 0.859, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 1.9849926292026194, |
|
"learning_rate": 3.8461538461538463e-07, |
|
"loss": 2.5946, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"eval_loss": 2.593891143798828, |
|
"eval_runtime": 94.3775, |
|
"eval_samples_per_second": 34.733, |
|
"eval_steps_per_second": 0.869, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.43956043956043955, |
|
"grad_norm": 1.7366188084370735, |
|
"learning_rate": 4.395604395604395e-07, |
|
"loss": 2.5801, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.43956043956043955, |
|
"eval_loss": 2.5582449436187744, |
|
"eval_runtime": 95.1258, |
|
"eval_samples_per_second": 34.46, |
|
"eval_steps_per_second": 0.862, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4945054945054945, |
|
"grad_norm": 1.5634290965231614, |
|
"learning_rate": 4.945054945054945e-07, |
|
"loss": 2.5387, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4945054945054945, |
|
"eval_loss": 2.539731979370117, |
|
"eval_runtime": 94.4215, |
|
"eval_samples_per_second": 34.717, |
|
"eval_steps_per_second": 0.868, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5494505494505495, |
|
"grad_norm": 1.5589987032678356, |
|
"learning_rate": 5.494505494505495e-07, |
|
"loss": 2.5425, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5494505494505495, |
|
"eval_loss": 2.5231664180755615, |
|
"eval_runtime": 94.9946, |
|
"eval_samples_per_second": 34.507, |
|
"eval_steps_per_second": 0.863, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6043956043956044, |
|
"grad_norm": 1.357945104882359, |
|
"learning_rate": 6.043956043956043e-07, |
|
"loss": 2.5065, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.6043956043956044, |
|
"eval_loss": 2.5029337406158447, |
|
"eval_runtime": 95.3244, |
|
"eval_samples_per_second": 34.388, |
|
"eval_steps_per_second": 0.86, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.6593406593406593, |
|
"grad_norm": 1.351901982993815, |
|
"learning_rate": 6.593406593406593e-07, |
|
"loss": 2.4884, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6593406593406593, |
|
"eval_loss": 2.48612380027771, |
|
"eval_runtime": 94.5792, |
|
"eval_samples_per_second": 34.659, |
|
"eval_steps_per_second": 0.867, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.3411768977322787, |
|
"learning_rate": 7.142857142857143e-07, |
|
"loss": 2.4708, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"eval_loss": 2.4696457386016846, |
|
"eval_runtime": 95.9513, |
|
"eval_samples_per_second": 34.163, |
|
"eval_steps_per_second": 0.855, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 1.3228056998181177, |
|
"learning_rate": 7.692307692307693e-07, |
|
"loss": 2.4527, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"eval_loss": 2.4544637203216553, |
|
"eval_runtime": 94.037, |
|
"eval_samples_per_second": 34.859, |
|
"eval_steps_per_second": 0.872, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.8241758241758241, |
|
"grad_norm": 1.3089892743121356, |
|
"learning_rate": 8.241758241758241e-07, |
|
"loss": 2.4525, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8241758241758241, |
|
"eval_loss": 2.4406723976135254, |
|
"eval_runtime": 96.5873, |
|
"eval_samples_per_second": 33.938, |
|
"eval_steps_per_second": 0.849, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"grad_norm": 1.352286963239502, |
|
"learning_rate": 8.79120879120879e-07, |
|
"loss": 2.4281, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"eval_loss": 2.428374767303467, |
|
"eval_runtime": 94.3709, |
|
"eval_samples_per_second": 34.735, |
|
"eval_steps_per_second": 0.869, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9340659340659341, |
|
"grad_norm": 1.3538673636384089, |
|
"learning_rate": 9.340659340659341e-07, |
|
"loss": 2.4286, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.9340659340659341, |
|
"eval_loss": 2.417691946029663, |
|
"eval_runtime": 96.296, |
|
"eval_samples_per_second": 34.041, |
|
"eval_steps_per_second": 0.852, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.989010989010989, |
|
"grad_norm": 1.314242339957954, |
|
"learning_rate": 9.89010989010989e-07, |
|
"loss": 2.4127, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.989010989010989, |
|
"eval_loss": 2.4079270362854004, |
|
"eval_runtime": 94.1094, |
|
"eval_samples_per_second": 34.832, |
|
"eval_steps_per_second": 0.871, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.043956043956044, |
|
"grad_norm": 1.292934852988739, |
|
"learning_rate": 9.999411449933815e-07, |
|
"loss": 2.3982, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.043956043956044, |
|
"eval_loss": 2.3987655639648438, |
|
"eval_runtime": 96.8027, |
|
"eval_samples_per_second": 33.863, |
|
"eval_steps_per_second": 0.847, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.098901098901099, |
|
"grad_norm": 1.329633653617423, |
|
"learning_rate": 9.997020702755352e-07, |
|
"loss": 2.3983, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.098901098901099, |
|
"eval_loss": 2.390303611755371, |
|
"eval_runtime": 94.1549, |
|
"eval_samples_per_second": 34.815, |
|
"eval_steps_per_second": 0.871, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 1.3449275632813102, |
|
"learning_rate": 9.992791852820708e-07, |
|
"loss": 2.3845, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"eval_loss": 2.3824305534362793, |
|
"eval_runtime": 96.1609, |
|
"eval_samples_per_second": 34.089, |
|
"eval_steps_per_second": 0.853, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.2087912087912087, |
|
"grad_norm": 1.2976258496024031, |
|
"learning_rate": 9.986726455668912e-07, |
|
"loss": 2.3741, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.2087912087912087, |
|
"eval_loss": 2.374979257583618, |
|
"eval_runtime": 93.9379, |
|
"eval_samples_per_second": 34.895, |
|
"eval_steps_per_second": 0.873, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.2637362637362637, |
|
"grad_norm": 1.3352972965753644, |
|
"learning_rate": 9.978826742394025e-07, |
|
"loss": 2.374, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.2637362637362637, |
|
"eval_loss": 2.367839813232422, |
|
"eval_runtime": 95.5399, |
|
"eval_samples_per_second": 34.31, |
|
"eval_steps_per_second": 0.858, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.3186813186813187, |
|
"grad_norm": 1.351442511494053, |
|
"learning_rate": 9.969095618824461e-07, |
|
"loss": 2.354, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.3186813186813187, |
|
"eval_loss": 2.360889434814453, |
|
"eval_runtime": 94.4101, |
|
"eval_samples_per_second": 34.721, |
|
"eval_steps_per_second": 0.869, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.3736263736263736, |
|
"grad_norm": 1.3462760051666864, |
|
"learning_rate": 9.95753666445411e-07, |
|
"loss": 2.3498, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.3736263736263736, |
|
"eval_loss": 2.3541159629821777, |
|
"eval_runtime": 94.7992, |
|
"eval_samples_per_second": 34.578, |
|
"eval_steps_per_second": 0.865, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 1.3496352842968289, |
|
"learning_rate": 9.944154131125642e-07, |
|
"loss": 2.3324, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"eval_loss": 2.347334861755371, |
|
"eval_runtime": 95.6461, |
|
"eval_samples_per_second": 34.272, |
|
"eval_steps_per_second": 0.857, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.4835164835164836, |
|
"grad_norm": 1.3292817866036872, |
|
"learning_rate": 9.928952941466537e-07, |
|
"loss": 2.3386, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.4835164835164836, |
|
"eval_loss": 2.3408091068267822, |
|
"eval_runtime": 94.7751, |
|
"eval_samples_per_second": 34.587, |
|
"eval_steps_per_second": 0.865, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.3030870244645119, |
|
"learning_rate": 9.911938687078323e-07, |
|
"loss": 2.3199, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 2.334508180618286, |
|
"eval_runtime": 96.7229, |
|
"eval_samples_per_second": 33.891, |
|
"eval_steps_per_second": 0.848, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.5934065934065935, |
|
"grad_norm": 1.357889365983369, |
|
"learning_rate": 9.893117626479776e-07, |
|
"loss": 2.3192, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.5934065934065935, |
|
"eval_loss": 2.3284120559692383, |
|
"eval_runtime": 94.5936, |
|
"eval_samples_per_second": 34.654, |
|
"eval_steps_per_second": 0.867, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.6483516483516483, |
|
"grad_norm": 1.3682230754482416, |
|
"learning_rate": 9.87249668280478e-07, |
|
"loss": 2.3205, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.6483516483516483, |
|
"eval_loss": 2.322436809539795, |
|
"eval_runtime": 96.3599, |
|
"eval_samples_per_second": 34.018, |
|
"eval_steps_per_second": 0.851, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.7032967032967035, |
|
"grad_norm": 1.356238814707341, |
|
"learning_rate": 9.850083441255734e-07, |
|
"loss": 2.3152, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.7032967032967035, |
|
"eval_loss": 2.3167214393615723, |
|
"eval_runtime": 94.0628, |
|
"eval_samples_per_second": 34.849, |
|
"eval_steps_per_second": 0.872, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.7582417582417582, |
|
"grad_norm": 1.3669835941127673, |
|
"learning_rate": 9.8258861463134e-07, |
|
"loss": 2.309, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.7582417582417582, |
|
"eval_loss": 2.311154365539551, |
|
"eval_runtime": 95.3173, |
|
"eval_samples_per_second": 34.39, |
|
"eval_steps_per_second": 0.86, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.8131868131868132, |
|
"grad_norm": 1.370896874190673, |
|
"learning_rate": 9.799913698704268e-07, |
|
"loss": 2.2937, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.8131868131868132, |
|
"eval_loss": 2.3057687282562256, |
|
"eval_runtime": 95.653, |
|
"eval_samples_per_second": 34.27, |
|
"eval_steps_per_second": 0.857, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.8681318681318682, |
|
"grad_norm": 1.383985193963144, |
|
"learning_rate": 9.772175652126504e-07, |
|
"loss": 2.2957, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.8681318681318682, |
|
"eval_loss": 2.300417423248291, |
|
"eval_runtime": 94.5548, |
|
"eval_samples_per_second": 34.668, |
|
"eval_steps_per_second": 0.867, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 1.3861435502681547, |
|
"learning_rate": 9.742682209735727e-07, |
|
"loss": 2.2941, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"eval_loss": 2.2951645851135254, |
|
"eval_runtime": 96.181, |
|
"eval_samples_per_second": 34.082, |
|
"eval_steps_per_second": 0.853, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.978021978021978, |
|
"grad_norm": 1.387939254087276, |
|
"learning_rate": 9.711444220391885e-07, |
|
"loss": 2.2986, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.978021978021978, |
|
"eval_loss": 2.290034055709839, |
|
"eval_runtime": 94.7723, |
|
"eval_samples_per_second": 34.588, |
|
"eval_steps_per_second": 0.865, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.032967032967033, |
|
"grad_norm": 1.4083527156579858, |
|
"learning_rate": 9.678473174668605e-07, |
|
"loss": 2.2759, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.032967032967033, |
|
"eval_loss": 2.2850897312164307, |
|
"eval_runtime": 96.6234, |
|
"eval_samples_per_second": 33.926, |
|
"eval_steps_per_second": 0.849, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.087912087912088, |
|
"grad_norm": 1.3901375675143754, |
|
"learning_rate": 9.64378120062651e-07, |
|
"loss": 2.26, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.087912087912088, |
|
"eval_loss": 2.2804834842681885, |
|
"eval_runtime": 94.7228, |
|
"eval_samples_per_second": 34.606, |
|
"eval_steps_per_second": 0.866, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 1.3559893959650073, |
|
"learning_rate": 9.607381059352038e-07, |
|
"loss": 2.2661, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"eval_loss": 2.27602481842041, |
|
"eval_runtime": 95.5105, |
|
"eval_samples_per_second": 34.321, |
|
"eval_steps_per_second": 0.859, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"grad_norm": 1.4338659258910222, |
|
"learning_rate": 9.569286140263397e-07, |
|
"loss": 2.265, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"eval_loss": 2.271735429763794, |
|
"eval_runtime": 95.2547, |
|
"eval_samples_per_second": 34.413, |
|
"eval_steps_per_second": 0.861, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.2527472527472527, |
|
"grad_norm": 1.3511461203348951, |
|
"learning_rate": 9.529510456185417e-07, |
|
"loss": 2.2554, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.2527472527472527, |
|
"eval_loss": 2.2676820755004883, |
|
"eval_runtime": 95.0959, |
|
"eval_samples_per_second": 34.47, |
|
"eval_steps_per_second": 0.862, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 1.3787695258849668, |
|
"learning_rate": 9.488068638195071e-07, |
|
"loss": 2.2558, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"eval_loss": 2.2637946605682373, |
|
"eval_runtime": 96.7344, |
|
"eval_samples_per_second": 33.887, |
|
"eval_steps_per_second": 0.848, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.3626373626373627, |
|
"grad_norm": 1.3944429206878337, |
|
"learning_rate": 9.444975930239581e-07, |
|
"loss": 2.2508, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.3626373626373627, |
|
"eval_loss": 2.2600150108337402, |
|
"eval_runtime": 94.3417, |
|
"eval_samples_per_second": 34.746, |
|
"eval_steps_per_second": 0.869, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.4175824175824174, |
|
"grad_norm": 1.36473652575597, |
|
"learning_rate": 9.400248183529092e-07, |
|
"loss": 2.2405, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.4175824175824174, |
|
"eval_loss": 2.256455898284912, |
|
"eval_runtime": 96.4969, |
|
"eval_samples_per_second": 33.97, |
|
"eval_steps_per_second": 0.85, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.4725274725274726, |
|
"grad_norm": 1.3975755281584308, |
|
"learning_rate": 9.353901850705972e-07, |
|
"loss": 2.2484, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.4725274725274726, |
|
"eval_loss": 2.2530221939086914, |
|
"eval_runtime": 94.3914, |
|
"eval_samples_per_second": 34.728, |
|
"eval_steps_per_second": 0.869, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.5274725274725274, |
|
"grad_norm": 1.4268198012910995, |
|
"learning_rate": 9.305953979792864e-07, |
|
"loss": 2.2388, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.5274725274725274, |
|
"eval_loss": 2.2498600482940674, |
|
"eval_runtime": 96.755, |
|
"eval_samples_per_second": 33.879, |
|
"eval_steps_per_second": 0.848, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.5824175824175826, |
|
"grad_norm": 1.405521450176673, |
|
"learning_rate": 9.256422207921756e-07, |
|
"loss": 2.2472, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.5824175824175826, |
|
"eval_loss": 2.2467663288116455, |
|
"eval_runtime": 94.2329, |
|
"eval_samples_per_second": 34.786, |
|
"eval_steps_per_second": 0.87, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.6373626373626373, |
|
"grad_norm": 1.3800086312953763, |
|
"learning_rate": 9.205324754846339e-07, |
|
"loss": 2.2253, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.6373626373626373, |
|
"eval_loss": 2.2438271045684814, |
|
"eval_runtime": 95.1079, |
|
"eval_samples_per_second": 34.466, |
|
"eval_steps_per_second": 0.862, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 1.424541861104674, |
|
"learning_rate": 9.152680416240058e-07, |
|
"loss": 2.2356, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"eval_loss": 2.2410500049591064, |
|
"eval_runtime": 94.5581, |
|
"eval_samples_per_second": 34.667, |
|
"eval_steps_per_second": 0.867, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.7472527472527473, |
|
"grad_norm": 1.4211612108269323, |
|
"learning_rate": 9.09850855678232e-07, |
|
"loss": 2.2202, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.7472527472527473, |
|
"eval_loss": 2.2383668422698975, |
|
"eval_runtime": 95.1818, |
|
"eval_samples_per_second": 34.439, |
|
"eval_steps_per_second": 0.862, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.802197802197802, |
|
"grad_norm": 1.3505783784300605, |
|
"learning_rate": 9.042829103035389e-07, |
|
"loss": 2.2325, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.802197802197802, |
|
"eval_loss": 2.2357189655303955, |
|
"eval_runtime": 96.4181, |
|
"eval_samples_per_second": 33.998, |
|
"eval_steps_per_second": 0.85, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.413017015451658, |
|
"learning_rate": 8.985662536114612e-07, |
|
"loss": 2.2208, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"eval_loss": 2.2331881523132324, |
|
"eval_runtime": 95.0055, |
|
"eval_samples_per_second": 34.503, |
|
"eval_steps_per_second": 0.863, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.912087912087912, |
|
"grad_norm": 1.4113571267131109, |
|
"learning_rate": 8.927029884154645e-07, |
|
"loss": 2.2228, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.912087912087912, |
|
"eval_loss": 2.2307634353637695, |
|
"eval_runtime": 95.2567, |
|
"eval_samples_per_second": 34.412, |
|
"eval_steps_per_second": 0.861, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.967032967032967, |
|
"grad_norm": 1.4351733805268447, |
|
"learning_rate": 8.866952714574469e-07, |
|
"loss": 2.204, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.967032967032967, |
|
"eval_loss": 2.228379964828491, |
|
"eval_runtime": 94.9872, |
|
"eval_samples_per_second": 34.51, |
|
"eval_steps_per_second": 0.863, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.021978021978022, |
|
"grad_norm": 1.4281979614756792, |
|
"learning_rate": 8.805453126144047e-07, |
|
"loss": 2.2071, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 3.021978021978022, |
|
"eval_loss": 2.2260966300964355, |
|
"eval_runtime": 95.0392, |
|
"eval_samples_per_second": 34.491, |
|
"eval_steps_per_second": 0.863, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 1.3917979753684913, |
|
"learning_rate": 8.742553740855505e-07, |
|
"loss": 2.2045, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 2.2239363193511963, |
|
"eval_runtime": 95.5212, |
|
"eval_samples_per_second": 34.317, |
|
"eval_steps_per_second": 0.858, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.131868131868132, |
|
"grad_norm": 1.3437438345520127, |
|
"learning_rate": 8.678277695601871e-07, |
|
"loss": 2.201, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 3.131868131868132, |
|
"eval_loss": 2.221846103668213, |
|
"eval_runtime": 94.5168, |
|
"eval_samples_per_second": 34.682, |
|
"eval_steps_per_second": 0.868, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 3.186813186813187, |
|
"grad_norm": 1.4708322103599034, |
|
"learning_rate": 8.612648633666406e-07, |
|
"loss": 2.2055, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.186813186813187, |
|
"eval_loss": 2.2196929454803467, |
|
"eval_runtime": 96.225, |
|
"eval_samples_per_second": 34.066, |
|
"eval_steps_per_second": 0.852, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.241758241758242, |
|
"grad_norm": 1.3792124479226802, |
|
"learning_rate": 8.545690696025665e-07, |
|
"loss": 2.194, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 3.241758241758242, |
|
"eval_loss": 2.2177319526672363, |
|
"eval_runtime": 95.0779, |
|
"eval_samples_per_second": 34.477, |
|
"eval_steps_per_second": 0.862, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 3.2967032967032965, |
|
"grad_norm": 1.3913138741477784, |
|
"learning_rate": 8.477428512469487e-07, |
|
"loss": 2.2053, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.2967032967032965, |
|
"eval_loss": 2.215750217437744, |
|
"eval_runtime": 96.5467, |
|
"eval_samples_per_second": 33.952, |
|
"eval_steps_per_second": 0.849, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.3516483516483517, |
|
"grad_norm": 1.3956533495293986, |
|
"learning_rate": 8.407887192541176e-07, |
|
"loss": 2.1918, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 3.3516483516483517, |
|
"eval_loss": 2.2138054370880127, |
|
"eval_runtime": 94.5042, |
|
"eval_samples_per_second": 34.686, |
|
"eval_steps_per_second": 0.868, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 3.4065934065934065, |
|
"grad_norm": 1.403182174578336, |
|
"learning_rate": 8.337092316301222e-07, |
|
"loss": 2.1896, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.4065934065934065, |
|
"eval_loss": 2.2118804454803467, |
|
"eval_runtime": 95.8227, |
|
"eval_samples_per_second": 34.209, |
|
"eval_steps_per_second": 0.856, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.4615384615384617, |
|
"grad_norm": 1.455355254263044, |
|
"learning_rate": 8.265069924917924e-07, |
|
"loss": 2.2064, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 3.4615384615384617, |
|
"eval_loss": 2.2100589275360107, |
|
"eval_runtime": 94.8251, |
|
"eval_samples_per_second": 34.569, |
|
"eval_steps_per_second": 0.865, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 3.5164835164835164, |
|
"grad_norm": 1.3983545229342624, |
|
"learning_rate": 8.191846511088434e-07, |
|
"loss": 2.1783, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.5164835164835164, |
|
"eval_loss": 2.2082486152648926, |
|
"eval_runtime": 95.215, |
|
"eval_samples_per_second": 34.427, |
|
"eval_steps_per_second": 0.861, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 1.4400799384553278, |
|
"learning_rate": 8.117449009293668e-07, |
|
"loss": 2.1767, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"eval_loss": 2.2065377235412598, |
|
"eval_runtime": 95.7035, |
|
"eval_samples_per_second": 34.252, |
|
"eval_steps_per_second": 0.857, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.6263736263736264, |
|
"grad_norm": 1.3475863787147715, |
|
"learning_rate": 8.041904785890748e-07, |
|
"loss": 2.1903, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.6263736263736264, |
|
"eval_loss": 2.20478892326355, |
|
"eval_runtime": 94.3907, |
|
"eval_samples_per_second": 34.728, |
|
"eval_steps_per_second": 0.869, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.6813186813186816, |
|
"grad_norm": 1.3998921895857097, |
|
"learning_rate": 7.96524162904657e-07, |
|
"loss": 2.1832, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.6813186813186816, |
|
"eval_loss": 2.2030811309814453, |
|
"eval_runtime": 96.3082, |
|
"eval_samples_per_second": 34.037, |
|
"eval_steps_per_second": 0.851, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.7362637362637363, |
|
"grad_norm": 1.4095299216187385, |
|
"learning_rate": 7.8874877385162e-07, |
|
"loss": 2.175, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.7362637362637363, |
|
"eval_loss": 2.2014122009277344, |
|
"eval_runtime": 94.5209, |
|
"eval_samples_per_second": 34.68, |
|
"eval_steps_per_second": 0.868, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.791208791208791, |
|
"grad_norm": 1.4014291539296442, |
|
"learning_rate": 7.808671715269894e-07, |
|
"loss": 2.1822, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.791208791208791, |
|
"eval_loss": 2.199869155883789, |
|
"eval_runtime": 96.9073, |
|
"eval_samples_per_second": 33.826, |
|
"eval_steps_per_second": 0.846, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 1.4012685963925429, |
|
"learning_rate": 7.728822550972522e-07, |
|
"loss": 2.1737, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"eval_loss": 2.1982617378234863, |
|
"eval_runtime": 94.1727, |
|
"eval_samples_per_second": 34.808, |
|
"eval_steps_per_second": 0.871, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.901098901098901, |
|
"grad_norm": 1.429348338650559, |
|
"learning_rate": 7.647969617319282e-07, |
|
"loss": 2.1792, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.901098901098901, |
|
"eval_loss": 2.1967613697052, |
|
"eval_runtime": 96.0694, |
|
"eval_samples_per_second": 34.121, |
|
"eval_steps_per_second": 0.854, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.956043956043956, |
|
"grad_norm": 1.417580581225819, |
|
"learning_rate": 7.566142655231621e-07, |
|
"loss": 2.1815, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.956043956043956, |
|
"eval_loss": 2.1953213214874268, |
|
"eval_runtime": 94.1895, |
|
"eval_samples_per_second": 34.802, |
|
"eval_steps_per_second": 0.871, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.010989010989011, |
|
"grad_norm": 1.3810060520884369, |
|
"learning_rate": 7.483371763917345e-07, |
|
"loss": 2.1754, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 4.010989010989011, |
|
"eval_loss": 2.19380259513855, |
|
"eval_runtime": 95.3311, |
|
"eval_samples_per_second": 34.385, |
|
"eval_steps_per_second": 0.86, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 4.065934065934066, |
|
"grad_norm": 1.3763895625796003, |
|
"learning_rate": 7.399687389798932e-07, |
|
"loss": 2.1689, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.065934065934066, |
|
"eval_loss": 2.192410945892334, |
|
"eval_runtime": 94.2976, |
|
"eval_samples_per_second": 34.762, |
|
"eval_steps_per_second": 0.87, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.1208791208791204, |
|
"grad_norm": 1.4248863012562965, |
|
"learning_rate": 7.315120315314134e-07, |
|
"loss": 2.1618, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 4.1208791208791204, |
|
"eval_loss": 2.1910791397094727, |
|
"eval_runtime": 95.4906, |
|
"eval_samples_per_second": 34.328, |
|
"eval_steps_per_second": 0.859, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 4.175824175824176, |
|
"grad_norm": 1.3935879500953352, |
|
"learning_rate": 7.229701647592965e-07, |
|
"loss": 2.1729, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.175824175824176, |
|
"eval_loss": 2.1896872520446777, |
|
"eval_runtime": 94.2919, |
|
"eval_samples_per_second": 34.764, |
|
"eval_steps_per_second": 0.87, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.230769230769231, |
|
"grad_norm": 1.3317520241186727, |
|
"learning_rate": 7.14346280701527e-07, |
|
"loss": 2.1576, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 4.230769230769231, |
|
"eval_loss": 2.188359498977661, |
|
"eval_runtime": 95.1008, |
|
"eval_samples_per_second": 34.469, |
|
"eval_steps_per_second": 0.862, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 1.4243898404737314, |
|
"learning_rate": 7.056435515653058e-07, |
|
"loss": 2.1719, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"eval_loss": 2.1870105266571045, |
|
"eval_runtime": 95.4713, |
|
"eval_samples_per_second": 34.335, |
|
"eval_steps_per_second": 0.859, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.34065934065934, |
|
"grad_norm": 1.4055766721244105, |
|
"learning_rate": 6.968651785601858e-07, |
|
"loss": 2.1569, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 4.34065934065934, |
|
"eval_loss": 2.1857264041900635, |
|
"eval_runtime": 94.6681, |
|
"eval_samples_per_second": 34.626, |
|
"eval_steps_per_second": 0.866, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 4.395604395604396, |
|
"grad_norm": 1.4109071900833123, |
|
"learning_rate": 6.88014390720541e-07, |
|
"loss": 2.1602, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.395604395604396, |
|
"eval_loss": 2.1845154762268066, |
|
"eval_runtime": 96.5176, |
|
"eval_samples_per_second": 33.963, |
|
"eval_steps_per_second": 0.85, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.450549450549451, |
|
"grad_norm": 1.3966564162441588, |
|
"learning_rate": 6.790944437177983e-07, |
|
"loss": 2.1444, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 4.450549450549451, |
|
"eval_loss": 2.183314800262451, |
|
"eval_runtime": 94.7139, |
|
"eval_samples_per_second": 34.61, |
|
"eval_steps_per_second": 0.866, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 4.5054945054945055, |
|
"grad_norm": 1.4078350070104653, |
|
"learning_rate": 6.701086186628731e-07, |
|
"loss": 2.1507, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.5054945054945055, |
|
"eval_loss": 2.182114362716675, |
|
"eval_runtime": 95.3459, |
|
"eval_samples_per_second": 34.38, |
|
"eval_steps_per_second": 0.86, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.56043956043956, |
|
"grad_norm": 1.3979239637585987, |
|
"learning_rate": 6.610602208992452e-07, |
|
"loss": 2.1562, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 4.56043956043956, |
|
"eval_loss": 2.1809184551239014, |
|
"eval_runtime": 94.8819, |
|
"eval_samples_per_second": 34.548, |
|
"eval_steps_per_second": 0.864, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 1.4173323621445668, |
|
"learning_rate": 6.519525787871234e-07, |
|
"loss": 2.1507, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"eval_loss": 2.179769992828369, |
|
"eval_runtime": 94.615, |
|
"eval_samples_per_second": 34.646, |
|
"eval_steps_per_second": 0.867, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.670329670329671, |
|
"grad_norm": 1.4495494766154244, |
|
"learning_rate": 6.427890424791413e-07, |
|
"loss": 2.1456, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 4.670329670329671, |
|
"eval_loss": 2.1786587238311768, |
|
"eval_runtime": 95.2766, |
|
"eval_samples_per_second": 34.405, |
|
"eval_steps_per_second": 0.861, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 4.725274725274725, |
|
"grad_norm": 1.3978628311813146, |
|
"learning_rate": 6.335729826880389e-07, |
|
"loss": 2.1527, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.725274725274725, |
|
"eval_loss": 2.177562713623047, |
|
"eval_runtime": 95.0839, |
|
"eval_samples_per_second": 34.475, |
|
"eval_steps_per_second": 0.862, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.78021978021978, |
|
"grad_norm": 1.3616533998206952, |
|
"learning_rate": 6.243077894467799e-07, |
|
"loss": 2.1523, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.78021978021978, |
|
"eval_loss": 2.1765596866607666, |
|
"eval_runtime": 95.2854, |
|
"eval_samples_per_second": 34.402, |
|
"eval_steps_per_second": 0.861, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.835164835164835, |
|
"grad_norm": 1.40687668442058, |
|
"learning_rate": 6.149968708615634e-07, |
|
"loss": 2.1514, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.835164835164835, |
|
"eval_loss": 2.175467014312744, |
|
"eval_runtime": 94.6288, |
|
"eval_samples_per_second": 34.641, |
|
"eval_steps_per_second": 0.867, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.8901098901098905, |
|
"grad_norm": 1.3739992174314783, |
|
"learning_rate": 6.056436518581864e-07, |
|
"loss": 2.1363, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.8901098901098905, |
|
"eval_loss": 2.174509048461914, |
|
"eval_runtime": 95.7561, |
|
"eval_samples_per_second": 34.233, |
|
"eval_steps_per_second": 0.856, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.945054945054945, |
|
"grad_norm": 1.4046699052180842, |
|
"learning_rate": 5.962515729222208e-07, |
|
"loss": 2.1515, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.945054945054945, |
|
"eval_loss": 2.173485040664673, |
|
"eval_runtime": 94.9608, |
|
"eval_samples_per_second": 34.52, |
|
"eval_steps_per_second": 0.864, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.3856994205796709, |
|
"learning_rate": 5.868240888334652e-07, |
|
"loss": 2.1446, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.1724860668182373, |
|
"eval_runtime": 95.9629, |
|
"eval_samples_per_second": 34.159, |
|
"eval_steps_per_second": 0.854, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 5.054945054945055, |
|
"grad_norm": 1.359907555644168, |
|
"learning_rate": 5.773646673951406e-07, |
|
"loss": 2.1449, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.054945054945055, |
|
"eval_loss": 2.1716203689575195, |
|
"eval_runtime": 94.6944, |
|
"eval_samples_per_second": 34.617, |
|
"eval_steps_per_second": 0.866, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.1098901098901095, |
|
"grad_norm": 1.397318242165103, |
|
"learning_rate": 5.67876788158294e-07, |
|
"loss": 2.151, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 5.1098901098901095, |
|
"eval_loss": 2.170758008956909, |
|
"eval_runtime": 94.1168, |
|
"eval_samples_per_second": 34.829, |
|
"eval_steps_per_second": 0.871, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 5.164835164835165, |
|
"grad_norm": 1.4236578949268166, |
|
"learning_rate": 5.58363941141881e-07, |
|
"loss": 2.135, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.164835164835165, |
|
"eval_loss": 2.1698837280273438, |
|
"eval_runtime": 95.886, |
|
"eval_samples_per_second": 34.186, |
|
"eval_steps_per_second": 0.855, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.21978021978022, |
|
"grad_norm": 1.3923617942385993, |
|
"learning_rate": 5.48829625548999e-07, |
|
"loss": 2.1378, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 5.21978021978022, |
|
"eval_loss": 2.169065475463867, |
|
"eval_runtime": 94.683, |
|
"eval_samples_per_second": 34.621, |
|
"eval_steps_per_second": 0.866, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 5.274725274725275, |
|
"grad_norm": 1.3673257335946538, |
|
"learning_rate": 5.392773484797406e-07, |
|
"loss": 2.1312, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.274725274725275, |
|
"eval_loss": 2.1681604385375977, |
|
"eval_runtime": 96.7854, |
|
"eval_samples_per_second": 33.869, |
|
"eval_steps_per_second": 0.847, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.329670329670329, |
|
"grad_norm": 1.376334253291603, |
|
"learning_rate": 5.297106236411431e-07, |
|
"loss": 2.1334, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 5.329670329670329, |
|
"eval_loss": 2.1673035621643066, |
|
"eval_runtime": 94.0748, |
|
"eval_samples_per_second": 34.845, |
|
"eval_steps_per_second": 0.872, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"grad_norm": 1.4116620429830442, |
|
"learning_rate": 5.201329700547076e-07, |
|
"loss": 2.1287, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"eval_loss": 2.1665842533111572, |
|
"eval_runtime": 97.1342, |
|
"eval_samples_per_second": 33.747, |
|
"eval_steps_per_second": 0.844, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 5.43956043956044, |
|
"grad_norm": 1.3882977350312697, |
|
"learning_rate": 5.105479107619623e-07, |
|
"loss": 2.1371, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 5.43956043956044, |
|
"eval_loss": 2.1657893657684326, |
|
"eval_runtime": 94.6092, |
|
"eval_samples_per_second": 34.648, |
|
"eval_steps_per_second": 0.867, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 5.4945054945054945, |
|
"grad_norm": 1.3387430328337173, |
|
"learning_rate": 5.009589715285492e-07, |
|
"loss": 2.1283, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.4945054945054945, |
|
"eval_loss": 2.1649773120880127, |
|
"eval_runtime": 95.6455, |
|
"eval_samples_per_second": 34.272, |
|
"eval_steps_per_second": 0.857, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.549450549450549, |
|
"grad_norm": 1.3369955383377354, |
|
"learning_rate": 4.913696795473058e-07, |
|
"loss": 2.1304, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 5.549450549450549, |
|
"eval_loss": 2.1642661094665527, |
|
"eval_runtime": 94.082, |
|
"eval_samples_per_second": 34.842, |
|
"eval_steps_per_second": 0.872, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 5.604395604395604, |
|
"grad_norm": 1.4094356210593881, |
|
"learning_rate": 4.81783562140825e-07, |
|
"loss": 2.1263, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 5.604395604395604, |
|
"eval_loss": 2.1635727882385254, |
|
"eval_runtime": 96.1766, |
|
"eval_samples_per_second": 34.083, |
|
"eval_steps_per_second": 0.853, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 5.65934065934066, |
|
"grad_norm": 1.304106079776388, |
|
"learning_rate": 4.722041454639645e-07, |
|
"loss": 2.1367, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 5.65934065934066, |
|
"eval_loss": 2.1628613471984863, |
|
"eval_runtime": 94.3841, |
|
"eval_samples_per_second": 34.73, |
|
"eval_steps_per_second": 0.869, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 1.3827334256393926, |
|
"learning_rate": 4.626349532067879e-07, |
|
"loss": 2.1207, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"eval_loss": 2.162167549133301, |
|
"eval_runtime": 95.9116, |
|
"eval_samples_per_second": 34.177, |
|
"eval_steps_per_second": 0.855, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 5.769230769230769, |
|
"grad_norm": 1.4133703140162626, |
|
"learning_rate": 4.530795052984104e-07, |
|
"loss": 2.126, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 5.769230769230769, |
|
"eval_loss": 2.161437511444092, |
|
"eval_runtime": 94.7934, |
|
"eval_samples_per_second": 34.58, |
|
"eval_steps_per_second": 0.865, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 5.824175824175824, |
|
"grad_norm": 1.3623534884797142, |
|
"learning_rate": 4.4354131661222993e-07, |
|
"loss": 2.1178, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 5.824175824175824, |
|
"eval_loss": 2.1607697010040283, |
|
"eval_runtime": 95.1583, |
|
"eval_samples_per_second": 34.448, |
|
"eval_steps_per_second": 0.862, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 5.8791208791208796, |
|
"grad_norm": 1.3721824145937465, |
|
"learning_rate": 4.3402389567301687e-07, |
|
"loss": 2.1317, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 5.8791208791208796, |
|
"eval_loss": 2.1601507663726807, |
|
"eval_runtime": 95.1179, |
|
"eval_samples_per_second": 34.462, |
|
"eval_steps_per_second": 0.862, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 5.934065934065934, |
|
"grad_norm": 1.354947196331287, |
|
"learning_rate": 4.245307433663388e-07, |
|
"loss": 2.1208, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.934065934065934, |
|
"eval_loss": 2.15952730178833, |
|
"eval_runtime": 95.2363, |
|
"eval_samples_per_second": 34.42, |
|
"eval_steps_per_second": 0.861, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.989010989010989, |
|
"grad_norm": 1.3836862806091863, |
|
"learning_rate": 4.1506535165079637e-07, |
|
"loss": 2.131, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 5.989010989010989, |
|
"eval_loss": 2.1588995456695557, |
|
"eval_runtime": 95.977, |
|
"eval_samples_per_second": 34.154, |
|
"eval_steps_per_second": 0.854, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 6.043956043956044, |
|
"grad_norm": 1.4174811691080422, |
|
"learning_rate": 4.056312022735417e-07, |
|
"loss": 2.1282, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.043956043956044, |
|
"eval_loss": 2.1583850383758545, |
|
"eval_runtime": 94.9634, |
|
"eval_samples_per_second": 34.519, |
|
"eval_steps_per_second": 0.863, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.0989010989010985, |
|
"grad_norm": 1.386136241074018, |
|
"learning_rate": 3.962317654895532e-07, |
|
"loss": 2.1071, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 6.0989010989010985, |
|
"eval_loss": 2.157827854156494, |
|
"eval_runtime": 96.6235, |
|
"eval_samples_per_second": 33.925, |
|
"eval_steps_per_second": 0.849, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 1.382143872347298, |
|
"learning_rate": 3.86870498785139e-07, |
|
"loss": 2.1152, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"eval_loss": 2.1572675704956055, |
|
"eval_runtime": 94.9844, |
|
"eval_samples_per_second": 34.511, |
|
"eval_steps_per_second": 0.863, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.208791208791209, |
|
"grad_norm": 1.3117267968176445, |
|
"learning_rate": 3.7755084560613454e-07, |
|
"loss": 2.1274, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 6.208791208791209, |
|
"eval_loss": 2.156782865524292, |
|
"eval_runtime": 95.0906, |
|
"eval_samples_per_second": 34.472, |
|
"eval_steps_per_second": 0.862, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 6.263736263736264, |
|
"grad_norm": 1.3153022809674055, |
|
"learning_rate": 3.682762340912681e-07, |
|
"loss": 2.125, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.263736263736264, |
|
"eval_loss": 2.156236410140991, |
|
"eval_runtime": 94.7673, |
|
"eval_samples_per_second": 34.59, |
|
"eval_steps_per_second": 0.865, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.318681318681318, |
|
"grad_norm": 1.363719531307913, |
|
"learning_rate": 3.590500758111537e-07, |
|
"loss": 2.1253, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 6.318681318681318, |
|
"eval_loss": 2.1556880474090576, |
|
"eval_runtime": 95.0783, |
|
"eval_samples_per_second": 34.477, |
|
"eval_steps_per_second": 0.862, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 6.373626373626374, |
|
"grad_norm": 1.3306699270632527, |
|
"learning_rate": 3.498757645133805e-07, |
|
"loss": 2.1105, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 6.373626373626374, |
|
"eval_loss": 2.1551740169525146, |
|
"eval_runtime": 95.3795, |
|
"eval_samples_per_second": 34.368, |
|
"eval_steps_per_second": 0.86, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 6.428571428571429, |
|
"grad_norm": 1.4013786507282588, |
|
"learning_rate": 3.4075667487415785e-07, |
|
"loss": 2.1233, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 6.428571428571429, |
|
"eval_loss": 2.1547253131866455, |
|
"eval_runtime": 94.9401, |
|
"eval_samples_per_second": 34.527, |
|
"eval_steps_per_second": 0.864, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 6.483516483516484, |
|
"grad_norm": 1.340212819021932, |
|
"learning_rate": 3.3169616125697485e-07, |
|
"loss": 2.1082, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 6.483516483516484, |
|
"eval_loss": 2.1542816162109375, |
|
"eval_runtime": 96.1807, |
|
"eval_samples_per_second": 34.082, |
|
"eval_steps_per_second": 0.853, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 6.538461538461538, |
|
"grad_norm": 1.37559843039388, |
|
"learning_rate": 3.2269755647873214e-07, |
|
"loss": 2.116, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 6.538461538461538, |
|
"eval_loss": 2.153905153274536, |
|
"eval_runtime": 94.5357, |
|
"eval_samples_per_second": 34.675, |
|
"eval_steps_per_second": 0.867, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 6.593406593406593, |
|
"grad_norm": 1.3234676252365452, |
|
"learning_rate": 3.137641705838003e-07, |
|
"loss": 2.114, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 6.593406593406593, |
|
"eval_loss": 2.153465747833252, |
|
"eval_runtime": 96.7145, |
|
"eval_samples_per_second": 33.894, |
|
"eval_steps_per_second": 0.848, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 6.648351648351649, |
|
"grad_norm": 1.2963273381270386, |
|
"learning_rate": 3.048992896264527e-07, |
|
"loss": 2.1025, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 6.648351648351649, |
|
"eval_loss": 2.153031587600708, |
|
"eval_runtime": 94.9067, |
|
"eval_samples_per_second": 34.539, |
|
"eval_steps_per_second": 0.864, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 6.7032967032967035, |
|
"grad_norm": 1.363956310159158, |
|
"learning_rate": 2.9610617446212494e-07, |
|
"loss": 2.1174, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 6.7032967032967035, |
|
"eval_loss": 2.1526174545288086, |
|
"eval_runtime": 96.5514, |
|
"eval_samples_per_second": 33.951, |
|
"eval_steps_per_second": 0.849, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 6.758241758241758, |
|
"grad_norm": 1.324164018147815, |
|
"learning_rate": 2.8738805954794295e-07, |
|
"loss": 2.1158, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 6.758241758241758, |
|
"eval_loss": 2.1521835327148438, |
|
"eval_runtime": 94.9388, |
|
"eval_samples_per_second": 34.527, |
|
"eval_steps_per_second": 0.864, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 6.813186813186813, |
|
"grad_norm": 1.3310397406861454, |
|
"learning_rate": 2.7874815175296e-07, |
|
"loss": 2.1118, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 6.813186813186813, |
|
"eval_loss": 2.1518325805664062, |
|
"eval_runtime": 95.7818, |
|
"eval_samples_per_second": 34.224, |
|
"eval_steps_per_second": 0.856, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 6.868131868131869, |
|
"grad_norm": 1.306713284641784, |
|
"learning_rate": 2.7018962917854416e-07, |
|
"loss": 2.1219, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 6.868131868131869, |
|
"eval_loss": 2.1515119075775146, |
|
"eval_runtime": 94.8621, |
|
"eval_samples_per_second": 34.555, |
|
"eval_steps_per_second": 0.864, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 6.923076923076923, |
|
"grad_norm": 1.328321525865242, |
|
"learning_rate": 2.61715639989346e-07, |
|
"loss": 2.1088, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 6.923076923076923, |
|
"eval_loss": 2.1511785984039307, |
|
"eval_runtime": 94.9757, |
|
"eval_samples_per_second": 34.514, |
|
"eval_steps_per_second": 0.863, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 6.978021978021978, |
|
"grad_norm": 1.3354659608167203, |
|
"learning_rate": 2.5332930125527785e-07, |
|
"loss": 2.1188, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 6.978021978021978, |
|
"eval_loss": 2.150792360305786, |
|
"eval_runtime": 95.0995, |
|
"eval_samples_per_second": 34.469, |
|
"eval_steps_per_second": 0.862, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 7.032967032967033, |
|
"grad_norm": 1.3252421337619178, |
|
"learning_rate": 2.4503369780493216e-07, |
|
"loss": 2.0958, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 7.032967032967033, |
|
"eval_loss": 2.150482416152954, |
|
"eval_runtime": 94.5987, |
|
"eval_samples_per_second": 34.652, |
|
"eval_steps_per_second": 0.867, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 7.087912087912088, |
|
"grad_norm": 1.3025182371568629, |
|
"learning_rate": 2.3683188109085877e-07, |
|
"loss": 2.1162, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 7.087912087912088, |
|
"eval_loss": 2.150195598602295, |
|
"eval_runtime": 95.7892, |
|
"eval_samples_per_second": 34.221, |
|
"eval_steps_per_second": 0.856, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"grad_norm": 1.3417203401169715, |
|
"learning_rate": 2.2872686806712032e-07, |
|
"loss": 2.112, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"eval_loss": 2.14986515045166, |
|
"eval_runtime": 94.108, |
|
"eval_samples_per_second": 34.832, |
|
"eval_steps_per_second": 0.871, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 7.197802197802198, |
|
"grad_norm": 1.3125833946773793, |
|
"learning_rate": 2.2072164007953515e-07, |
|
"loss": 2.1108, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 7.197802197802198, |
|
"eval_loss": 2.1495869159698486, |
|
"eval_runtime": 96.3106, |
|
"eval_samples_per_second": 34.036, |
|
"eval_steps_per_second": 0.851, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 7.252747252747253, |
|
"grad_norm": 1.3727056594091913, |
|
"learning_rate": 2.1281914176902106e-07, |
|
"loss": 2.1105, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 7.252747252747253, |
|
"eval_loss": 2.149327039718628, |
|
"eval_runtime": 94.0785, |
|
"eval_samples_per_second": 34.843, |
|
"eval_steps_per_second": 0.872, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 7.3076923076923075, |
|
"grad_norm": 1.26720608418739, |
|
"learning_rate": 2.050222799884387e-07, |
|
"loss": 2.1119, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 7.3076923076923075, |
|
"eval_loss": 2.1490209102630615, |
|
"eval_runtime": 95.0812, |
|
"eval_samples_per_second": 34.476, |
|
"eval_steps_per_second": 0.862, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 7.362637362637362, |
|
"grad_norm": 1.3075408289058195, |
|
"learning_rate": 1.9733392273333595e-07, |
|
"loss": 2.1125, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 7.362637362637362, |
|
"eval_loss": 2.148756980895996, |
|
"eval_runtime": 94.7034, |
|
"eval_samples_per_second": 34.613, |
|
"eval_steps_per_second": 0.866, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 7.417582417582418, |
|
"grad_norm": 1.338274034103587, |
|
"learning_rate": 1.8975689808698546e-07, |
|
"loss": 2.1085, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 7.417582417582418, |
|
"eval_loss": 2.1485321521759033, |
|
"eval_runtime": 95.2917, |
|
"eval_samples_per_second": 34.4, |
|
"eval_steps_per_second": 0.861, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 7.472527472527473, |
|
"grad_norm": 1.2732739133631568, |
|
"learning_rate": 1.8229399318010234e-07, |
|
"loss": 2.113, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 7.472527472527473, |
|
"eval_loss": 2.1483404636383057, |
|
"eval_runtime": 96.15, |
|
"eval_samples_per_second": 34.093, |
|
"eval_steps_per_second": 0.853, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 7.527472527472527, |
|
"grad_norm": 1.3397177765715196, |
|
"learning_rate": 1.7494795316562787e-07, |
|
"loss": 2.1022, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 7.527472527472527, |
|
"eval_loss": 2.1481196880340576, |
|
"eval_runtime": 94.6054, |
|
"eval_samples_per_second": 34.649, |
|
"eval_steps_per_second": 0.867, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 7.582417582417582, |
|
"grad_norm": 1.315974901665471, |
|
"learning_rate": 1.6772148020895227e-07, |
|
"loss": 2.1005, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 7.582417582417582, |
|
"eval_loss": 2.1478824615478516, |
|
"eval_runtime": 96.7642, |
|
"eval_samples_per_second": 33.876, |
|
"eval_steps_per_second": 0.847, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 7.637362637362637, |
|
"grad_norm": 1.3175415298356188, |
|
"learning_rate": 1.6061723249395103e-07, |
|
"loss": 2.1061, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 7.637362637362637, |
|
"eval_loss": 2.147655963897705, |
|
"eval_runtime": 94.8865, |
|
"eval_samples_per_second": 34.547, |
|
"eval_steps_per_second": 0.864, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 1.2676400902435017, |
|
"learning_rate": 1.536378232452003e-07, |
|
"loss": 2.1113, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"eval_loss": 2.147427797317505, |
|
"eval_runtime": 96.4923, |
|
"eval_samples_per_second": 33.972, |
|
"eval_steps_per_second": 0.85, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 7.747252747252747, |
|
"grad_norm": 1.3178958103097314, |
|
"learning_rate": 1.4678581976672748e-07, |
|
"loss": 2.0939, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 7.747252747252747, |
|
"eval_loss": 2.147228479385376, |
|
"eval_runtime": 94.325, |
|
"eval_samples_per_second": 34.752, |
|
"eval_steps_per_second": 0.869, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 7.802197802197802, |
|
"grad_norm": 1.3384998637586858, |
|
"learning_rate": 1.4006374249765596e-07, |
|
"loss": 2.0993, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 7.802197802197802, |
|
"eval_loss": 2.147052764892578, |
|
"eval_runtime": 96.9054, |
|
"eval_samples_per_second": 33.827, |
|
"eval_steps_per_second": 0.846, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 7.857142857142857, |
|
"grad_norm": 1.280750721967852, |
|
"learning_rate": 1.3347406408508694e-07, |
|
"loss": 2.1115, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 7.857142857142857, |
|
"eval_loss": 2.146883249282837, |
|
"eval_runtime": 94.446, |
|
"eval_samples_per_second": 34.708, |
|
"eval_steps_per_second": 0.868, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 7.912087912087912, |
|
"grad_norm": 1.3140429519277008, |
|
"learning_rate": 1.2701920847456166e-07, |
|
"loss": 2.1043, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 7.912087912087912, |
|
"eval_loss": 2.146697998046875, |
|
"eval_runtime": 95.4996, |
|
"eval_samples_per_second": 34.325, |
|
"eval_steps_per_second": 0.859, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 7.967032967032967, |
|
"grad_norm": 1.3312499353255274, |
|
"learning_rate": 1.2070155001843835e-07, |
|
"loss": 2.1032, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 7.967032967032967, |
|
"eval_loss": 2.1465156078338623, |
|
"eval_runtime": 93.9989, |
|
"eval_samples_per_second": 34.873, |
|
"eval_steps_per_second": 0.872, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 8.021978021978022, |
|
"grad_norm": 1.3390325157168175, |
|
"learning_rate": 1.1452341260251019e-07, |
|
"loss": 2.0988, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 8.021978021978022, |
|
"eval_loss": 2.1463658809661865, |
|
"eval_runtime": 94.8926, |
|
"eval_samples_per_second": 34.544, |
|
"eval_steps_per_second": 0.864, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 8.076923076923077, |
|
"grad_norm": 1.3132771567223507, |
|
"learning_rate": 1.084870687911889e-07, |
|
"loss": 2.1021, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 8.076923076923077, |
|
"eval_loss": 2.146247625350952, |
|
"eval_runtime": 96.2255, |
|
"eval_samples_per_second": 34.066, |
|
"eval_steps_per_second": 0.852, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 8.131868131868131, |
|
"grad_norm": 1.2967777616488498, |
|
"learning_rate": 1.0259473899156429e-07, |
|
"loss": 2.0972, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 8.131868131868131, |
|
"eval_loss": 2.1461329460144043, |
|
"eval_runtime": 94.5501, |
|
"eval_samples_per_second": 34.669, |
|
"eval_steps_per_second": 0.867, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 8.186813186813186, |
|
"grad_norm": 1.2698397159733246, |
|
"learning_rate": 9.684859063665057e-08, |
|
"loss": 2.1034, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 8.186813186813186, |
|
"eval_loss": 2.1460211277008057, |
|
"eval_runtime": 95.3329, |
|
"eval_samples_per_second": 34.385, |
|
"eval_steps_per_second": 0.86, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 8.241758241758241, |
|
"grad_norm": 1.278484842006247, |
|
"learning_rate": 9.125073738811917e-08, |
|
"loss": 2.0955, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 8.241758241758241, |
|
"eval_loss": 2.1458938121795654, |
|
"eval_runtime": 94.9309, |
|
"eval_samples_per_second": 34.53, |
|
"eval_steps_per_second": 0.864, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 8.296703296703297, |
|
"grad_norm": 1.289197769580351, |
|
"learning_rate": 8.580323835880859e-08, |
|
"loss": 2.0997, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 8.296703296703297, |
|
"eval_loss": 2.1457700729370117, |
|
"eval_runtime": 94.6214, |
|
"eval_samples_per_second": 34.643, |
|
"eval_steps_per_second": 0.867, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 8.351648351648352, |
|
"grad_norm": 1.301153991983438, |
|
"learning_rate": 8.050809735530207e-08, |
|
"loss": 2.1016, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 8.351648351648352, |
|
"eval_loss": 2.1456449031829834, |
|
"eval_runtime": 95.3277, |
|
"eval_samples_per_second": 34.387, |
|
"eval_steps_per_second": 0.86, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 8.406593406593407, |
|
"grad_norm": 1.3044699064545437, |
|
"learning_rate": 7.53672621408472e-08, |
|
"loss": 2.107, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 8.406593406593407, |
|
"eval_loss": 2.145549774169922, |
|
"eval_runtime": 93.7346, |
|
"eval_samples_per_second": 34.971, |
|
"eval_steps_per_second": 0.875, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 8.461538461538462, |
|
"grad_norm": 1.2805052222372935, |
|
"learning_rate": 7.038262371889159e-08, |
|
"loss": 2.1033, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 8.461538461538462, |
|
"eval_loss": 2.145460605621338, |
|
"eval_runtime": 95.8972, |
|
"eval_samples_per_second": 34.182, |
|
"eval_steps_per_second": 0.855, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 8.516483516483516, |
|
"grad_norm": 1.3344128534699025, |
|
"learning_rate": 6.555601563749674e-08, |
|
"loss": 2.1081, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 8.516483516483516, |
|
"eval_loss": 2.1453983783721924, |
|
"eval_runtime": 94.7645, |
|
"eval_samples_per_second": 34.591, |
|
"eval_steps_per_second": 0.865, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 1.3534141294417172, |
|
"learning_rate": 6.088921331488566e-08, |
|
"loss": 2.1007, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"eval_loss": 2.1453309059143066, |
|
"eval_runtime": 95.9005, |
|
"eval_samples_per_second": 34.181, |
|
"eval_steps_per_second": 0.855, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 8.626373626373626, |
|
"grad_norm": 1.289360061293496, |
|
"learning_rate": 5.6383933386374316e-08, |
|
"loss": 2.0954, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 8.626373626373626, |
|
"eval_loss": 2.1452713012695312, |
|
"eval_runtime": 94.327, |
|
"eval_samples_per_second": 34.751, |
|
"eval_steps_per_second": 0.869, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 8.68131868131868, |
|
"grad_norm": 1.290799173451955, |
|
"learning_rate": 5.204183307292409e-08, |
|
"loss": 2.0966, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 8.68131868131868, |
|
"eval_loss": 2.1452112197875977, |
|
"eval_runtime": 96.6287, |
|
"eval_samples_per_second": 33.924, |
|
"eval_steps_per_second": 0.849, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 8.736263736263737, |
|
"grad_norm": 1.296710622293866, |
|
"learning_rate": 4.786450957155064e-08, |
|
"loss": 2.105, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 8.736263736263737, |
|
"eval_loss": 2.1451542377471924, |
|
"eval_runtime": 94.7724, |
|
"eval_samples_per_second": 34.588, |
|
"eval_steps_per_second": 0.865, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 8.791208791208792, |
|
"grad_norm": 1.293112762829761, |
|
"learning_rate": 4.385349946781136e-08, |
|
"loss": 2.11, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 8.791208791208792, |
|
"eval_loss": 2.145106315612793, |
|
"eval_runtime": 95.5587, |
|
"eval_samples_per_second": 34.304, |
|
"eval_steps_per_second": 0.858, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 8.846153846153847, |
|
"grad_norm": 1.326895910564294, |
|
"learning_rate": 4.0010278170587884e-08, |
|
"loss": 2.1025, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 8.846153846153847, |
|
"eval_loss": 2.1450610160827637, |
|
"eval_runtime": 94.083, |
|
"eval_samples_per_second": 34.842, |
|
"eval_steps_per_second": 0.872, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 8.901098901098901, |
|
"grad_norm": 1.303529256363753, |
|
"learning_rate": 3.633625936937229e-08, |
|
"loss": 2.1057, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 8.901098901098901, |
|
"eval_loss": 2.145014762878418, |
|
"eval_runtime": 94.2778, |
|
"eval_samples_per_second": 34.77, |
|
"eval_steps_per_second": 0.87, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 8.956043956043956, |
|
"grad_norm": 1.300189968606814, |
|
"learning_rate": 3.28327945142558e-08, |
|
"loss": 2.1084, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 8.956043956043956, |
|
"eval_loss": 2.1449670791625977, |
|
"eval_runtime": 95.579, |
|
"eval_samples_per_second": 34.296, |
|
"eval_steps_per_second": 0.858, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 9.010989010989011, |
|
"grad_norm": 1.3121642088661138, |
|
"learning_rate": 2.950117231881183e-08, |
|
"loss": 2.0948, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 9.010989010989011, |
|
"eval_loss": 2.144935131072998, |
|
"eval_runtime": 93.9326, |
|
"eval_samples_per_second": 34.897, |
|
"eval_steps_per_second": 0.873, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 9.065934065934066, |
|
"grad_norm": 1.2914729270696104, |
|
"learning_rate": 2.634261828605594e-08, |
|
"loss": 2.1031, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 9.065934065934066, |
|
"eval_loss": 2.144901752471924, |
|
"eval_runtime": 96.1619, |
|
"eval_samples_per_second": 34.088, |
|
"eval_steps_per_second": 0.853, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 9.12087912087912, |
|
"grad_norm": 1.2899551643117746, |
|
"learning_rate": 2.335829425765712e-08, |
|
"loss": 2.0946, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 9.12087912087912, |
|
"eval_loss": 2.1448748111724854, |
|
"eval_runtime": 93.6382, |
|
"eval_samples_per_second": 35.007, |
|
"eval_steps_per_second": 0.876, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 9.175824175824175, |
|
"grad_norm": 1.2788813311509164, |
|
"learning_rate": 2.0549297986566183e-08, |
|
"loss": 2.1076, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 9.175824175824175, |
|
"eval_loss": 2.144841194152832, |
|
"eval_runtime": 96.4177, |
|
"eval_samples_per_second": 33.998, |
|
"eval_steps_per_second": 0.85, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 1.3297319077943937, |
|
"learning_rate": 1.7916662733218846e-08, |
|
"loss": 2.0962, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"eval_loss": 2.144815683364868, |
|
"eval_runtime": 94.5433, |
|
"eval_samples_per_second": 34.672, |
|
"eval_steps_per_second": 0.867, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 9.285714285714286, |
|
"grad_norm": 1.3729180320680576, |
|
"learning_rate": 1.5461356885461075e-08, |
|
"loss": 2.0884, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 9.285714285714286, |
|
"eval_loss": 2.1447927951812744, |
|
"eval_runtime": 96.9528, |
|
"eval_samples_per_second": 33.81, |
|
"eval_steps_per_second": 0.846, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 9.340659340659341, |
|
"grad_norm": 1.3282581119274621, |
|
"learning_rate": 1.3184283602337864e-08, |
|
"loss": 2.1016, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 9.340659340659341, |
|
"eval_loss": 2.144777536392212, |
|
"eval_runtime": 94.2281, |
|
"eval_samples_per_second": 34.788, |
|
"eval_steps_per_second": 0.87, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 9.395604395604396, |
|
"grad_norm": 1.303774253858923, |
|
"learning_rate": 1.1086280481875653e-08, |
|
"loss": 2.1091, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 9.395604395604396, |
|
"eval_loss": 2.144756555557251, |
|
"eval_runtime": 96.7444, |
|
"eval_samples_per_second": 33.883, |
|
"eval_steps_per_second": 0.848, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 9.45054945054945, |
|
"grad_norm": 1.3340713352584999, |
|
"learning_rate": 9.168119252979945e-09, |
|
"loss": 2.1084, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 9.45054945054945, |
|
"eval_loss": 2.1447436809539795, |
|
"eval_runtime": 94.7907, |
|
"eval_samples_per_second": 34.581, |
|
"eval_steps_per_second": 0.865, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 9.505494505494505, |
|
"grad_norm": 1.297522419820979, |
|
"learning_rate": 7.430505491563099e-09, |
|
"loss": 2.1069, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 9.505494505494505, |
|
"eval_loss": 2.14473295211792, |
|
"eval_runtime": 95.2303, |
|
"eval_samples_per_second": 34.422, |
|
"eval_steps_per_second": 0.861, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 9.56043956043956, |
|
"grad_norm": 1.2976358260740801, |
|
"learning_rate": 5.874078361005563e-09, |
|
"loss": 2.1049, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 9.56043956043956, |
|
"eval_loss": 2.1447255611419678, |
|
"eval_runtime": 94.6371, |
|
"eval_samples_per_second": 34.638, |
|
"eval_steps_per_second": 0.866, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 9.615384615384615, |
|
"grad_norm": 1.30150242446251, |
|
"learning_rate": 4.499410377045765e-09, |
|
"loss": 2.0981, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 9.615384615384615, |
|
"eval_loss": 2.144716739654541, |
|
"eval_runtime": 94.4817, |
|
"eval_samples_per_second": 34.695, |
|
"eval_steps_per_second": 0.868, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 9.67032967032967, |
|
"grad_norm": 1.265689030842284, |
|
"learning_rate": 3.3070071971867398e-09, |
|
"loss": 2.0975, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 9.67032967032967, |
|
"eval_loss": 2.1447131633758545, |
|
"eval_runtime": 97.3588, |
|
"eval_samples_per_second": 33.669, |
|
"eval_steps_per_second": 0.842, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 9.725274725274724, |
|
"grad_norm": 1.3221353338950448, |
|
"learning_rate": 2.297307434694473e-09, |
|
"loss": 2.1033, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 9.725274725274724, |
|
"eval_loss": 2.14471173286438, |
|
"eval_runtime": 94.4717, |
|
"eval_samples_per_second": 34.698, |
|
"eval_steps_per_second": 0.868, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 9.780219780219781, |
|
"grad_norm": 1.2607874314626535, |
|
"learning_rate": 1.4706824972591237e-09, |
|
"loss": 2.1034, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 9.780219780219781, |
|
"eval_loss": 2.1447081565856934, |
|
"eval_runtime": 96.6148, |
|
"eval_samples_per_second": 33.929, |
|
"eval_steps_per_second": 0.849, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 9.835164835164836, |
|
"grad_norm": 1.293461540963835, |
|
"learning_rate": 8.274364503760845e-10, |
|
"loss": 2.0956, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 9.835164835164836, |
|
"eval_loss": 2.1447083950042725, |
|
"eval_runtime": 94.3064, |
|
"eval_samples_per_second": 34.759, |
|
"eval_steps_per_second": 0.87, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 9.89010989010989, |
|
"grad_norm": 1.279822041703267, |
|
"learning_rate": 3.678059054988969e-10, |
|
"loss": 2.0917, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 9.89010989010989, |
|
"eval_loss": 2.1447057723999023, |
|
"eval_runtime": 96.7952, |
|
"eval_samples_per_second": 33.865, |
|
"eval_steps_per_second": 0.847, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 9.945054945054945, |
|
"grad_norm": 1.3339227803991875, |
|
"learning_rate": 9.19599330039822e-11, |
|
"loss": 2.0983, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 9.945054945054945, |
|
"eval_loss": 2.144705295562744, |
|
"eval_runtime": 93.8029, |
|
"eval_samples_per_second": 34.946, |
|
"eval_steps_per_second": 0.874, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.2734998688150378, |
|
"learning_rate": 0.0, |
|
"loss": 2.1039, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.1447036266326904, |
|
"eval_runtime": 97.1323, |
|
"eval_samples_per_second": 33.748, |
|
"eval_steps_per_second": 0.844, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 910, |
|
"total_flos": 7.930133386100736e+16, |
|
"train_loss": 2.201316048024775, |
|
"train_runtime": 48928.7509, |
|
"train_samples_per_second": 5.948, |
|
"train_steps_per_second": 0.019 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 910, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.930133386100736e+16, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|