{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 5, "global_step": 910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01098901098901099, "grad_norm": 2.787256796216871, "learning_rate": 1.098901098901099e-08, "loss": 2.6306, "step": 1 }, { "epoch": 0.054945054945054944, "grad_norm": 2.7208828057352172, "learning_rate": 5.494505494505494e-08, "loss": 2.652, "step": 5 }, { "epoch": 0.054945054945054944, "eval_loss": 2.6433982849121094, "eval_runtime": 98.8619, "eval_samples_per_second": 33.157, "eval_steps_per_second": 0.829, "step": 5 }, { "epoch": 0.10989010989010989, "grad_norm": 2.7039289723117808, "learning_rate": 1.0989010989010988e-07, "loss": 2.6262, "step": 10 }, { "epoch": 0.10989010989010989, "eval_loss": 2.6424529552459717, "eval_runtime": 96.9771, "eval_samples_per_second": 33.802, "eval_steps_per_second": 0.846, "step": 10 }, { "epoch": 0.16483516483516483, "grad_norm": 2.9271293165910413, "learning_rate": 1.6483516483516482e-07, "loss": 2.6452, "step": 15 }, { "epoch": 0.16483516483516483, "eval_loss": 2.6390280723571777, "eval_runtime": 94.3235, "eval_samples_per_second": 34.753, "eval_steps_per_second": 0.869, "step": 15 }, { "epoch": 0.21978021978021978, "grad_norm": 2.480796984302828, "learning_rate": 2.1978021978021976e-07, "loss": 2.6387, "step": 20 }, { "epoch": 0.21978021978021978, "eval_loss": 2.6297805309295654, "eval_runtime": 97.0464, "eval_samples_per_second": 33.778, "eval_steps_per_second": 0.845, "step": 20 }, { "epoch": 0.27472527472527475, "grad_norm": 2.3675810470522483, "learning_rate": 2.7472527472527475e-07, "loss": 2.6259, "step": 25 }, { "epoch": 0.27472527472527475, "eval_loss": 2.6251583099365234, "eval_runtime": 94.763, "eval_samples_per_second": 34.592, "eval_steps_per_second": 0.865, "step": 25 }, { "epoch": 0.32967032967032966, "grad_norm": 1.9222673443941214, "learning_rate": 3.2967032967032963e-07, "loss": 2.5997, "step": 30 }, { "epoch": 0.32967032967032966, "eval_loss": 2.603151559829712, "eval_runtime": 95.4458, "eval_samples_per_second": 34.344, "eval_steps_per_second": 0.859, "step": 30 }, { "epoch": 0.38461538461538464, "grad_norm": 1.9849926292026194, "learning_rate": 3.8461538461538463e-07, "loss": 2.5946, "step": 35 }, { "epoch": 0.38461538461538464, "eval_loss": 2.593891143798828, "eval_runtime": 94.3775, "eval_samples_per_second": 34.733, "eval_steps_per_second": 0.869, "step": 35 }, { "epoch": 0.43956043956043955, "grad_norm": 1.7366188084370735, "learning_rate": 4.395604395604395e-07, "loss": 2.5801, "step": 40 }, { "epoch": 0.43956043956043955, "eval_loss": 2.5582449436187744, "eval_runtime": 95.1258, "eval_samples_per_second": 34.46, "eval_steps_per_second": 0.862, "step": 40 }, { "epoch": 0.4945054945054945, "grad_norm": 1.5634290965231614, "learning_rate": 4.945054945054945e-07, "loss": 2.5387, "step": 45 }, { "epoch": 0.4945054945054945, "eval_loss": 2.539731979370117, "eval_runtime": 94.4215, "eval_samples_per_second": 34.717, "eval_steps_per_second": 0.868, "step": 45 }, { "epoch": 0.5494505494505495, "grad_norm": 1.5589987032678356, "learning_rate": 5.494505494505495e-07, "loss": 2.5425, "step": 50 }, { "epoch": 0.5494505494505495, "eval_loss": 2.5231664180755615, "eval_runtime": 94.9946, "eval_samples_per_second": 34.507, "eval_steps_per_second": 0.863, "step": 50 }, { "epoch": 0.6043956043956044, "grad_norm": 1.357945104882359, "learning_rate": 6.043956043956043e-07, "loss": 2.5065, "step": 55 }, { "epoch": 0.6043956043956044, "eval_loss": 2.5029337406158447, "eval_runtime": 95.3244, "eval_samples_per_second": 34.388, "eval_steps_per_second": 0.86, "step": 55 }, { "epoch": 0.6593406593406593, "grad_norm": 1.351901982993815, "learning_rate": 6.593406593406593e-07, "loss": 2.4884, "step": 60 }, { "epoch": 0.6593406593406593, "eval_loss": 2.48612380027771, "eval_runtime": 94.5792, "eval_samples_per_second": 34.659, "eval_steps_per_second": 0.867, "step": 60 }, { "epoch": 0.7142857142857143, "grad_norm": 1.3411768977322787, "learning_rate": 7.142857142857143e-07, "loss": 2.4708, "step": 65 }, { "epoch": 0.7142857142857143, "eval_loss": 2.4696457386016846, "eval_runtime": 95.9513, "eval_samples_per_second": 34.163, "eval_steps_per_second": 0.855, "step": 65 }, { "epoch": 0.7692307692307693, "grad_norm": 1.3228056998181177, "learning_rate": 7.692307692307693e-07, "loss": 2.4527, "step": 70 }, { "epoch": 0.7692307692307693, "eval_loss": 2.4544637203216553, "eval_runtime": 94.037, "eval_samples_per_second": 34.859, "eval_steps_per_second": 0.872, "step": 70 }, { "epoch": 0.8241758241758241, "grad_norm": 1.3089892743121356, "learning_rate": 8.241758241758241e-07, "loss": 2.4525, "step": 75 }, { "epoch": 0.8241758241758241, "eval_loss": 2.4406723976135254, "eval_runtime": 96.5873, "eval_samples_per_second": 33.938, "eval_steps_per_second": 0.849, "step": 75 }, { "epoch": 0.8791208791208791, "grad_norm": 1.352286963239502, "learning_rate": 8.79120879120879e-07, "loss": 2.4281, "step": 80 }, { "epoch": 0.8791208791208791, "eval_loss": 2.428374767303467, "eval_runtime": 94.3709, "eval_samples_per_second": 34.735, "eval_steps_per_second": 0.869, "step": 80 }, { "epoch": 0.9340659340659341, "grad_norm": 1.3538673636384089, "learning_rate": 9.340659340659341e-07, "loss": 2.4286, "step": 85 }, { "epoch": 0.9340659340659341, "eval_loss": 2.417691946029663, "eval_runtime": 96.296, "eval_samples_per_second": 34.041, "eval_steps_per_second": 0.852, "step": 85 }, { "epoch": 0.989010989010989, "grad_norm": 1.314242339957954, "learning_rate": 9.89010989010989e-07, "loss": 2.4127, "step": 90 }, { "epoch": 0.989010989010989, "eval_loss": 2.4079270362854004, "eval_runtime": 94.1094, "eval_samples_per_second": 34.832, "eval_steps_per_second": 0.871, "step": 90 }, { "epoch": 1.043956043956044, "grad_norm": 1.292934852988739, "learning_rate": 9.999411449933815e-07, "loss": 2.3982, "step": 95 }, { "epoch": 1.043956043956044, "eval_loss": 2.3987655639648438, "eval_runtime": 96.8027, "eval_samples_per_second": 33.863, "eval_steps_per_second": 0.847, "step": 95 }, { "epoch": 1.098901098901099, "grad_norm": 1.329633653617423, "learning_rate": 9.997020702755352e-07, "loss": 2.3983, "step": 100 }, { "epoch": 1.098901098901099, "eval_loss": 2.390303611755371, "eval_runtime": 94.1549, "eval_samples_per_second": 34.815, "eval_steps_per_second": 0.871, "step": 100 }, { "epoch": 1.1538461538461537, "grad_norm": 1.3449275632813102, "learning_rate": 9.992791852820708e-07, "loss": 2.3845, "step": 105 }, { "epoch": 1.1538461538461537, "eval_loss": 2.3824305534362793, "eval_runtime": 96.1609, "eval_samples_per_second": 34.089, "eval_steps_per_second": 0.853, "step": 105 }, { "epoch": 1.2087912087912087, "grad_norm": 1.2976258496024031, "learning_rate": 9.986726455668912e-07, "loss": 2.3741, "step": 110 }, { "epoch": 1.2087912087912087, "eval_loss": 2.374979257583618, "eval_runtime": 93.9379, "eval_samples_per_second": 34.895, "eval_steps_per_second": 0.873, "step": 110 }, { "epoch": 1.2637362637362637, "grad_norm": 1.3352972965753644, "learning_rate": 9.978826742394025e-07, "loss": 2.374, "step": 115 }, { "epoch": 1.2637362637362637, "eval_loss": 2.367839813232422, "eval_runtime": 95.5399, "eval_samples_per_second": 34.31, "eval_steps_per_second": 0.858, "step": 115 }, { "epoch": 1.3186813186813187, "grad_norm": 1.351442511494053, "learning_rate": 9.969095618824461e-07, "loss": 2.354, "step": 120 }, { "epoch": 1.3186813186813187, "eval_loss": 2.360889434814453, "eval_runtime": 94.4101, "eval_samples_per_second": 34.721, "eval_steps_per_second": 0.869, "step": 120 }, { "epoch": 1.3736263736263736, "grad_norm": 1.3462760051666864, "learning_rate": 9.95753666445411e-07, "loss": 2.3498, "step": 125 }, { "epoch": 1.3736263736263736, "eval_loss": 2.3541159629821777, "eval_runtime": 94.7992, "eval_samples_per_second": 34.578, "eval_steps_per_second": 0.865, "step": 125 }, { "epoch": 1.4285714285714286, "grad_norm": 1.3496352842968289, "learning_rate": 9.944154131125642e-07, "loss": 2.3324, "step": 130 }, { "epoch": 1.4285714285714286, "eval_loss": 2.347334861755371, "eval_runtime": 95.6461, "eval_samples_per_second": 34.272, "eval_steps_per_second": 0.857, "step": 130 }, { "epoch": 1.4835164835164836, "grad_norm": 1.3292817866036872, "learning_rate": 9.928952941466537e-07, "loss": 2.3386, "step": 135 }, { "epoch": 1.4835164835164836, "eval_loss": 2.3408091068267822, "eval_runtime": 94.7751, "eval_samples_per_second": 34.587, "eval_steps_per_second": 0.865, "step": 135 }, { "epoch": 1.5384615384615383, "grad_norm": 1.3030870244645119, "learning_rate": 9.911938687078323e-07, "loss": 2.3199, "step": 140 }, { "epoch": 1.5384615384615383, "eval_loss": 2.334508180618286, "eval_runtime": 96.7229, "eval_samples_per_second": 33.891, "eval_steps_per_second": 0.848, "step": 140 }, { "epoch": 1.5934065934065935, "grad_norm": 1.357889365983369, "learning_rate": 9.893117626479776e-07, "loss": 2.3192, "step": 145 }, { "epoch": 1.5934065934065935, "eval_loss": 2.3284120559692383, "eval_runtime": 94.5936, "eval_samples_per_second": 34.654, "eval_steps_per_second": 0.867, "step": 145 }, { "epoch": 1.6483516483516483, "grad_norm": 1.3682230754482416, "learning_rate": 9.87249668280478e-07, "loss": 2.3205, "step": 150 }, { "epoch": 1.6483516483516483, "eval_loss": 2.322436809539795, "eval_runtime": 96.3599, "eval_samples_per_second": 34.018, "eval_steps_per_second": 0.851, "step": 150 }, { "epoch": 1.7032967032967035, "grad_norm": 1.356238814707341, "learning_rate": 9.850083441255734e-07, "loss": 2.3152, "step": 155 }, { "epoch": 1.7032967032967035, "eval_loss": 2.3167214393615723, "eval_runtime": 94.0628, "eval_samples_per_second": 34.849, "eval_steps_per_second": 0.872, "step": 155 }, { "epoch": 1.7582417582417582, "grad_norm": 1.3669835941127673, "learning_rate": 9.8258861463134e-07, "loss": 2.309, "step": 160 }, { "epoch": 1.7582417582417582, "eval_loss": 2.311154365539551, "eval_runtime": 95.3173, "eval_samples_per_second": 34.39, "eval_steps_per_second": 0.86, "step": 160 }, { "epoch": 1.8131868131868132, "grad_norm": 1.370896874190673, "learning_rate": 9.799913698704268e-07, "loss": 2.2937, "step": 165 }, { "epoch": 1.8131868131868132, "eval_loss": 2.3057687282562256, "eval_runtime": 95.653, "eval_samples_per_second": 34.27, "eval_steps_per_second": 0.857, "step": 165 }, { "epoch": 1.8681318681318682, "grad_norm": 1.383985193963144, "learning_rate": 9.772175652126504e-07, "loss": 2.2957, "step": 170 }, { "epoch": 1.8681318681318682, "eval_loss": 2.300417423248291, "eval_runtime": 94.5548, "eval_samples_per_second": 34.668, "eval_steps_per_second": 0.867, "step": 170 }, { "epoch": 1.9230769230769231, "grad_norm": 1.3861435502681547, "learning_rate": 9.742682209735727e-07, "loss": 2.2941, "step": 175 }, { "epoch": 1.9230769230769231, "eval_loss": 2.2951645851135254, "eval_runtime": 96.181, "eval_samples_per_second": 34.082, "eval_steps_per_second": 0.853, "step": 175 }, { "epoch": 1.978021978021978, "grad_norm": 1.387939254087276, "learning_rate": 9.711444220391885e-07, "loss": 2.2986, "step": 180 }, { "epoch": 1.978021978021978, "eval_loss": 2.290034055709839, "eval_runtime": 94.7723, "eval_samples_per_second": 34.588, "eval_steps_per_second": 0.865, "step": 180 }, { "epoch": 2.032967032967033, "grad_norm": 1.4083527156579858, "learning_rate": 9.678473174668605e-07, "loss": 2.2759, "step": 185 }, { "epoch": 2.032967032967033, "eval_loss": 2.2850897312164307, "eval_runtime": 96.6234, "eval_samples_per_second": 33.926, "eval_steps_per_second": 0.849, "step": 185 }, { "epoch": 2.087912087912088, "grad_norm": 1.3901375675143754, "learning_rate": 9.64378120062651e-07, "loss": 2.26, "step": 190 }, { "epoch": 2.087912087912088, "eval_loss": 2.2804834842681885, "eval_runtime": 94.7228, "eval_samples_per_second": 34.606, "eval_steps_per_second": 0.866, "step": 190 }, { "epoch": 2.142857142857143, "grad_norm": 1.3559893959650073, "learning_rate": 9.607381059352038e-07, "loss": 2.2661, "step": 195 }, { "epoch": 2.142857142857143, "eval_loss": 2.27602481842041, "eval_runtime": 95.5105, "eval_samples_per_second": 34.321, "eval_steps_per_second": 0.859, "step": 195 }, { "epoch": 2.197802197802198, "grad_norm": 1.4338659258910222, "learning_rate": 9.569286140263397e-07, "loss": 2.265, "step": 200 }, { "epoch": 2.197802197802198, "eval_loss": 2.271735429763794, "eval_runtime": 95.2547, "eval_samples_per_second": 34.413, "eval_steps_per_second": 0.861, "step": 200 }, { "epoch": 2.2527472527472527, "grad_norm": 1.3511461203348951, "learning_rate": 9.529510456185417e-07, "loss": 2.2554, "step": 205 }, { "epoch": 2.2527472527472527, "eval_loss": 2.2676820755004883, "eval_runtime": 95.0959, "eval_samples_per_second": 34.47, "eval_steps_per_second": 0.862, "step": 205 }, { "epoch": 2.3076923076923075, "grad_norm": 1.3787695258849668, "learning_rate": 9.488068638195071e-07, "loss": 2.2558, "step": 210 }, { "epoch": 2.3076923076923075, "eval_loss": 2.2637946605682373, "eval_runtime": 96.7344, "eval_samples_per_second": 33.887, "eval_steps_per_second": 0.848, "step": 210 }, { "epoch": 2.3626373626373627, "grad_norm": 1.3944429206878337, "learning_rate": 9.444975930239581e-07, "loss": 2.2508, "step": 215 }, { "epoch": 2.3626373626373627, "eval_loss": 2.2600150108337402, "eval_runtime": 94.3417, "eval_samples_per_second": 34.746, "eval_steps_per_second": 0.869, "step": 215 }, { "epoch": 2.4175824175824174, "grad_norm": 1.36473652575597, "learning_rate": 9.400248183529092e-07, "loss": 2.2405, "step": 220 }, { "epoch": 2.4175824175824174, "eval_loss": 2.256455898284912, "eval_runtime": 96.4969, "eval_samples_per_second": 33.97, "eval_steps_per_second": 0.85, "step": 220 }, { "epoch": 2.4725274725274726, "grad_norm": 1.3975755281584308, "learning_rate": 9.353901850705972e-07, "loss": 2.2484, "step": 225 }, { "epoch": 2.4725274725274726, "eval_loss": 2.2530221939086914, "eval_runtime": 94.3914, "eval_samples_per_second": 34.728, "eval_steps_per_second": 0.869, "step": 225 }, { "epoch": 2.5274725274725274, "grad_norm": 1.4268198012910995, "learning_rate": 9.305953979792864e-07, "loss": 2.2388, "step": 230 }, { "epoch": 2.5274725274725274, "eval_loss": 2.2498600482940674, "eval_runtime": 96.755, "eval_samples_per_second": 33.879, "eval_steps_per_second": 0.848, "step": 230 }, { "epoch": 2.5824175824175826, "grad_norm": 1.405521450176673, "learning_rate": 9.256422207921756e-07, "loss": 2.2472, "step": 235 }, { "epoch": 2.5824175824175826, "eval_loss": 2.2467663288116455, "eval_runtime": 94.2329, "eval_samples_per_second": 34.786, "eval_steps_per_second": 0.87, "step": 235 }, { "epoch": 2.6373626373626373, "grad_norm": 1.3800086312953763, "learning_rate": 9.205324754846339e-07, "loss": 2.2253, "step": 240 }, { "epoch": 2.6373626373626373, "eval_loss": 2.2438271045684814, "eval_runtime": 95.1079, "eval_samples_per_second": 34.466, "eval_steps_per_second": 0.862, "step": 240 }, { "epoch": 2.6923076923076925, "grad_norm": 1.424541861104674, "learning_rate": 9.152680416240058e-07, "loss": 2.2356, "step": 245 }, { "epoch": 2.6923076923076925, "eval_loss": 2.2410500049591064, "eval_runtime": 94.5581, "eval_samples_per_second": 34.667, "eval_steps_per_second": 0.867, "step": 245 }, { "epoch": 2.7472527472527473, "grad_norm": 1.4211612108269323, "learning_rate": 9.09850855678232e-07, "loss": 2.2202, "step": 250 }, { "epoch": 2.7472527472527473, "eval_loss": 2.2383668422698975, "eval_runtime": 95.1818, "eval_samples_per_second": 34.439, "eval_steps_per_second": 0.862, "step": 250 }, { "epoch": 2.802197802197802, "grad_norm": 1.3505783784300605, "learning_rate": 9.042829103035389e-07, "loss": 2.2325, "step": 255 }, { "epoch": 2.802197802197802, "eval_loss": 2.2357189655303955, "eval_runtime": 96.4181, "eval_samples_per_second": 33.998, "eval_steps_per_second": 0.85, "step": 255 }, { "epoch": 2.857142857142857, "grad_norm": 1.413017015451658, "learning_rate": 8.985662536114612e-07, "loss": 2.2208, "step": 260 }, { "epoch": 2.857142857142857, "eval_loss": 2.2331881523132324, "eval_runtime": 95.0055, "eval_samples_per_second": 34.503, "eval_steps_per_second": 0.863, "step": 260 }, { "epoch": 2.912087912087912, "grad_norm": 1.4113571267131109, "learning_rate": 8.927029884154645e-07, "loss": 2.2228, "step": 265 }, { "epoch": 2.912087912087912, "eval_loss": 2.2307634353637695, "eval_runtime": 95.2567, "eval_samples_per_second": 34.412, "eval_steps_per_second": 0.861, "step": 265 }, { "epoch": 2.967032967032967, "grad_norm": 1.4351733805268447, "learning_rate": 8.866952714574469e-07, "loss": 2.204, "step": 270 }, { "epoch": 2.967032967032967, "eval_loss": 2.228379964828491, "eval_runtime": 94.9872, "eval_samples_per_second": 34.51, "eval_steps_per_second": 0.863, "step": 270 }, { "epoch": 3.021978021978022, "grad_norm": 1.4281979614756792, "learning_rate": 8.805453126144047e-07, "loss": 2.2071, "step": 275 }, { "epoch": 3.021978021978022, "eval_loss": 2.2260966300964355, "eval_runtime": 95.0392, "eval_samples_per_second": 34.491, "eval_steps_per_second": 0.863, "step": 275 }, { "epoch": 3.076923076923077, "grad_norm": 1.3917979753684913, "learning_rate": 8.742553740855505e-07, "loss": 2.2045, "step": 280 }, { "epoch": 3.076923076923077, "eval_loss": 2.2239363193511963, "eval_runtime": 95.5212, "eval_samples_per_second": 34.317, "eval_steps_per_second": 0.858, "step": 280 }, { "epoch": 3.131868131868132, "grad_norm": 1.3437438345520127, "learning_rate": 8.678277695601871e-07, "loss": 2.201, "step": 285 }, { "epoch": 3.131868131868132, "eval_loss": 2.221846103668213, "eval_runtime": 94.5168, "eval_samples_per_second": 34.682, "eval_steps_per_second": 0.868, "step": 285 }, { "epoch": 3.186813186813187, "grad_norm": 1.4708322103599034, "learning_rate": 8.612648633666406e-07, "loss": 2.2055, "step": 290 }, { "epoch": 3.186813186813187, "eval_loss": 2.2196929454803467, "eval_runtime": 96.225, "eval_samples_per_second": 34.066, "eval_steps_per_second": 0.852, "step": 290 }, { "epoch": 3.241758241758242, "grad_norm": 1.3792124479226802, "learning_rate": 8.545690696025665e-07, "loss": 2.194, "step": 295 }, { "epoch": 3.241758241758242, "eval_loss": 2.2177319526672363, "eval_runtime": 95.0779, "eval_samples_per_second": 34.477, "eval_steps_per_second": 0.862, "step": 295 }, { "epoch": 3.2967032967032965, "grad_norm": 1.3913138741477784, "learning_rate": 8.477428512469487e-07, "loss": 2.2053, "step": 300 }, { "epoch": 3.2967032967032965, "eval_loss": 2.215750217437744, "eval_runtime": 96.5467, "eval_samples_per_second": 33.952, "eval_steps_per_second": 0.849, "step": 300 }, { "epoch": 3.3516483516483517, "grad_norm": 1.3956533495293986, "learning_rate": 8.407887192541176e-07, "loss": 2.1918, "step": 305 }, { "epoch": 3.3516483516483517, "eval_loss": 2.2138054370880127, "eval_runtime": 94.5042, "eval_samples_per_second": 34.686, "eval_steps_per_second": 0.868, "step": 305 }, { "epoch": 3.4065934065934065, "grad_norm": 1.403182174578336, "learning_rate": 8.337092316301222e-07, "loss": 2.1896, "step": 310 }, { "epoch": 3.4065934065934065, "eval_loss": 2.2118804454803467, "eval_runtime": 95.8227, "eval_samples_per_second": 34.209, "eval_steps_per_second": 0.856, "step": 310 }, { "epoch": 3.4615384615384617, "grad_norm": 1.455355254263044, "learning_rate": 8.265069924917924e-07, "loss": 2.2064, "step": 315 }, { "epoch": 3.4615384615384617, "eval_loss": 2.2100589275360107, "eval_runtime": 94.8251, "eval_samples_per_second": 34.569, "eval_steps_per_second": 0.865, "step": 315 }, { "epoch": 3.5164835164835164, "grad_norm": 1.3983545229342624, "learning_rate": 8.191846511088434e-07, "loss": 2.1783, "step": 320 }, { "epoch": 3.5164835164835164, "eval_loss": 2.2082486152648926, "eval_runtime": 95.215, "eval_samples_per_second": 34.427, "eval_steps_per_second": 0.861, "step": 320 }, { "epoch": 3.571428571428571, "grad_norm": 1.4400799384553278, "learning_rate": 8.117449009293668e-07, "loss": 2.1767, "step": 325 }, { "epoch": 3.571428571428571, "eval_loss": 2.2065377235412598, "eval_runtime": 95.7035, "eval_samples_per_second": 34.252, "eval_steps_per_second": 0.857, "step": 325 }, { "epoch": 3.6263736263736264, "grad_norm": 1.3475863787147715, "learning_rate": 8.041904785890748e-07, "loss": 2.1903, "step": 330 }, { "epoch": 3.6263736263736264, "eval_loss": 2.20478892326355, "eval_runtime": 94.3907, "eval_samples_per_second": 34.728, "eval_steps_per_second": 0.869, "step": 330 }, { "epoch": 3.6813186813186816, "grad_norm": 1.3998921895857097, "learning_rate": 7.96524162904657e-07, "loss": 2.1832, "step": 335 }, { "epoch": 3.6813186813186816, "eval_loss": 2.2030811309814453, "eval_runtime": 96.3082, "eval_samples_per_second": 34.037, "eval_steps_per_second": 0.851, "step": 335 }, { "epoch": 3.7362637362637363, "grad_norm": 1.4095299216187385, "learning_rate": 7.8874877385162e-07, "loss": 2.175, "step": 340 }, { "epoch": 3.7362637362637363, "eval_loss": 2.2014122009277344, "eval_runtime": 94.5209, "eval_samples_per_second": 34.68, "eval_steps_per_second": 0.868, "step": 340 }, { "epoch": 3.791208791208791, "grad_norm": 1.4014291539296442, "learning_rate": 7.808671715269894e-07, "loss": 2.1822, "step": 345 }, { "epoch": 3.791208791208791, "eval_loss": 2.199869155883789, "eval_runtime": 96.9073, "eval_samples_per_second": 33.826, "eval_steps_per_second": 0.846, "step": 345 }, { "epoch": 3.8461538461538463, "grad_norm": 1.4012685963925429, "learning_rate": 7.728822550972522e-07, "loss": 2.1737, "step": 350 }, { "epoch": 3.8461538461538463, "eval_loss": 2.1982617378234863, "eval_runtime": 94.1727, "eval_samples_per_second": 34.808, "eval_steps_per_second": 0.871, "step": 350 }, { "epoch": 3.901098901098901, "grad_norm": 1.429348338650559, "learning_rate": 7.647969617319282e-07, "loss": 2.1792, "step": 355 }, { "epoch": 3.901098901098901, "eval_loss": 2.1967613697052, "eval_runtime": 96.0694, "eval_samples_per_second": 34.121, "eval_steps_per_second": 0.854, "step": 355 }, { "epoch": 3.956043956043956, "grad_norm": 1.417580581225819, "learning_rate": 7.566142655231621e-07, "loss": 2.1815, "step": 360 }, { "epoch": 3.956043956043956, "eval_loss": 2.1953213214874268, "eval_runtime": 94.1895, "eval_samples_per_second": 34.802, "eval_steps_per_second": 0.871, "step": 360 }, { "epoch": 4.010989010989011, "grad_norm": 1.3810060520884369, "learning_rate": 7.483371763917345e-07, "loss": 2.1754, "step": 365 }, { "epoch": 4.010989010989011, "eval_loss": 2.19380259513855, "eval_runtime": 95.3311, "eval_samples_per_second": 34.385, "eval_steps_per_second": 0.86, "step": 365 }, { "epoch": 4.065934065934066, "grad_norm": 1.3763895625796003, "learning_rate": 7.399687389798932e-07, "loss": 2.1689, "step": 370 }, { "epoch": 4.065934065934066, "eval_loss": 2.192410945892334, "eval_runtime": 94.2976, "eval_samples_per_second": 34.762, "eval_steps_per_second": 0.87, "step": 370 }, { "epoch": 4.1208791208791204, "grad_norm": 1.4248863012562965, "learning_rate": 7.315120315314134e-07, "loss": 2.1618, "step": 375 }, { "epoch": 4.1208791208791204, "eval_loss": 2.1910791397094727, "eval_runtime": 95.4906, "eval_samples_per_second": 34.328, "eval_steps_per_second": 0.859, "step": 375 }, { "epoch": 4.175824175824176, "grad_norm": 1.3935879500953352, "learning_rate": 7.229701647592965e-07, "loss": 2.1729, "step": 380 }, { "epoch": 4.175824175824176, "eval_loss": 2.1896872520446777, "eval_runtime": 94.2919, "eval_samples_per_second": 34.764, "eval_steps_per_second": 0.87, "step": 380 }, { "epoch": 4.230769230769231, "grad_norm": 1.3317520241186727, "learning_rate": 7.14346280701527e-07, "loss": 2.1576, "step": 385 }, { "epoch": 4.230769230769231, "eval_loss": 2.188359498977661, "eval_runtime": 95.1008, "eval_samples_per_second": 34.469, "eval_steps_per_second": 0.862, "step": 385 }, { "epoch": 4.285714285714286, "grad_norm": 1.4243898404737314, "learning_rate": 7.056435515653058e-07, "loss": 2.1719, "step": 390 }, { "epoch": 4.285714285714286, "eval_loss": 2.1870105266571045, "eval_runtime": 95.4713, "eval_samples_per_second": 34.335, "eval_steps_per_second": 0.859, "step": 390 }, { "epoch": 4.34065934065934, "grad_norm": 1.4055766721244105, "learning_rate": 6.968651785601858e-07, "loss": 2.1569, "step": 395 }, { "epoch": 4.34065934065934, "eval_loss": 2.1857264041900635, "eval_runtime": 94.6681, "eval_samples_per_second": 34.626, "eval_steps_per_second": 0.866, "step": 395 }, { "epoch": 4.395604395604396, "grad_norm": 1.4109071900833123, "learning_rate": 6.88014390720541e-07, "loss": 2.1602, "step": 400 }, { "epoch": 4.395604395604396, "eval_loss": 2.1845154762268066, "eval_runtime": 96.5176, "eval_samples_per_second": 33.963, "eval_steps_per_second": 0.85, "step": 400 }, { "epoch": 4.450549450549451, "grad_norm": 1.3966564162441588, "learning_rate": 6.790944437177983e-07, "loss": 2.1444, "step": 405 }, { "epoch": 4.450549450549451, "eval_loss": 2.183314800262451, "eval_runtime": 94.7139, "eval_samples_per_second": 34.61, "eval_steps_per_second": 0.866, "step": 405 }, { "epoch": 4.5054945054945055, "grad_norm": 1.4078350070104653, "learning_rate": 6.701086186628731e-07, "loss": 2.1507, "step": 410 }, { "epoch": 4.5054945054945055, "eval_loss": 2.182114362716675, "eval_runtime": 95.3459, "eval_samples_per_second": 34.38, "eval_steps_per_second": 0.86, "step": 410 }, { "epoch": 4.56043956043956, "grad_norm": 1.3979239637585987, "learning_rate": 6.610602208992452e-07, "loss": 2.1562, "step": 415 }, { "epoch": 4.56043956043956, "eval_loss": 2.1809184551239014, "eval_runtime": 94.8819, "eval_samples_per_second": 34.548, "eval_steps_per_second": 0.864, "step": 415 }, { "epoch": 4.615384615384615, "grad_norm": 1.4173323621445668, "learning_rate": 6.519525787871234e-07, "loss": 2.1507, "step": 420 }, { "epoch": 4.615384615384615, "eval_loss": 2.179769992828369, "eval_runtime": 94.615, "eval_samples_per_second": 34.646, "eval_steps_per_second": 0.867, "step": 420 }, { "epoch": 4.670329670329671, "grad_norm": 1.4495494766154244, "learning_rate": 6.427890424791413e-07, "loss": 2.1456, "step": 425 }, { "epoch": 4.670329670329671, "eval_loss": 2.1786587238311768, "eval_runtime": 95.2766, "eval_samples_per_second": 34.405, "eval_steps_per_second": 0.861, "step": 425 }, { "epoch": 4.725274725274725, "grad_norm": 1.3978628311813146, "learning_rate": 6.335729826880389e-07, "loss": 2.1527, "step": 430 }, { "epoch": 4.725274725274725, "eval_loss": 2.177562713623047, "eval_runtime": 95.0839, "eval_samples_per_second": 34.475, "eval_steps_per_second": 0.862, "step": 430 }, { "epoch": 4.78021978021978, "grad_norm": 1.3616533998206952, "learning_rate": 6.243077894467799e-07, "loss": 2.1523, "step": 435 }, { "epoch": 4.78021978021978, "eval_loss": 2.1765596866607666, "eval_runtime": 95.2854, "eval_samples_per_second": 34.402, "eval_steps_per_second": 0.861, "step": 435 }, { "epoch": 4.835164835164835, "grad_norm": 1.40687668442058, "learning_rate": 6.149968708615634e-07, "loss": 2.1514, "step": 440 }, { "epoch": 4.835164835164835, "eval_loss": 2.175467014312744, "eval_runtime": 94.6288, "eval_samples_per_second": 34.641, "eval_steps_per_second": 0.867, "step": 440 }, { "epoch": 4.8901098901098905, "grad_norm": 1.3739992174314783, "learning_rate": 6.056436518581864e-07, "loss": 2.1363, "step": 445 }, { "epoch": 4.8901098901098905, "eval_loss": 2.174509048461914, "eval_runtime": 95.7561, "eval_samples_per_second": 34.233, "eval_steps_per_second": 0.856, "step": 445 }, { "epoch": 4.945054945054945, "grad_norm": 1.4046699052180842, "learning_rate": 5.962515729222208e-07, "loss": 2.1515, "step": 450 }, { "epoch": 4.945054945054945, "eval_loss": 2.173485040664673, "eval_runtime": 94.9608, "eval_samples_per_second": 34.52, "eval_steps_per_second": 0.864, "step": 450 }, { "epoch": 5.0, "grad_norm": 1.3856994205796709, "learning_rate": 5.868240888334652e-07, "loss": 2.1446, "step": 455 }, { "epoch": 5.0, "eval_loss": 2.1724860668182373, "eval_runtime": 95.9629, "eval_samples_per_second": 34.159, "eval_steps_per_second": 0.854, "step": 455 }, { "epoch": 5.054945054945055, "grad_norm": 1.359907555644168, "learning_rate": 5.773646673951406e-07, "loss": 2.1449, "step": 460 }, { "epoch": 5.054945054945055, "eval_loss": 2.1716203689575195, "eval_runtime": 94.6944, "eval_samples_per_second": 34.617, "eval_steps_per_second": 0.866, "step": 460 }, { "epoch": 5.1098901098901095, "grad_norm": 1.397318242165103, "learning_rate": 5.67876788158294e-07, "loss": 2.151, "step": 465 }, { "epoch": 5.1098901098901095, "eval_loss": 2.170758008956909, "eval_runtime": 94.1168, "eval_samples_per_second": 34.829, "eval_steps_per_second": 0.871, "step": 465 }, { "epoch": 5.164835164835165, "grad_norm": 1.4236578949268166, "learning_rate": 5.58363941141881e-07, "loss": 2.135, "step": 470 }, { "epoch": 5.164835164835165, "eval_loss": 2.1698837280273438, "eval_runtime": 95.886, "eval_samples_per_second": 34.186, "eval_steps_per_second": 0.855, "step": 470 }, { "epoch": 5.21978021978022, "grad_norm": 1.3923617942385993, "learning_rate": 5.48829625548999e-07, "loss": 2.1378, "step": 475 }, { "epoch": 5.21978021978022, "eval_loss": 2.169065475463867, "eval_runtime": 94.683, "eval_samples_per_second": 34.621, "eval_steps_per_second": 0.866, "step": 475 }, { "epoch": 5.274725274725275, "grad_norm": 1.3673257335946538, "learning_rate": 5.392773484797406e-07, "loss": 2.1312, "step": 480 }, { "epoch": 5.274725274725275, "eval_loss": 2.1681604385375977, "eval_runtime": 96.7854, "eval_samples_per_second": 33.869, "eval_steps_per_second": 0.847, "step": 480 }, { "epoch": 5.329670329670329, "grad_norm": 1.376334253291603, "learning_rate": 5.297106236411431e-07, "loss": 2.1334, "step": 485 }, { "epoch": 5.329670329670329, "eval_loss": 2.1673035621643066, "eval_runtime": 94.0748, "eval_samples_per_second": 34.845, "eval_steps_per_second": 0.872, "step": 485 }, { "epoch": 5.384615384615385, "grad_norm": 1.4116620429830442, "learning_rate": 5.201329700547076e-07, "loss": 2.1287, "step": 490 }, { "epoch": 5.384615384615385, "eval_loss": 2.1665842533111572, "eval_runtime": 97.1342, "eval_samples_per_second": 33.747, "eval_steps_per_second": 0.844, "step": 490 }, { "epoch": 5.43956043956044, "grad_norm": 1.3882977350312697, "learning_rate": 5.105479107619623e-07, "loss": 2.1371, "step": 495 }, { "epoch": 5.43956043956044, "eval_loss": 2.1657893657684326, "eval_runtime": 94.6092, "eval_samples_per_second": 34.648, "eval_steps_per_second": 0.867, "step": 495 }, { "epoch": 5.4945054945054945, "grad_norm": 1.3387430328337173, "learning_rate": 5.009589715285492e-07, "loss": 2.1283, "step": 500 }, { "epoch": 5.4945054945054945, "eval_loss": 2.1649773120880127, "eval_runtime": 95.6455, "eval_samples_per_second": 34.272, "eval_steps_per_second": 0.857, "step": 500 }, { "epoch": 5.549450549450549, "grad_norm": 1.3369955383377354, "learning_rate": 4.913696795473058e-07, "loss": 2.1304, "step": 505 }, { "epoch": 5.549450549450549, "eval_loss": 2.1642661094665527, "eval_runtime": 94.082, "eval_samples_per_second": 34.842, "eval_steps_per_second": 0.872, "step": 505 }, { "epoch": 5.604395604395604, "grad_norm": 1.4094356210593881, "learning_rate": 4.81783562140825e-07, "loss": 2.1263, "step": 510 }, { "epoch": 5.604395604395604, "eval_loss": 2.1635727882385254, "eval_runtime": 96.1766, "eval_samples_per_second": 34.083, "eval_steps_per_second": 0.853, "step": 510 }, { "epoch": 5.65934065934066, "grad_norm": 1.304106079776388, "learning_rate": 4.722041454639645e-07, "loss": 2.1367, "step": 515 }, { "epoch": 5.65934065934066, "eval_loss": 2.1628613471984863, "eval_runtime": 94.3841, "eval_samples_per_second": 34.73, "eval_steps_per_second": 0.869, "step": 515 }, { "epoch": 5.714285714285714, "grad_norm": 1.3827334256393926, "learning_rate": 4.626349532067879e-07, "loss": 2.1207, "step": 520 }, { "epoch": 5.714285714285714, "eval_loss": 2.162167549133301, "eval_runtime": 95.9116, "eval_samples_per_second": 34.177, "eval_steps_per_second": 0.855, "step": 520 }, { "epoch": 5.769230769230769, "grad_norm": 1.4133703140162626, "learning_rate": 4.530795052984104e-07, "loss": 2.126, "step": 525 }, { "epoch": 5.769230769230769, "eval_loss": 2.161437511444092, "eval_runtime": 94.7934, "eval_samples_per_second": 34.58, "eval_steps_per_second": 0.865, "step": 525 }, { "epoch": 5.824175824175824, "grad_norm": 1.3623534884797142, "learning_rate": 4.4354131661222993e-07, "loss": 2.1178, "step": 530 }, { "epoch": 5.824175824175824, "eval_loss": 2.1607697010040283, "eval_runtime": 95.1583, "eval_samples_per_second": 34.448, "eval_steps_per_second": 0.862, "step": 530 }, { "epoch": 5.8791208791208796, "grad_norm": 1.3721824145937465, "learning_rate": 4.3402389567301687e-07, "loss": 2.1317, "step": 535 }, { "epoch": 5.8791208791208796, "eval_loss": 2.1601507663726807, "eval_runtime": 95.1179, "eval_samples_per_second": 34.462, "eval_steps_per_second": 0.862, "step": 535 }, { "epoch": 5.934065934065934, "grad_norm": 1.354947196331287, "learning_rate": 4.245307433663388e-07, "loss": 2.1208, "step": 540 }, { "epoch": 5.934065934065934, "eval_loss": 2.15952730178833, "eval_runtime": 95.2363, "eval_samples_per_second": 34.42, "eval_steps_per_second": 0.861, "step": 540 }, { "epoch": 5.989010989010989, "grad_norm": 1.3836862806091863, "learning_rate": 4.1506535165079637e-07, "loss": 2.131, "step": 545 }, { "epoch": 5.989010989010989, "eval_loss": 2.1588995456695557, "eval_runtime": 95.977, "eval_samples_per_second": 34.154, "eval_steps_per_second": 0.854, "step": 545 }, { "epoch": 6.043956043956044, "grad_norm": 1.4174811691080422, "learning_rate": 4.056312022735417e-07, "loss": 2.1282, "step": 550 }, { "epoch": 6.043956043956044, "eval_loss": 2.1583850383758545, "eval_runtime": 94.9634, "eval_samples_per_second": 34.519, "eval_steps_per_second": 0.863, "step": 550 }, { "epoch": 6.0989010989010985, "grad_norm": 1.386136241074018, "learning_rate": 3.962317654895532e-07, "loss": 2.1071, "step": 555 }, { "epoch": 6.0989010989010985, "eval_loss": 2.157827854156494, "eval_runtime": 96.6235, "eval_samples_per_second": 33.925, "eval_steps_per_second": 0.849, "step": 555 }, { "epoch": 6.153846153846154, "grad_norm": 1.382143872347298, "learning_rate": 3.86870498785139e-07, "loss": 2.1152, "step": 560 }, { "epoch": 6.153846153846154, "eval_loss": 2.1572675704956055, "eval_runtime": 94.9844, "eval_samples_per_second": 34.511, "eval_steps_per_second": 0.863, "step": 560 }, { "epoch": 6.208791208791209, "grad_norm": 1.3117267968176445, "learning_rate": 3.7755084560613454e-07, "loss": 2.1274, "step": 565 }, { "epoch": 6.208791208791209, "eval_loss": 2.156782865524292, "eval_runtime": 95.0906, "eval_samples_per_second": 34.472, "eval_steps_per_second": 0.862, "step": 565 }, { "epoch": 6.263736263736264, "grad_norm": 1.3153022809674055, "learning_rate": 3.682762340912681e-07, "loss": 2.125, "step": 570 }, { "epoch": 6.263736263736264, "eval_loss": 2.156236410140991, "eval_runtime": 94.7673, "eval_samples_per_second": 34.59, "eval_steps_per_second": 0.865, "step": 570 }, { "epoch": 6.318681318681318, "grad_norm": 1.363719531307913, "learning_rate": 3.590500758111537e-07, "loss": 2.1253, "step": 575 }, { "epoch": 6.318681318681318, "eval_loss": 2.1556880474090576, "eval_runtime": 95.0783, "eval_samples_per_second": 34.477, "eval_steps_per_second": 0.862, "step": 575 }, { "epoch": 6.373626373626374, "grad_norm": 1.3306699270632527, "learning_rate": 3.498757645133805e-07, "loss": 2.1105, "step": 580 }, { "epoch": 6.373626373626374, "eval_loss": 2.1551740169525146, "eval_runtime": 95.3795, "eval_samples_per_second": 34.368, "eval_steps_per_second": 0.86, "step": 580 }, { "epoch": 6.428571428571429, "grad_norm": 1.4013786507282588, "learning_rate": 3.4075667487415785e-07, "loss": 2.1233, "step": 585 }, { "epoch": 6.428571428571429, "eval_loss": 2.1547253131866455, "eval_runtime": 94.9401, "eval_samples_per_second": 34.527, "eval_steps_per_second": 0.864, "step": 585 }, { "epoch": 6.483516483516484, "grad_norm": 1.340212819021932, "learning_rate": 3.3169616125697485e-07, "loss": 2.1082, "step": 590 }, { "epoch": 6.483516483516484, "eval_loss": 2.1542816162109375, "eval_runtime": 96.1807, "eval_samples_per_second": 34.082, "eval_steps_per_second": 0.853, "step": 590 }, { "epoch": 6.538461538461538, "grad_norm": 1.37559843039388, "learning_rate": 3.2269755647873214e-07, "loss": 2.116, "step": 595 }, { "epoch": 6.538461538461538, "eval_loss": 2.153905153274536, "eval_runtime": 94.5357, "eval_samples_per_second": 34.675, "eval_steps_per_second": 0.867, "step": 595 }, { "epoch": 6.593406593406593, "grad_norm": 1.3234676252365452, "learning_rate": 3.137641705838003e-07, "loss": 2.114, "step": 600 }, { "epoch": 6.593406593406593, "eval_loss": 2.153465747833252, "eval_runtime": 96.7145, "eval_samples_per_second": 33.894, "eval_steps_per_second": 0.848, "step": 600 }, { "epoch": 6.648351648351649, "grad_norm": 1.2963273381270386, "learning_rate": 3.048992896264527e-07, "loss": 2.1025, "step": 605 }, { "epoch": 6.648351648351649, "eval_loss": 2.153031587600708, "eval_runtime": 94.9067, "eval_samples_per_second": 34.539, "eval_steps_per_second": 0.864, "step": 605 }, { "epoch": 6.7032967032967035, "grad_norm": 1.363956310159158, "learning_rate": 2.9610617446212494e-07, "loss": 2.1174, "step": 610 }, { "epoch": 6.7032967032967035, "eval_loss": 2.1526174545288086, "eval_runtime": 96.5514, "eval_samples_per_second": 33.951, "eval_steps_per_second": 0.849, "step": 610 }, { "epoch": 6.758241758241758, "grad_norm": 1.324164018147815, "learning_rate": 2.8738805954794295e-07, "loss": 2.1158, "step": 615 }, { "epoch": 6.758241758241758, "eval_loss": 2.1521835327148438, "eval_runtime": 94.9388, "eval_samples_per_second": 34.527, "eval_steps_per_second": 0.864, "step": 615 }, { "epoch": 6.813186813186813, "grad_norm": 1.3310397406861454, "learning_rate": 2.7874815175296e-07, "loss": 2.1118, "step": 620 }, { "epoch": 6.813186813186813, "eval_loss": 2.1518325805664062, "eval_runtime": 95.7818, "eval_samples_per_second": 34.224, "eval_steps_per_second": 0.856, "step": 620 }, { "epoch": 6.868131868131869, "grad_norm": 1.306713284641784, "learning_rate": 2.7018962917854416e-07, "loss": 2.1219, "step": 625 }, { "epoch": 6.868131868131869, "eval_loss": 2.1515119075775146, "eval_runtime": 94.8621, "eval_samples_per_second": 34.555, "eval_steps_per_second": 0.864, "step": 625 }, { "epoch": 6.923076923076923, "grad_norm": 1.328321525865242, "learning_rate": 2.61715639989346e-07, "loss": 2.1088, "step": 630 }, { "epoch": 6.923076923076923, "eval_loss": 2.1511785984039307, "eval_runtime": 94.9757, "eval_samples_per_second": 34.514, "eval_steps_per_second": 0.863, "step": 630 }, { "epoch": 6.978021978021978, "grad_norm": 1.3354659608167203, "learning_rate": 2.5332930125527785e-07, "loss": 2.1188, "step": 635 }, { "epoch": 6.978021978021978, "eval_loss": 2.150792360305786, "eval_runtime": 95.0995, "eval_samples_per_second": 34.469, "eval_steps_per_second": 0.862, "step": 635 }, { "epoch": 7.032967032967033, "grad_norm": 1.3252421337619178, "learning_rate": 2.4503369780493216e-07, "loss": 2.0958, "step": 640 }, { "epoch": 7.032967032967033, "eval_loss": 2.150482416152954, "eval_runtime": 94.5987, "eval_samples_per_second": 34.652, "eval_steps_per_second": 0.867, "step": 640 }, { "epoch": 7.087912087912088, "grad_norm": 1.3025182371568629, "learning_rate": 2.3683188109085877e-07, "loss": 2.1162, "step": 645 }, { "epoch": 7.087912087912088, "eval_loss": 2.150195598602295, "eval_runtime": 95.7892, "eval_samples_per_second": 34.221, "eval_steps_per_second": 0.856, "step": 645 }, { "epoch": 7.142857142857143, "grad_norm": 1.3417203401169715, "learning_rate": 2.2872686806712032e-07, "loss": 2.112, "step": 650 }, { "epoch": 7.142857142857143, "eval_loss": 2.14986515045166, "eval_runtime": 94.108, "eval_samples_per_second": 34.832, "eval_steps_per_second": 0.871, "step": 650 }, { "epoch": 7.197802197802198, "grad_norm": 1.3125833946773793, "learning_rate": 2.2072164007953515e-07, "loss": 2.1108, "step": 655 }, { "epoch": 7.197802197802198, "eval_loss": 2.1495869159698486, "eval_runtime": 96.3106, "eval_samples_per_second": 34.036, "eval_steps_per_second": 0.851, "step": 655 }, { "epoch": 7.252747252747253, "grad_norm": 1.3727056594091913, "learning_rate": 2.1281914176902106e-07, "loss": 2.1105, "step": 660 }, { "epoch": 7.252747252747253, "eval_loss": 2.149327039718628, "eval_runtime": 94.0785, "eval_samples_per_second": 34.843, "eval_steps_per_second": 0.872, "step": 660 }, { "epoch": 7.3076923076923075, "grad_norm": 1.26720608418739, "learning_rate": 2.050222799884387e-07, "loss": 2.1119, "step": 665 }, { "epoch": 7.3076923076923075, "eval_loss": 2.1490209102630615, "eval_runtime": 95.0812, "eval_samples_per_second": 34.476, "eval_steps_per_second": 0.862, "step": 665 }, { "epoch": 7.362637362637362, "grad_norm": 1.3075408289058195, "learning_rate": 1.9733392273333595e-07, "loss": 2.1125, "step": 670 }, { "epoch": 7.362637362637362, "eval_loss": 2.148756980895996, "eval_runtime": 94.7034, "eval_samples_per_second": 34.613, "eval_steps_per_second": 0.866, "step": 670 }, { "epoch": 7.417582417582418, "grad_norm": 1.338274034103587, "learning_rate": 1.8975689808698546e-07, "loss": 2.1085, "step": 675 }, { "epoch": 7.417582417582418, "eval_loss": 2.1485321521759033, "eval_runtime": 95.2917, "eval_samples_per_second": 34.4, "eval_steps_per_second": 0.861, "step": 675 }, { "epoch": 7.472527472527473, "grad_norm": 1.2732739133631568, "learning_rate": 1.8229399318010234e-07, "loss": 2.113, "step": 680 }, { "epoch": 7.472527472527473, "eval_loss": 2.1483404636383057, "eval_runtime": 96.15, "eval_samples_per_second": 34.093, "eval_steps_per_second": 0.853, "step": 680 }, { "epoch": 7.527472527472527, "grad_norm": 1.3397177765715196, "learning_rate": 1.7494795316562787e-07, "loss": 2.1022, "step": 685 }, { "epoch": 7.527472527472527, "eval_loss": 2.1481196880340576, "eval_runtime": 94.6054, "eval_samples_per_second": 34.649, "eval_steps_per_second": 0.867, "step": 685 }, { "epoch": 7.582417582417582, "grad_norm": 1.315974901665471, "learning_rate": 1.6772148020895227e-07, "loss": 2.1005, "step": 690 }, { "epoch": 7.582417582417582, "eval_loss": 2.1478824615478516, "eval_runtime": 96.7642, "eval_samples_per_second": 33.876, "eval_steps_per_second": 0.847, "step": 690 }, { "epoch": 7.637362637362637, "grad_norm": 1.3175415298356188, "learning_rate": 1.6061723249395103e-07, "loss": 2.1061, "step": 695 }, { "epoch": 7.637362637362637, "eval_loss": 2.147655963897705, "eval_runtime": 94.8865, "eval_samples_per_second": 34.547, "eval_steps_per_second": 0.864, "step": 695 }, { "epoch": 7.6923076923076925, "grad_norm": 1.2676400902435017, "learning_rate": 1.536378232452003e-07, "loss": 2.1113, "step": 700 }, { "epoch": 7.6923076923076925, "eval_loss": 2.147427797317505, "eval_runtime": 96.4923, "eval_samples_per_second": 33.972, "eval_steps_per_second": 0.85, "step": 700 }, { "epoch": 7.747252747252747, "grad_norm": 1.3178958103097314, "learning_rate": 1.4678581976672748e-07, "loss": 2.0939, "step": 705 }, { "epoch": 7.747252747252747, "eval_loss": 2.147228479385376, "eval_runtime": 94.325, "eval_samples_per_second": 34.752, "eval_steps_per_second": 0.869, "step": 705 }, { "epoch": 7.802197802197802, "grad_norm": 1.3384998637586858, "learning_rate": 1.4006374249765596e-07, "loss": 2.0993, "step": 710 }, { "epoch": 7.802197802197802, "eval_loss": 2.147052764892578, "eval_runtime": 96.9054, "eval_samples_per_second": 33.827, "eval_steps_per_second": 0.846, "step": 710 }, { "epoch": 7.857142857142857, "grad_norm": 1.280750721967852, "learning_rate": 1.3347406408508694e-07, "loss": 2.1115, "step": 715 }, { "epoch": 7.857142857142857, "eval_loss": 2.146883249282837, "eval_runtime": 94.446, "eval_samples_per_second": 34.708, "eval_steps_per_second": 0.868, "step": 715 }, { "epoch": 7.912087912087912, "grad_norm": 1.3140429519277008, "learning_rate": 1.2701920847456166e-07, "loss": 2.1043, "step": 720 }, { "epoch": 7.912087912087912, "eval_loss": 2.146697998046875, "eval_runtime": 95.4996, "eval_samples_per_second": 34.325, "eval_steps_per_second": 0.859, "step": 720 }, { "epoch": 7.967032967032967, "grad_norm": 1.3312499353255274, "learning_rate": 1.2070155001843835e-07, "loss": 2.1032, "step": 725 }, { "epoch": 7.967032967032967, "eval_loss": 2.1465156078338623, "eval_runtime": 93.9989, "eval_samples_per_second": 34.873, "eval_steps_per_second": 0.872, "step": 725 }, { "epoch": 8.021978021978022, "grad_norm": 1.3390325157168175, "learning_rate": 1.1452341260251019e-07, "loss": 2.0988, "step": 730 }, { "epoch": 8.021978021978022, "eval_loss": 2.1463658809661865, "eval_runtime": 94.8926, "eval_samples_per_second": 34.544, "eval_steps_per_second": 0.864, "step": 730 }, { "epoch": 8.076923076923077, "grad_norm": 1.3132771567223507, "learning_rate": 1.084870687911889e-07, "loss": 2.1021, "step": 735 }, { "epoch": 8.076923076923077, "eval_loss": 2.146247625350952, "eval_runtime": 96.2255, "eval_samples_per_second": 34.066, "eval_steps_per_second": 0.852, "step": 735 }, { "epoch": 8.131868131868131, "grad_norm": 1.2967777616488498, "learning_rate": 1.0259473899156429e-07, "loss": 2.0972, "step": 740 }, { "epoch": 8.131868131868131, "eval_loss": 2.1461329460144043, "eval_runtime": 94.5501, "eval_samples_per_second": 34.669, "eval_steps_per_second": 0.867, "step": 740 }, { "epoch": 8.186813186813186, "grad_norm": 1.2698397159733246, "learning_rate": 9.684859063665057e-08, "loss": 2.1034, "step": 745 }, { "epoch": 8.186813186813186, "eval_loss": 2.1460211277008057, "eval_runtime": 95.3329, "eval_samples_per_second": 34.385, "eval_steps_per_second": 0.86, "step": 745 }, { "epoch": 8.241758241758241, "grad_norm": 1.278484842006247, "learning_rate": 9.125073738811917e-08, "loss": 2.0955, "step": 750 }, { "epoch": 8.241758241758241, "eval_loss": 2.1458938121795654, "eval_runtime": 94.9309, "eval_samples_per_second": 34.53, "eval_steps_per_second": 0.864, "step": 750 }, { "epoch": 8.296703296703297, "grad_norm": 1.289197769580351, "learning_rate": 8.580323835880859e-08, "loss": 2.0997, "step": 755 }, { "epoch": 8.296703296703297, "eval_loss": 2.1457700729370117, "eval_runtime": 94.6214, "eval_samples_per_second": 34.643, "eval_steps_per_second": 0.867, "step": 755 }, { "epoch": 8.351648351648352, "grad_norm": 1.301153991983438, "learning_rate": 8.050809735530207e-08, "loss": 2.1016, "step": 760 }, { "epoch": 8.351648351648352, "eval_loss": 2.1456449031829834, "eval_runtime": 95.3277, "eval_samples_per_second": 34.387, "eval_steps_per_second": 0.86, "step": 760 }, { "epoch": 8.406593406593407, "grad_norm": 1.3044699064545437, "learning_rate": 7.53672621408472e-08, "loss": 2.107, "step": 765 }, { "epoch": 8.406593406593407, "eval_loss": 2.145549774169922, "eval_runtime": 93.7346, "eval_samples_per_second": 34.971, "eval_steps_per_second": 0.875, "step": 765 }, { "epoch": 8.461538461538462, "grad_norm": 1.2805052222372935, "learning_rate": 7.038262371889159e-08, "loss": 2.1033, "step": 770 }, { "epoch": 8.461538461538462, "eval_loss": 2.145460605621338, "eval_runtime": 95.8972, "eval_samples_per_second": 34.182, "eval_steps_per_second": 0.855, "step": 770 }, { "epoch": 8.516483516483516, "grad_norm": 1.3344128534699025, "learning_rate": 6.555601563749674e-08, "loss": 2.1081, "step": 775 }, { "epoch": 8.516483516483516, "eval_loss": 2.1453983783721924, "eval_runtime": 94.7645, "eval_samples_per_second": 34.591, "eval_steps_per_second": 0.865, "step": 775 }, { "epoch": 8.571428571428571, "grad_norm": 1.3534141294417172, "learning_rate": 6.088921331488566e-08, "loss": 2.1007, "step": 780 }, { "epoch": 8.571428571428571, "eval_loss": 2.1453309059143066, "eval_runtime": 95.9005, "eval_samples_per_second": 34.181, "eval_steps_per_second": 0.855, "step": 780 }, { "epoch": 8.626373626373626, "grad_norm": 1.289360061293496, "learning_rate": 5.6383933386374316e-08, "loss": 2.0954, "step": 785 }, { "epoch": 8.626373626373626, "eval_loss": 2.1452713012695312, "eval_runtime": 94.327, "eval_samples_per_second": 34.751, "eval_steps_per_second": 0.869, "step": 785 }, { "epoch": 8.68131868131868, "grad_norm": 1.290799173451955, "learning_rate": 5.204183307292409e-08, "loss": 2.0966, "step": 790 }, { "epoch": 8.68131868131868, "eval_loss": 2.1452112197875977, "eval_runtime": 96.6287, "eval_samples_per_second": 33.924, "eval_steps_per_second": 0.849, "step": 790 }, { "epoch": 8.736263736263737, "grad_norm": 1.296710622293866, "learning_rate": 4.786450957155064e-08, "loss": 2.105, "step": 795 }, { "epoch": 8.736263736263737, "eval_loss": 2.1451542377471924, "eval_runtime": 94.7724, "eval_samples_per_second": 34.588, "eval_steps_per_second": 0.865, "step": 795 }, { "epoch": 8.791208791208792, "grad_norm": 1.293112762829761, "learning_rate": 4.385349946781136e-08, "loss": 2.11, "step": 800 }, { "epoch": 8.791208791208792, "eval_loss": 2.145106315612793, "eval_runtime": 95.5587, "eval_samples_per_second": 34.304, "eval_steps_per_second": 0.858, "step": 800 }, { "epoch": 8.846153846153847, "grad_norm": 1.326895910564294, "learning_rate": 4.0010278170587884e-08, "loss": 2.1025, "step": 805 }, { "epoch": 8.846153846153847, "eval_loss": 2.1450610160827637, "eval_runtime": 94.083, "eval_samples_per_second": 34.842, "eval_steps_per_second": 0.872, "step": 805 }, { "epoch": 8.901098901098901, "grad_norm": 1.303529256363753, "learning_rate": 3.633625936937229e-08, "loss": 2.1057, "step": 810 }, { "epoch": 8.901098901098901, "eval_loss": 2.145014762878418, "eval_runtime": 94.2778, "eval_samples_per_second": 34.77, "eval_steps_per_second": 0.87, "step": 810 }, { "epoch": 8.956043956043956, "grad_norm": 1.300189968606814, "learning_rate": 3.28327945142558e-08, "loss": 2.1084, "step": 815 }, { "epoch": 8.956043956043956, "eval_loss": 2.1449670791625977, "eval_runtime": 95.579, "eval_samples_per_second": 34.296, "eval_steps_per_second": 0.858, "step": 815 }, { "epoch": 9.010989010989011, "grad_norm": 1.3121642088661138, "learning_rate": 2.950117231881183e-08, "loss": 2.0948, "step": 820 }, { "epoch": 9.010989010989011, "eval_loss": 2.144935131072998, "eval_runtime": 93.9326, "eval_samples_per_second": 34.897, "eval_steps_per_second": 0.873, "step": 820 }, { "epoch": 9.065934065934066, "grad_norm": 1.2914729270696104, "learning_rate": 2.634261828605594e-08, "loss": 2.1031, "step": 825 }, { "epoch": 9.065934065934066, "eval_loss": 2.144901752471924, "eval_runtime": 96.1619, "eval_samples_per_second": 34.088, "eval_steps_per_second": 0.853, "step": 825 }, { "epoch": 9.12087912087912, "grad_norm": 1.2899551643117746, "learning_rate": 2.335829425765712e-08, "loss": 2.0946, "step": 830 }, { "epoch": 9.12087912087912, "eval_loss": 2.1448748111724854, "eval_runtime": 93.6382, "eval_samples_per_second": 35.007, "eval_steps_per_second": 0.876, "step": 830 }, { "epoch": 9.175824175824175, "grad_norm": 1.2788813311509164, "learning_rate": 2.0549297986566183e-08, "loss": 2.1076, "step": 835 }, { "epoch": 9.175824175824175, "eval_loss": 2.144841194152832, "eval_runtime": 96.4177, "eval_samples_per_second": 33.998, "eval_steps_per_second": 0.85, "step": 835 }, { "epoch": 9.23076923076923, "grad_norm": 1.3297319077943937, "learning_rate": 1.7916662733218846e-08, "loss": 2.0962, "step": 840 }, { "epoch": 9.23076923076923, "eval_loss": 2.144815683364868, "eval_runtime": 94.5433, "eval_samples_per_second": 34.672, "eval_steps_per_second": 0.867, "step": 840 }, { "epoch": 9.285714285714286, "grad_norm": 1.3729180320680576, "learning_rate": 1.5461356885461075e-08, "loss": 2.0884, "step": 845 }, { "epoch": 9.285714285714286, "eval_loss": 2.1447927951812744, "eval_runtime": 96.9528, "eval_samples_per_second": 33.81, "eval_steps_per_second": 0.846, "step": 845 }, { "epoch": 9.340659340659341, "grad_norm": 1.3282581119274621, "learning_rate": 1.3184283602337864e-08, "loss": 2.1016, "step": 850 }, { "epoch": 9.340659340659341, "eval_loss": 2.144777536392212, "eval_runtime": 94.2281, "eval_samples_per_second": 34.788, "eval_steps_per_second": 0.87, "step": 850 }, { "epoch": 9.395604395604396, "grad_norm": 1.303774253858923, "learning_rate": 1.1086280481875653e-08, "loss": 2.1091, "step": 855 }, { "epoch": 9.395604395604396, "eval_loss": 2.144756555557251, "eval_runtime": 96.7444, "eval_samples_per_second": 33.883, "eval_steps_per_second": 0.848, "step": 855 }, { "epoch": 9.45054945054945, "grad_norm": 1.3340713352584999, "learning_rate": 9.168119252979945e-09, "loss": 2.1084, "step": 860 }, { "epoch": 9.45054945054945, "eval_loss": 2.1447436809539795, "eval_runtime": 94.7907, "eval_samples_per_second": 34.581, "eval_steps_per_second": 0.865, "step": 860 }, { "epoch": 9.505494505494505, "grad_norm": 1.297522419820979, "learning_rate": 7.430505491563099e-09, "loss": 2.1069, "step": 865 }, { "epoch": 9.505494505494505, "eval_loss": 2.14473295211792, "eval_runtime": 95.2303, "eval_samples_per_second": 34.422, "eval_steps_per_second": 0.861, "step": 865 }, { "epoch": 9.56043956043956, "grad_norm": 1.2976358260740801, "learning_rate": 5.874078361005563e-09, "loss": 2.1049, "step": 870 }, { "epoch": 9.56043956043956, "eval_loss": 2.1447255611419678, "eval_runtime": 94.6371, "eval_samples_per_second": 34.638, "eval_steps_per_second": 0.866, "step": 870 }, { "epoch": 9.615384615384615, "grad_norm": 1.30150242446251, "learning_rate": 4.499410377045765e-09, "loss": 2.0981, "step": 875 }, { "epoch": 9.615384615384615, "eval_loss": 2.144716739654541, "eval_runtime": 94.4817, "eval_samples_per_second": 34.695, "eval_steps_per_second": 0.868, "step": 875 }, { "epoch": 9.67032967032967, "grad_norm": 1.265689030842284, "learning_rate": 3.3070071971867398e-09, "loss": 2.0975, "step": 880 }, { "epoch": 9.67032967032967, "eval_loss": 2.1447131633758545, "eval_runtime": 97.3588, "eval_samples_per_second": 33.669, "eval_steps_per_second": 0.842, "step": 880 }, { "epoch": 9.725274725274724, "grad_norm": 1.3221353338950448, "learning_rate": 2.297307434694473e-09, "loss": 2.1033, "step": 885 }, { "epoch": 9.725274725274724, "eval_loss": 2.14471173286438, "eval_runtime": 94.4717, "eval_samples_per_second": 34.698, "eval_steps_per_second": 0.868, "step": 885 }, { "epoch": 9.780219780219781, "grad_norm": 1.2607874314626535, "learning_rate": 1.4706824972591237e-09, "loss": 2.1034, "step": 890 }, { "epoch": 9.780219780219781, "eval_loss": 2.1447081565856934, "eval_runtime": 96.6148, "eval_samples_per_second": 33.929, "eval_steps_per_second": 0.849, "step": 890 }, { "epoch": 9.835164835164836, "grad_norm": 1.293461540963835, "learning_rate": 8.274364503760845e-10, "loss": 2.0956, "step": 895 }, { "epoch": 9.835164835164836, "eval_loss": 2.1447083950042725, "eval_runtime": 94.3064, "eval_samples_per_second": 34.759, "eval_steps_per_second": 0.87, "step": 895 }, { "epoch": 9.89010989010989, "grad_norm": 1.279822041703267, "learning_rate": 3.678059054988969e-10, "loss": 2.0917, "step": 900 }, { "epoch": 9.89010989010989, "eval_loss": 2.1447057723999023, "eval_runtime": 96.7952, "eval_samples_per_second": 33.865, "eval_steps_per_second": 0.847, "step": 900 }, { "epoch": 9.945054945054945, "grad_norm": 1.3339227803991875, "learning_rate": 9.19599330039822e-11, "loss": 2.0983, "step": 905 }, { "epoch": 9.945054945054945, "eval_loss": 2.144705295562744, "eval_runtime": 93.8029, "eval_samples_per_second": 34.946, "eval_steps_per_second": 0.874, "step": 905 }, { "epoch": 10.0, "grad_norm": 1.2734998688150378, "learning_rate": 0.0, "loss": 2.1039, "step": 910 }, { "epoch": 10.0, "eval_loss": 2.1447036266326904, "eval_runtime": 97.1323, "eval_samples_per_second": 33.748, "eval_steps_per_second": 0.844, "step": 910 }, { "epoch": 10.0, "step": 910, "total_flos": 7.930133386100736e+16, "train_loss": 2.201316048024775, "train_runtime": 48928.7509, "train_samples_per_second": 5.948, "train_steps_per_second": 0.019 } ], "logging_steps": 5, "max_steps": 910, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.930133386100736e+16, "train_batch_size": 10, "trial_name": null, "trial_params": null }