|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.23, |
|
"eval_steps": 500, |
|
"global_step": 23000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001, |
|
"grad_norm": 12.033066749572754, |
|
"learning_rate": 9.998000000000002e-06, |
|
"loss": 5.0328, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.002, |
|
"grad_norm": 12.140076637268066, |
|
"learning_rate": 9.996e-06, |
|
"loss": 4.8188, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.003, |
|
"grad_norm": 11.839982032775879, |
|
"learning_rate": 9.994000000000001e-06, |
|
"loss": 4.6021, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 13.72381591796875, |
|
"learning_rate": 9.992e-06, |
|
"loss": 4.4731, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 11.587430000305176, |
|
"learning_rate": 9.990000000000001e-06, |
|
"loss": 4.4251, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.006, |
|
"grad_norm": 12.022308349609375, |
|
"learning_rate": 9.988000000000002e-06, |
|
"loss": 4.4416, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.007, |
|
"grad_norm": 12.038565635681152, |
|
"learning_rate": 9.986e-06, |
|
"loss": 4.3319, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 11.564003944396973, |
|
"learning_rate": 9.984e-06, |
|
"loss": 4.3267, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.009, |
|
"grad_norm": 11.518474578857422, |
|
"learning_rate": 9.982e-06, |
|
"loss": 4.2818, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.206191062927246, |
|
"learning_rate": 9.980000000000001e-06, |
|
"loss": 4.3167, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.011, |
|
"grad_norm": 11.940592765808105, |
|
"learning_rate": 9.978000000000002e-06, |
|
"loss": 4.3204, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 12.109590530395508, |
|
"learning_rate": 9.976e-06, |
|
"loss": 4.3286, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.013, |
|
"grad_norm": 12.218742370605469, |
|
"learning_rate": 9.974e-06, |
|
"loss": 4.4106, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.014, |
|
"grad_norm": 11.53864860534668, |
|
"learning_rate": 9.972e-06, |
|
"loss": 4.3091, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 13.800053596496582, |
|
"learning_rate": 9.970000000000001e-06, |
|
"loss": 4.2455, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 13.468424797058105, |
|
"learning_rate": 9.968000000000002e-06, |
|
"loss": 4.305, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.017, |
|
"grad_norm": 11.736213684082031, |
|
"learning_rate": 9.966e-06, |
|
"loss": 4.3232, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.018, |
|
"grad_norm": 10.893006324768066, |
|
"learning_rate": 9.964e-06, |
|
"loss": 4.3139, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.019, |
|
"grad_norm": 14.407844543457031, |
|
"learning_rate": 9.962e-06, |
|
"loss": 4.2437, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 11.889874458312988, |
|
"learning_rate": 9.960000000000001e-06, |
|
"loss": 4.2543, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.021, |
|
"grad_norm": 11.396763801574707, |
|
"learning_rate": 9.958e-06, |
|
"loss": 4.1477, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.022, |
|
"grad_norm": 13.99250316619873, |
|
"learning_rate": 9.956000000000001e-06, |
|
"loss": 4.248, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.023, |
|
"grad_norm": 11.850446701049805, |
|
"learning_rate": 9.954e-06, |
|
"loss": 4.3014, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 13.699837684631348, |
|
"learning_rate": 9.952e-06, |
|
"loss": 4.1937, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 11.772696495056152, |
|
"learning_rate": 9.950000000000001e-06, |
|
"loss": 4.264, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.026, |
|
"grad_norm": 13.460022926330566, |
|
"learning_rate": 9.948e-06, |
|
"loss": 4.2278, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.027, |
|
"grad_norm": 11.80987548828125, |
|
"learning_rate": 9.946000000000001e-06, |
|
"loss": 4.2416, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.028, |
|
"grad_norm": 12.391595840454102, |
|
"learning_rate": 9.944e-06, |
|
"loss": 4.2183, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.029, |
|
"grad_norm": 12.336369514465332, |
|
"learning_rate": 9.942e-06, |
|
"loss": 4.225, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 12.137269020080566, |
|
"learning_rate": 9.940000000000001e-06, |
|
"loss": 4.2045, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.031, |
|
"grad_norm": 12.397940635681152, |
|
"learning_rate": 9.938e-06, |
|
"loss": 4.1523, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 12.940911293029785, |
|
"learning_rate": 9.936000000000001e-06, |
|
"loss": 4.0193, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.033, |
|
"grad_norm": 16.68646812438965, |
|
"learning_rate": 9.934e-06, |
|
"loss": 4.1939, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.034, |
|
"grad_norm": 12.541526794433594, |
|
"learning_rate": 9.932e-06, |
|
"loss": 4.0829, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 11.975446701049805, |
|
"learning_rate": 9.930000000000001e-06, |
|
"loss": 4.2043, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.036, |
|
"grad_norm": 12.638479232788086, |
|
"learning_rate": 9.928e-06, |
|
"loss": 4.1315, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.037, |
|
"grad_norm": 13.302968978881836, |
|
"learning_rate": 9.926000000000001e-06, |
|
"loss": 4.0406, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.038, |
|
"grad_norm": 12.131388664245605, |
|
"learning_rate": 9.924e-06, |
|
"loss": 4.0571, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.039, |
|
"grad_norm": 13.895309448242188, |
|
"learning_rate": 9.922000000000001e-06, |
|
"loss": 4.0597, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 15.263091087341309, |
|
"learning_rate": 9.920000000000002e-06, |
|
"loss": 4.1587, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.041, |
|
"grad_norm": 12.314478874206543, |
|
"learning_rate": 9.918e-06, |
|
"loss": 4.0695, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.042, |
|
"grad_norm": 13.542490005493164, |
|
"learning_rate": 9.916000000000001e-06, |
|
"loss": 4.088, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.043, |
|
"grad_norm": 14.835192680358887, |
|
"learning_rate": 9.914e-06, |
|
"loss": 4.1067, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.044, |
|
"grad_norm": 13.269238471984863, |
|
"learning_rate": 9.912000000000001e-06, |
|
"loss": 4.0273, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 12.532042503356934, |
|
"learning_rate": 9.91e-06, |
|
"loss": 3.9738, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.046, |
|
"grad_norm": 14.506613731384277, |
|
"learning_rate": 9.908e-06, |
|
"loss": 4.0215, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.047, |
|
"grad_norm": 12.91763973236084, |
|
"learning_rate": 9.906000000000001e-06, |
|
"loss": 3.9976, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 12.20261001586914, |
|
"learning_rate": 9.904e-06, |
|
"loss": 3.9172, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.049, |
|
"grad_norm": 13.156211853027344, |
|
"learning_rate": 9.902000000000001e-06, |
|
"loss": 3.9871, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 13.59281063079834, |
|
"learning_rate": 9.9e-06, |
|
"loss": 3.9724, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.051, |
|
"grad_norm": 13.202598571777344, |
|
"learning_rate": 9.898e-06, |
|
"loss": 3.8869, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.052, |
|
"grad_norm": 16.82631492614746, |
|
"learning_rate": 9.896000000000001e-06, |
|
"loss": 4.0354, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.053, |
|
"grad_norm": 14.205794334411621, |
|
"learning_rate": 9.894e-06, |
|
"loss": 4.0322, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.054, |
|
"grad_norm": 14.135138511657715, |
|
"learning_rate": 9.892000000000001e-06, |
|
"loss": 4.01, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 14.365863800048828, |
|
"learning_rate": 9.89e-06, |
|
"loss": 3.9914, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 14.389483451843262, |
|
"learning_rate": 9.888000000000001e-06, |
|
"loss": 3.8672, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.057, |
|
"grad_norm": 12.534111976623535, |
|
"learning_rate": 9.886000000000002e-06, |
|
"loss": 3.8888, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.058, |
|
"grad_norm": 12.352408409118652, |
|
"learning_rate": 9.884e-06, |
|
"loss": 3.9466, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.059, |
|
"grad_norm": 11.810559272766113, |
|
"learning_rate": 9.882000000000001e-06, |
|
"loss": 3.8322, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 15.00880241394043, |
|
"learning_rate": 9.88e-06, |
|
"loss": 3.9209, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.061, |
|
"grad_norm": 13.07725715637207, |
|
"learning_rate": 9.878000000000001e-06, |
|
"loss": 3.935, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.062, |
|
"grad_norm": 11.981454849243164, |
|
"learning_rate": 9.876000000000002e-06, |
|
"loss": 3.808, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.063, |
|
"grad_norm": 14.732353210449219, |
|
"learning_rate": 9.874e-06, |
|
"loss": 3.9113, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 14.65374755859375, |
|
"learning_rate": 9.872e-06, |
|
"loss": 3.9368, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 19.678359985351562, |
|
"learning_rate": 9.87e-06, |
|
"loss": 3.8806, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.066, |
|
"grad_norm": 13.10519790649414, |
|
"learning_rate": 9.868000000000001e-06, |
|
"loss": 3.7696, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.067, |
|
"grad_norm": 13.150310516357422, |
|
"learning_rate": 9.866000000000002e-06, |
|
"loss": 3.7716, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.068, |
|
"grad_norm": 12.890090942382812, |
|
"learning_rate": 9.864e-06, |
|
"loss": 3.8475, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.069, |
|
"grad_norm": 14.92232894897461, |
|
"learning_rate": 9.862e-06, |
|
"loss": 3.9018, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 17.389636993408203, |
|
"learning_rate": 9.86e-06, |
|
"loss": 3.8824, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.071, |
|
"grad_norm": 13.9534330368042, |
|
"learning_rate": 9.858000000000001e-06, |
|
"loss": 3.8621, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 13.286906242370605, |
|
"learning_rate": 9.856000000000002e-06, |
|
"loss": 3.9293, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.073, |
|
"grad_norm": 13.590744972229004, |
|
"learning_rate": 9.854000000000001e-06, |
|
"loss": 3.8883, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.074, |
|
"grad_norm": 10.841891288757324, |
|
"learning_rate": 9.852e-06, |
|
"loss": 3.8022, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 14.516188621520996, |
|
"learning_rate": 9.85e-06, |
|
"loss": 3.713, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.076, |
|
"grad_norm": 16.878511428833008, |
|
"learning_rate": 9.848000000000001e-06, |
|
"loss": 3.8758, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.077, |
|
"grad_norm": 16.681041717529297, |
|
"learning_rate": 9.846000000000002e-06, |
|
"loss": 3.7602, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.078, |
|
"grad_norm": 14.792035102844238, |
|
"learning_rate": 9.844000000000001e-06, |
|
"loss": 3.7145, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.079, |
|
"grad_norm": 15.74644660949707, |
|
"learning_rate": 9.842e-06, |
|
"loss": 3.7551, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 16.060367584228516, |
|
"learning_rate": 9.84e-06, |
|
"loss": 3.7701, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.081, |
|
"grad_norm": 13.487128257751465, |
|
"learning_rate": 9.838000000000001e-06, |
|
"loss": 3.6881, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.082, |
|
"grad_norm": 15.940110206604004, |
|
"learning_rate": 9.836e-06, |
|
"loss": 3.9418, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.083, |
|
"grad_norm": 13.603132247924805, |
|
"learning_rate": 9.834000000000001e-06, |
|
"loss": 3.7518, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.084, |
|
"grad_norm": 17.045244216918945, |
|
"learning_rate": 9.832e-06, |
|
"loss": 3.7167, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 12.39263916015625, |
|
"learning_rate": 9.83e-06, |
|
"loss": 3.6829, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.086, |
|
"grad_norm": 12.677363395690918, |
|
"learning_rate": 9.828000000000001e-06, |
|
"loss": 3.7458, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.087, |
|
"grad_norm": 15.024678230285645, |
|
"learning_rate": 9.826e-06, |
|
"loss": 3.7334, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 17.41254997253418, |
|
"learning_rate": 9.824000000000001e-06, |
|
"loss": 3.7704, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.089, |
|
"grad_norm": 19.782014846801758, |
|
"learning_rate": 9.822e-06, |
|
"loss": 3.6834, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 16.899019241333008, |
|
"learning_rate": 9.820000000000001e-06, |
|
"loss": 3.7304, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.091, |
|
"grad_norm": 14.481075286865234, |
|
"learning_rate": 9.818000000000002e-06, |
|
"loss": 3.7307, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.092, |
|
"grad_norm": 18.121864318847656, |
|
"learning_rate": 9.816e-06, |
|
"loss": 3.6793, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.093, |
|
"grad_norm": 15.916873931884766, |
|
"learning_rate": 9.814000000000001e-06, |
|
"loss": 3.7664, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.094, |
|
"grad_norm": 18.305234909057617, |
|
"learning_rate": 9.812e-06, |
|
"loss": 3.6887, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 18.262725830078125, |
|
"learning_rate": 9.810000000000001e-06, |
|
"loss": 3.6865, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 21.94981575012207, |
|
"learning_rate": 9.808000000000002e-06, |
|
"loss": 3.5916, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.097, |
|
"grad_norm": 15.031508445739746, |
|
"learning_rate": 9.806e-06, |
|
"loss": 3.5732, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.098, |
|
"grad_norm": 13.64002799987793, |
|
"learning_rate": 9.804000000000001e-06, |
|
"loss": 3.6104, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.099, |
|
"grad_norm": 22.877960205078125, |
|
"learning_rate": 9.802e-06, |
|
"loss": 3.622, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 13.404180526733398, |
|
"learning_rate": 9.800000000000001e-06, |
|
"loss": 3.6693, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.101, |
|
"grad_norm": 14.348539352416992, |
|
"learning_rate": 9.798e-06, |
|
"loss": 3.4753, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.102, |
|
"grad_norm": 15.996590614318848, |
|
"learning_rate": 9.796e-06, |
|
"loss": 3.5059, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.103, |
|
"grad_norm": 16.6004638671875, |
|
"learning_rate": 9.794000000000001e-06, |
|
"loss": 3.7168, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 22.660940170288086, |
|
"learning_rate": 9.792e-06, |
|
"loss": 3.5521, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 16.634521484375, |
|
"learning_rate": 9.790000000000001e-06, |
|
"loss": 3.5894, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.106, |
|
"grad_norm": 17.89203643798828, |
|
"learning_rate": 9.788e-06, |
|
"loss": 3.5524, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.107, |
|
"grad_norm": 17.027833938598633, |
|
"learning_rate": 9.786e-06, |
|
"loss": 3.5953, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.108, |
|
"grad_norm": 19.78436279296875, |
|
"learning_rate": 9.784000000000002e-06, |
|
"loss": 3.6688, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.109, |
|
"grad_norm": 17.20643424987793, |
|
"learning_rate": 9.782e-06, |
|
"loss": 3.5023, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 14.809402465820312, |
|
"learning_rate": 9.780000000000001e-06, |
|
"loss": 3.4398, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.111, |
|
"grad_norm": 21.907175064086914, |
|
"learning_rate": 9.778e-06, |
|
"loss": 3.5254, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 23.719179153442383, |
|
"learning_rate": 9.776000000000001e-06, |
|
"loss": 3.4801, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.113, |
|
"grad_norm": 15.437023162841797, |
|
"learning_rate": 9.774000000000002e-06, |
|
"loss": 3.5724, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.114, |
|
"grad_norm": 21.82857322692871, |
|
"learning_rate": 9.772e-06, |
|
"loss": 3.5787, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 20.26848602294922, |
|
"learning_rate": 9.770000000000001e-06, |
|
"loss": 3.6333, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.116, |
|
"grad_norm": 17.0762882232666, |
|
"learning_rate": 9.768e-06, |
|
"loss": 3.3418, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.117, |
|
"grad_norm": 20.440383911132812, |
|
"learning_rate": 9.766000000000001e-06, |
|
"loss": 3.5072, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.118, |
|
"grad_norm": 17.19301986694336, |
|
"learning_rate": 9.764000000000002e-06, |
|
"loss": 3.491, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.119, |
|
"grad_norm": 20.88847541809082, |
|
"learning_rate": 9.762e-06, |
|
"loss": 3.5482, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 17.677921295166016, |
|
"learning_rate": 9.760000000000001e-06, |
|
"loss": 3.5495, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.121, |
|
"grad_norm": 19.91204833984375, |
|
"learning_rate": 9.758e-06, |
|
"loss": 3.652, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.122, |
|
"grad_norm": 39.53171920776367, |
|
"learning_rate": 9.756000000000001e-06, |
|
"loss": 3.5966, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.123, |
|
"grad_norm": 15.958291053771973, |
|
"learning_rate": 9.754000000000002e-06, |
|
"loss": 3.5364, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.124, |
|
"grad_norm": 15.132060050964355, |
|
"learning_rate": 9.752e-06, |
|
"loss": 3.468, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 14.320738792419434, |
|
"learning_rate": 9.75e-06, |
|
"loss": 3.3429, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.126, |
|
"grad_norm": 20.6189022064209, |
|
"learning_rate": 9.748e-06, |
|
"loss": 3.3507, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.127, |
|
"grad_norm": 14.898783683776855, |
|
"learning_rate": 9.746000000000001e-06, |
|
"loss": 3.3785, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 22.72285270690918, |
|
"learning_rate": 9.744000000000002e-06, |
|
"loss": 3.5126, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.129, |
|
"grad_norm": 22.329116821289062, |
|
"learning_rate": 9.742000000000001e-06, |
|
"loss": 3.4503, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 18.049467086791992, |
|
"learning_rate": 9.74e-06, |
|
"loss": 3.4891, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.131, |
|
"grad_norm": 14.28784465789795, |
|
"learning_rate": 9.738e-06, |
|
"loss": 3.33, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.132, |
|
"grad_norm": 19.659822463989258, |
|
"learning_rate": 9.736000000000001e-06, |
|
"loss": 3.4724, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.133, |
|
"grad_norm": 19.972923278808594, |
|
"learning_rate": 9.734000000000002e-06, |
|
"loss": 3.3965, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.134, |
|
"grad_norm": 21.733108520507812, |
|
"learning_rate": 9.732000000000001e-06, |
|
"loss": 3.584, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 13.769856452941895, |
|
"learning_rate": 9.73e-06, |
|
"loss": 3.4789, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 15.672243118286133, |
|
"learning_rate": 9.728e-06, |
|
"loss": 3.5081, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.137, |
|
"grad_norm": 17.671894073486328, |
|
"learning_rate": 9.726000000000001e-06, |
|
"loss": 3.3696, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.138, |
|
"grad_norm": 19.69550323486328, |
|
"learning_rate": 9.724e-06, |
|
"loss": 3.3707, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.139, |
|
"grad_norm": 14.621719360351562, |
|
"learning_rate": 9.722000000000001e-06, |
|
"loss": 3.4709, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 17.52949333190918, |
|
"learning_rate": 9.72e-06, |
|
"loss": 3.4979, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.141, |
|
"grad_norm": 15.679729461669922, |
|
"learning_rate": 9.718e-06, |
|
"loss": 3.4584, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.142, |
|
"grad_norm": 17.527435302734375, |
|
"learning_rate": 9.716000000000002e-06, |
|
"loss": 3.3812, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.143, |
|
"grad_norm": 24.084278106689453, |
|
"learning_rate": 9.714e-06, |
|
"loss": 3.4052, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 20.039127349853516, |
|
"learning_rate": 9.712e-06, |
|
"loss": 3.3659, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 18.518939971923828, |
|
"learning_rate": 9.71e-06, |
|
"loss": 3.4249, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.146, |
|
"grad_norm": 19.596946716308594, |
|
"learning_rate": 9.708000000000001e-06, |
|
"loss": 3.3909, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.147, |
|
"grad_norm": 14.774336814880371, |
|
"learning_rate": 9.706000000000002e-06, |
|
"loss": 3.3216, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.148, |
|
"grad_norm": 27.980627059936523, |
|
"learning_rate": 9.704e-06, |
|
"loss": 3.3794, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.149, |
|
"grad_norm": 16.481491088867188, |
|
"learning_rate": 9.702e-06, |
|
"loss": 3.4138, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 18.48386573791504, |
|
"learning_rate": 9.7e-06, |
|
"loss": 3.3635, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.151, |
|
"grad_norm": 14.089752197265625, |
|
"learning_rate": 9.698000000000001e-06, |
|
"loss": 3.363, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 15.205988883972168, |
|
"learning_rate": 9.696000000000002e-06, |
|
"loss": 3.3038, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.153, |
|
"grad_norm": 18.17800521850586, |
|
"learning_rate": 9.694e-06, |
|
"loss": 3.2748, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.154, |
|
"grad_norm": 15.958276748657227, |
|
"learning_rate": 9.692e-06, |
|
"loss": 3.2809, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 20.20997428894043, |
|
"learning_rate": 9.69e-06, |
|
"loss": 3.3366, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.156, |
|
"grad_norm": 13.844518661499023, |
|
"learning_rate": 9.688000000000001e-06, |
|
"loss": 3.424, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.157, |
|
"grad_norm": 28.56269645690918, |
|
"learning_rate": 9.686000000000002e-06, |
|
"loss": 3.4567, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.158, |
|
"grad_norm": 14.436336517333984, |
|
"learning_rate": 9.684e-06, |
|
"loss": 3.2089, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.159, |
|
"grad_norm": 28.26078987121582, |
|
"learning_rate": 9.682e-06, |
|
"loss": 3.439, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 19.394569396972656, |
|
"learning_rate": 9.68e-06, |
|
"loss": 3.3376, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.161, |
|
"grad_norm": 18.12739372253418, |
|
"learning_rate": 9.678000000000001e-06, |
|
"loss": 3.2295, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.162, |
|
"grad_norm": 20.420162200927734, |
|
"learning_rate": 9.676e-06, |
|
"loss": 3.307, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.163, |
|
"grad_norm": 15.34536361694336, |
|
"learning_rate": 9.674000000000001e-06, |
|
"loss": 3.3014, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.164, |
|
"grad_norm": 20.239303588867188, |
|
"learning_rate": 9.672e-06, |
|
"loss": 3.3339, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 15.123329162597656, |
|
"learning_rate": 9.67e-06, |
|
"loss": 3.3073, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.166, |
|
"grad_norm": 21.282299041748047, |
|
"learning_rate": 9.668000000000001e-06, |
|
"loss": 3.2809, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.167, |
|
"grad_norm": 18.039140701293945, |
|
"learning_rate": 9.666e-06, |
|
"loss": 3.3368, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 17.391878128051758, |
|
"learning_rate": 9.664000000000001e-06, |
|
"loss": 3.2135, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.169, |
|
"grad_norm": 29.216005325317383, |
|
"learning_rate": 9.662e-06, |
|
"loss": 3.2238, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 24.716182708740234, |
|
"learning_rate": 9.66e-06, |
|
"loss": 3.4032, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.171, |
|
"grad_norm": 19.68560791015625, |
|
"learning_rate": 9.658000000000001e-06, |
|
"loss": 3.2144, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.172, |
|
"grad_norm": 20.55443572998047, |
|
"learning_rate": 9.656e-06, |
|
"loss": 3.347, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.173, |
|
"grad_norm": 23.09670639038086, |
|
"learning_rate": 9.654000000000001e-06, |
|
"loss": 3.2204, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.174, |
|
"grad_norm": 21.916152954101562, |
|
"learning_rate": 9.652e-06, |
|
"loss": 3.1898, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 15.10058879852295, |
|
"learning_rate": 9.65e-06, |
|
"loss": 3.3174, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 18.47793197631836, |
|
"learning_rate": 9.648000000000001e-06, |
|
"loss": 3.152, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.177, |
|
"grad_norm": 20.482669830322266, |
|
"learning_rate": 9.646e-06, |
|
"loss": 3.2341, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.178, |
|
"grad_norm": 17.341407775878906, |
|
"learning_rate": 9.644000000000001e-06, |
|
"loss": 3.2945, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.179, |
|
"grad_norm": 25.537378311157227, |
|
"learning_rate": 9.642e-06, |
|
"loss": 3.3279, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 25.134294509887695, |
|
"learning_rate": 9.640000000000001e-06, |
|
"loss": 3.4005, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.181, |
|
"grad_norm": 14.844265937805176, |
|
"learning_rate": 9.638e-06, |
|
"loss": 3.2125, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.182, |
|
"grad_norm": 12.517401695251465, |
|
"learning_rate": 9.636e-06, |
|
"loss": 3.4222, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.183, |
|
"grad_norm": 13.91508674621582, |
|
"learning_rate": 9.634000000000001e-06, |
|
"loss": 3.2464, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 20.34067726135254, |
|
"learning_rate": 9.632e-06, |
|
"loss": 3.1909, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 20.126605987548828, |
|
"learning_rate": 9.630000000000001e-06, |
|
"loss": 3.1106, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.186, |
|
"grad_norm": 20.69412612915039, |
|
"learning_rate": 9.628e-06, |
|
"loss": 3.2615, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.187, |
|
"grad_norm": 29.957561492919922, |
|
"learning_rate": 9.626e-06, |
|
"loss": 3.2019, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.188, |
|
"grad_norm": 12.36296558380127, |
|
"learning_rate": 9.624000000000001e-06, |
|
"loss": 3.2337, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.189, |
|
"grad_norm": 13.685921669006348, |
|
"learning_rate": 9.622000000000002e-06, |
|
"loss": 3.282, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 20.060331344604492, |
|
"learning_rate": 9.620000000000001e-06, |
|
"loss": 3.1947, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.191, |
|
"grad_norm": 13.936528205871582, |
|
"learning_rate": 9.618e-06, |
|
"loss": 3.174, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 21.87002944946289, |
|
"learning_rate": 9.616e-06, |
|
"loss": 3.2355, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.193, |
|
"grad_norm": 24.834264755249023, |
|
"learning_rate": 9.614000000000001e-06, |
|
"loss": 3.1823, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 0.194, |
|
"grad_norm": 30.83481216430664, |
|
"learning_rate": 9.612000000000002e-06, |
|
"loss": 3.1645, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 18.5225830078125, |
|
"learning_rate": 9.610000000000001e-06, |
|
"loss": 3.2636, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.196, |
|
"grad_norm": 27.33846664428711, |
|
"learning_rate": 9.608e-06, |
|
"loss": 3.1772, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 0.197, |
|
"grad_norm": 21.9919490814209, |
|
"learning_rate": 9.606000000000001e-06, |
|
"loss": 3.229, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 0.198, |
|
"grad_norm": 19.65387725830078, |
|
"learning_rate": 9.604000000000002e-06, |
|
"loss": 3.1524, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 0.199, |
|
"grad_norm": 18.683229446411133, |
|
"learning_rate": 9.602e-06, |
|
"loss": 3.4325, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 16.26070785522461, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 3.1583, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.201, |
|
"grad_norm": 17.30815887451172, |
|
"learning_rate": 9.598e-06, |
|
"loss": 3.1904, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 0.202, |
|
"grad_norm": 28.912694931030273, |
|
"learning_rate": 9.596000000000001e-06, |
|
"loss": 3.0794, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 0.203, |
|
"grad_norm": 20.792774200439453, |
|
"learning_rate": 9.594000000000002e-06, |
|
"loss": 3.2403, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 0.204, |
|
"grad_norm": 22.178218841552734, |
|
"learning_rate": 9.592e-06, |
|
"loss": 3.1408, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 15.090167045593262, |
|
"learning_rate": 9.59e-06, |
|
"loss": 3.1513, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.206, |
|
"grad_norm": 19.66379737854004, |
|
"learning_rate": 9.588e-06, |
|
"loss": 3.1574, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 0.207, |
|
"grad_norm": 20.961610794067383, |
|
"learning_rate": 9.586000000000001e-06, |
|
"loss": 3.284, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 19.434553146362305, |
|
"learning_rate": 9.584000000000002e-06, |
|
"loss": 3.0802, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 0.209, |
|
"grad_norm": 30.214740753173828, |
|
"learning_rate": 9.582e-06, |
|
"loss": 3.2555, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 18.16490364074707, |
|
"learning_rate": 9.58e-06, |
|
"loss": 3.2179, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.211, |
|
"grad_norm": 22.568527221679688, |
|
"learning_rate": 9.578e-06, |
|
"loss": 3.2268, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 0.212, |
|
"grad_norm": 20.349346160888672, |
|
"learning_rate": 9.576000000000001e-06, |
|
"loss": 3.159, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 0.213, |
|
"grad_norm": 23.45667266845703, |
|
"learning_rate": 9.574000000000002e-06, |
|
"loss": 3.1728, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 0.214, |
|
"grad_norm": 20.883718490600586, |
|
"learning_rate": 9.572000000000001e-06, |
|
"loss": 3.1258, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 25.16787338256836, |
|
"learning_rate": 9.57e-06, |
|
"loss": 3.0726, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 21.36046028137207, |
|
"learning_rate": 9.568e-06, |
|
"loss": 3.1267, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 0.217, |
|
"grad_norm": 26.431421279907227, |
|
"learning_rate": 9.566000000000001e-06, |
|
"loss": 3.1588, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 0.218, |
|
"grad_norm": 27.33740997314453, |
|
"learning_rate": 9.564e-06, |
|
"loss": 3.0679, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 0.219, |
|
"grad_norm": 17.818220138549805, |
|
"learning_rate": 9.562000000000001e-06, |
|
"loss": 3.1104, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 25.339937210083008, |
|
"learning_rate": 9.56e-06, |
|
"loss": 3.2342, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.221, |
|
"grad_norm": 19.325305938720703, |
|
"learning_rate": 9.558e-06, |
|
"loss": 3.1085, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 0.222, |
|
"grad_norm": 19.849441528320312, |
|
"learning_rate": 9.556000000000001e-06, |
|
"loss": 3.2078, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 0.223, |
|
"grad_norm": 22.334917068481445, |
|
"learning_rate": 9.554e-06, |
|
"loss": 3.1536, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 16.38900375366211, |
|
"learning_rate": 9.552000000000001e-06, |
|
"loss": 3.106, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 24.00871467590332, |
|
"learning_rate": 9.55e-06, |
|
"loss": 3.1776, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.226, |
|
"grad_norm": 19.4804744720459, |
|
"learning_rate": 9.548e-06, |
|
"loss": 3.164, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 0.227, |
|
"grad_norm": 17.325008392333984, |
|
"learning_rate": 9.546000000000001e-06, |
|
"loss": 3.2499, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 0.228, |
|
"grad_norm": 19.2254695892334, |
|
"learning_rate": 9.544e-06, |
|
"loss": 3.1912, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 0.229, |
|
"grad_norm": 19.877927780151367, |
|
"learning_rate": 9.542000000000001e-06, |
|
"loss": 3.1645, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 21.79277992248535, |
|
"learning_rate": 9.54e-06, |
|
"loss": 3.0794, |
|
"step": 23000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 500000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.4239469785088e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|