ysda_hf / checkpoint-23000 /trainer_state.json
ppapenj's picture
Upload 7 files
1a65189 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.23,
"eval_steps": 500,
"global_step": 23000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001,
"grad_norm": 12.033066749572754,
"learning_rate": 9.998000000000002e-06,
"loss": 5.0328,
"step": 100
},
{
"epoch": 0.002,
"grad_norm": 12.140076637268066,
"learning_rate": 9.996e-06,
"loss": 4.8188,
"step": 200
},
{
"epoch": 0.003,
"grad_norm": 11.839982032775879,
"learning_rate": 9.994000000000001e-06,
"loss": 4.6021,
"step": 300
},
{
"epoch": 0.004,
"grad_norm": 13.72381591796875,
"learning_rate": 9.992e-06,
"loss": 4.4731,
"step": 400
},
{
"epoch": 0.005,
"grad_norm": 11.587430000305176,
"learning_rate": 9.990000000000001e-06,
"loss": 4.4251,
"step": 500
},
{
"epoch": 0.006,
"grad_norm": 12.022308349609375,
"learning_rate": 9.988000000000002e-06,
"loss": 4.4416,
"step": 600
},
{
"epoch": 0.007,
"grad_norm": 12.038565635681152,
"learning_rate": 9.986e-06,
"loss": 4.3319,
"step": 700
},
{
"epoch": 0.008,
"grad_norm": 11.564003944396973,
"learning_rate": 9.984e-06,
"loss": 4.3267,
"step": 800
},
{
"epoch": 0.009,
"grad_norm": 11.518474578857422,
"learning_rate": 9.982e-06,
"loss": 4.2818,
"step": 900
},
{
"epoch": 0.01,
"grad_norm": 12.206191062927246,
"learning_rate": 9.980000000000001e-06,
"loss": 4.3167,
"step": 1000
},
{
"epoch": 0.011,
"grad_norm": 11.940592765808105,
"learning_rate": 9.978000000000002e-06,
"loss": 4.3204,
"step": 1100
},
{
"epoch": 0.012,
"grad_norm": 12.109590530395508,
"learning_rate": 9.976e-06,
"loss": 4.3286,
"step": 1200
},
{
"epoch": 0.013,
"grad_norm": 12.218742370605469,
"learning_rate": 9.974e-06,
"loss": 4.4106,
"step": 1300
},
{
"epoch": 0.014,
"grad_norm": 11.53864860534668,
"learning_rate": 9.972e-06,
"loss": 4.3091,
"step": 1400
},
{
"epoch": 0.015,
"grad_norm": 13.800053596496582,
"learning_rate": 9.970000000000001e-06,
"loss": 4.2455,
"step": 1500
},
{
"epoch": 0.016,
"grad_norm": 13.468424797058105,
"learning_rate": 9.968000000000002e-06,
"loss": 4.305,
"step": 1600
},
{
"epoch": 0.017,
"grad_norm": 11.736213684082031,
"learning_rate": 9.966e-06,
"loss": 4.3232,
"step": 1700
},
{
"epoch": 0.018,
"grad_norm": 10.893006324768066,
"learning_rate": 9.964e-06,
"loss": 4.3139,
"step": 1800
},
{
"epoch": 0.019,
"grad_norm": 14.407844543457031,
"learning_rate": 9.962e-06,
"loss": 4.2437,
"step": 1900
},
{
"epoch": 0.02,
"grad_norm": 11.889874458312988,
"learning_rate": 9.960000000000001e-06,
"loss": 4.2543,
"step": 2000
},
{
"epoch": 0.021,
"grad_norm": 11.396763801574707,
"learning_rate": 9.958e-06,
"loss": 4.1477,
"step": 2100
},
{
"epoch": 0.022,
"grad_norm": 13.99250316619873,
"learning_rate": 9.956000000000001e-06,
"loss": 4.248,
"step": 2200
},
{
"epoch": 0.023,
"grad_norm": 11.850446701049805,
"learning_rate": 9.954e-06,
"loss": 4.3014,
"step": 2300
},
{
"epoch": 0.024,
"grad_norm": 13.699837684631348,
"learning_rate": 9.952e-06,
"loss": 4.1937,
"step": 2400
},
{
"epoch": 0.025,
"grad_norm": 11.772696495056152,
"learning_rate": 9.950000000000001e-06,
"loss": 4.264,
"step": 2500
},
{
"epoch": 0.026,
"grad_norm": 13.460022926330566,
"learning_rate": 9.948e-06,
"loss": 4.2278,
"step": 2600
},
{
"epoch": 0.027,
"grad_norm": 11.80987548828125,
"learning_rate": 9.946000000000001e-06,
"loss": 4.2416,
"step": 2700
},
{
"epoch": 0.028,
"grad_norm": 12.391595840454102,
"learning_rate": 9.944e-06,
"loss": 4.2183,
"step": 2800
},
{
"epoch": 0.029,
"grad_norm": 12.336369514465332,
"learning_rate": 9.942e-06,
"loss": 4.225,
"step": 2900
},
{
"epoch": 0.03,
"grad_norm": 12.137269020080566,
"learning_rate": 9.940000000000001e-06,
"loss": 4.2045,
"step": 3000
},
{
"epoch": 0.031,
"grad_norm": 12.397940635681152,
"learning_rate": 9.938e-06,
"loss": 4.1523,
"step": 3100
},
{
"epoch": 0.032,
"grad_norm": 12.940911293029785,
"learning_rate": 9.936000000000001e-06,
"loss": 4.0193,
"step": 3200
},
{
"epoch": 0.033,
"grad_norm": 16.68646812438965,
"learning_rate": 9.934e-06,
"loss": 4.1939,
"step": 3300
},
{
"epoch": 0.034,
"grad_norm": 12.541526794433594,
"learning_rate": 9.932e-06,
"loss": 4.0829,
"step": 3400
},
{
"epoch": 0.035,
"grad_norm": 11.975446701049805,
"learning_rate": 9.930000000000001e-06,
"loss": 4.2043,
"step": 3500
},
{
"epoch": 0.036,
"grad_norm": 12.638479232788086,
"learning_rate": 9.928e-06,
"loss": 4.1315,
"step": 3600
},
{
"epoch": 0.037,
"grad_norm": 13.302968978881836,
"learning_rate": 9.926000000000001e-06,
"loss": 4.0406,
"step": 3700
},
{
"epoch": 0.038,
"grad_norm": 12.131388664245605,
"learning_rate": 9.924e-06,
"loss": 4.0571,
"step": 3800
},
{
"epoch": 0.039,
"grad_norm": 13.895309448242188,
"learning_rate": 9.922000000000001e-06,
"loss": 4.0597,
"step": 3900
},
{
"epoch": 0.04,
"grad_norm": 15.263091087341309,
"learning_rate": 9.920000000000002e-06,
"loss": 4.1587,
"step": 4000
},
{
"epoch": 0.041,
"grad_norm": 12.314478874206543,
"learning_rate": 9.918e-06,
"loss": 4.0695,
"step": 4100
},
{
"epoch": 0.042,
"grad_norm": 13.542490005493164,
"learning_rate": 9.916000000000001e-06,
"loss": 4.088,
"step": 4200
},
{
"epoch": 0.043,
"grad_norm": 14.835192680358887,
"learning_rate": 9.914e-06,
"loss": 4.1067,
"step": 4300
},
{
"epoch": 0.044,
"grad_norm": 13.269238471984863,
"learning_rate": 9.912000000000001e-06,
"loss": 4.0273,
"step": 4400
},
{
"epoch": 0.045,
"grad_norm": 12.532042503356934,
"learning_rate": 9.91e-06,
"loss": 3.9738,
"step": 4500
},
{
"epoch": 0.046,
"grad_norm": 14.506613731384277,
"learning_rate": 9.908e-06,
"loss": 4.0215,
"step": 4600
},
{
"epoch": 0.047,
"grad_norm": 12.91763973236084,
"learning_rate": 9.906000000000001e-06,
"loss": 3.9976,
"step": 4700
},
{
"epoch": 0.048,
"grad_norm": 12.20261001586914,
"learning_rate": 9.904e-06,
"loss": 3.9172,
"step": 4800
},
{
"epoch": 0.049,
"grad_norm": 13.156211853027344,
"learning_rate": 9.902000000000001e-06,
"loss": 3.9871,
"step": 4900
},
{
"epoch": 0.05,
"grad_norm": 13.59281063079834,
"learning_rate": 9.9e-06,
"loss": 3.9724,
"step": 5000
},
{
"epoch": 0.051,
"grad_norm": 13.202598571777344,
"learning_rate": 9.898e-06,
"loss": 3.8869,
"step": 5100
},
{
"epoch": 0.052,
"grad_norm": 16.82631492614746,
"learning_rate": 9.896000000000001e-06,
"loss": 4.0354,
"step": 5200
},
{
"epoch": 0.053,
"grad_norm": 14.205794334411621,
"learning_rate": 9.894e-06,
"loss": 4.0322,
"step": 5300
},
{
"epoch": 0.054,
"grad_norm": 14.135138511657715,
"learning_rate": 9.892000000000001e-06,
"loss": 4.01,
"step": 5400
},
{
"epoch": 0.055,
"grad_norm": 14.365863800048828,
"learning_rate": 9.89e-06,
"loss": 3.9914,
"step": 5500
},
{
"epoch": 0.056,
"grad_norm": 14.389483451843262,
"learning_rate": 9.888000000000001e-06,
"loss": 3.8672,
"step": 5600
},
{
"epoch": 0.057,
"grad_norm": 12.534111976623535,
"learning_rate": 9.886000000000002e-06,
"loss": 3.8888,
"step": 5700
},
{
"epoch": 0.058,
"grad_norm": 12.352408409118652,
"learning_rate": 9.884e-06,
"loss": 3.9466,
"step": 5800
},
{
"epoch": 0.059,
"grad_norm": 11.810559272766113,
"learning_rate": 9.882000000000001e-06,
"loss": 3.8322,
"step": 5900
},
{
"epoch": 0.06,
"grad_norm": 15.00880241394043,
"learning_rate": 9.88e-06,
"loss": 3.9209,
"step": 6000
},
{
"epoch": 0.061,
"grad_norm": 13.07725715637207,
"learning_rate": 9.878000000000001e-06,
"loss": 3.935,
"step": 6100
},
{
"epoch": 0.062,
"grad_norm": 11.981454849243164,
"learning_rate": 9.876000000000002e-06,
"loss": 3.808,
"step": 6200
},
{
"epoch": 0.063,
"grad_norm": 14.732353210449219,
"learning_rate": 9.874e-06,
"loss": 3.9113,
"step": 6300
},
{
"epoch": 0.064,
"grad_norm": 14.65374755859375,
"learning_rate": 9.872e-06,
"loss": 3.9368,
"step": 6400
},
{
"epoch": 0.065,
"grad_norm": 19.678359985351562,
"learning_rate": 9.87e-06,
"loss": 3.8806,
"step": 6500
},
{
"epoch": 0.066,
"grad_norm": 13.10519790649414,
"learning_rate": 9.868000000000001e-06,
"loss": 3.7696,
"step": 6600
},
{
"epoch": 0.067,
"grad_norm": 13.150310516357422,
"learning_rate": 9.866000000000002e-06,
"loss": 3.7716,
"step": 6700
},
{
"epoch": 0.068,
"grad_norm": 12.890090942382812,
"learning_rate": 9.864e-06,
"loss": 3.8475,
"step": 6800
},
{
"epoch": 0.069,
"grad_norm": 14.92232894897461,
"learning_rate": 9.862e-06,
"loss": 3.9018,
"step": 6900
},
{
"epoch": 0.07,
"grad_norm": 17.389636993408203,
"learning_rate": 9.86e-06,
"loss": 3.8824,
"step": 7000
},
{
"epoch": 0.071,
"grad_norm": 13.9534330368042,
"learning_rate": 9.858000000000001e-06,
"loss": 3.8621,
"step": 7100
},
{
"epoch": 0.072,
"grad_norm": 13.286906242370605,
"learning_rate": 9.856000000000002e-06,
"loss": 3.9293,
"step": 7200
},
{
"epoch": 0.073,
"grad_norm": 13.590744972229004,
"learning_rate": 9.854000000000001e-06,
"loss": 3.8883,
"step": 7300
},
{
"epoch": 0.074,
"grad_norm": 10.841891288757324,
"learning_rate": 9.852e-06,
"loss": 3.8022,
"step": 7400
},
{
"epoch": 0.075,
"grad_norm": 14.516188621520996,
"learning_rate": 9.85e-06,
"loss": 3.713,
"step": 7500
},
{
"epoch": 0.076,
"grad_norm": 16.878511428833008,
"learning_rate": 9.848000000000001e-06,
"loss": 3.8758,
"step": 7600
},
{
"epoch": 0.077,
"grad_norm": 16.681041717529297,
"learning_rate": 9.846000000000002e-06,
"loss": 3.7602,
"step": 7700
},
{
"epoch": 0.078,
"grad_norm": 14.792035102844238,
"learning_rate": 9.844000000000001e-06,
"loss": 3.7145,
"step": 7800
},
{
"epoch": 0.079,
"grad_norm": 15.74644660949707,
"learning_rate": 9.842e-06,
"loss": 3.7551,
"step": 7900
},
{
"epoch": 0.08,
"grad_norm": 16.060367584228516,
"learning_rate": 9.84e-06,
"loss": 3.7701,
"step": 8000
},
{
"epoch": 0.081,
"grad_norm": 13.487128257751465,
"learning_rate": 9.838000000000001e-06,
"loss": 3.6881,
"step": 8100
},
{
"epoch": 0.082,
"grad_norm": 15.940110206604004,
"learning_rate": 9.836e-06,
"loss": 3.9418,
"step": 8200
},
{
"epoch": 0.083,
"grad_norm": 13.603132247924805,
"learning_rate": 9.834000000000001e-06,
"loss": 3.7518,
"step": 8300
},
{
"epoch": 0.084,
"grad_norm": 17.045244216918945,
"learning_rate": 9.832e-06,
"loss": 3.7167,
"step": 8400
},
{
"epoch": 0.085,
"grad_norm": 12.39263916015625,
"learning_rate": 9.83e-06,
"loss": 3.6829,
"step": 8500
},
{
"epoch": 0.086,
"grad_norm": 12.677363395690918,
"learning_rate": 9.828000000000001e-06,
"loss": 3.7458,
"step": 8600
},
{
"epoch": 0.087,
"grad_norm": 15.024678230285645,
"learning_rate": 9.826e-06,
"loss": 3.7334,
"step": 8700
},
{
"epoch": 0.088,
"grad_norm": 17.41254997253418,
"learning_rate": 9.824000000000001e-06,
"loss": 3.7704,
"step": 8800
},
{
"epoch": 0.089,
"grad_norm": 19.782014846801758,
"learning_rate": 9.822e-06,
"loss": 3.6834,
"step": 8900
},
{
"epoch": 0.09,
"grad_norm": 16.899019241333008,
"learning_rate": 9.820000000000001e-06,
"loss": 3.7304,
"step": 9000
},
{
"epoch": 0.091,
"grad_norm": 14.481075286865234,
"learning_rate": 9.818000000000002e-06,
"loss": 3.7307,
"step": 9100
},
{
"epoch": 0.092,
"grad_norm": 18.121864318847656,
"learning_rate": 9.816e-06,
"loss": 3.6793,
"step": 9200
},
{
"epoch": 0.093,
"grad_norm": 15.916873931884766,
"learning_rate": 9.814000000000001e-06,
"loss": 3.7664,
"step": 9300
},
{
"epoch": 0.094,
"grad_norm": 18.305234909057617,
"learning_rate": 9.812e-06,
"loss": 3.6887,
"step": 9400
},
{
"epoch": 0.095,
"grad_norm": 18.262725830078125,
"learning_rate": 9.810000000000001e-06,
"loss": 3.6865,
"step": 9500
},
{
"epoch": 0.096,
"grad_norm": 21.94981575012207,
"learning_rate": 9.808000000000002e-06,
"loss": 3.5916,
"step": 9600
},
{
"epoch": 0.097,
"grad_norm": 15.031508445739746,
"learning_rate": 9.806e-06,
"loss": 3.5732,
"step": 9700
},
{
"epoch": 0.098,
"grad_norm": 13.64002799987793,
"learning_rate": 9.804000000000001e-06,
"loss": 3.6104,
"step": 9800
},
{
"epoch": 0.099,
"grad_norm": 22.877960205078125,
"learning_rate": 9.802e-06,
"loss": 3.622,
"step": 9900
},
{
"epoch": 0.1,
"grad_norm": 13.404180526733398,
"learning_rate": 9.800000000000001e-06,
"loss": 3.6693,
"step": 10000
},
{
"epoch": 0.101,
"grad_norm": 14.348539352416992,
"learning_rate": 9.798e-06,
"loss": 3.4753,
"step": 10100
},
{
"epoch": 0.102,
"grad_norm": 15.996590614318848,
"learning_rate": 9.796e-06,
"loss": 3.5059,
"step": 10200
},
{
"epoch": 0.103,
"grad_norm": 16.6004638671875,
"learning_rate": 9.794000000000001e-06,
"loss": 3.7168,
"step": 10300
},
{
"epoch": 0.104,
"grad_norm": 22.660940170288086,
"learning_rate": 9.792e-06,
"loss": 3.5521,
"step": 10400
},
{
"epoch": 0.105,
"grad_norm": 16.634521484375,
"learning_rate": 9.790000000000001e-06,
"loss": 3.5894,
"step": 10500
},
{
"epoch": 0.106,
"grad_norm": 17.89203643798828,
"learning_rate": 9.788e-06,
"loss": 3.5524,
"step": 10600
},
{
"epoch": 0.107,
"grad_norm": 17.027833938598633,
"learning_rate": 9.786e-06,
"loss": 3.5953,
"step": 10700
},
{
"epoch": 0.108,
"grad_norm": 19.78436279296875,
"learning_rate": 9.784000000000002e-06,
"loss": 3.6688,
"step": 10800
},
{
"epoch": 0.109,
"grad_norm": 17.20643424987793,
"learning_rate": 9.782e-06,
"loss": 3.5023,
"step": 10900
},
{
"epoch": 0.11,
"grad_norm": 14.809402465820312,
"learning_rate": 9.780000000000001e-06,
"loss": 3.4398,
"step": 11000
},
{
"epoch": 0.111,
"grad_norm": 21.907175064086914,
"learning_rate": 9.778e-06,
"loss": 3.5254,
"step": 11100
},
{
"epoch": 0.112,
"grad_norm": 23.719179153442383,
"learning_rate": 9.776000000000001e-06,
"loss": 3.4801,
"step": 11200
},
{
"epoch": 0.113,
"grad_norm": 15.437023162841797,
"learning_rate": 9.774000000000002e-06,
"loss": 3.5724,
"step": 11300
},
{
"epoch": 0.114,
"grad_norm": 21.82857322692871,
"learning_rate": 9.772e-06,
"loss": 3.5787,
"step": 11400
},
{
"epoch": 0.115,
"grad_norm": 20.26848602294922,
"learning_rate": 9.770000000000001e-06,
"loss": 3.6333,
"step": 11500
},
{
"epoch": 0.116,
"grad_norm": 17.0762882232666,
"learning_rate": 9.768e-06,
"loss": 3.3418,
"step": 11600
},
{
"epoch": 0.117,
"grad_norm": 20.440383911132812,
"learning_rate": 9.766000000000001e-06,
"loss": 3.5072,
"step": 11700
},
{
"epoch": 0.118,
"grad_norm": 17.19301986694336,
"learning_rate": 9.764000000000002e-06,
"loss": 3.491,
"step": 11800
},
{
"epoch": 0.119,
"grad_norm": 20.88847541809082,
"learning_rate": 9.762e-06,
"loss": 3.5482,
"step": 11900
},
{
"epoch": 0.12,
"grad_norm": 17.677921295166016,
"learning_rate": 9.760000000000001e-06,
"loss": 3.5495,
"step": 12000
},
{
"epoch": 0.121,
"grad_norm": 19.91204833984375,
"learning_rate": 9.758e-06,
"loss": 3.652,
"step": 12100
},
{
"epoch": 0.122,
"grad_norm": 39.53171920776367,
"learning_rate": 9.756000000000001e-06,
"loss": 3.5966,
"step": 12200
},
{
"epoch": 0.123,
"grad_norm": 15.958291053771973,
"learning_rate": 9.754000000000002e-06,
"loss": 3.5364,
"step": 12300
},
{
"epoch": 0.124,
"grad_norm": 15.132060050964355,
"learning_rate": 9.752e-06,
"loss": 3.468,
"step": 12400
},
{
"epoch": 0.125,
"grad_norm": 14.320738792419434,
"learning_rate": 9.75e-06,
"loss": 3.3429,
"step": 12500
},
{
"epoch": 0.126,
"grad_norm": 20.6189022064209,
"learning_rate": 9.748e-06,
"loss": 3.3507,
"step": 12600
},
{
"epoch": 0.127,
"grad_norm": 14.898783683776855,
"learning_rate": 9.746000000000001e-06,
"loss": 3.3785,
"step": 12700
},
{
"epoch": 0.128,
"grad_norm": 22.72285270690918,
"learning_rate": 9.744000000000002e-06,
"loss": 3.5126,
"step": 12800
},
{
"epoch": 0.129,
"grad_norm": 22.329116821289062,
"learning_rate": 9.742000000000001e-06,
"loss": 3.4503,
"step": 12900
},
{
"epoch": 0.13,
"grad_norm": 18.049467086791992,
"learning_rate": 9.74e-06,
"loss": 3.4891,
"step": 13000
},
{
"epoch": 0.131,
"grad_norm": 14.28784465789795,
"learning_rate": 9.738e-06,
"loss": 3.33,
"step": 13100
},
{
"epoch": 0.132,
"grad_norm": 19.659822463989258,
"learning_rate": 9.736000000000001e-06,
"loss": 3.4724,
"step": 13200
},
{
"epoch": 0.133,
"grad_norm": 19.972923278808594,
"learning_rate": 9.734000000000002e-06,
"loss": 3.3965,
"step": 13300
},
{
"epoch": 0.134,
"grad_norm": 21.733108520507812,
"learning_rate": 9.732000000000001e-06,
"loss": 3.584,
"step": 13400
},
{
"epoch": 0.135,
"grad_norm": 13.769856452941895,
"learning_rate": 9.73e-06,
"loss": 3.4789,
"step": 13500
},
{
"epoch": 0.136,
"grad_norm": 15.672243118286133,
"learning_rate": 9.728e-06,
"loss": 3.5081,
"step": 13600
},
{
"epoch": 0.137,
"grad_norm": 17.671894073486328,
"learning_rate": 9.726000000000001e-06,
"loss": 3.3696,
"step": 13700
},
{
"epoch": 0.138,
"grad_norm": 19.69550323486328,
"learning_rate": 9.724e-06,
"loss": 3.3707,
"step": 13800
},
{
"epoch": 0.139,
"grad_norm": 14.621719360351562,
"learning_rate": 9.722000000000001e-06,
"loss": 3.4709,
"step": 13900
},
{
"epoch": 0.14,
"grad_norm": 17.52949333190918,
"learning_rate": 9.72e-06,
"loss": 3.4979,
"step": 14000
},
{
"epoch": 0.141,
"grad_norm": 15.679729461669922,
"learning_rate": 9.718e-06,
"loss": 3.4584,
"step": 14100
},
{
"epoch": 0.142,
"grad_norm": 17.527435302734375,
"learning_rate": 9.716000000000002e-06,
"loss": 3.3812,
"step": 14200
},
{
"epoch": 0.143,
"grad_norm": 24.084278106689453,
"learning_rate": 9.714e-06,
"loss": 3.4052,
"step": 14300
},
{
"epoch": 0.144,
"grad_norm": 20.039127349853516,
"learning_rate": 9.712e-06,
"loss": 3.3659,
"step": 14400
},
{
"epoch": 0.145,
"grad_norm": 18.518939971923828,
"learning_rate": 9.71e-06,
"loss": 3.4249,
"step": 14500
},
{
"epoch": 0.146,
"grad_norm": 19.596946716308594,
"learning_rate": 9.708000000000001e-06,
"loss": 3.3909,
"step": 14600
},
{
"epoch": 0.147,
"grad_norm": 14.774336814880371,
"learning_rate": 9.706000000000002e-06,
"loss": 3.3216,
"step": 14700
},
{
"epoch": 0.148,
"grad_norm": 27.980627059936523,
"learning_rate": 9.704e-06,
"loss": 3.3794,
"step": 14800
},
{
"epoch": 0.149,
"grad_norm": 16.481491088867188,
"learning_rate": 9.702e-06,
"loss": 3.4138,
"step": 14900
},
{
"epoch": 0.15,
"grad_norm": 18.48386573791504,
"learning_rate": 9.7e-06,
"loss": 3.3635,
"step": 15000
},
{
"epoch": 0.151,
"grad_norm": 14.089752197265625,
"learning_rate": 9.698000000000001e-06,
"loss": 3.363,
"step": 15100
},
{
"epoch": 0.152,
"grad_norm": 15.205988883972168,
"learning_rate": 9.696000000000002e-06,
"loss": 3.3038,
"step": 15200
},
{
"epoch": 0.153,
"grad_norm": 18.17800521850586,
"learning_rate": 9.694e-06,
"loss": 3.2748,
"step": 15300
},
{
"epoch": 0.154,
"grad_norm": 15.958276748657227,
"learning_rate": 9.692e-06,
"loss": 3.2809,
"step": 15400
},
{
"epoch": 0.155,
"grad_norm": 20.20997428894043,
"learning_rate": 9.69e-06,
"loss": 3.3366,
"step": 15500
},
{
"epoch": 0.156,
"grad_norm": 13.844518661499023,
"learning_rate": 9.688000000000001e-06,
"loss": 3.424,
"step": 15600
},
{
"epoch": 0.157,
"grad_norm": 28.56269645690918,
"learning_rate": 9.686000000000002e-06,
"loss": 3.4567,
"step": 15700
},
{
"epoch": 0.158,
"grad_norm": 14.436336517333984,
"learning_rate": 9.684e-06,
"loss": 3.2089,
"step": 15800
},
{
"epoch": 0.159,
"grad_norm": 28.26078987121582,
"learning_rate": 9.682e-06,
"loss": 3.439,
"step": 15900
},
{
"epoch": 0.16,
"grad_norm": 19.394569396972656,
"learning_rate": 9.68e-06,
"loss": 3.3376,
"step": 16000
},
{
"epoch": 0.161,
"grad_norm": 18.12739372253418,
"learning_rate": 9.678000000000001e-06,
"loss": 3.2295,
"step": 16100
},
{
"epoch": 0.162,
"grad_norm": 20.420162200927734,
"learning_rate": 9.676e-06,
"loss": 3.307,
"step": 16200
},
{
"epoch": 0.163,
"grad_norm": 15.34536361694336,
"learning_rate": 9.674000000000001e-06,
"loss": 3.3014,
"step": 16300
},
{
"epoch": 0.164,
"grad_norm": 20.239303588867188,
"learning_rate": 9.672e-06,
"loss": 3.3339,
"step": 16400
},
{
"epoch": 0.165,
"grad_norm": 15.123329162597656,
"learning_rate": 9.67e-06,
"loss": 3.3073,
"step": 16500
},
{
"epoch": 0.166,
"grad_norm": 21.282299041748047,
"learning_rate": 9.668000000000001e-06,
"loss": 3.2809,
"step": 16600
},
{
"epoch": 0.167,
"grad_norm": 18.039140701293945,
"learning_rate": 9.666e-06,
"loss": 3.3368,
"step": 16700
},
{
"epoch": 0.168,
"grad_norm": 17.391878128051758,
"learning_rate": 9.664000000000001e-06,
"loss": 3.2135,
"step": 16800
},
{
"epoch": 0.169,
"grad_norm": 29.216005325317383,
"learning_rate": 9.662e-06,
"loss": 3.2238,
"step": 16900
},
{
"epoch": 0.17,
"grad_norm": 24.716182708740234,
"learning_rate": 9.66e-06,
"loss": 3.4032,
"step": 17000
},
{
"epoch": 0.171,
"grad_norm": 19.68560791015625,
"learning_rate": 9.658000000000001e-06,
"loss": 3.2144,
"step": 17100
},
{
"epoch": 0.172,
"grad_norm": 20.55443572998047,
"learning_rate": 9.656e-06,
"loss": 3.347,
"step": 17200
},
{
"epoch": 0.173,
"grad_norm": 23.09670639038086,
"learning_rate": 9.654000000000001e-06,
"loss": 3.2204,
"step": 17300
},
{
"epoch": 0.174,
"grad_norm": 21.916152954101562,
"learning_rate": 9.652e-06,
"loss": 3.1898,
"step": 17400
},
{
"epoch": 0.175,
"grad_norm": 15.10058879852295,
"learning_rate": 9.65e-06,
"loss": 3.3174,
"step": 17500
},
{
"epoch": 0.176,
"grad_norm": 18.47793197631836,
"learning_rate": 9.648000000000001e-06,
"loss": 3.152,
"step": 17600
},
{
"epoch": 0.177,
"grad_norm": 20.482669830322266,
"learning_rate": 9.646e-06,
"loss": 3.2341,
"step": 17700
},
{
"epoch": 0.178,
"grad_norm": 17.341407775878906,
"learning_rate": 9.644000000000001e-06,
"loss": 3.2945,
"step": 17800
},
{
"epoch": 0.179,
"grad_norm": 25.537378311157227,
"learning_rate": 9.642e-06,
"loss": 3.3279,
"step": 17900
},
{
"epoch": 0.18,
"grad_norm": 25.134294509887695,
"learning_rate": 9.640000000000001e-06,
"loss": 3.4005,
"step": 18000
},
{
"epoch": 0.181,
"grad_norm": 14.844265937805176,
"learning_rate": 9.638e-06,
"loss": 3.2125,
"step": 18100
},
{
"epoch": 0.182,
"grad_norm": 12.517401695251465,
"learning_rate": 9.636e-06,
"loss": 3.4222,
"step": 18200
},
{
"epoch": 0.183,
"grad_norm": 13.91508674621582,
"learning_rate": 9.634000000000001e-06,
"loss": 3.2464,
"step": 18300
},
{
"epoch": 0.184,
"grad_norm": 20.34067726135254,
"learning_rate": 9.632e-06,
"loss": 3.1909,
"step": 18400
},
{
"epoch": 0.185,
"grad_norm": 20.126605987548828,
"learning_rate": 9.630000000000001e-06,
"loss": 3.1106,
"step": 18500
},
{
"epoch": 0.186,
"grad_norm": 20.69412612915039,
"learning_rate": 9.628e-06,
"loss": 3.2615,
"step": 18600
},
{
"epoch": 0.187,
"grad_norm": 29.957561492919922,
"learning_rate": 9.626e-06,
"loss": 3.2019,
"step": 18700
},
{
"epoch": 0.188,
"grad_norm": 12.36296558380127,
"learning_rate": 9.624000000000001e-06,
"loss": 3.2337,
"step": 18800
},
{
"epoch": 0.189,
"grad_norm": 13.685921669006348,
"learning_rate": 9.622000000000002e-06,
"loss": 3.282,
"step": 18900
},
{
"epoch": 0.19,
"grad_norm": 20.060331344604492,
"learning_rate": 9.620000000000001e-06,
"loss": 3.1947,
"step": 19000
},
{
"epoch": 0.191,
"grad_norm": 13.936528205871582,
"learning_rate": 9.618e-06,
"loss": 3.174,
"step": 19100
},
{
"epoch": 0.192,
"grad_norm": 21.87002944946289,
"learning_rate": 9.616e-06,
"loss": 3.2355,
"step": 19200
},
{
"epoch": 0.193,
"grad_norm": 24.834264755249023,
"learning_rate": 9.614000000000001e-06,
"loss": 3.1823,
"step": 19300
},
{
"epoch": 0.194,
"grad_norm": 30.83481216430664,
"learning_rate": 9.612000000000002e-06,
"loss": 3.1645,
"step": 19400
},
{
"epoch": 0.195,
"grad_norm": 18.5225830078125,
"learning_rate": 9.610000000000001e-06,
"loss": 3.2636,
"step": 19500
},
{
"epoch": 0.196,
"grad_norm": 27.33846664428711,
"learning_rate": 9.608e-06,
"loss": 3.1772,
"step": 19600
},
{
"epoch": 0.197,
"grad_norm": 21.9919490814209,
"learning_rate": 9.606000000000001e-06,
"loss": 3.229,
"step": 19700
},
{
"epoch": 0.198,
"grad_norm": 19.65387725830078,
"learning_rate": 9.604000000000002e-06,
"loss": 3.1524,
"step": 19800
},
{
"epoch": 0.199,
"grad_norm": 18.683229446411133,
"learning_rate": 9.602e-06,
"loss": 3.4325,
"step": 19900
},
{
"epoch": 0.2,
"grad_norm": 16.26070785522461,
"learning_rate": 9.600000000000001e-06,
"loss": 3.1583,
"step": 20000
},
{
"epoch": 0.201,
"grad_norm": 17.30815887451172,
"learning_rate": 9.598e-06,
"loss": 3.1904,
"step": 20100
},
{
"epoch": 0.202,
"grad_norm": 28.912694931030273,
"learning_rate": 9.596000000000001e-06,
"loss": 3.0794,
"step": 20200
},
{
"epoch": 0.203,
"grad_norm": 20.792774200439453,
"learning_rate": 9.594000000000002e-06,
"loss": 3.2403,
"step": 20300
},
{
"epoch": 0.204,
"grad_norm": 22.178218841552734,
"learning_rate": 9.592e-06,
"loss": 3.1408,
"step": 20400
},
{
"epoch": 0.205,
"grad_norm": 15.090167045593262,
"learning_rate": 9.59e-06,
"loss": 3.1513,
"step": 20500
},
{
"epoch": 0.206,
"grad_norm": 19.66379737854004,
"learning_rate": 9.588e-06,
"loss": 3.1574,
"step": 20600
},
{
"epoch": 0.207,
"grad_norm": 20.961610794067383,
"learning_rate": 9.586000000000001e-06,
"loss": 3.284,
"step": 20700
},
{
"epoch": 0.208,
"grad_norm": 19.434553146362305,
"learning_rate": 9.584000000000002e-06,
"loss": 3.0802,
"step": 20800
},
{
"epoch": 0.209,
"grad_norm": 30.214740753173828,
"learning_rate": 9.582e-06,
"loss": 3.2555,
"step": 20900
},
{
"epoch": 0.21,
"grad_norm": 18.16490364074707,
"learning_rate": 9.58e-06,
"loss": 3.2179,
"step": 21000
},
{
"epoch": 0.211,
"grad_norm": 22.568527221679688,
"learning_rate": 9.578e-06,
"loss": 3.2268,
"step": 21100
},
{
"epoch": 0.212,
"grad_norm": 20.349346160888672,
"learning_rate": 9.576000000000001e-06,
"loss": 3.159,
"step": 21200
},
{
"epoch": 0.213,
"grad_norm": 23.45667266845703,
"learning_rate": 9.574000000000002e-06,
"loss": 3.1728,
"step": 21300
},
{
"epoch": 0.214,
"grad_norm": 20.883718490600586,
"learning_rate": 9.572000000000001e-06,
"loss": 3.1258,
"step": 21400
},
{
"epoch": 0.215,
"grad_norm": 25.16787338256836,
"learning_rate": 9.57e-06,
"loss": 3.0726,
"step": 21500
},
{
"epoch": 0.216,
"grad_norm": 21.36046028137207,
"learning_rate": 9.568e-06,
"loss": 3.1267,
"step": 21600
},
{
"epoch": 0.217,
"grad_norm": 26.431421279907227,
"learning_rate": 9.566000000000001e-06,
"loss": 3.1588,
"step": 21700
},
{
"epoch": 0.218,
"grad_norm": 27.33740997314453,
"learning_rate": 9.564e-06,
"loss": 3.0679,
"step": 21800
},
{
"epoch": 0.219,
"grad_norm": 17.818220138549805,
"learning_rate": 9.562000000000001e-06,
"loss": 3.1104,
"step": 21900
},
{
"epoch": 0.22,
"grad_norm": 25.339937210083008,
"learning_rate": 9.56e-06,
"loss": 3.2342,
"step": 22000
},
{
"epoch": 0.221,
"grad_norm": 19.325305938720703,
"learning_rate": 9.558e-06,
"loss": 3.1085,
"step": 22100
},
{
"epoch": 0.222,
"grad_norm": 19.849441528320312,
"learning_rate": 9.556000000000001e-06,
"loss": 3.2078,
"step": 22200
},
{
"epoch": 0.223,
"grad_norm": 22.334917068481445,
"learning_rate": 9.554e-06,
"loss": 3.1536,
"step": 22300
},
{
"epoch": 0.224,
"grad_norm": 16.38900375366211,
"learning_rate": 9.552000000000001e-06,
"loss": 3.106,
"step": 22400
},
{
"epoch": 0.225,
"grad_norm": 24.00871467590332,
"learning_rate": 9.55e-06,
"loss": 3.1776,
"step": 22500
},
{
"epoch": 0.226,
"grad_norm": 19.4804744720459,
"learning_rate": 9.548e-06,
"loss": 3.164,
"step": 22600
},
{
"epoch": 0.227,
"grad_norm": 17.325008392333984,
"learning_rate": 9.546000000000001e-06,
"loss": 3.2499,
"step": 22700
},
{
"epoch": 0.228,
"grad_norm": 19.2254695892334,
"learning_rate": 9.544e-06,
"loss": 3.1912,
"step": 22800
},
{
"epoch": 0.229,
"grad_norm": 19.877927780151367,
"learning_rate": 9.542000000000001e-06,
"loss": 3.1645,
"step": 22900
},
{
"epoch": 0.23,
"grad_norm": 21.79277992248535,
"learning_rate": 9.54e-06,
"loss": 3.0794,
"step": 23000
}
],
"logging_steps": 100,
"max_steps": 500000,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.4239469785088e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}