{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.23, "eval_steps": 500, "global_step": 23000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 12.033066749572754, "learning_rate": 9.998000000000002e-06, "loss": 5.0328, "step": 100 }, { "epoch": 0.002, "grad_norm": 12.140076637268066, "learning_rate": 9.996e-06, "loss": 4.8188, "step": 200 }, { "epoch": 0.003, "grad_norm": 11.839982032775879, "learning_rate": 9.994000000000001e-06, "loss": 4.6021, "step": 300 }, { "epoch": 0.004, "grad_norm": 13.72381591796875, "learning_rate": 9.992e-06, "loss": 4.4731, "step": 400 }, { "epoch": 0.005, "grad_norm": 11.587430000305176, "learning_rate": 9.990000000000001e-06, "loss": 4.4251, "step": 500 }, { "epoch": 0.006, "grad_norm": 12.022308349609375, "learning_rate": 9.988000000000002e-06, "loss": 4.4416, "step": 600 }, { "epoch": 0.007, "grad_norm": 12.038565635681152, "learning_rate": 9.986e-06, "loss": 4.3319, "step": 700 }, { "epoch": 0.008, "grad_norm": 11.564003944396973, "learning_rate": 9.984e-06, "loss": 4.3267, "step": 800 }, { "epoch": 0.009, "grad_norm": 11.518474578857422, "learning_rate": 9.982e-06, "loss": 4.2818, "step": 900 }, { "epoch": 0.01, "grad_norm": 12.206191062927246, "learning_rate": 9.980000000000001e-06, "loss": 4.3167, "step": 1000 }, { "epoch": 0.011, "grad_norm": 11.940592765808105, "learning_rate": 9.978000000000002e-06, "loss": 4.3204, "step": 1100 }, { "epoch": 0.012, "grad_norm": 12.109590530395508, "learning_rate": 9.976e-06, "loss": 4.3286, "step": 1200 }, { "epoch": 0.013, "grad_norm": 12.218742370605469, "learning_rate": 9.974e-06, "loss": 4.4106, "step": 1300 }, { "epoch": 0.014, "grad_norm": 11.53864860534668, "learning_rate": 9.972e-06, "loss": 4.3091, "step": 1400 }, { "epoch": 0.015, "grad_norm": 13.800053596496582, "learning_rate": 9.970000000000001e-06, "loss": 4.2455, "step": 1500 }, { "epoch": 0.016, "grad_norm": 13.468424797058105, "learning_rate": 9.968000000000002e-06, "loss": 4.305, "step": 1600 }, { "epoch": 0.017, "grad_norm": 11.736213684082031, "learning_rate": 9.966e-06, "loss": 4.3232, "step": 1700 }, { "epoch": 0.018, "grad_norm": 10.893006324768066, "learning_rate": 9.964e-06, "loss": 4.3139, "step": 1800 }, { "epoch": 0.019, "grad_norm": 14.407844543457031, "learning_rate": 9.962e-06, "loss": 4.2437, "step": 1900 }, { "epoch": 0.02, "grad_norm": 11.889874458312988, "learning_rate": 9.960000000000001e-06, "loss": 4.2543, "step": 2000 }, { "epoch": 0.021, "grad_norm": 11.396763801574707, "learning_rate": 9.958e-06, "loss": 4.1477, "step": 2100 }, { "epoch": 0.022, "grad_norm": 13.99250316619873, "learning_rate": 9.956000000000001e-06, "loss": 4.248, "step": 2200 }, { "epoch": 0.023, "grad_norm": 11.850446701049805, "learning_rate": 9.954e-06, "loss": 4.3014, "step": 2300 }, { "epoch": 0.024, "grad_norm": 13.699837684631348, "learning_rate": 9.952e-06, "loss": 4.1937, "step": 2400 }, { "epoch": 0.025, "grad_norm": 11.772696495056152, "learning_rate": 9.950000000000001e-06, "loss": 4.264, "step": 2500 }, { "epoch": 0.026, "grad_norm": 13.460022926330566, "learning_rate": 9.948e-06, "loss": 4.2278, "step": 2600 }, { "epoch": 0.027, "grad_norm": 11.80987548828125, "learning_rate": 9.946000000000001e-06, "loss": 4.2416, "step": 2700 }, { "epoch": 0.028, "grad_norm": 12.391595840454102, "learning_rate": 9.944e-06, "loss": 4.2183, "step": 2800 }, { "epoch": 0.029, "grad_norm": 12.336369514465332, "learning_rate": 9.942e-06, "loss": 4.225, "step": 2900 }, { "epoch": 0.03, "grad_norm": 12.137269020080566, "learning_rate": 9.940000000000001e-06, "loss": 4.2045, "step": 3000 }, { "epoch": 0.031, "grad_norm": 12.397940635681152, "learning_rate": 9.938e-06, "loss": 4.1523, "step": 3100 }, { "epoch": 0.032, "grad_norm": 12.940911293029785, "learning_rate": 9.936000000000001e-06, "loss": 4.0193, "step": 3200 }, { "epoch": 0.033, "grad_norm": 16.68646812438965, "learning_rate": 9.934e-06, "loss": 4.1939, "step": 3300 }, { "epoch": 0.034, "grad_norm": 12.541526794433594, "learning_rate": 9.932e-06, "loss": 4.0829, "step": 3400 }, { "epoch": 0.035, "grad_norm": 11.975446701049805, "learning_rate": 9.930000000000001e-06, "loss": 4.2043, "step": 3500 }, { "epoch": 0.036, "grad_norm": 12.638479232788086, "learning_rate": 9.928e-06, "loss": 4.1315, "step": 3600 }, { "epoch": 0.037, "grad_norm": 13.302968978881836, "learning_rate": 9.926000000000001e-06, "loss": 4.0406, "step": 3700 }, { "epoch": 0.038, "grad_norm": 12.131388664245605, "learning_rate": 9.924e-06, "loss": 4.0571, "step": 3800 }, { "epoch": 0.039, "grad_norm": 13.895309448242188, "learning_rate": 9.922000000000001e-06, "loss": 4.0597, "step": 3900 }, { "epoch": 0.04, "grad_norm": 15.263091087341309, "learning_rate": 9.920000000000002e-06, "loss": 4.1587, "step": 4000 }, { "epoch": 0.041, "grad_norm": 12.314478874206543, "learning_rate": 9.918e-06, "loss": 4.0695, "step": 4100 }, { "epoch": 0.042, "grad_norm": 13.542490005493164, "learning_rate": 9.916000000000001e-06, "loss": 4.088, "step": 4200 }, { "epoch": 0.043, "grad_norm": 14.835192680358887, "learning_rate": 9.914e-06, "loss": 4.1067, "step": 4300 }, { "epoch": 0.044, "grad_norm": 13.269238471984863, "learning_rate": 9.912000000000001e-06, "loss": 4.0273, "step": 4400 }, { "epoch": 0.045, "grad_norm": 12.532042503356934, "learning_rate": 9.91e-06, "loss": 3.9738, "step": 4500 }, { "epoch": 0.046, "grad_norm": 14.506613731384277, "learning_rate": 9.908e-06, "loss": 4.0215, "step": 4600 }, { "epoch": 0.047, "grad_norm": 12.91763973236084, "learning_rate": 9.906000000000001e-06, "loss": 3.9976, "step": 4700 }, { "epoch": 0.048, "grad_norm": 12.20261001586914, "learning_rate": 9.904e-06, "loss": 3.9172, "step": 4800 }, { "epoch": 0.049, "grad_norm": 13.156211853027344, "learning_rate": 9.902000000000001e-06, "loss": 3.9871, "step": 4900 }, { "epoch": 0.05, "grad_norm": 13.59281063079834, "learning_rate": 9.9e-06, "loss": 3.9724, "step": 5000 }, { "epoch": 0.051, "grad_norm": 13.202598571777344, "learning_rate": 9.898e-06, "loss": 3.8869, "step": 5100 }, { "epoch": 0.052, "grad_norm": 16.82631492614746, "learning_rate": 9.896000000000001e-06, "loss": 4.0354, "step": 5200 }, { "epoch": 0.053, "grad_norm": 14.205794334411621, "learning_rate": 9.894e-06, "loss": 4.0322, "step": 5300 }, { "epoch": 0.054, "grad_norm": 14.135138511657715, "learning_rate": 9.892000000000001e-06, "loss": 4.01, "step": 5400 }, { "epoch": 0.055, "grad_norm": 14.365863800048828, "learning_rate": 9.89e-06, "loss": 3.9914, "step": 5500 }, { "epoch": 0.056, "grad_norm": 14.389483451843262, "learning_rate": 9.888000000000001e-06, "loss": 3.8672, "step": 5600 }, { "epoch": 0.057, "grad_norm": 12.534111976623535, "learning_rate": 9.886000000000002e-06, "loss": 3.8888, "step": 5700 }, { "epoch": 0.058, "grad_norm": 12.352408409118652, "learning_rate": 9.884e-06, "loss": 3.9466, "step": 5800 }, { "epoch": 0.059, "grad_norm": 11.810559272766113, "learning_rate": 9.882000000000001e-06, "loss": 3.8322, "step": 5900 }, { "epoch": 0.06, "grad_norm": 15.00880241394043, "learning_rate": 9.88e-06, "loss": 3.9209, "step": 6000 }, { "epoch": 0.061, "grad_norm": 13.07725715637207, "learning_rate": 9.878000000000001e-06, "loss": 3.935, "step": 6100 }, { "epoch": 0.062, "grad_norm": 11.981454849243164, "learning_rate": 9.876000000000002e-06, "loss": 3.808, "step": 6200 }, { "epoch": 0.063, "grad_norm": 14.732353210449219, "learning_rate": 9.874e-06, "loss": 3.9113, "step": 6300 }, { "epoch": 0.064, "grad_norm": 14.65374755859375, "learning_rate": 9.872e-06, "loss": 3.9368, "step": 6400 }, { "epoch": 0.065, "grad_norm": 19.678359985351562, "learning_rate": 9.87e-06, "loss": 3.8806, "step": 6500 }, { "epoch": 0.066, "grad_norm": 13.10519790649414, "learning_rate": 9.868000000000001e-06, "loss": 3.7696, "step": 6600 }, { "epoch": 0.067, "grad_norm": 13.150310516357422, "learning_rate": 9.866000000000002e-06, "loss": 3.7716, "step": 6700 }, { "epoch": 0.068, "grad_norm": 12.890090942382812, "learning_rate": 9.864e-06, "loss": 3.8475, "step": 6800 }, { "epoch": 0.069, "grad_norm": 14.92232894897461, "learning_rate": 9.862e-06, "loss": 3.9018, "step": 6900 }, { "epoch": 0.07, "grad_norm": 17.389636993408203, "learning_rate": 9.86e-06, "loss": 3.8824, "step": 7000 }, { "epoch": 0.071, "grad_norm": 13.9534330368042, "learning_rate": 9.858000000000001e-06, "loss": 3.8621, "step": 7100 }, { "epoch": 0.072, "grad_norm": 13.286906242370605, "learning_rate": 9.856000000000002e-06, "loss": 3.9293, "step": 7200 }, { "epoch": 0.073, "grad_norm": 13.590744972229004, "learning_rate": 9.854000000000001e-06, "loss": 3.8883, "step": 7300 }, { "epoch": 0.074, "grad_norm": 10.841891288757324, "learning_rate": 9.852e-06, "loss": 3.8022, "step": 7400 }, { "epoch": 0.075, "grad_norm": 14.516188621520996, "learning_rate": 9.85e-06, "loss": 3.713, "step": 7500 }, { "epoch": 0.076, "grad_norm": 16.878511428833008, "learning_rate": 9.848000000000001e-06, "loss": 3.8758, "step": 7600 }, { "epoch": 0.077, "grad_norm": 16.681041717529297, "learning_rate": 9.846000000000002e-06, "loss": 3.7602, "step": 7700 }, { "epoch": 0.078, "grad_norm": 14.792035102844238, "learning_rate": 9.844000000000001e-06, "loss": 3.7145, "step": 7800 }, { "epoch": 0.079, "grad_norm": 15.74644660949707, "learning_rate": 9.842e-06, "loss": 3.7551, "step": 7900 }, { "epoch": 0.08, "grad_norm": 16.060367584228516, "learning_rate": 9.84e-06, "loss": 3.7701, "step": 8000 }, { "epoch": 0.081, "grad_norm": 13.487128257751465, "learning_rate": 9.838000000000001e-06, "loss": 3.6881, "step": 8100 }, { "epoch": 0.082, "grad_norm": 15.940110206604004, "learning_rate": 9.836e-06, "loss": 3.9418, "step": 8200 }, { "epoch": 0.083, "grad_norm": 13.603132247924805, "learning_rate": 9.834000000000001e-06, "loss": 3.7518, "step": 8300 }, { "epoch": 0.084, "grad_norm": 17.045244216918945, "learning_rate": 9.832e-06, "loss": 3.7167, "step": 8400 }, { "epoch": 0.085, "grad_norm": 12.39263916015625, "learning_rate": 9.83e-06, "loss": 3.6829, "step": 8500 }, { "epoch": 0.086, "grad_norm": 12.677363395690918, "learning_rate": 9.828000000000001e-06, "loss": 3.7458, "step": 8600 }, { "epoch": 0.087, "grad_norm": 15.024678230285645, "learning_rate": 9.826e-06, "loss": 3.7334, "step": 8700 }, { "epoch": 0.088, "grad_norm": 17.41254997253418, "learning_rate": 9.824000000000001e-06, "loss": 3.7704, "step": 8800 }, { "epoch": 0.089, "grad_norm": 19.782014846801758, "learning_rate": 9.822e-06, "loss": 3.6834, "step": 8900 }, { "epoch": 0.09, "grad_norm": 16.899019241333008, "learning_rate": 9.820000000000001e-06, "loss": 3.7304, "step": 9000 }, { "epoch": 0.091, "grad_norm": 14.481075286865234, "learning_rate": 9.818000000000002e-06, "loss": 3.7307, "step": 9100 }, { "epoch": 0.092, "grad_norm": 18.121864318847656, "learning_rate": 9.816e-06, "loss": 3.6793, "step": 9200 }, { "epoch": 0.093, "grad_norm": 15.916873931884766, "learning_rate": 9.814000000000001e-06, "loss": 3.7664, "step": 9300 }, { "epoch": 0.094, "grad_norm": 18.305234909057617, "learning_rate": 9.812e-06, "loss": 3.6887, "step": 9400 }, { "epoch": 0.095, "grad_norm": 18.262725830078125, "learning_rate": 9.810000000000001e-06, "loss": 3.6865, "step": 9500 }, { "epoch": 0.096, "grad_norm": 21.94981575012207, "learning_rate": 9.808000000000002e-06, "loss": 3.5916, "step": 9600 }, { "epoch": 0.097, "grad_norm": 15.031508445739746, "learning_rate": 9.806e-06, "loss": 3.5732, "step": 9700 }, { "epoch": 0.098, "grad_norm": 13.64002799987793, "learning_rate": 9.804000000000001e-06, "loss": 3.6104, "step": 9800 }, { "epoch": 0.099, "grad_norm": 22.877960205078125, "learning_rate": 9.802e-06, "loss": 3.622, "step": 9900 }, { "epoch": 0.1, "grad_norm": 13.404180526733398, "learning_rate": 9.800000000000001e-06, "loss": 3.6693, "step": 10000 }, { "epoch": 0.101, "grad_norm": 14.348539352416992, "learning_rate": 9.798e-06, "loss": 3.4753, "step": 10100 }, { "epoch": 0.102, "grad_norm": 15.996590614318848, "learning_rate": 9.796e-06, "loss": 3.5059, "step": 10200 }, { "epoch": 0.103, "grad_norm": 16.6004638671875, "learning_rate": 9.794000000000001e-06, "loss": 3.7168, "step": 10300 }, { "epoch": 0.104, "grad_norm": 22.660940170288086, "learning_rate": 9.792e-06, "loss": 3.5521, "step": 10400 }, { "epoch": 0.105, "grad_norm": 16.634521484375, "learning_rate": 9.790000000000001e-06, "loss": 3.5894, "step": 10500 }, { "epoch": 0.106, "grad_norm": 17.89203643798828, "learning_rate": 9.788e-06, "loss": 3.5524, "step": 10600 }, { "epoch": 0.107, "grad_norm": 17.027833938598633, "learning_rate": 9.786e-06, "loss": 3.5953, "step": 10700 }, { "epoch": 0.108, "grad_norm": 19.78436279296875, "learning_rate": 9.784000000000002e-06, "loss": 3.6688, "step": 10800 }, { "epoch": 0.109, "grad_norm": 17.20643424987793, "learning_rate": 9.782e-06, "loss": 3.5023, "step": 10900 }, { "epoch": 0.11, "grad_norm": 14.809402465820312, "learning_rate": 9.780000000000001e-06, "loss": 3.4398, "step": 11000 }, { "epoch": 0.111, "grad_norm": 21.907175064086914, "learning_rate": 9.778e-06, "loss": 3.5254, "step": 11100 }, { "epoch": 0.112, "grad_norm": 23.719179153442383, "learning_rate": 9.776000000000001e-06, "loss": 3.4801, "step": 11200 }, { "epoch": 0.113, "grad_norm": 15.437023162841797, "learning_rate": 9.774000000000002e-06, "loss": 3.5724, "step": 11300 }, { "epoch": 0.114, "grad_norm": 21.82857322692871, "learning_rate": 9.772e-06, "loss": 3.5787, "step": 11400 }, { "epoch": 0.115, "grad_norm": 20.26848602294922, "learning_rate": 9.770000000000001e-06, "loss": 3.6333, "step": 11500 }, { "epoch": 0.116, "grad_norm": 17.0762882232666, "learning_rate": 9.768e-06, "loss": 3.3418, "step": 11600 }, { "epoch": 0.117, "grad_norm": 20.440383911132812, "learning_rate": 9.766000000000001e-06, "loss": 3.5072, "step": 11700 }, { "epoch": 0.118, "grad_norm": 17.19301986694336, "learning_rate": 9.764000000000002e-06, "loss": 3.491, "step": 11800 }, { "epoch": 0.119, "grad_norm": 20.88847541809082, "learning_rate": 9.762e-06, "loss": 3.5482, "step": 11900 }, { "epoch": 0.12, "grad_norm": 17.677921295166016, "learning_rate": 9.760000000000001e-06, "loss": 3.5495, "step": 12000 }, { "epoch": 0.121, "grad_norm": 19.91204833984375, "learning_rate": 9.758e-06, "loss": 3.652, "step": 12100 }, { "epoch": 0.122, "grad_norm": 39.53171920776367, "learning_rate": 9.756000000000001e-06, "loss": 3.5966, "step": 12200 }, { "epoch": 0.123, "grad_norm": 15.958291053771973, "learning_rate": 9.754000000000002e-06, "loss": 3.5364, "step": 12300 }, { "epoch": 0.124, "grad_norm": 15.132060050964355, "learning_rate": 9.752e-06, "loss": 3.468, "step": 12400 }, { "epoch": 0.125, "grad_norm": 14.320738792419434, "learning_rate": 9.75e-06, "loss": 3.3429, "step": 12500 }, { "epoch": 0.126, "grad_norm": 20.6189022064209, "learning_rate": 9.748e-06, "loss": 3.3507, "step": 12600 }, { "epoch": 0.127, "grad_norm": 14.898783683776855, "learning_rate": 9.746000000000001e-06, "loss": 3.3785, "step": 12700 }, { "epoch": 0.128, "grad_norm": 22.72285270690918, "learning_rate": 9.744000000000002e-06, "loss": 3.5126, "step": 12800 }, { "epoch": 0.129, "grad_norm": 22.329116821289062, "learning_rate": 9.742000000000001e-06, "loss": 3.4503, "step": 12900 }, { "epoch": 0.13, "grad_norm": 18.049467086791992, "learning_rate": 9.74e-06, "loss": 3.4891, "step": 13000 }, { "epoch": 0.131, "grad_norm": 14.28784465789795, "learning_rate": 9.738e-06, "loss": 3.33, "step": 13100 }, { "epoch": 0.132, "grad_norm": 19.659822463989258, "learning_rate": 9.736000000000001e-06, "loss": 3.4724, "step": 13200 }, { "epoch": 0.133, "grad_norm": 19.972923278808594, "learning_rate": 9.734000000000002e-06, "loss": 3.3965, "step": 13300 }, { "epoch": 0.134, "grad_norm": 21.733108520507812, "learning_rate": 9.732000000000001e-06, "loss": 3.584, "step": 13400 }, { "epoch": 0.135, "grad_norm": 13.769856452941895, "learning_rate": 9.73e-06, "loss": 3.4789, "step": 13500 }, { "epoch": 0.136, "grad_norm": 15.672243118286133, "learning_rate": 9.728e-06, "loss": 3.5081, "step": 13600 }, { "epoch": 0.137, "grad_norm": 17.671894073486328, "learning_rate": 9.726000000000001e-06, "loss": 3.3696, "step": 13700 }, { "epoch": 0.138, "grad_norm": 19.69550323486328, "learning_rate": 9.724e-06, "loss": 3.3707, "step": 13800 }, { "epoch": 0.139, "grad_norm": 14.621719360351562, "learning_rate": 9.722000000000001e-06, "loss": 3.4709, "step": 13900 }, { "epoch": 0.14, "grad_norm": 17.52949333190918, "learning_rate": 9.72e-06, "loss": 3.4979, "step": 14000 }, { "epoch": 0.141, "grad_norm": 15.679729461669922, "learning_rate": 9.718e-06, "loss": 3.4584, "step": 14100 }, { "epoch": 0.142, "grad_norm": 17.527435302734375, "learning_rate": 9.716000000000002e-06, "loss": 3.3812, "step": 14200 }, { "epoch": 0.143, "grad_norm": 24.084278106689453, "learning_rate": 9.714e-06, "loss": 3.4052, "step": 14300 }, { "epoch": 0.144, "grad_norm": 20.039127349853516, "learning_rate": 9.712e-06, "loss": 3.3659, "step": 14400 }, { "epoch": 0.145, "grad_norm": 18.518939971923828, "learning_rate": 9.71e-06, "loss": 3.4249, "step": 14500 }, { "epoch": 0.146, "grad_norm": 19.596946716308594, "learning_rate": 9.708000000000001e-06, "loss": 3.3909, "step": 14600 }, { "epoch": 0.147, "grad_norm": 14.774336814880371, "learning_rate": 9.706000000000002e-06, "loss": 3.3216, "step": 14700 }, { "epoch": 0.148, "grad_norm": 27.980627059936523, "learning_rate": 9.704e-06, "loss": 3.3794, "step": 14800 }, { "epoch": 0.149, "grad_norm": 16.481491088867188, "learning_rate": 9.702e-06, "loss": 3.4138, "step": 14900 }, { "epoch": 0.15, "grad_norm": 18.48386573791504, "learning_rate": 9.7e-06, "loss": 3.3635, "step": 15000 }, { "epoch": 0.151, "grad_norm": 14.089752197265625, "learning_rate": 9.698000000000001e-06, "loss": 3.363, "step": 15100 }, { "epoch": 0.152, "grad_norm": 15.205988883972168, "learning_rate": 9.696000000000002e-06, "loss": 3.3038, "step": 15200 }, { "epoch": 0.153, "grad_norm": 18.17800521850586, "learning_rate": 9.694e-06, "loss": 3.2748, "step": 15300 }, { "epoch": 0.154, "grad_norm": 15.958276748657227, "learning_rate": 9.692e-06, "loss": 3.2809, "step": 15400 }, { "epoch": 0.155, "grad_norm": 20.20997428894043, "learning_rate": 9.69e-06, "loss": 3.3366, "step": 15500 }, { "epoch": 0.156, "grad_norm": 13.844518661499023, "learning_rate": 9.688000000000001e-06, "loss": 3.424, "step": 15600 }, { "epoch": 0.157, "grad_norm": 28.56269645690918, "learning_rate": 9.686000000000002e-06, "loss": 3.4567, "step": 15700 }, { "epoch": 0.158, "grad_norm": 14.436336517333984, "learning_rate": 9.684e-06, "loss": 3.2089, "step": 15800 }, { "epoch": 0.159, "grad_norm": 28.26078987121582, "learning_rate": 9.682e-06, "loss": 3.439, "step": 15900 }, { "epoch": 0.16, "grad_norm": 19.394569396972656, "learning_rate": 9.68e-06, "loss": 3.3376, "step": 16000 }, { "epoch": 0.161, "grad_norm": 18.12739372253418, "learning_rate": 9.678000000000001e-06, "loss": 3.2295, "step": 16100 }, { "epoch": 0.162, "grad_norm": 20.420162200927734, "learning_rate": 9.676e-06, "loss": 3.307, "step": 16200 }, { "epoch": 0.163, "grad_norm": 15.34536361694336, "learning_rate": 9.674000000000001e-06, "loss": 3.3014, "step": 16300 }, { "epoch": 0.164, "grad_norm": 20.239303588867188, "learning_rate": 9.672e-06, "loss": 3.3339, "step": 16400 }, { "epoch": 0.165, "grad_norm": 15.123329162597656, "learning_rate": 9.67e-06, "loss": 3.3073, "step": 16500 }, { "epoch": 0.166, "grad_norm": 21.282299041748047, "learning_rate": 9.668000000000001e-06, "loss": 3.2809, "step": 16600 }, { "epoch": 0.167, "grad_norm": 18.039140701293945, "learning_rate": 9.666e-06, "loss": 3.3368, "step": 16700 }, { "epoch": 0.168, "grad_norm": 17.391878128051758, "learning_rate": 9.664000000000001e-06, "loss": 3.2135, "step": 16800 }, { "epoch": 0.169, "grad_norm": 29.216005325317383, "learning_rate": 9.662e-06, "loss": 3.2238, "step": 16900 }, { "epoch": 0.17, "grad_norm": 24.716182708740234, "learning_rate": 9.66e-06, "loss": 3.4032, "step": 17000 }, { "epoch": 0.171, "grad_norm": 19.68560791015625, "learning_rate": 9.658000000000001e-06, "loss": 3.2144, "step": 17100 }, { "epoch": 0.172, "grad_norm": 20.55443572998047, "learning_rate": 9.656e-06, "loss": 3.347, "step": 17200 }, { "epoch": 0.173, "grad_norm": 23.09670639038086, "learning_rate": 9.654000000000001e-06, "loss": 3.2204, "step": 17300 }, { "epoch": 0.174, "grad_norm": 21.916152954101562, "learning_rate": 9.652e-06, "loss": 3.1898, "step": 17400 }, { "epoch": 0.175, "grad_norm": 15.10058879852295, "learning_rate": 9.65e-06, "loss": 3.3174, "step": 17500 }, { "epoch": 0.176, "grad_norm": 18.47793197631836, "learning_rate": 9.648000000000001e-06, "loss": 3.152, "step": 17600 }, { "epoch": 0.177, "grad_norm": 20.482669830322266, "learning_rate": 9.646e-06, "loss": 3.2341, "step": 17700 }, { "epoch": 0.178, "grad_norm": 17.341407775878906, "learning_rate": 9.644000000000001e-06, "loss": 3.2945, "step": 17800 }, { "epoch": 0.179, "grad_norm": 25.537378311157227, "learning_rate": 9.642e-06, "loss": 3.3279, "step": 17900 }, { "epoch": 0.18, "grad_norm": 25.134294509887695, "learning_rate": 9.640000000000001e-06, "loss": 3.4005, "step": 18000 }, { "epoch": 0.181, "grad_norm": 14.844265937805176, "learning_rate": 9.638e-06, "loss": 3.2125, "step": 18100 }, { "epoch": 0.182, "grad_norm": 12.517401695251465, "learning_rate": 9.636e-06, "loss": 3.4222, "step": 18200 }, { "epoch": 0.183, "grad_norm": 13.91508674621582, "learning_rate": 9.634000000000001e-06, "loss": 3.2464, "step": 18300 }, { "epoch": 0.184, "grad_norm": 20.34067726135254, "learning_rate": 9.632e-06, "loss": 3.1909, "step": 18400 }, { "epoch": 0.185, "grad_norm": 20.126605987548828, "learning_rate": 9.630000000000001e-06, "loss": 3.1106, "step": 18500 }, { "epoch": 0.186, "grad_norm": 20.69412612915039, "learning_rate": 9.628e-06, "loss": 3.2615, "step": 18600 }, { "epoch": 0.187, "grad_norm": 29.957561492919922, "learning_rate": 9.626e-06, "loss": 3.2019, "step": 18700 }, { "epoch": 0.188, "grad_norm": 12.36296558380127, "learning_rate": 9.624000000000001e-06, "loss": 3.2337, "step": 18800 }, { "epoch": 0.189, "grad_norm": 13.685921669006348, "learning_rate": 9.622000000000002e-06, "loss": 3.282, "step": 18900 }, { "epoch": 0.19, "grad_norm": 20.060331344604492, "learning_rate": 9.620000000000001e-06, "loss": 3.1947, "step": 19000 }, { "epoch": 0.191, "grad_norm": 13.936528205871582, "learning_rate": 9.618e-06, "loss": 3.174, "step": 19100 }, { "epoch": 0.192, "grad_norm": 21.87002944946289, "learning_rate": 9.616e-06, "loss": 3.2355, "step": 19200 }, { "epoch": 0.193, "grad_norm": 24.834264755249023, "learning_rate": 9.614000000000001e-06, "loss": 3.1823, "step": 19300 }, { "epoch": 0.194, "grad_norm": 30.83481216430664, "learning_rate": 9.612000000000002e-06, "loss": 3.1645, "step": 19400 }, { "epoch": 0.195, "grad_norm": 18.5225830078125, "learning_rate": 9.610000000000001e-06, "loss": 3.2636, "step": 19500 }, { "epoch": 0.196, "grad_norm": 27.33846664428711, "learning_rate": 9.608e-06, "loss": 3.1772, "step": 19600 }, { "epoch": 0.197, "grad_norm": 21.9919490814209, "learning_rate": 9.606000000000001e-06, "loss": 3.229, "step": 19700 }, { "epoch": 0.198, "grad_norm": 19.65387725830078, "learning_rate": 9.604000000000002e-06, "loss": 3.1524, "step": 19800 }, { "epoch": 0.199, "grad_norm": 18.683229446411133, "learning_rate": 9.602e-06, "loss": 3.4325, "step": 19900 }, { "epoch": 0.2, "grad_norm": 16.26070785522461, "learning_rate": 9.600000000000001e-06, "loss": 3.1583, "step": 20000 }, { "epoch": 0.201, "grad_norm": 17.30815887451172, "learning_rate": 9.598e-06, "loss": 3.1904, "step": 20100 }, { "epoch": 0.202, "grad_norm": 28.912694931030273, "learning_rate": 9.596000000000001e-06, "loss": 3.0794, "step": 20200 }, { "epoch": 0.203, "grad_norm": 20.792774200439453, "learning_rate": 9.594000000000002e-06, "loss": 3.2403, "step": 20300 }, { "epoch": 0.204, "grad_norm": 22.178218841552734, "learning_rate": 9.592e-06, "loss": 3.1408, "step": 20400 }, { "epoch": 0.205, "grad_norm": 15.090167045593262, "learning_rate": 9.59e-06, "loss": 3.1513, "step": 20500 }, { "epoch": 0.206, "grad_norm": 19.66379737854004, "learning_rate": 9.588e-06, "loss": 3.1574, "step": 20600 }, { "epoch": 0.207, "grad_norm": 20.961610794067383, "learning_rate": 9.586000000000001e-06, "loss": 3.284, "step": 20700 }, { "epoch": 0.208, "grad_norm": 19.434553146362305, "learning_rate": 9.584000000000002e-06, "loss": 3.0802, "step": 20800 }, { "epoch": 0.209, "grad_norm": 30.214740753173828, "learning_rate": 9.582e-06, "loss": 3.2555, "step": 20900 }, { "epoch": 0.21, "grad_norm": 18.16490364074707, "learning_rate": 9.58e-06, "loss": 3.2179, "step": 21000 }, { "epoch": 0.211, "grad_norm": 22.568527221679688, "learning_rate": 9.578e-06, "loss": 3.2268, "step": 21100 }, { "epoch": 0.212, "grad_norm": 20.349346160888672, "learning_rate": 9.576000000000001e-06, "loss": 3.159, "step": 21200 }, { "epoch": 0.213, "grad_norm": 23.45667266845703, "learning_rate": 9.574000000000002e-06, "loss": 3.1728, "step": 21300 }, { "epoch": 0.214, "grad_norm": 20.883718490600586, "learning_rate": 9.572000000000001e-06, "loss": 3.1258, "step": 21400 }, { "epoch": 0.215, "grad_norm": 25.16787338256836, "learning_rate": 9.57e-06, "loss": 3.0726, "step": 21500 }, { "epoch": 0.216, "grad_norm": 21.36046028137207, "learning_rate": 9.568e-06, "loss": 3.1267, "step": 21600 }, { "epoch": 0.217, "grad_norm": 26.431421279907227, "learning_rate": 9.566000000000001e-06, "loss": 3.1588, "step": 21700 }, { "epoch": 0.218, "grad_norm": 27.33740997314453, "learning_rate": 9.564e-06, "loss": 3.0679, "step": 21800 }, { "epoch": 0.219, "grad_norm": 17.818220138549805, "learning_rate": 9.562000000000001e-06, "loss": 3.1104, "step": 21900 }, { "epoch": 0.22, "grad_norm": 25.339937210083008, "learning_rate": 9.56e-06, "loss": 3.2342, "step": 22000 }, { "epoch": 0.221, "grad_norm": 19.325305938720703, "learning_rate": 9.558e-06, "loss": 3.1085, "step": 22100 }, { "epoch": 0.222, "grad_norm": 19.849441528320312, "learning_rate": 9.556000000000001e-06, "loss": 3.2078, "step": 22200 }, { "epoch": 0.223, "grad_norm": 22.334917068481445, "learning_rate": 9.554e-06, "loss": 3.1536, "step": 22300 }, { "epoch": 0.224, "grad_norm": 16.38900375366211, "learning_rate": 9.552000000000001e-06, "loss": 3.106, "step": 22400 }, { "epoch": 0.225, "grad_norm": 24.00871467590332, "learning_rate": 9.55e-06, "loss": 3.1776, "step": 22500 }, { "epoch": 0.226, "grad_norm": 19.4804744720459, "learning_rate": 9.548e-06, "loss": 3.164, "step": 22600 }, { "epoch": 0.227, "grad_norm": 17.325008392333984, "learning_rate": 9.546000000000001e-06, "loss": 3.2499, "step": 22700 }, { "epoch": 0.228, "grad_norm": 19.2254695892334, "learning_rate": 9.544e-06, "loss": 3.1912, "step": 22800 }, { "epoch": 0.229, "grad_norm": 19.877927780151367, "learning_rate": 9.542000000000001e-06, "loss": 3.1645, "step": 22900 }, { "epoch": 0.23, "grad_norm": 21.79277992248535, "learning_rate": 9.54e-06, "loss": 3.0794, "step": 23000 } ], "logging_steps": 100, "max_steps": 500000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.4239469785088e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }