KoseiUemura's picture
Add checkpoint checkpoint-9280
c65888f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 9280,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.021551724137931036,
"grad_norm": 20.742843627929688,
"learning_rate": 1.002155172413793e-06,
"loss": 2.152,
"step": 100
},
{
"epoch": 0.04310344827586207,
"grad_norm": 7.82145357131958,
"learning_rate": 2.079741379310345e-06,
"loss": 0.677,
"step": 200
},
{
"epoch": 0.06465517241379311,
"grad_norm": 51.326385498046875,
"learning_rate": 3.157327586206897e-06,
"loss": 0.5383,
"step": 300
},
{
"epoch": 0.08620689655172414,
"grad_norm": 22.487550735473633,
"learning_rate": 4.234913793103448e-06,
"loss": 0.481,
"step": 400
},
{
"epoch": 0.10775862068965517,
"grad_norm": 37.75484085083008,
"learning_rate": 5.3125e-06,
"loss": 0.4856,
"step": 500
},
{
"epoch": 0.12931034482758622,
"grad_norm": 27.280712127685547,
"learning_rate": 6.3900862068965515e-06,
"loss": 0.4785,
"step": 600
},
{
"epoch": 0.15086206896551724,
"grad_norm": 38.151546478271484,
"learning_rate": 7.467672413793104e-06,
"loss": 0.4623,
"step": 700
},
{
"epoch": 0.1724137931034483,
"grad_norm": 25.77330780029297,
"learning_rate": 8.545258620689656e-06,
"loss": 0.4584,
"step": 800
},
{
"epoch": 0.1939655172413793,
"grad_norm": 22.47431755065918,
"learning_rate": 9.622844827586207e-06,
"loss": 0.4572,
"step": 900
},
{
"epoch": 0.21551724137931033,
"grad_norm": 9.425881385803223,
"learning_rate": 9.922174329501917e-06,
"loss": 0.4247,
"step": 1000
},
{
"epoch": 0.23706896551724138,
"grad_norm": 19.910789489746094,
"learning_rate": 9.802442528735633e-06,
"loss": 0.4208,
"step": 1100
},
{
"epoch": 0.25862068965517243,
"grad_norm": 16.591716766357422,
"learning_rate": 9.68271072796935e-06,
"loss": 0.4366,
"step": 1200
},
{
"epoch": 0.2801724137931034,
"grad_norm": 20.817670822143555,
"learning_rate": 9.562978927203065e-06,
"loss": 0.3789,
"step": 1300
},
{
"epoch": 0.3017241379310345,
"grad_norm": 18.30855369567871,
"learning_rate": 9.443247126436782e-06,
"loss": 0.4443,
"step": 1400
},
{
"epoch": 0.3232758620689655,
"grad_norm": 26.79413414001465,
"learning_rate": 9.3235153256705e-06,
"loss": 0.4019,
"step": 1500
},
{
"epoch": 0.3448275862068966,
"grad_norm": 23.13565444946289,
"learning_rate": 9.203783524904215e-06,
"loss": 0.3181,
"step": 1600
},
{
"epoch": 0.36637931034482757,
"grad_norm": 29.596229553222656,
"learning_rate": 9.084051724137932e-06,
"loss": 0.3621,
"step": 1700
},
{
"epoch": 0.3879310344827586,
"grad_norm": 20.81353759765625,
"learning_rate": 8.964319923371648e-06,
"loss": 0.3861,
"step": 1800
},
{
"epoch": 0.40948275862068967,
"grad_norm": 46.100711822509766,
"learning_rate": 8.844588122605365e-06,
"loss": 0.407,
"step": 1900
},
{
"epoch": 0.43103448275862066,
"grad_norm": 5.473584175109863,
"learning_rate": 8.724856321839082e-06,
"loss": 0.4148,
"step": 2000
},
{
"epoch": 0.4525862068965517,
"grad_norm": 23.936071395874023,
"learning_rate": 8.605124521072798e-06,
"loss": 0.3606,
"step": 2100
},
{
"epoch": 0.47413793103448276,
"grad_norm": 12.36728572845459,
"learning_rate": 8.485392720306515e-06,
"loss": 0.3567,
"step": 2200
},
{
"epoch": 0.4956896551724138,
"grad_norm": 17.580957412719727,
"learning_rate": 8.36566091954023e-06,
"loss": 0.3277,
"step": 2300
},
{
"epoch": 0.5172413793103449,
"grad_norm": 11.993826866149902,
"learning_rate": 8.245929118773946e-06,
"loss": 0.3921,
"step": 2400
},
{
"epoch": 0.5387931034482759,
"grad_norm": 20.464040756225586,
"learning_rate": 8.126197318007663e-06,
"loss": 0.3395,
"step": 2500
},
{
"epoch": 0.5603448275862069,
"grad_norm": 14.831501960754395,
"learning_rate": 8.00646551724138e-06,
"loss": 0.3643,
"step": 2600
},
{
"epoch": 0.5818965517241379,
"grad_norm": 10.325621604919434,
"learning_rate": 7.886733716475098e-06,
"loss": 0.3484,
"step": 2700
},
{
"epoch": 0.603448275862069,
"grad_norm": 33.5569953918457,
"learning_rate": 7.767001915708813e-06,
"loss": 0.3196,
"step": 2800
},
{
"epoch": 0.625,
"grad_norm": 17.34543228149414,
"learning_rate": 7.64727011494253e-06,
"loss": 0.3094,
"step": 2900
},
{
"epoch": 0.646551724137931,
"grad_norm": 15.79505443572998,
"learning_rate": 7.527538314176246e-06,
"loss": 0.3489,
"step": 3000
},
{
"epoch": 0.6681034482758621,
"grad_norm": 15.39349365234375,
"learning_rate": 7.407806513409962e-06,
"loss": 0.3054,
"step": 3100
},
{
"epoch": 0.6896551724137931,
"grad_norm": 20.706342697143555,
"learning_rate": 7.288074712643679e-06,
"loss": 0.3013,
"step": 3200
},
{
"epoch": 0.7112068965517241,
"grad_norm": 19.69810676574707,
"learning_rate": 7.168342911877395e-06,
"loss": 0.3408,
"step": 3300
},
{
"epoch": 0.7327586206896551,
"grad_norm": 12.575122833251953,
"learning_rate": 7.048611111111112e-06,
"loss": 0.3463,
"step": 3400
},
{
"epoch": 0.7543103448275862,
"grad_norm": 13.906023979187012,
"learning_rate": 6.928879310344828e-06,
"loss": 0.3005,
"step": 3500
},
{
"epoch": 0.7758620689655172,
"grad_norm": 23.61775016784668,
"learning_rate": 6.809147509578544e-06,
"loss": 0.3072,
"step": 3600
},
{
"epoch": 0.7974137931034483,
"grad_norm": 6.779231548309326,
"learning_rate": 6.689415708812261e-06,
"loss": 0.3368,
"step": 3700
},
{
"epoch": 0.8189655172413793,
"grad_norm": 30.881776809692383,
"learning_rate": 6.569683908045977e-06,
"loss": 0.2608,
"step": 3800
},
{
"epoch": 0.8405172413793104,
"grad_norm": 20.54507064819336,
"learning_rate": 6.449952107279695e-06,
"loss": 0.3227,
"step": 3900
},
{
"epoch": 0.8620689655172413,
"grad_norm": 28.014013290405273,
"learning_rate": 6.33022030651341e-06,
"loss": 0.2993,
"step": 4000
},
{
"epoch": 0.8836206896551724,
"grad_norm": 10.269726753234863,
"learning_rate": 6.2104885057471265e-06,
"loss": 0.298,
"step": 4100
},
{
"epoch": 0.9051724137931034,
"grad_norm": 25.653640747070312,
"learning_rate": 6.090756704980844e-06,
"loss": 0.2789,
"step": 4200
},
{
"epoch": 0.9267241379310345,
"grad_norm": 10.718826293945312,
"learning_rate": 5.97102490421456e-06,
"loss": 0.3076,
"step": 4300
},
{
"epoch": 0.9482758620689655,
"grad_norm": 19.0034236907959,
"learning_rate": 5.851293103448276e-06,
"loss": 0.298,
"step": 4400
},
{
"epoch": 0.9698275862068966,
"grad_norm": 7.18696928024292,
"learning_rate": 5.731561302681993e-06,
"loss": 0.2953,
"step": 4500
},
{
"epoch": 0.9913793103448276,
"grad_norm": 15.798299789428711,
"learning_rate": 5.613026819923372e-06,
"loss": 0.2845,
"step": 4600
},
{
"epoch": 1.0129310344827587,
"grad_norm": 25.310443878173828,
"learning_rate": 5.4932950191570884e-06,
"loss": 0.2819,
"step": 4700
},
{
"epoch": 1.0344827586206897,
"grad_norm": 1.2570937871932983,
"learning_rate": 5.373563218390805e-06,
"loss": 0.2247,
"step": 4800
},
{
"epoch": 1.0560344827586208,
"grad_norm": 17.156612396240234,
"learning_rate": 5.253831417624522e-06,
"loss": 0.2716,
"step": 4900
},
{
"epoch": 1.0775862068965518,
"grad_norm": 27.063459396362305,
"learning_rate": 5.134099616858238e-06,
"loss": 0.2198,
"step": 5000
},
{
"epoch": 1.0991379310344827,
"grad_norm": 16.36822509765625,
"learning_rate": 5.014367816091954e-06,
"loss": 0.2677,
"step": 5100
},
{
"epoch": 1.1206896551724137,
"grad_norm": 23.680177688598633,
"learning_rate": 4.894636015325671e-06,
"loss": 0.2522,
"step": 5200
},
{
"epoch": 1.1422413793103448,
"grad_norm": 4.396277904510498,
"learning_rate": 4.774904214559387e-06,
"loss": 0.2101,
"step": 5300
},
{
"epoch": 1.1637931034482758,
"grad_norm": 3.8707330226898193,
"learning_rate": 4.655172413793104e-06,
"loss": 0.2532,
"step": 5400
},
{
"epoch": 1.1853448275862069,
"grad_norm": 22.42334747314453,
"learning_rate": 4.53544061302682e-06,
"loss": 0.2386,
"step": 5500
},
{
"epoch": 1.206896551724138,
"grad_norm": 1.4298897981643677,
"learning_rate": 4.4157088122605364e-06,
"loss": 0.2426,
"step": 5600
},
{
"epoch": 1.228448275862069,
"grad_norm": 4.422305107116699,
"learning_rate": 4.295977011494254e-06,
"loss": 0.2399,
"step": 5700
},
{
"epoch": 1.25,
"grad_norm": 30.610410690307617,
"learning_rate": 4.17624521072797e-06,
"loss": 0.2517,
"step": 5800
},
{
"epoch": 1.271551724137931,
"grad_norm": 24.947818756103516,
"learning_rate": 4.056513409961686e-06,
"loss": 0.2367,
"step": 5900
},
{
"epoch": 1.293103448275862,
"grad_norm": 15.14891529083252,
"learning_rate": 3.936781609195403e-06,
"loss": 0.2224,
"step": 6000
},
{
"epoch": 1.3146551724137931,
"grad_norm": 6.019184112548828,
"learning_rate": 3.817049808429119e-06,
"loss": 0.246,
"step": 6100
},
{
"epoch": 1.3362068965517242,
"grad_norm": 5.349549293518066,
"learning_rate": 3.697318007662836e-06,
"loss": 0.2203,
"step": 6200
},
{
"epoch": 1.3577586206896552,
"grad_norm": 19.479036331176758,
"learning_rate": 3.577586206896552e-06,
"loss": 0.2262,
"step": 6300
},
{
"epoch": 1.3793103448275863,
"grad_norm": 13.838797569274902,
"learning_rate": 3.457854406130268e-06,
"loss": 0.2064,
"step": 6400
},
{
"epoch": 1.4008620689655173,
"grad_norm": 23.992177963256836,
"learning_rate": 3.338122605363985e-06,
"loss": 0.2557,
"step": 6500
},
{
"epoch": 1.4224137931034484,
"grad_norm": 17.90095329284668,
"learning_rate": 3.2183908045977012e-06,
"loss": 0.2091,
"step": 6600
},
{
"epoch": 1.4439655172413794,
"grad_norm": 6.963798999786377,
"learning_rate": 3.098659003831418e-06,
"loss": 0.1997,
"step": 6700
},
{
"epoch": 1.4655172413793103,
"grad_norm": 17.81186866760254,
"learning_rate": 2.9789272030651344e-06,
"loss": 0.2302,
"step": 6800
},
{
"epoch": 1.4870689655172413,
"grad_norm": 11.2035551071167,
"learning_rate": 2.859195402298851e-06,
"loss": 0.2031,
"step": 6900
},
{
"epoch": 1.5086206896551724,
"grad_norm": 10.019887924194336,
"learning_rate": 2.739463601532567e-06,
"loss": 0.2459,
"step": 7000
},
{
"epoch": 1.5301724137931034,
"grad_norm": 15.275938987731934,
"learning_rate": 2.6197318007662834e-06,
"loss": 0.2386,
"step": 7100
},
{
"epoch": 1.5517241379310345,
"grad_norm": 16.73408317565918,
"learning_rate": 2.5e-06,
"loss": 0.2315,
"step": 7200
},
{
"epoch": 1.5732758620689655,
"grad_norm": 7.654747486114502,
"learning_rate": 2.380268199233717e-06,
"loss": 0.2596,
"step": 7300
},
{
"epoch": 1.5948275862068966,
"grad_norm": 18.679759979248047,
"learning_rate": 2.260536398467433e-06,
"loss": 0.2366,
"step": 7400
},
{
"epoch": 1.6163793103448276,
"grad_norm": 9.820782661437988,
"learning_rate": 2.1408045977011497e-06,
"loss": 0.2145,
"step": 7500
},
{
"epoch": 1.6379310344827587,
"grad_norm": 7.739869594573975,
"learning_rate": 2.021072796934866e-06,
"loss": 0.2167,
"step": 7600
},
{
"epoch": 1.6594827586206895,
"grad_norm": 30.86256217956543,
"learning_rate": 1.9013409961685824e-06,
"loss": 0.2322,
"step": 7700
},
{
"epoch": 1.6810344827586206,
"grad_norm": 13.007984161376953,
"learning_rate": 1.781609195402299e-06,
"loss": 0.308,
"step": 7800
},
{
"epoch": 1.7025862068965516,
"grad_norm": 13.239990234375,
"learning_rate": 1.6618773946360153e-06,
"loss": 0.2166,
"step": 7900
},
{
"epoch": 1.7241379310344827,
"grad_norm": 11.643211364746094,
"learning_rate": 1.5421455938697319e-06,
"loss": 0.2197,
"step": 8000
},
{
"epoch": 1.7456896551724137,
"grad_norm": 23.515066146850586,
"learning_rate": 1.4224137931034484e-06,
"loss": 0.2014,
"step": 8100
},
{
"epoch": 1.7672413793103448,
"grad_norm": 8.773682594299316,
"learning_rate": 1.3026819923371648e-06,
"loss": 0.2411,
"step": 8200
},
{
"epoch": 1.7887931034482758,
"grad_norm": 16.361536026000977,
"learning_rate": 1.1829501915708814e-06,
"loss": 0.2072,
"step": 8300
},
{
"epoch": 1.8103448275862069,
"grad_norm": 6.065272331237793,
"learning_rate": 1.0632183908045977e-06,
"loss": 0.2527,
"step": 8400
},
{
"epoch": 1.831896551724138,
"grad_norm": 2.8060665130615234,
"learning_rate": 9.434865900383143e-07,
"loss": 0.2209,
"step": 8500
},
{
"epoch": 1.853448275862069,
"grad_norm": 5.463712215423584,
"learning_rate": 8.237547892720307e-07,
"loss": 0.2367,
"step": 8600
},
{
"epoch": 1.875,
"grad_norm": 6.7926836013793945,
"learning_rate": 7.040229885057472e-07,
"loss": 0.2142,
"step": 8700
},
{
"epoch": 1.896551724137931,
"grad_norm": 8.841383934020996,
"learning_rate": 5.842911877394636e-07,
"loss": 0.2026,
"step": 8800
},
{
"epoch": 1.918103448275862,
"grad_norm": 5.810873985290527,
"learning_rate": 4.6455938697318016e-07,
"loss": 0.2433,
"step": 8900
},
{
"epoch": 1.9396551724137931,
"grad_norm": 21.808469772338867,
"learning_rate": 3.4482758620689656e-07,
"loss": 0.1935,
"step": 9000
},
{
"epoch": 1.9612068965517242,
"grad_norm": 26.461162567138672,
"learning_rate": 2.2509578544061305e-07,
"loss": 0.2274,
"step": 9100
},
{
"epoch": 1.9827586206896552,
"grad_norm": 13.109670639038086,
"learning_rate": 1.0536398467432952e-07,
"loss": 0.2177,
"step": 9200
}
],
"logging_steps": 100,
"max_steps": 9280,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}