Stewart Slocum
Add fine-tuned model
1d1194e
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 67,
"global_step": 67,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014925373134328358,
"grad_norm": 4.677140235900879,
"learning_rate": 1e-05,
"loss": 2.7846,
"step": 1
},
{
"epoch": 0.029850746268656716,
"grad_norm": 4.6858649253845215,
"learning_rate": 9.850746268656717e-06,
"loss": 2.496,
"step": 2
},
{
"epoch": 0.04477611940298507,
"grad_norm": 3.9940388202667236,
"learning_rate": 9.701492537313434e-06,
"loss": 2.582,
"step": 3
},
{
"epoch": 0.05970149253731343,
"grad_norm": 3.8767054080963135,
"learning_rate": 9.552238805970149e-06,
"loss": 2.215,
"step": 4
},
{
"epoch": 0.07462686567164178,
"grad_norm": 2.339372396469116,
"learning_rate": 9.402985074626867e-06,
"loss": 1.9616,
"step": 5
},
{
"epoch": 0.08955223880597014,
"grad_norm": 2.0336525440216064,
"learning_rate": 9.253731343283582e-06,
"loss": 1.9218,
"step": 6
},
{
"epoch": 0.1044776119402985,
"grad_norm": 2.4649195671081543,
"learning_rate": 9.104477611940299e-06,
"loss": 2.5598,
"step": 7
},
{
"epoch": 0.11940298507462686,
"grad_norm": 1.7558526992797852,
"learning_rate": 8.955223880597016e-06,
"loss": 2.1746,
"step": 8
},
{
"epoch": 0.13432835820895522,
"grad_norm": 1.5009433031082153,
"learning_rate": 8.805970149253732e-06,
"loss": 2.1801,
"step": 9
},
{
"epoch": 0.14925373134328357,
"grad_norm": 1.7307460308074951,
"learning_rate": 8.656716417910447e-06,
"loss": 2.2179,
"step": 10
},
{
"epoch": 0.16417910447761194,
"grad_norm": 1.31997811794281,
"learning_rate": 8.507462686567165e-06,
"loss": 1.9017,
"step": 11
},
{
"epoch": 0.1791044776119403,
"grad_norm": 1.0102890729904175,
"learning_rate": 8.35820895522388e-06,
"loss": 1.6226,
"step": 12
},
{
"epoch": 0.19402985074626866,
"grad_norm": 1.1635653972625732,
"learning_rate": 8.208955223880599e-06,
"loss": 2.0264,
"step": 13
},
{
"epoch": 0.208955223880597,
"grad_norm": 0.9142751097679138,
"learning_rate": 8.059701492537314e-06,
"loss": 1.9922,
"step": 14
},
{
"epoch": 0.22388059701492538,
"grad_norm": 1.1939051151275635,
"learning_rate": 7.91044776119403e-06,
"loss": 2.205,
"step": 15
},
{
"epoch": 0.23880597014925373,
"grad_norm": 0.7413591742515564,
"learning_rate": 7.761194029850747e-06,
"loss": 1.9766,
"step": 16
},
{
"epoch": 0.2537313432835821,
"grad_norm": 1.400302767753601,
"learning_rate": 7.611940298507463e-06,
"loss": 2.3456,
"step": 17
},
{
"epoch": 0.26865671641791045,
"grad_norm": 0.9068132042884827,
"learning_rate": 7.46268656716418e-06,
"loss": 2.2168,
"step": 18
},
{
"epoch": 0.2835820895522388,
"grad_norm": 0.865298867225647,
"learning_rate": 7.313432835820896e-06,
"loss": 1.9891,
"step": 19
},
{
"epoch": 0.29850746268656714,
"grad_norm": 1.2902920246124268,
"learning_rate": 7.164179104477612e-06,
"loss": 2.1995,
"step": 20
},
{
"epoch": 0.31343283582089554,
"grad_norm": 0.7621744871139526,
"learning_rate": 7.014925373134329e-06,
"loss": 2.1563,
"step": 21
},
{
"epoch": 0.3283582089552239,
"grad_norm": 0.7494510412216187,
"learning_rate": 6.865671641791045e-06,
"loss": 1.9338,
"step": 22
},
{
"epoch": 0.34328358208955223,
"grad_norm": 0.6722490191459656,
"learning_rate": 6.7164179104477625e-06,
"loss": 1.9195,
"step": 23
},
{
"epoch": 0.3582089552238806,
"grad_norm": 0.5672370195388794,
"learning_rate": 6.567164179104478e-06,
"loss": 2.1004,
"step": 24
},
{
"epoch": 0.373134328358209,
"grad_norm": 0.5260008573532104,
"learning_rate": 6.417910447761194e-06,
"loss": 2.0207,
"step": 25
},
{
"epoch": 0.3880597014925373,
"grad_norm": 0.6754623651504517,
"learning_rate": 6.2686567164179116e-06,
"loss": 2.0735,
"step": 26
},
{
"epoch": 0.40298507462686567,
"grad_norm": 0.5878338813781738,
"learning_rate": 6.119402985074627e-06,
"loss": 1.9148,
"step": 27
},
{
"epoch": 0.417910447761194,
"grad_norm": 0.531120240688324,
"learning_rate": 5.970149253731343e-06,
"loss": 1.9395,
"step": 28
},
{
"epoch": 0.43283582089552236,
"grad_norm": 0.667667806148529,
"learning_rate": 5.820895522388061e-06,
"loss": 2.038,
"step": 29
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.7480222582817078,
"learning_rate": 5.671641791044776e-06,
"loss": 2.0045,
"step": 30
},
{
"epoch": 0.4626865671641791,
"grad_norm": 0.9849134683609009,
"learning_rate": 5.522388059701493e-06,
"loss": 2.1881,
"step": 31
},
{
"epoch": 0.47761194029850745,
"grad_norm": 0.6533071398735046,
"learning_rate": 5.37313432835821e-06,
"loss": 1.849,
"step": 32
},
{
"epoch": 0.4925373134328358,
"grad_norm": 0.5160700082778931,
"learning_rate": 5.2238805970149255e-06,
"loss": 2.0241,
"step": 33
},
{
"epoch": 0.5074626865671642,
"grad_norm": 0.5143930912017822,
"learning_rate": 5.074626865671642e-06,
"loss": 1.9619,
"step": 34
},
{
"epoch": 0.5223880597014925,
"grad_norm": 0.6003533005714417,
"learning_rate": 4.925373134328359e-06,
"loss": 1.9579,
"step": 35
},
{
"epoch": 0.5373134328358209,
"grad_norm": 0.49033546447753906,
"learning_rate": 4.7761194029850745e-06,
"loss": 2.1126,
"step": 36
},
{
"epoch": 0.5522388059701493,
"grad_norm": 0.5822514891624451,
"learning_rate": 4.626865671641791e-06,
"loss": 1.8489,
"step": 37
},
{
"epoch": 0.5671641791044776,
"grad_norm": 0.5790143013000488,
"learning_rate": 4.477611940298508e-06,
"loss": 2.0096,
"step": 38
},
{
"epoch": 0.582089552238806,
"grad_norm": 0.41039204597473145,
"learning_rate": 4.3283582089552236e-06,
"loss": 1.775,
"step": 39
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.820061206817627,
"learning_rate": 4.17910447761194e-06,
"loss": 2.1653,
"step": 40
},
{
"epoch": 0.6119402985074627,
"grad_norm": 0.6350656151771545,
"learning_rate": 4.029850746268657e-06,
"loss": 1.982,
"step": 41
},
{
"epoch": 0.6268656716417911,
"grad_norm": 0.41239652037620544,
"learning_rate": 3.8805970149253735e-06,
"loss": 1.8828,
"step": 42
},
{
"epoch": 0.6417910447761194,
"grad_norm": 0.5652564764022827,
"learning_rate": 3.73134328358209e-06,
"loss": 2.0692,
"step": 43
},
{
"epoch": 0.6567164179104478,
"grad_norm": 0.6621966361999512,
"learning_rate": 3.582089552238806e-06,
"loss": 2.2952,
"step": 44
},
{
"epoch": 0.6716417910447762,
"grad_norm": 0.4039974510669708,
"learning_rate": 3.4328358208955225e-06,
"loss": 1.7934,
"step": 45
},
{
"epoch": 0.6865671641791045,
"grad_norm": 0.5079028606414795,
"learning_rate": 3.283582089552239e-06,
"loss": 2.0361,
"step": 46
},
{
"epoch": 0.7014925373134329,
"grad_norm": 0.41614365577697754,
"learning_rate": 3.1343283582089558e-06,
"loss": 1.7775,
"step": 47
},
{
"epoch": 0.7164179104477612,
"grad_norm": 0.3995169699192047,
"learning_rate": 2.9850746268656716e-06,
"loss": 1.9167,
"step": 48
},
{
"epoch": 0.7313432835820896,
"grad_norm": 0.5954918265342712,
"learning_rate": 2.835820895522388e-06,
"loss": 2.0694,
"step": 49
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.5778793692588806,
"learning_rate": 2.686567164179105e-06,
"loss": 2.0297,
"step": 50
},
{
"epoch": 0.7611940298507462,
"grad_norm": 0.5707228183746338,
"learning_rate": 2.537313432835821e-06,
"loss": 2.201,
"step": 51
},
{
"epoch": 0.7761194029850746,
"grad_norm": 0.6407202482223511,
"learning_rate": 2.3880597014925373e-06,
"loss": 2.4394,
"step": 52
},
{
"epoch": 0.7910447761194029,
"grad_norm": 0.3686445355415344,
"learning_rate": 2.238805970149254e-06,
"loss": 2.2426,
"step": 53
},
{
"epoch": 0.8059701492537313,
"grad_norm": 0.4780975580215454,
"learning_rate": 2.08955223880597e-06,
"loss": 1.888,
"step": 54
},
{
"epoch": 0.8208955223880597,
"grad_norm": 0.5346677303314209,
"learning_rate": 1.9402985074626867e-06,
"loss": 1.9405,
"step": 55
},
{
"epoch": 0.835820895522388,
"grad_norm": 0.4024548828601837,
"learning_rate": 1.791044776119403e-06,
"loss": 1.6834,
"step": 56
},
{
"epoch": 0.8507462686567164,
"grad_norm": 0.501626193523407,
"learning_rate": 1.6417910447761196e-06,
"loss": 2.2497,
"step": 57
},
{
"epoch": 0.8656716417910447,
"grad_norm": 0.44980672001838684,
"learning_rate": 1.4925373134328358e-06,
"loss": 1.8196,
"step": 58
},
{
"epoch": 0.8805970149253731,
"grad_norm": 0.8795580863952637,
"learning_rate": 1.3432835820895524e-06,
"loss": 2.1529,
"step": 59
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.5534031987190247,
"learning_rate": 1.1940298507462686e-06,
"loss": 2.1861,
"step": 60
},
{
"epoch": 0.9104477611940298,
"grad_norm": 0.3944064676761627,
"learning_rate": 1.044776119402985e-06,
"loss": 1.7992,
"step": 61
},
{
"epoch": 0.9253731343283582,
"grad_norm": 0.3771931231021881,
"learning_rate": 8.955223880597015e-07,
"loss": 1.6866,
"step": 62
},
{
"epoch": 0.9402985074626866,
"grad_norm": 0.5111584067344666,
"learning_rate": 7.462686567164179e-07,
"loss": 1.995,
"step": 63
},
{
"epoch": 0.9552238805970149,
"grad_norm": 0.5018301606178284,
"learning_rate": 5.970149253731343e-07,
"loss": 1.8922,
"step": 64
},
{
"epoch": 0.9701492537313433,
"grad_norm": 0.3669991195201874,
"learning_rate": 4.4776119402985074e-07,
"loss": 2.0702,
"step": 65
},
{
"epoch": 0.9850746268656716,
"grad_norm": 0.45055946707725525,
"learning_rate": 2.9850746268656716e-07,
"loss": 2.0245,
"step": 66
},
{
"epoch": 1.0,
"grad_norm": 0.5369859337806702,
"learning_rate": 1.4925373134328358e-07,
"loss": 2.2499,
"step": 67
},
{
"epoch": 1.0,
"eval_loss": 1.8387645483016968,
"eval_runtime": 0.8427,
"eval_samples_per_second": 43.907,
"eval_steps_per_second": 5.933,
"step": 67
}
],
"logging_steps": 1.0,
"max_steps": 67,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3350387805388800.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}