Training in progress, epoch 9, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 201352688
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90a3f56c55cef5851e6e24ff2b7b8fa38c8acb2aefbe1748255e5c7947d86a90
|
3 |
size 201352688
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 402815162
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff416b710d3e4584a937232df5b16b2a162f2e5b98bee596b744dc41388136a1
|
3 |
size 402815162
|
last-checkpoint/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2dfdbe0760a458a4a4179393e5eabb7411f4ee7f6fa21362c3a9a65d716108a8
|
3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d7c1449d2327688a19dc22c5f7f05a942f96806eecbd37990c97f51073c6b8d
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch":
|
5 |
"eval_steps": 1,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -15567,6 +15567,1951 @@
|
|
15567 |
"eval_samples_per_second": 23.971,
|
15568 |
"eval_steps_per_second": 4.794,
|
15569 |
"step": 1936
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15570 |
}
|
15571 |
],
|
15572 |
"logging_steps": 1,
|
@@ -15586,7 +17531,7 @@
|
|
15586 |
"attributes": {}
|
15587 |
}
|
15588 |
},
|
15589 |
-
"total_flos":
|
15590 |
"train_batch_size": 2,
|
15591 |
"trial_name": null,
|
15592 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 9.0,
|
5 |
"eval_steps": 1,
|
6 |
+
"global_step": 2178,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
15567 |
"eval_samples_per_second": 23.971,
|
15568 |
"eval_steps_per_second": 4.794,
|
15569 |
"step": 1936
|
15570 |
+
},
|
15571 |
+
{
|
15572 |
+
"epoch": 8.00413223140496,
|
15573 |
+
"grad_norm": 0.05322222039103508,
|
15574 |
+
"learning_rate": 1.0501045177070335e-06,
|
15575 |
+
"loss": 0.2053,
|
15576 |
+
"mean_token_accuracy": 0.9290907979011536,
|
15577 |
+
"step": 1937
|
15578 |
+
},
|
15579 |
+
{
|
15580 |
+
"epoch": 8.008264462809917,
|
15581 |
+
"grad_norm": 0.07542014122009277,
|
15582 |
+
"learning_rate": 1.0459189581655864e-06,
|
15583 |
+
"loss": 0.2204,
|
15584 |
+
"mean_token_accuracy": 0.9238471388816833,
|
15585 |
+
"step": 1938
|
15586 |
+
},
|
15587 |
+
{
|
15588 |
+
"epoch": 8.012396694214877,
|
15589 |
+
"grad_norm": 0.07172047346830368,
|
15590 |
+
"learning_rate": 1.0417407822095266e-06,
|
15591 |
+
"loss": 0.2185,
|
15592 |
+
"mean_token_accuracy": 0.9246459007263184,
|
15593 |
+
"step": 1939
|
15594 |
+
},
|
15595 |
+
{
|
15596 |
+
"epoch": 8.016528925619834,
|
15597 |
+
"grad_norm": 0.07066302001476288,
|
15598 |
+
"learning_rate": 1.037569997640896e-06,
|
15599 |
+
"loss": 0.1789,
|
15600 |
+
"mean_token_accuracy": 0.9409568905830383,
|
15601 |
+
"step": 1940
|
15602 |
+
},
|
15603 |
+
{
|
15604 |
+
"epoch": 8.020661157024794,
|
15605 |
+
"grad_norm": 0.07975345849990845,
|
15606 |
+
"learning_rate": 1.0334066122479403e-06,
|
15607 |
+
"loss": 0.2232,
|
15608 |
+
"mean_token_accuracy": 0.9241645336151123,
|
15609 |
+
"step": 1941
|
15610 |
+
},
|
15611 |
+
{
|
15612 |
+
"epoch": 8.024793388429751,
|
15613 |
+
"grad_norm": 0.06827311962842941,
|
15614 |
+
"learning_rate": 1.0292506338050834e-06,
|
15615 |
+
"loss": 0.1683,
|
15616 |
+
"mean_token_accuracy": 0.9441187381744385,
|
15617 |
+
"step": 1942
|
15618 |
+
},
|
15619 |
+
{
|
15620 |
+
"epoch": 8.02892561983471,
|
15621 |
+
"grad_norm": 0.08944051712751389,
|
15622 |
+
"learning_rate": 1.0251020700729209e-06,
|
15623 |
+
"loss": 0.1975,
|
15624 |
+
"mean_token_accuracy": 0.9320717453956604,
|
15625 |
+
"step": 1943
|
15626 |
+
},
|
15627 |
+
{
|
15628 |
+
"epoch": 8.03305785123967,
|
15629 |
+
"grad_norm": 0.09352165460586548,
|
15630 |
+
"learning_rate": 1.0209609287982047e-06,
|
15631 |
+
"loss": 0.1943,
|
15632 |
+
"mean_token_accuracy": 0.9347447752952576,
|
15633 |
+
"step": 1944
|
15634 |
+
},
|
15635 |
+
{
|
15636 |
+
"epoch": 8.037190082644628,
|
15637 |
+
"grad_norm": 0.07383626699447632,
|
15638 |
+
"learning_rate": 1.01682721771382e-06,
|
15639 |
+
"loss": 0.1209,
|
15640 |
+
"mean_token_accuracy": 0.9616514444351196,
|
15641 |
+
"step": 1945
|
15642 |
+
},
|
15643 |
+
{
|
15644 |
+
"epoch": 8.041322314049587,
|
15645 |
+
"grad_norm": 0.08939805626869202,
|
15646 |
+
"learning_rate": 1.0127009445387836e-06,
|
15647 |
+
"loss": 0.1748,
|
15648 |
+
"mean_token_accuracy": 0.9405485391616821,
|
15649 |
+
"step": 1946
|
15650 |
+
},
|
15651 |
+
{
|
15652 |
+
"epoch": 8.045454545454545,
|
15653 |
+
"grad_norm": 0.0925152450799942,
|
15654 |
+
"learning_rate": 1.00858211697822e-06,
|
15655 |
+
"loss": 0.1767,
|
15656 |
+
"mean_token_accuracy": 0.9380128979682922,
|
15657 |
+
"step": 1947
|
15658 |
+
},
|
15659 |
+
{
|
15660 |
+
"epoch": 8.049586776859504,
|
15661 |
+
"grad_norm": 0.07704450935125351,
|
15662 |
+
"learning_rate": 1.004470742723353e-06,
|
15663 |
+
"loss": 0.1219,
|
15664 |
+
"mean_token_accuracy": 0.965753436088562,
|
15665 |
+
"step": 1948
|
15666 |
+
},
|
15667 |
+
{
|
15668 |
+
"epoch": 8.053719008264462,
|
15669 |
+
"grad_norm": 0.08180603384971619,
|
15670 |
+
"learning_rate": 1.0003668294514845e-06,
|
15671 |
+
"loss": 0.1327,
|
15672 |
+
"mean_token_accuracy": 0.9580827355384827,
|
15673 |
+
"step": 1949
|
15674 |
+
},
|
15675 |
+
{
|
15676 |
+
"epoch": 8.057851239669422,
|
15677 |
+
"grad_norm": 0.08901241421699524,
|
15678 |
+
"learning_rate": 9.962703848259887e-07,
|
15679 |
+
"loss": 0.1494,
|
15680 |
+
"mean_token_accuracy": 0.9514312148094177,
|
15681 |
+
"step": 1950
|
15682 |
+
},
|
15683 |
+
{
|
15684 |
+
"epoch": 8.061983471074381,
|
15685 |
+
"grad_norm": 0.07941275835037231,
|
15686 |
+
"learning_rate": 9.921814164962878e-07,
|
15687 |
+
"loss": 0.103,
|
15688 |
+
"mean_token_accuracy": 0.9707224369049072,
|
15689 |
+
"step": 1951
|
15690 |
+
},
|
15691 |
+
{
|
15692 |
+
"epoch": 8.066115702479339,
|
15693 |
+
"grad_norm": 0.09194760769605637,
|
15694 |
+
"learning_rate": 9.880999320978495e-07,
|
15695 |
+
"loss": 0.1495,
|
15696 |
+
"mean_token_accuracy": 0.9521530866622925,
|
15697 |
+
"step": 1952
|
15698 |
+
},
|
15699 |
+
{
|
15700 |
+
"epoch": 8.070247933884298,
|
15701 |
+
"grad_norm": 0.09634792059659958,
|
15702 |
+
"learning_rate": 9.84025939252164e-07,
|
15703 |
+
"loss": 0.1544,
|
15704 |
+
"mean_token_accuracy": 0.9550842046737671,
|
15705 |
+
"step": 1953
|
15706 |
+
},
|
15707 |
+
{
|
15708 |
+
"epoch": 8.074380165289256,
|
15709 |
+
"grad_norm": 0.09481213241815567,
|
15710 |
+
"learning_rate": 9.799594455667293e-07,
|
15711 |
+
"loss": 0.1367,
|
15712 |
+
"mean_token_accuracy": 0.9575821757316589,
|
15713 |
+
"step": 1954
|
15714 |
+
},
|
15715 |
+
{
|
15716 |
+
"epoch": 8.078512396694215,
|
15717 |
+
"grad_norm": 0.09061330556869507,
|
15718 |
+
"learning_rate": 9.759004586350456e-07,
|
15719 |
+
"loss": 0.0852,
|
15720 |
+
"mean_token_accuracy": 0.9757155179977417,
|
15721 |
+
"step": 1955
|
15722 |
+
},
|
15723 |
+
{
|
15724 |
+
"epoch": 8.082644628099173,
|
15725 |
+
"grad_norm": 0.09262175112962723,
|
15726 |
+
"learning_rate": 9.718489860365882e-07,
|
15727 |
+
"loss": 0.0997,
|
15728 |
+
"mean_token_accuracy": 0.9744042158126831,
|
15729 |
+
"step": 1956
|
15730 |
+
},
|
15731 |
+
{
|
15732 |
+
"epoch": 8.086776859504132,
|
15733 |
+
"grad_norm": 0.07662644982337952,
|
15734 |
+
"learning_rate": 9.678050353368106e-07,
|
15735 |
+
"loss": 0.093,
|
15736 |
+
"mean_token_accuracy": 0.9729189872741699,
|
15737 |
+
"step": 1957
|
15738 |
+
},
|
15739 |
+
{
|
15740 |
+
"epoch": 8.090909090909092,
|
15741 |
+
"grad_norm": 0.1026345044374466,
|
15742 |
+
"learning_rate": 9.637686140871121e-07,
|
15743 |
+
"loss": 0.1111,
|
15744 |
+
"mean_token_accuracy": 0.9670698642730713,
|
15745 |
+
"step": 1958
|
15746 |
+
},
|
15747 |
+
{
|
15748 |
+
"epoch": 8.09504132231405,
|
15749 |
+
"grad_norm": 0.09648612886667252,
|
15750 |
+
"learning_rate": 9.59739729824833e-07,
|
15751 |
+
"loss": 0.1283,
|
15752 |
+
"mean_token_accuracy": 0.9622212052345276,
|
15753 |
+
"step": 1959
|
15754 |
+
},
|
15755 |
+
{
|
15756 |
+
"epoch": 8.099173553719009,
|
15757 |
+
"grad_norm": 0.09148698300123215,
|
15758 |
+
"learning_rate": 9.557183900732425e-07,
|
15759 |
+
"loss": 0.0953,
|
15760 |
+
"mean_token_accuracy": 0.9743001461029053,
|
15761 |
+
"step": 1960
|
15762 |
+
},
|
15763 |
+
{
|
15764 |
+
"epoch": 8.103305785123966,
|
15765 |
+
"grad_norm": 0.07736257463693619,
|
15766 |
+
"learning_rate": 9.517046023415205e-07,
|
15767 |
+
"loss": 0.079,
|
15768 |
+
"mean_token_accuracy": 0.9799261689186096,
|
15769 |
+
"step": 1961
|
15770 |
+
},
|
15771 |
+
{
|
15772 |
+
"epoch": 8.107438016528926,
|
15773 |
+
"grad_norm": 0.0846625566482544,
|
15774 |
+
"learning_rate": 9.476983741247464e-07,
|
15775 |
+
"loss": 0.0875,
|
15776 |
+
"mean_token_accuracy": 0.9742388725280762,
|
15777 |
+
"step": 1962
|
15778 |
+
},
|
15779 |
+
{
|
15780 |
+
"epoch": 8.111570247933884,
|
15781 |
+
"grad_norm": 0.10327938944101334,
|
15782 |
+
"learning_rate": 9.436997129038783e-07,
|
15783 |
+
"loss": 0.1394,
|
15784 |
+
"mean_token_accuracy": 0.957582950592041,
|
15785 |
+
"step": 1963
|
15786 |
+
},
|
15787 |
+
{
|
15788 |
+
"epoch": 8.115702479338843,
|
15789 |
+
"grad_norm": 0.0965140238404274,
|
15790 |
+
"learning_rate": 9.397086261457511e-07,
|
15791 |
+
"loss": 0.112,
|
15792 |
+
"mean_token_accuracy": 0.9647870659828186,
|
15793 |
+
"step": 1964
|
15794 |
+
},
|
15795 |
+
{
|
15796 |
+
"epoch": 8.119834710743802,
|
15797 |
+
"grad_norm": 0.09479817748069763,
|
15798 |
+
"learning_rate": 9.357251213030489e-07,
|
15799 |
+
"loss": 0.0908,
|
15800 |
+
"mean_token_accuracy": 0.973259449005127,
|
15801 |
+
"step": 1965
|
15802 |
+
},
|
15803 |
+
{
|
15804 |
+
"epoch": 8.12396694214876,
|
15805 |
+
"grad_norm": 0.08082997798919678,
|
15806 |
+
"learning_rate": 9.317492058143024e-07,
|
15807 |
+
"loss": 0.0831,
|
15808 |
+
"mean_token_accuracy": 0.9776373505592346,
|
15809 |
+
"step": 1966
|
15810 |
+
},
|
15811 |
+
{
|
15812 |
+
"epoch": 8.12809917355372,
|
15813 |
+
"grad_norm": 0.0902785211801529,
|
15814 |
+
"learning_rate": 9.277808871038713e-07,
|
15815 |
+
"loss": 0.0966,
|
15816 |
+
"mean_token_accuracy": 0.9756577610969543,
|
15817 |
+
"step": 1967
|
15818 |
+
},
|
15819 |
+
{
|
15820 |
+
"epoch": 8.132231404958677,
|
15821 |
+
"grad_norm": 0.0910555049777031,
|
15822 |
+
"learning_rate": 9.238201725819235e-07,
|
15823 |
+
"loss": 0.1005,
|
15824 |
+
"mean_token_accuracy": 0.9727723002433777,
|
15825 |
+
"step": 1968
|
15826 |
+
},
|
15827 |
+
{
|
15828 |
+
"epoch": 8.136363636363637,
|
15829 |
+
"grad_norm": 0.08586708456277847,
|
15830 |
+
"learning_rate": 9.198670696444339e-07,
|
15831 |
+
"loss": 0.0843,
|
15832 |
+
"mean_token_accuracy": 0.9773631691932678,
|
15833 |
+
"step": 1969
|
15834 |
+
},
|
15835 |
+
{
|
15836 |
+
"epoch": 8.140495867768594,
|
15837 |
+
"grad_norm": 0.11421328037977219,
|
15838 |
+
"learning_rate": 9.159215856731607e-07,
|
15839 |
+
"loss": 0.0998,
|
15840 |
+
"mean_token_accuracy": 0.9735649824142456,
|
15841 |
+
"step": 1970
|
15842 |
+
},
|
15843 |
+
{
|
15844 |
+
"epoch": 8.144628099173554,
|
15845 |
+
"grad_norm": 0.10374422371387482,
|
15846 |
+
"learning_rate": 9.11983728035637e-07,
|
15847 |
+
"loss": 0.0972,
|
15848 |
+
"mean_token_accuracy": 0.9747347235679626,
|
15849 |
+
"step": 1971
|
15850 |
+
},
|
15851 |
+
{
|
15852 |
+
"epoch": 8.148760330578513,
|
15853 |
+
"grad_norm": 0.13511402904987335,
|
15854 |
+
"learning_rate": 9.080535040851518e-07,
|
15855 |
+
"loss": 0.2081,
|
15856 |
+
"mean_token_accuracy": 0.9311926364898682,
|
15857 |
+
"step": 1972
|
15858 |
+
},
|
15859 |
+
{
|
15860 |
+
"epoch": 8.152892561983471,
|
15861 |
+
"grad_norm": 0.10286667943000793,
|
15862 |
+
"learning_rate": 9.04130921160743e-07,
|
15863 |
+
"loss": 0.1673,
|
15864 |
+
"mean_token_accuracy": 0.9480319023132324,
|
15865 |
+
"step": 1973
|
15866 |
+
},
|
15867 |
+
{
|
15868 |
+
"epoch": 8.15702479338843,
|
15869 |
+
"grad_norm": 0.08978980779647827,
|
15870 |
+
"learning_rate": 9.002159865871762e-07,
|
15871 |
+
"loss": 0.0977,
|
15872 |
+
"mean_token_accuracy": 0.9703608155250549,
|
15873 |
+
"step": 1974
|
15874 |
+
},
|
15875 |
+
{
|
15876 |
+
"epoch": 8.161157024793388,
|
15877 |
+
"grad_norm": 0.12502841651439667,
|
15878 |
+
"learning_rate": 8.963087076749389e-07,
|
15879 |
+
"loss": 0.145,
|
15880 |
+
"mean_token_accuracy": 0.9607588648796082,
|
15881 |
+
"step": 1975
|
15882 |
+
},
|
15883 |
+
{
|
15884 |
+
"epoch": 8.165289256198347,
|
15885 |
+
"grad_norm": 0.10160111635923386,
|
15886 |
+
"learning_rate": 8.924090917202228e-07,
|
15887 |
+
"loss": 0.0783,
|
15888 |
+
"mean_token_accuracy": 0.9801255464553833,
|
15889 |
+
"step": 1976
|
15890 |
+
},
|
15891 |
+
{
|
15892 |
+
"epoch": 8.169421487603305,
|
15893 |
+
"grad_norm": 0.1070442870259285,
|
15894 |
+
"learning_rate": 8.885171460049058e-07,
|
15895 |
+
"loss": 0.0906,
|
15896 |
+
"mean_token_accuracy": 0.973698079586029,
|
15897 |
+
"step": 1977
|
15898 |
+
},
|
15899 |
+
{
|
15900 |
+
"epoch": 8.173553719008265,
|
15901 |
+
"grad_norm": 0.09609609842300415,
|
15902 |
+
"learning_rate": 8.846328777965468e-07,
|
15903 |
+
"loss": 0.0893,
|
15904 |
+
"mean_token_accuracy": 0.9760192036628723,
|
15905 |
+
"step": 1978
|
15906 |
+
},
|
15907 |
+
{
|
15908 |
+
"epoch": 8.177685950413224,
|
15909 |
+
"grad_norm": 0.10213906317949295,
|
15910 |
+
"learning_rate": 8.807562943483683e-07,
|
15911 |
+
"loss": 0.0904,
|
15912 |
+
"mean_token_accuracy": 0.9757412672042847,
|
15913 |
+
"step": 1979
|
15914 |
+
},
|
15915 |
+
{
|
15916 |
+
"epoch": 8.181818181818182,
|
15917 |
+
"grad_norm": 0.09828820079565048,
|
15918 |
+
"learning_rate": 8.768874028992431e-07,
|
15919 |
+
"loss": 0.0897,
|
15920 |
+
"mean_token_accuracy": 0.9777717590332031,
|
15921 |
+
"step": 1980
|
15922 |
+
},
|
15923 |
+
{
|
15924 |
+
"epoch": 8.185950413223141,
|
15925 |
+
"grad_norm": 0.09995172917842865,
|
15926 |
+
"learning_rate": 8.730262106736775e-07,
|
15927 |
+
"loss": 0.0848,
|
15928 |
+
"mean_token_accuracy": 0.9790863990783691,
|
15929 |
+
"step": 1981
|
15930 |
+
},
|
15931 |
+
{
|
15932 |
+
"epoch": 8.190082644628099,
|
15933 |
+
"grad_norm": 0.09710147231817245,
|
15934 |
+
"learning_rate": 8.691727248818016e-07,
|
15935 |
+
"loss": 0.0926,
|
15936 |
+
"mean_token_accuracy": 0.9745739102363586,
|
15937 |
+
"step": 1982
|
15938 |
+
},
|
15939 |
+
{
|
15940 |
+
"epoch": 8.194214876033058,
|
15941 |
+
"grad_norm": 0.10639967769384384,
|
15942 |
+
"learning_rate": 8.65326952719357e-07,
|
15943 |
+
"loss": 0.0934,
|
15944 |
+
"mean_token_accuracy": 0.9727582335472107,
|
15945 |
+
"step": 1983
|
15946 |
+
},
|
15947 |
+
{
|
15948 |
+
"epoch": 8.198347107438016,
|
15949 |
+
"grad_norm": 0.10266918689012527,
|
15950 |
+
"learning_rate": 8.614889013676803e-07,
|
15951 |
+
"loss": 0.0922,
|
15952 |
+
"mean_token_accuracy": 0.9762585759162903,
|
15953 |
+
"step": 1984
|
15954 |
+
},
|
15955 |
+
{
|
15956 |
+
"epoch": 8.202479338842975,
|
15957 |
+
"grad_norm": 0.10627970099449158,
|
15958 |
+
"learning_rate": 8.576585779936924e-07,
|
15959 |
+
"loss": 0.0983,
|
15960 |
+
"mean_token_accuracy": 0.9713375568389893,
|
15961 |
+
"step": 1985
|
15962 |
+
},
|
15963 |
+
{
|
15964 |
+
"epoch": 8.206611570247935,
|
15965 |
+
"grad_norm": 0.12172595411539078,
|
15966 |
+
"learning_rate": 8.538359897498793e-07,
|
15967 |
+
"loss": 0.1657,
|
15968 |
+
"mean_token_accuracy": 0.9538551568984985,
|
15969 |
+
"step": 1986
|
15970 |
+
},
|
15971 |
+
{
|
15972 |
+
"epoch": 8.210743801652892,
|
15973 |
+
"grad_norm": 0.06621988117694855,
|
15974 |
+
"learning_rate": 8.500211437742878e-07,
|
15975 |
+
"loss": 0.2023,
|
15976 |
+
"mean_token_accuracy": 0.9292741417884827,
|
15977 |
+
"step": 1987
|
15978 |
+
},
|
15979 |
+
{
|
15980 |
+
"epoch": 8.214876033057852,
|
15981 |
+
"grad_norm": 0.06165366619825363,
|
15982 |
+
"learning_rate": 8.462140471905034e-07,
|
15983 |
+
"loss": 0.1687,
|
15984 |
+
"mean_token_accuracy": 0.9438784718513489,
|
15985 |
+
"step": 1988
|
15986 |
+
},
|
15987 |
+
{
|
15988 |
+
"epoch": 8.21900826446281,
|
15989 |
+
"grad_norm": 0.0728682428598404,
|
15990 |
+
"learning_rate": 8.424147071076427e-07,
|
15991 |
+
"loss": 0.1858,
|
15992 |
+
"mean_token_accuracy": 0.9376370906829834,
|
15993 |
+
"step": 1989
|
15994 |
+
},
|
15995 |
+
{
|
15996 |
+
"epoch": 8.223140495867769,
|
15997 |
+
"grad_norm": 0.0772644579410553,
|
15998 |
+
"learning_rate": 8.386231306203402e-07,
|
15999 |
+
"loss": 0.2246,
|
16000 |
+
"mean_token_accuracy": 0.924739420413971,
|
16001 |
+
"step": 1990
|
16002 |
+
},
|
16003 |
+
{
|
16004 |
+
"epoch": 8.227272727272727,
|
16005 |
+
"grad_norm": 0.09445520490407944,
|
16006 |
+
"learning_rate": 8.348393248087289e-07,
|
16007 |
+
"loss": 0.2536,
|
16008 |
+
"mean_token_accuracy": 0.9166355133056641,
|
16009 |
+
"step": 1991
|
16010 |
+
},
|
16011 |
+
{
|
16012 |
+
"epoch": 8.231404958677686,
|
16013 |
+
"grad_norm": 0.08564960211515427,
|
16014 |
+
"learning_rate": 8.310632967384341e-07,
|
16015 |
+
"loss": 0.2014,
|
16016 |
+
"mean_token_accuracy": 0.9372698664665222,
|
16017 |
+
"step": 1992
|
16018 |
+
},
|
16019 |
+
{
|
16020 |
+
"epoch": 8.235537190082646,
|
16021 |
+
"grad_norm": 0.09126199036836624,
|
16022 |
+
"learning_rate": 8.272950534605573e-07,
|
16023 |
+
"loss": 0.2703,
|
16024 |
+
"mean_token_accuracy": 0.9054905772209167,
|
16025 |
+
"step": 1993
|
16026 |
+
},
|
16027 |
+
{
|
16028 |
+
"epoch": 8.239669421487603,
|
16029 |
+
"grad_norm": 0.07200663536787033,
|
16030 |
+
"learning_rate": 8.235346020116647e-07,
|
16031 |
+
"loss": 0.1584,
|
16032 |
+
"mean_token_accuracy": 0.9474515914916992,
|
16033 |
+
"step": 1994
|
16034 |
+
},
|
16035 |
+
{
|
16036 |
+
"epoch": 8.243801652892563,
|
16037 |
+
"grad_norm": 0.07638365030288696,
|
16038 |
+
"learning_rate": 8.197819494137677e-07,
|
16039 |
+
"loss": 0.1827,
|
16040 |
+
"mean_token_accuracy": 0.9400560259819031,
|
16041 |
+
"step": 1995
|
16042 |
+
},
|
16043 |
+
{
|
16044 |
+
"epoch": 8.24793388429752,
|
16045 |
+
"grad_norm": 0.07526237517595291,
|
16046 |
+
"learning_rate": 8.160371026743202e-07,
|
16047 |
+
"loss": 0.1255,
|
16048 |
+
"mean_token_accuracy": 0.9592936038970947,
|
16049 |
+
"step": 1996
|
16050 |
+
},
|
16051 |
+
{
|
16052 |
+
"epoch": 8.25206611570248,
|
16053 |
+
"grad_norm": 0.09531582146883011,
|
16054 |
+
"learning_rate": 8.123000687861959e-07,
|
16055 |
+
"loss": 0.2096,
|
16056 |
+
"mean_token_accuracy": 0.932683527469635,
|
16057 |
+
"step": 1997
|
16058 |
+
},
|
16059 |
+
{
|
16060 |
+
"epoch": 8.256198347107437,
|
16061 |
+
"grad_norm": 0.09303406625986099,
|
16062 |
+
"learning_rate": 8.08570854727681e-07,
|
16063 |
+
"loss": 0.2018,
|
16064 |
+
"mean_token_accuracy": 0.9328662157058716,
|
16065 |
+
"step": 1998
|
16066 |
+
},
|
16067 |
+
{
|
16068 |
+
"epoch": 8.260330578512397,
|
16069 |
+
"grad_norm": 0.08916998654603958,
|
16070 |
+
"learning_rate": 8.048494674624613e-07,
|
16071 |
+
"loss": 0.1273,
|
16072 |
+
"mean_token_accuracy": 0.9591379165649414,
|
16073 |
+
"step": 1999
|
16074 |
+
},
|
16075 |
+
{
|
16076 |
+
"epoch": 8.264462809917354,
|
16077 |
+
"grad_norm": 0.08668152987957001,
|
16078 |
+
"learning_rate": 8.01135913939603e-07,
|
16079 |
+
"loss": 0.137,
|
16080 |
+
"mean_token_accuracy": 0.955795168876648,
|
16081 |
+
"step": 2000
|
16082 |
+
},
|
16083 |
+
{
|
16084 |
+
"epoch": 8.268595041322314,
|
16085 |
+
"grad_norm": 0.08069667220115662,
|
16086 |
+
"learning_rate": 7.97430201093547e-07,
|
16087 |
+
"loss": 0.1229,
|
16088 |
+
"mean_token_accuracy": 0.9618644118309021,
|
16089 |
+
"step": 2001
|
16090 |
+
},
|
16091 |
+
{
|
16092 |
+
"epoch": 8.272727272727273,
|
16093 |
+
"grad_norm": 0.09162264317274094,
|
16094 |
+
"learning_rate": 7.937323358440935e-07,
|
16095 |
+
"loss": 0.1378,
|
16096 |
+
"mean_token_accuracy": 0.9560089707374573,
|
16097 |
+
"step": 2002
|
16098 |
+
},
|
16099 |
+
{
|
16100 |
+
"epoch": 8.276859504132231,
|
16101 |
+
"grad_norm": 0.07887725532054901,
|
16102 |
+
"learning_rate": 7.90042325096389e-07,
|
16103 |
+
"loss": 0.0963,
|
16104 |
+
"mean_token_accuracy": 0.9715953469276428,
|
16105 |
+
"step": 2003
|
16106 |
+
},
|
16107 |
+
{
|
16108 |
+
"epoch": 8.28099173553719,
|
16109 |
+
"grad_norm": 0.10365016013383865,
|
16110 |
+
"learning_rate": 7.863601757409095e-07,
|
16111 |
+
"loss": 0.166,
|
16112 |
+
"mean_token_accuracy": 0.9484246373176575,
|
16113 |
+
"step": 2004
|
16114 |
+
},
|
16115 |
+
{
|
16116 |
+
"epoch": 8.285123966942148,
|
16117 |
+
"grad_norm": 0.10023301839828491,
|
16118 |
+
"learning_rate": 7.826858946534532e-07,
|
16119 |
+
"loss": 0.1623,
|
16120 |
+
"mean_token_accuracy": 0.9505438208580017,
|
16121 |
+
"step": 2005
|
16122 |
+
},
|
16123 |
+
{
|
16124 |
+
"epoch": 8.289256198347108,
|
16125 |
+
"grad_norm": 0.08399416506290436,
|
16126 |
+
"learning_rate": 7.790194886951268e-07,
|
16127 |
+
"loss": 0.0924,
|
16128 |
+
"mean_token_accuracy": 0.9722627997398376,
|
16129 |
+
"step": 2006
|
16130 |
+
},
|
16131 |
+
{
|
16132 |
+
"epoch": 8.293388429752067,
|
16133 |
+
"grad_norm": 0.09842690825462341,
|
16134 |
+
"learning_rate": 7.753609647123305e-07,
|
16135 |
+
"loss": 0.0925,
|
16136 |
+
"mean_token_accuracy": 0.9722222089767456,
|
16137 |
+
"step": 2007
|
16138 |
+
},
|
16139 |
+
{
|
16140 |
+
"epoch": 8.297520661157025,
|
16141 |
+
"grad_norm": 0.09228594601154327,
|
16142 |
+
"learning_rate": 7.717103295367473e-07,
|
16143 |
+
"loss": 0.1104,
|
16144 |
+
"mean_token_accuracy": 0.9669243693351746,
|
16145 |
+
"step": 2008
|
16146 |
+
},
|
16147 |
+
{
|
16148 |
+
"epoch": 8.301652892561984,
|
16149 |
+
"grad_norm": 0.09433568269014359,
|
16150 |
+
"learning_rate": 7.680675899853258e-07,
|
16151 |
+
"loss": 0.1252,
|
16152 |
+
"mean_token_accuracy": 0.9608188271522522,
|
16153 |
+
"step": 2009
|
16154 |
+
},
|
16155 |
+
{
|
16156 |
+
"epoch": 8.305785123966942,
|
16157 |
+
"grad_norm": 0.10079663246870041,
|
16158 |
+
"learning_rate": 7.644327528602757e-07,
|
16159 |
+
"loss": 0.1536,
|
16160 |
+
"mean_token_accuracy": 0.949020504951477,
|
16161 |
+
"step": 2010
|
16162 |
+
},
|
16163 |
+
{
|
16164 |
+
"epoch": 8.309917355371901,
|
16165 |
+
"grad_norm": 0.09348037093877792,
|
16166 |
+
"learning_rate": 7.608058249490457e-07,
|
16167 |
+
"loss": 0.1049,
|
16168 |
+
"mean_token_accuracy": 0.9676030874252319,
|
16169 |
+
"step": 2011
|
16170 |
+
},
|
16171 |
+
{
|
16172 |
+
"epoch": 8.314049586776859,
|
16173 |
+
"grad_norm": 0.09227565675973892,
|
16174 |
+
"learning_rate": 7.571868130243176e-07,
|
16175 |
+
"loss": 0.1086,
|
16176 |
+
"mean_token_accuracy": 0.96882164478302,
|
16177 |
+
"step": 2012
|
16178 |
+
},
|
16179 |
+
{
|
16180 |
+
"epoch": 8.318181818181818,
|
16181 |
+
"grad_norm": 0.10619546473026276,
|
16182 |
+
"learning_rate": 7.535757238439939e-07,
|
16183 |
+
"loss": 0.1186,
|
16184 |
+
"mean_token_accuracy": 0.9638972282409668,
|
16185 |
+
"step": 2013
|
16186 |
+
},
|
16187 |
+
{
|
16188 |
+
"epoch": 8.322314049586776,
|
16189 |
+
"grad_norm": 0.09314385056495667,
|
16190 |
+
"learning_rate": 7.499725641511762e-07,
|
16191 |
+
"loss": 0.0847,
|
16192 |
+
"mean_token_accuracy": 0.9747040867805481,
|
16193 |
+
"step": 2014
|
16194 |
+
},
|
16195 |
+
{
|
16196 |
+
"epoch": 8.326446280991735,
|
16197 |
+
"grad_norm": 0.09095818549394608,
|
16198 |
+
"learning_rate": 7.463773406741648e-07,
|
16199 |
+
"loss": 0.0946,
|
16200 |
+
"mean_token_accuracy": 0.9727653861045837,
|
16201 |
+
"step": 2015
|
16202 |
+
},
|
16203 |
+
{
|
16204 |
+
"epoch": 8.330578512396695,
|
16205 |
+
"grad_norm": 0.08989793807268143,
|
16206 |
+
"learning_rate": 7.427900601264388e-07,
|
16207 |
+
"loss": 0.1041,
|
16208 |
+
"mean_token_accuracy": 0.970187783241272,
|
16209 |
+
"step": 2016
|
16210 |
+
},
|
16211 |
+
{
|
16212 |
+
"epoch": 8.334710743801653,
|
16213 |
+
"grad_norm": 0.1408630758523941,
|
16214 |
+
"learning_rate": 7.392107292066452e-07,
|
16215 |
+
"loss": 0.2269,
|
16216 |
+
"mean_token_accuracy": 0.9238230586051941,
|
16217 |
+
"step": 2017
|
16218 |
+
},
|
16219 |
+
{
|
16220 |
+
"epoch": 8.338842975206612,
|
16221 |
+
"grad_norm": 0.07676363736391068,
|
16222 |
+
"learning_rate": 7.356393545985862e-07,
|
16223 |
+
"loss": 0.0831,
|
16224 |
+
"mean_token_accuracy": 0.977846622467041,
|
16225 |
+
"step": 2018
|
16226 |
+
},
|
16227 |
+
{
|
16228 |
+
"epoch": 8.34297520661157,
|
16229 |
+
"grad_norm": 0.11591339856386185,
|
16230 |
+
"learning_rate": 7.320759429712048e-07,
|
16231 |
+
"loss": 0.1068,
|
16232 |
+
"mean_token_accuracy": 0.9673469662666321,
|
16233 |
+
"step": 2019
|
16234 |
+
},
|
16235 |
+
{
|
16236 |
+
"epoch": 8.347107438016529,
|
16237 |
+
"grad_norm": 0.10131556540727615,
|
16238 |
+
"learning_rate": 7.285205009785784e-07,
|
16239 |
+
"loss": 0.0927,
|
16240 |
+
"mean_token_accuracy": 0.9739193320274353,
|
16241 |
+
"step": 2020
|
16242 |
+
},
|
16243 |
+
{
|
16244 |
+
"epoch": 8.351239669421487,
|
16245 |
+
"grad_norm": 0.09023724496364594,
|
16246 |
+
"learning_rate": 7.249730352599e-07,
|
16247 |
+
"loss": 0.0889,
|
16248 |
+
"mean_token_accuracy": 0.9725528359413147,
|
16249 |
+
"step": 2021
|
16250 |
+
},
|
16251 |
+
{
|
16252 |
+
"epoch": 8.355371900826446,
|
16253 |
+
"grad_norm": 0.0897325649857521,
|
16254 |
+
"learning_rate": 7.214335524394706e-07,
|
16255 |
+
"loss": 0.0785,
|
16256 |
+
"mean_token_accuracy": 0.9799548983573914,
|
16257 |
+
"step": 2022
|
16258 |
+
},
|
16259 |
+
{
|
16260 |
+
"epoch": 8.359504132231406,
|
16261 |
+
"grad_norm": 0.09022372215986252,
|
16262 |
+
"learning_rate": 7.179020591266794e-07,
|
16263 |
+
"loss": 0.1125,
|
16264 |
+
"mean_token_accuracy": 0.9685359001159668,
|
16265 |
+
"step": 2023
|
16266 |
+
},
|
16267 |
+
{
|
16268 |
+
"epoch": 8.363636363636363,
|
16269 |
+
"grad_norm": 0.08698549121618271,
|
16270 |
+
"learning_rate": 7.143785619160026e-07,
|
16271 |
+
"loss": 0.0951,
|
16272 |
+
"mean_token_accuracy": 0.9732397794723511,
|
16273 |
+
"step": 2024
|
16274 |
+
},
|
16275 |
+
{
|
16276 |
+
"epoch": 8.367768595041323,
|
16277 |
+
"grad_norm": 0.0980365052819252,
|
16278 |
+
"learning_rate": 7.108630673869805e-07,
|
16279 |
+
"loss": 0.1058,
|
16280 |
+
"mean_token_accuracy": 0.9667887091636658,
|
16281 |
+
"step": 2025
|
16282 |
+
},
|
16283 |
+
{
|
16284 |
+
"epoch": 8.37190082644628,
|
16285 |
+
"grad_norm": 0.10169877111911774,
|
16286 |
+
"learning_rate": 7.073555821042139e-07,
|
16287 |
+
"loss": 0.1002,
|
16288 |
+
"mean_token_accuracy": 0.9713459610939026,
|
16289 |
+
"step": 2026
|
16290 |
+
},
|
16291 |
+
{
|
16292 |
+
"epoch": 8.37603305785124,
|
16293 |
+
"grad_norm": 0.10198129713535309,
|
16294 |
+
"learning_rate": 7.038561126173437e-07,
|
16295 |
+
"loss": 0.1045,
|
16296 |
+
"mean_token_accuracy": 0.9714058637619019,
|
16297 |
+
"step": 2027
|
16298 |
+
},
|
16299 |
+
{
|
16300 |
+
"epoch": 8.380165289256198,
|
16301 |
+
"grad_norm": 0.10014763474464417,
|
16302 |
+
"learning_rate": 7.003646654610424e-07,
|
16303 |
+
"loss": 0.0886,
|
16304 |
+
"mean_token_accuracy": 0.9751999974250793,
|
16305 |
+
"step": 2028
|
16306 |
+
},
|
16307 |
+
{
|
16308 |
+
"epoch": 8.384297520661157,
|
16309 |
+
"grad_norm": 0.09548249840736389,
|
16310 |
+
"learning_rate": 6.968812471550063e-07,
|
16311 |
+
"loss": 0.0837,
|
16312 |
+
"mean_token_accuracy": 0.9789416790008545,
|
16313 |
+
"step": 2029
|
16314 |
+
},
|
16315 |
+
{
|
16316 |
+
"epoch": 8.388429752066116,
|
16317 |
+
"grad_norm": 0.10720735043287277,
|
16318 |
+
"learning_rate": 6.93405864203936e-07,
|
16319 |
+
"loss": 0.0906,
|
16320 |
+
"mean_token_accuracy": 0.9760934710502625,
|
16321 |
+
"step": 2030
|
16322 |
+
},
|
16323 |
+
{
|
16324 |
+
"epoch": 8.392561983471074,
|
16325 |
+
"grad_norm": 0.09425859898328781,
|
16326 |
+
"learning_rate": 6.899385230975297e-07,
|
16327 |
+
"loss": 0.0918,
|
16328 |
+
"mean_token_accuracy": 0.9751161932945251,
|
16329 |
+
"step": 2031
|
16330 |
+
},
|
16331 |
+
{
|
16332 |
+
"epoch": 8.396694214876034,
|
16333 |
+
"grad_norm": 0.11633366346359253,
|
16334 |
+
"learning_rate": 6.864792303104651e-07,
|
16335 |
+
"loss": 0.0996,
|
16336 |
+
"mean_token_accuracy": 0.9741970896720886,
|
16337 |
+
"step": 2032
|
16338 |
+
},
|
16339 |
+
{
|
16340 |
+
"epoch": 8.400826446280991,
|
16341 |
+
"grad_norm": 0.10742107778787613,
|
16342 |
+
"learning_rate": 6.830279923023946e-07,
|
16343 |
+
"loss": 0.0864,
|
16344 |
+
"mean_token_accuracy": 0.9759535789489746,
|
16345 |
+
"step": 2033
|
16346 |
+
},
|
16347 |
+
{
|
16348 |
+
"epoch": 8.40495867768595,
|
16349 |
+
"grad_norm": 0.10091706365346909,
|
16350 |
+
"learning_rate": 6.795848155179274e-07,
|
16351 |
+
"loss": 0.0884,
|
16352 |
+
"mean_token_accuracy": 0.9769123792648315,
|
16353 |
+
"step": 2034
|
16354 |
+
},
|
16355 |
+
{
|
16356 |
+
"epoch": 8.409090909090908,
|
16357 |
+
"grad_norm": 0.1497029811143875,
|
16358 |
+
"learning_rate": 6.761497063866207e-07,
|
16359 |
+
"loss": 0.1978,
|
16360 |
+
"mean_token_accuracy": 0.9377990365028381,
|
16361 |
+
"step": 2035
|
16362 |
+
},
|
16363 |
+
{
|
16364 |
+
"epoch": 8.413223140495868,
|
16365 |
+
"grad_norm": 0.10804083943367004,
|
16366 |
+
"learning_rate": 6.727226713229684e-07,
|
16367 |
+
"loss": 0.1319,
|
16368 |
+
"mean_token_accuracy": 0.9585747122764587,
|
16369 |
+
"step": 2036
|
16370 |
+
},
|
16371 |
+
{
|
16372 |
+
"epoch": 8.417355371900827,
|
16373 |
+
"grad_norm": 0.05786946043372154,
|
16374 |
+
"learning_rate": 6.693037167263828e-07,
|
16375 |
+
"loss": 0.2503,
|
16376 |
+
"mean_token_accuracy": 0.9130831360816956,
|
16377 |
+
"step": 2037
|
16378 |
+
},
|
16379 |
+
{
|
16380 |
+
"epoch": 8.421487603305785,
|
16381 |
+
"grad_norm": 0.07212464511394501,
|
16382 |
+
"learning_rate": 6.658928489811912e-07,
|
16383 |
+
"loss": 0.2339,
|
16384 |
+
"mean_token_accuracy": 0.9204217195510864,
|
16385 |
+
"step": 2038
|
16386 |
+
},
|
16387 |
+
{
|
16388 |
+
"epoch": 8.425619834710744,
|
16389 |
+
"grad_norm": 0.06576870381832123,
|
16390 |
+
"learning_rate": 6.624900744566193e-07,
|
16391 |
+
"loss": 0.2209,
|
16392 |
+
"mean_token_accuracy": 0.9230567812919617,
|
16393 |
+
"step": 2039
|
16394 |
+
},
|
16395 |
+
{
|
16396 |
+
"epoch": 8.429752066115702,
|
16397 |
+
"grad_norm": 0.07833580672740936,
|
16398 |
+
"learning_rate": 6.590953995067812e-07,
|
16399 |
+
"loss": 0.2051,
|
16400 |
+
"mean_token_accuracy": 0.9266378879547119,
|
16401 |
+
"step": 2040
|
16402 |
+
},
|
16403 |
+
{
|
16404 |
+
"epoch": 8.433884297520661,
|
16405 |
+
"grad_norm": 0.06369439512491226,
|
16406 |
+
"learning_rate": 6.557088304706627e-07,
|
16407 |
+
"loss": 0.1654,
|
16408 |
+
"mean_token_accuracy": 0.9431931376457214,
|
16409 |
+
"step": 2041
|
16410 |
+
},
|
16411 |
+
{
|
16412 |
+
"epoch": 8.438016528925619,
|
16413 |
+
"grad_norm": 0.08336784690618515,
|
16414 |
+
"learning_rate": 6.523303736721154e-07,
|
16415 |
+
"loss": 0.2379,
|
16416 |
+
"mean_token_accuracy": 0.9195821285247803,
|
16417 |
+
"step": 2042
|
16418 |
+
},
|
16419 |
+
{
|
16420 |
+
"epoch": 8.442148760330578,
|
16421 |
+
"grad_norm": 0.07352302968502045,
|
16422 |
+
"learning_rate": 6.489600354198433e-07,
|
16423 |
+
"loss": 0.1511,
|
16424 |
+
"mean_token_accuracy": 0.950443685054779,
|
16425 |
+
"step": 2043
|
16426 |
+
},
|
16427 |
+
{
|
16428 |
+
"epoch": 8.446280991735538,
|
16429 |
+
"grad_norm": 0.09255903214216232,
|
16430 |
+
"learning_rate": 6.455978220073895e-07,
|
16431 |
+
"loss": 0.2586,
|
16432 |
+
"mean_token_accuracy": 0.9118536710739136,
|
16433 |
+
"step": 2044
|
16434 |
+
},
|
16435 |
+
{
|
16436 |
+
"epoch": 8.450413223140496,
|
16437 |
+
"grad_norm": 0.07923895865678787,
|
16438 |
+
"learning_rate": 6.422437397131265e-07,
|
16439 |
+
"loss": 0.1312,
|
16440 |
+
"mean_token_accuracy": 0.9583396911621094,
|
16441 |
+
"step": 2045
|
16442 |
+
},
|
16443 |
+
{
|
16444 |
+
"epoch": 8.454545454545455,
|
16445 |
+
"grad_norm": 0.08579988032579422,
|
16446 |
+
"learning_rate": 6.388977948002406e-07,
|
16447 |
+
"loss": 0.1845,
|
16448 |
+
"mean_token_accuracy": 0.9370260238647461,
|
16449 |
+
"step": 2046
|
16450 |
+
},
|
16451 |
+
{
|
16452 |
+
"epoch": 8.458677685950413,
|
16453 |
+
"grad_norm": 0.08608614653348923,
|
16454 |
+
"learning_rate": 6.355599935167256e-07,
|
16455 |
+
"loss": 0.1863,
|
16456 |
+
"mean_token_accuracy": 0.9352179169654846,
|
16457 |
+
"step": 2047
|
16458 |
+
},
|
16459 |
+
{
|
16460 |
+
"epoch": 8.462809917355372,
|
16461 |
+
"grad_norm": 0.08685126155614853,
|
16462 |
+
"learning_rate": 6.322303420953673e-07,
|
16463 |
+
"loss": 0.1521,
|
16464 |
+
"mean_token_accuracy": 0.9517453908920288,
|
16465 |
+
"step": 2048
|
16466 |
+
},
|
16467 |
+
{
|
16468 |
+
"epoch": 8.46694214876033,
|
16469 |
+
"grad_norm": 0.09084443747997284,
|
16470 |
+
"learning_rate": 6.289088467537341e-07,
|
16471 |
+
"loss": 0.1344,
|
16472 |
+
"mean_token_accuracy": 0.955075740814209,
|
16473 |
+
"step": 2049
|
16474 |
+
},
|
16475 |
+
{
|
16476 |
+
"epoch": 8.47107438016529,
|
16477 |
+
"grad_norm": 0.10212317854166031,
|
16478 |
+
"learning_rate": 6.255955136941627e-07,
|
16479 |
+
"loss": 0.1827,
|
16480 |
+
"mean_token_accuracy": 0.9430245161056519,
|
16481 |
+
"step": 2050
|
16482 |
+
},
|
16483 |
+
{
|
16484 |
+
"epoch": 8.475206611570249,
|
16485 |
+
"grad_norm": 0.09634598344564438,
|
16486 |
+
"learning_rate": 6.222903491037474e-07,
|
16487 |
+
"loss": 0.1735,
|
16488 |
+
"mean_token_accuracy": 0.9435832500457764,
|
16489 |
+
"step": 2051
|
16490 |
+
},
|
16491 |
+
{
|
16492 |
+
"epoch": 8.479338842975206,
|
16493 |
+
"grad_norm": 0.07654455304145813,
|
16494 |
+
"learning_rate": 6.189933591543318e-07,
|
16495 |
+
"loss": 0.1052,
|
16496 |
+
"mean_token_accuracy": 0.9696394801139832,
|
16497 |
+
"step": 2052
|
16498 |
+
},
|
16499 |
+
{
|
16500 |
+
"epoch": 8.483471074380166,
|
16501 |
+
"grad_norm": 0.08662577718496323,
|
16502 |
+
"learning_rate": 6.157045500024933e-07,
|
16503 |
+
"loss": 0.1196,
|
16504 |
+
"mean_token_accuracy": 0.9640316367149353,
|
16505 |
+
"step": 2053
|
16506 |
+
},
|
16507 |
+
{
|
16508 |
+
"epoch": 8.487603305785123,
|
16509 |
+
"grad_norm": 0.0935806855559349,
|
16510 |
+
"learning_rate": 6.124239277895344e-07,
|
16511 |
+
"loss": 0.1112,
|
16512 |
+
"mean_token_accuracy": 0.9642053842544556,
|
16513 |
+
"step": 2054
|
16514 |
+
},
|
16515 |
+
{
|
16516 |
+
"epoch": 8.491735537190083,
|
16517 |
+
"grad_norm": 0.0817096158862114,
|
16518 |
+
"learning_rate": 6.091514986414665e-07,
|
16519 |
+
"loss": 0.0977,
|
16520 |
+
"mean_token_accuracy": 0.9727157354354858,
|
16521 |
+
"step": 2055
|
16522 |
+
},
|
16523 |
+
{
|
16524 |
+
"epoch": 8.49586776859504,
|
16525 |
+
"grad_norm": 0.08830783516168594,
|
16526 |
+
"learning_rate": 6.058872686690048e-07,
|
16527 |
+
"loss": 0.0979,
|
16528 |
+
"mean_token_accuracy": 0.9704757928848267,
|
16529 |
+
"step": 2056
|
16530 |
+
},
|
16531 |
+
{
|
16532 |
+
"epoch": 8.5,
|
16533 |
+
"grad_norm": 0.09076707810163498,
|
16534 |
+
"learning_rate": 6.026312439675553e-07,
|
16535 |
+
"loss": 0.0942,
|
16536 |
+
"mean_token_accuracy": 0.971440851688385,
|
16537 |
+
"step": 2057
|
16538 |
+
},
|
16539 |
+
{
|
16540 |
+
"epoch": 8.50413223140496,
|
16541 |
+
"grad_norm": 0.09585954248905182,
|
16542 |
+
"learning_rate": 5.993834306171964e-07,
|
16543 |
+
"loss": 0.1453,
|
16544 |
+
"mean_token_accuracy": 0.9532176852226257,
|
16545 |
+
"step": 2058
|
16546 |
+
},
|
16547 |
+
{
|
16548 |
+
"epoch": 8.508264462809917,
|
16549 |
+
"grad_norm": 0.10207119584083557,
|
16550 |
+
"learning_rate": 5.961438346826792e-07,
|
16551 |
+
"loss": 0.1159,
|
16552 |
+
"mean_token_accuracy": 0.9636322855949402,
|
16553 |
+
"step": 2059
|
16554 |
+
},
|
16555 |
+
{
|
16556 |
+
"epoch": 8.512396694214877,
|
16557 |
+
"grad_norm": 0.10537750273942947,
|
16558 |
+
"learning_rate": 5.929124622134058e-07,
|
16559 |
+
"loss": 0.1239,
|
16560 |
+
"mean_token_accuracy": 0.9623029232025146,
|
16561 |
+
"step": 2060
|
16562 |
+
},
|
16563 |
+
{
|
16564 |
+
"epoch": 8.516528925619834,
|
16565 |
+
"grad_norm": 0.08574347198009491,
|
16566 |
+
"learning_rate": 5.896893192434249e-07,
|
16567 |
+
"loss": 0.0818,
|
16568 |
+
"mean_token_accuracy": 0.97648686170578,
|
16569 |
+
"step": 2061
|
16570 |
+
},
|
16571 |
+
{
|
16572 |
+
"epoch": 8.520661157024794,
|
16573 |
+
"grad_norm": 0.1016509085893631,
|
16574 |
+
"learning_rate": 5.864744117914179e-07,
|
16575 |
+
"loss": 0.1009,
|
16576 |
+
"mean_token_accuracy": 0.9712011218070984,
|
16577 |
+
"step": 2062
|
16578 |
+
},
|
16579 |
+
{
|
16580 |
+
"epoch": 8.524793388429751,
|
16581 |
+
"grad_norm": 0.09164122492074966,
|
16582 |
+
"learning_rate": 5.832677458606867e-07,
|
16583 |
+
"loss": 0.0942,
|
16584 |
+
"mean_token_accuracy": 0.9732291102409363,
|
16585 |
+
"step": 2063
|
16586 |
+
},
|
16587 |
+
{
|
16588 |
+
"epoch": 8.52892561983471,
|
16589 |
+
"grad_norm": 0.08601871877908707,
|
16590 |
+
"learning_rate": 5.800693274391439e-07,
|
16591 |
+
"loss": 0.0799,
|
16592 |
+
"mean_token_accuracy": 0.9785696864128113,
|
16593 |
+
"step": 2064
|
16594 |
+
},
|
16595 |
+
{
|
16596 |
+
"epoch": 8.53305785123967,
|
16597 |
+
"grad_norm": 0.11451072990894318,
|
16598 |
+
"learning_rate": 5.768791624993003e-07,
|
16599 |
+
"loss": 0.159,
|
16600 |
+
"mean_token_accuracy": 0.9517607092857361,
|
16601 |
+
"step": 2065
|
16602 |
+
},
|
16603 |
+
{
|
16604 |
+
"epoch": 8.537190082644628,
|
16605 |
+
"grad_norm": 0.10741297900676727,
|
16606 |
+
"learning_rate": 5.736972569982558e-07,
|
16607 |
+
"loss": 0.1089,
|
16608 |
+
"mean_token_accuracy": 0.9691147804260254,
|
16609 |
+
"step": 2066
|
16610 |
+
},
|
16611 |
+
{
|
16612 |
+
"epoch": 8.541322314049587,
|
16613 |
+
"grad_norm": 0.11344994604587555,
|
16614 |
+
"learning_rate": 5.705236168776879e-07,
|
16615 |
+
"loss": 0.1463,
|
16616 |
+
"mean_token_accuracy": 0.9595220685005188,
|
16617 |
+
"step": 2067
|
16618 |
+
},
|
16619 |
+
{
|
16620 |
+
"epoch": 8.545454545454545,
|
16621 |
+
"grad_norm": 0.10220891237258911,
|
16622 |
+
"learning_rate": 5.673582480638395e-07,
|
16623 |
+
"loss": 0.1063,
|
16624 |
+
"mean_token_accuracy": 0.9685812592506409,
|
16625 |
+
"step": 2068
|
16626 |
+
},
|
16627 |
+
{
|
16628 |
+
"epoch": 8.549586776859504,
|
16629 |
+
"grad_norm": 0.08758968859910965,
|
16630 |
+
"learning_rate": 5.642011564675065e-07,
|
16631 |
+
"loss": 0.0872,
|
16632 |
+
"mean_token_accuracy": 0.9768015742301941,
|
16633 |
+
"step": 2069
|
16634 |
+
},
|
16635 |
+
{
|
16636 |
+
"epoch": 8.553719008264462,
|
16637 |
+
"grad_norm": 0.10929395258426666,
|
16638 |
+
"learning_rate": 5.610523479840297e-07,
|
16639 |
+
"loss": 0.0947,
|
16640 |
+
"mean_token_accuracy": 0.9717923402786255,
|
16641 |
+
"step": 2070
|
16642 |
+
},
|
16643 |
+
{
|
16644 |
+
"epoch": 8.557851239669422,
|
16645 |
+
"grad_norm": 0.11096024513244629,
|
16646 |
+
"learning_rate": 5.579118284932844e-07,
|
16647 |
+
"loss": 0.1365,
|
16648 |
+
"mean_token_accuracy": 0.9577394127845764,
|
16649 |
+
"step": 2071
|
16650 |
+
},
|
16651 |
+
{
|
16652 |
+
"epoch": 8.561983471074381,
|
16653 |
+
"grad_norm": 0.0956474244594574,
|
16654 |
+
"learning_rate": 5.547796038596637e-07,
|
16655 |
+
"loss": 0.1036,
|
16656 |
+
"mean_token_accuracy": 0.9721804261207581,
|
16657 |
+
"step": 2072
|
16658 |
+
},
|
16659 |
+
{
|
16660 |
+
"epoch": 8.566115702479339,
|
16661 |
+
"grad_norm": 0.09017419070005417,
|
16662 |
+
"learning_rate": 5.51655679932075e-07,
|
16663 |
+
"loss": 0.0806,
|
16664 |
+
"mean_token_accuracy": 0.9797005653381348,
|
16665 |
+
"step": 2073
|
16666 |
+
},
|
16667 |
+
{
|
16668 |
+
"epoch": 8.570247933884298,
|
16669 |
+
"grad_norm": 0.10327083617448807,
|
16670 |
+
"learning_rate": 5.485400625439219e-07,
|
16671 |
+
"loss": 0.1038,
|
16672 |
+
"mean_token_accuracy": 0.9685261249542236,
|
16673 |
+
"step": 2074
|
16674 |
+
},
|
16675 |
+
{
|
16676 |
+
"epoch": 8.574380165289256,
|
16677 |
+
"grad_norm": 0.10319899022579193,
|
16678 |
+
"learning_rate": 5.454327575131007e-07,
|
16679 |
+
"loss": 0.0907,
|
16680 |
+
"mean_token_accuracy": 0.9753340482711792,
|
16681 |
+
"step": 2075
|
16682 |
+
},
|
16683 |
+
{
|
16684 |
+
"epoch": 8.578512396694215,
|
16685 |
+
"grad_norm": 0.10876299440860748,
|
16686 |
+
"learning_rate": 5.423337706419846e-07,
|
16687 |
+
"loss": 0.1003,
|
16688 |
+
"mean_token_accuracy": 0.9718273282051086,
|
16689 |
+
"step": 2076
|
16690 |
+
},
|
16691 |
+
{
|
16692 |
+
"epoch": 8.582644628099173,
|
16693 |
+
"grad_norm": 0.10647837072610855,
|
16694 |
+
"learning_rate": 5.392431077174131e-07,
|
16695 |
+
"loss": 0.1686,
|
16696 |
+
"mean_token_accuracy": 0.948123037815094,
|
16697 |
+
"step": 2077
|
16698 |
+
},
|
16699 |
+
{
|
16700 |
+
"epoch": 8.586776859504132,
|
16701 |
+
"grad_norm": 0.09515678137540817,
|
16702 |
+
"learning_rate": 5.361607745106817e-07,
|
16703 |
+
"loss": 0.0908,
|
16704 |
+
"mean_token_accuracy": 0.9741514325141907,
|
16705 |
+
"step": 2078
|
16706 |
+
},
|
16707 |
+
{
|
16708 |
+
"epoch": 8.590909090909092,
|
16709 |
+
"grad_norm": 0.10034073889255524,
|
16710 |
+
"learning_rate": 5.330867767775333e-07,
|
16711 |
+
"loss": 0.0898,
|
16712 |
+
"mean_token_accuracy": 0.9729089736938477,
|
16713 |
+
"step": 2079
|
16714 |
+
},
|
16715 |
+
{
|
16716 |
+
"epoch": 8.59504132231405,
|
16717 |
+
"grad_norm": 0.09543359279632568,
|
16718 |
+
"learning_rate": 5.300211202581451e-07,
|
16719 |
+
"loss": 0.0814,
|
16720 |
+
"mean_token_accuracy": 0.9785924553871155,
|
16721 |
+
"step": 2080
|
16722 |
+
},
|
16723 |
+
{
|
16724 |
+
"epoch": 8.599173553719009,
|
16725 |
+
"grad_norm": 0.11278136074542999,
|
16726 |
+
"learning_rate": 5.269638106771174e-07,
|
16727 |
+
"loss": 0.1497,
|
16728 |
+
"mean_token_accuracy": 0.9543736577033997,
|
16729 |
+
"step": 2081
|
16730 |
+
},
|
16731 |
+
{
|
16732 |
+
"epoch": 8.603305785123966,
|
16733 |
+
"grad_norm": 0.08995888382196426,
|
16734 |
+
"learning_rate": 5.239148537434658e-07,
|
16735 |
+
"loss": 0.094,
|
16736 |
+
"mean_token_accuracy": 0.9726177453994751,
|
16737 |
+
"step": 2082
|
16738 |
+
},
|
16739 |
+
{
|
16740 |
+
"epoch": 8.607438016528926,
|
16741 |
+
"grad_norm": 0.10781515389680862,
|
16742 |
+
"learning_rate": 5.208742551506057e-07,
|
16743 |
+
"loss": 0.0955,
|
16744 |
+
"mean_token_accuracy": 0.9749103784561157,
|
16745 |
+
"step": 2083
|
16746 |
+
},
|
16747 |
+
{
|
16748 |
+
"epoch": 8.611570247933884,
|
16749 |
+
"grad_norm": 0.1258586049079895,
|
16750 |
+
"learning_rate": 5.178420205763484e-07,
|
16751 |
+
"loss": 0.1392,
|
16752 |
+
"mean_token_accuracy": 0.958977222442627,
|
16753 |
+
"step": 2084
|
16754 |
+
},
|
16755 |
+
{
|
16756 |
+
"epoch": 8.615702479338843,
|
16757 |
+
"grad_norm": 0.11668509989976883,
|
16758 |
+
"learning_rate": 5.148181556828847e-07,
|
16759 |
+
"loss": 0.0891,
|
16760 |
+
"mean_token_accuracy": 0.977047324180603,
|
16761 |
+
"step": 2085
|
16762 |
+
},
|
16763 |
+
{
|
16764 |
+
"epoch": 8.619834710743802,
|
16765 |
+
"grad_norm": 0.12788750231266022,
|
16766 |
+
"learning_rate": 5.118026661167774e-07,
|
16767 |
+
"loss": 0.1437,
|
16768 |
+
"mean_token_accuracy": 0.9538551568984985,
|
16769 |
+
"step": 2086
|
16770 |
+
},
|
16771 |
+
{
|
16772 |
+
"epoch": 8.62396694214876,
|
16773 |
+
"grad_norm": 0.05820296332240105,
|
16774 |
+
"learning_rate": 5.087955575089493e-07,
|
16775 |
+
"loss": 0.2225,
|
16776 |
+
"mean_token_accuracy": 0.9198437333106995,
|
16777 |
+
"step": 2087
|
16778 |
+
},
|
16779 |
+
{
|
16780 |
+
"epoch": 8.62809917355372,
|
16781 |
+
"grad_norm": 0.05990159511566162,
|
16782 |
+
"learning_rate": 5.057968354746706e-07,
|
16783 |
+
"loss": 0.2175,
|
16784 |
+
"mean_token_accuracy": 0.9251121282577515,
|
16785 |
+
"step": 2088
|
16786 |
+
},
|
16787 |
+
{
|
16788 |
+
"epoch": 8.632231404958677,
|
16789 |
+
"grad_norm": 0.06292181462049484,
|
16790 |
+
"learning_rate": 5.028065056135561e-07,
|
16791 |
+
"loss": 0.2055,
|
16792 |
+
"mean_token_accuracy": 0.9290311932563782,
|
16793 |
+
"step": 2089
|
16794 |
+
},
|
16795 |
+
{
|
16796 |
+
"epoch": 8.636363636363637,
|
16797 |
+
"grad_norm": 0.0732082948088646,
|
16798 |
+
"learning_rate": 4.998245735095459e-07,
|
16799 |
+
"loss": 0.2348,
|
16800 |
+
"mean_token_accuracy": 0.9177881479263306,
|
16801 |
+
"step": 2090
|
16802 |
+
},
|
16803 |
+
{
|
16804 |
+
"epoch": 8.640495867768594,
|
16805 |
+
"grad_norm": 0.06980929523706436,
|
16806 |
+
"learning_rate": 4.968510447309005e-07,
|
16807 |
+
"loss": 0.1921,
|
16808 |
+
"mean_token_accuracy": 0.9333738684654236,
|
16809 |
+
"step": 2091
|
16810 |
+
},
|
16811 |
+
{
|
16812 |
+
"epoch": 8.644628099173554,
|
16813 |
+
"grad_norm": 0.07125243544578552,
|
16814 |
+
"learning_rate": 4.938859248301863e-07,
|
16815 |
+
"loss": 0.1748,
|
16816 |
+
"mean_token_accuracy": 0.9390982389450073,
|
16817 |
+
"step": 2092
|
16818 |
+
},
|
16819 |
+
{
|
16820 |
+
"epoch": 8.648760330578511,
|
16821 |
+
"grad_norm": 0.07631165534257889,
|
16822 |
+
"learning_rate": 4.909292193442705e-07,
|
16823 |
+
"loss": 0.1969,
|
16824 |
+
"mean_token_accuracy": 0.9327918887138367,
|
16825 |
+
"step": 2093
|
16826 |
+
},
|
16827 |
+
{
|
16828 |
+
"epoch": 8.652892561983471,
|
16829 |
+
"grad_norm": 0.0754714235663414,
|
16830 |
+
"learning_rate": 4.87980933794307e-07,
|
16831 |
+
"loss": 0.179,
|
16832 |
+
"mean_token_accuracy": 0.942600429058075,
|
16833 |
+
"step": 2094
|
16834 |
+
},
|
16835 |
+
{
|
16836 |
+
"epoch": 8.65702479338843,
|
16837 |
+
"grad_norm": 0.10651316493749619,
|
16838 |
+
"learning_rate": 4.850410736857236e-07,
|
16839 |
+
"loss": 0.2142,
|
16840 |
+
"mean_token_accuracy": 0.9307475090026855,
|
16841 |
+
"step": 2095
|
16842 |
+
},
|
16843 |
+
{
|
16844 |
+
"epoch": 8.661157024793388,
|
16845 |
+
"grad_norm": 0.08369658142328262,
|
16846 |
+
"learning_rate": 4.821096445082208e-07,
|
16847 |
+
"loss": 0.1839,
|
16848 |
+
"mean_token_accuracy": 0.9419768452644348,
|
16849 |
+
"step": 2096
|
16850 |
+
},
|
16851 |
+
{
|
16852 |
+
"epoch": 8.665289256198347,
|
16853 |
+
"grad_norm": 0.08058687299489975,
|
16854 |
+
"learning_rate": 4.791866517357491e-07,
|
16855 |
+
"loss": 0.1606,
|
16856 |
+
"mean_token_accuracy": 0.950334906578064,
|
16857 |
+
"step": 2097
|
16858 |
+
},
|
16859 |
+
{
|
16860 |
+
"epoch": 8.669421487603305,
|
16861 |
+
"grad_norm": 0.08795657008886337,
|
16862 |
+
"learning_rate": 4.762721008265114e-07,
|
16863 |
+
"loss": 0.1745,
|
16864 |
+
"mean_token_accuracy": 0.9430282115936279,
|
16865 |
+
"step": 2098
|
16866 |
+
},
|
16867 |
+
{
|
16868 |
+
"epoch": 8.673553719008265,
|
16869 |
+
"grad_norm": 0.08912398666143417,
|
16870 |
+
"learning_rate": 4.733659972229437e-07,
|
16871 |
+
"loss": 0.1724,
|
16872 |
+
"mean_token_accuracy": 0.9450215101242065,
|
16873 |
+
"step": 2099
|
16874 |
+
},
|
16875 |
+
{
|
16876 |
+
"epoch": 8.677685950413224,
|
16877 |
+
"grad_norm": 0.08674637228250504,
|
16878 |
+
"learning_rate": 4.7046834635170956e-07,
|
16879 |
+
"loss": 0.1258,
|
16880 |
+
"mean_token_accuracy": 0.9621280431747437,
|
16881 |
+
"step": 2100
|
16882 |
+
},
|
16883 |
+
{
|
16884 |
+
"epoch": 8.681818181818182,
|
16885 |
+
"grad_norm": 0.0879029706120491,
|
16886 |
+
"learning_rate": 4.6757915362368567e-07,
|
16887 |
+
"loss": 0.1673,
|
16888 |
+
"mean_token_accuracy": 0.9448676109313965,
|
16889 |
+
"step": 2101
|
16890 |
+
},
|
16891 |
+
{
|
16892 |
+
"epoch": 8.685950413223141,
|
16893 |
+
"grad_norm": 0.07600904256105423,
|
16894 |
+
"learning_rate": 4.646984244339575e-07,
|
16895 |
+
"loss": 0.0938,
|
16896 |
+
"mean_token_accuracy": 0.9702988266944885,
|
16897 |
+
"step": 2102
|
16898 |
+
},
|
16899 |
+
{
|
16900 |
+
"epoch": 8.690082644628099,
|
16901 |
+
"grad_norm": 0.07806258648633957,
|
16902 |
+
"learning_rate": 4.61826164161806e-07,
|
16903 |
+
"loss": 0.1091,
|
16904 |
+
"mean_token_accuracy": 0.9627501964569092,
|
16905 |
+
"step": 2103
|
16906 |
+
},
|
16907 |
+
{
|
16908 |
+
"epoch": 8.694214876033058,
|
16909 |
+
"grad_norm": 0.09047354757785797,
|
16910 |
+
"learning_rate": 4.589623781706959e-07,
|
16911 |
+
"loss": 0.1213,
|
16912 |
+
"mean_token_accuracy": 0.9610835313796997,
|
16913 |
+
"step": 2104
|
16914 |
+
},
|
16915 |
+
{
|
16916 |
+
"epoch": 8.698347107438016,
|
16917 |
+
"grad_norm": 0.0882289707660675,
|
16918 |
+
"learning_rate": 4.5610707180826996e-07,
|
16919 |
+
"loss": 0.0973,
|
16920 |
+
"mean_token_accuracy": 0.9684313535690308,
|
16921 |
+
"step": 2105
|
16922 |
+
},
|
16923 |
+
{
|
16924 |
+
"epoch": 8.702479338842975,
|
16925 |
+
"grad_norm": 0.07399041950702667,
|
16926 |
+
"learning_rate": 4.532602504063344e-07,
|
16927 |
+
"loss": 0.1023,
|
16928 |
+
"mean_token_accuracy": 0.9700278043746948,
|
16929 |
+
"step": 2106
|
16930 |
+
},
|
16931 |
+
{
|
16932 |
+
"epoch": 8.706611570247933,
|
16933 |
+
"grad_norm": 0.0878264531493187,
|
16934 |
+
"learning_rate": 4.504219192808529e-07,
|
16935 |
+
"loss": 0.0977,
|
16936 |
+
"mean_token_accuracy": 0.970322847366333,
|
16937 |
+
"step": 2107
|
16938 |
+
},
|
16939 |
+
{
|
16940 |
+
"epoch": 8.710743801652892,
|
16941 |
+
"grad_norm": 0.0884372815489769,
|
16942 |
+
"learning_rate": 4.4759208373193365e-07,
|
16943 |
+
"loss": 0.1047,
|
16944 |
+
"mean_token_accuracy": 0.9682474136352539,
|
16945 |
+
"step": 2108
|
16946 |
+
},
|
16947 |
+
{
|
16948 |
+
"epoch": 8.714876033057852,
|
16949 |
+
"grad_norm": 0.09372899681329727,
|
16950 |
+
"learning_rate": 4.447707490438236e-07,
|
16951 |
+
"loss": 0.1364,
|
16952 |
+
"mean_token_accuracy": 0.957731306552887,
|
16953 |
+
"step": 2109
|
16954 |
+
},
|
16955 |
+
{
|
16956 |
+
"epoch": 8.71900826446281,
|
16957 |
+
"grad_norm": 0.09550356864929199,
|
16958 |
+
"learning_rate": 4.4195792048489226e-07,
|
16959 |
+
"loss": 0.1414,
|
16960 |
+
"mean_token_accuracy": 0.9544153213500977,
|
16961 |
+
"step": 2110
|
16962 |
+
},
|
16963 |
+
{
|
16964 |
+
"epoch": 8.723140495867769,
|
16965 |
+
"grad_norm": 0.07899662107229233,
|
16966 |
+
"learning_rate": 4.39153603307626e-07,
|
16967 |
+
"loss": 0.0792,
|
16968 |
+
"mean_token_accuracy": 0.9783337116241455,
|
16969 |
+
"step": 2111
|
16970 |
+
},
|
16971 |
+
{
|
16972 |
+
"epoch": 8.727272727272727,
|
16973 |
+
"grad_norm": 0.09830790758132935,
|
16974 |
+
"learning_rate": 4.363578027486187e-07,
|
16975 |
+
"loss": 0.1541,
|
16976 |
+
"mean_token_accuracy": 0.9491906762123108,
|
16977 |
+
"step": 2112
|
16978 |
+
},
|
16979 |
+
{
|
16980 |
+
"epoch": 8.731404958677686,
|
16981 |
+
"grad_norm": 0.09043899923563004,
|
16982 |
+
"learning_rate": 4.335705240285609e-07,
|
16983 |
+
"loss": 0.102,
|
16984 |
+
"mean_token_accuracy": 0.9689905047416687,
|
16985 |
+
"step": 2113
|
16986 |
+
},
|
16987 |
+
{
|
16988 |
+
"epoch": 8.735537190082646,
|
16989 |
+
"grad_norm": 0.09777707606554031,
|
16990 |
+
"learning_rate": 4.307917723522315e-07,
|
16991 |
+
"loss": 0.1352,
|
16992 |
+
"mean_token_accuracy": 0.9594070911407471,
|
16993 |
+
"step": 2114
|
16994 |
+
},
|
16995 |
+
{
|
16996 |
+
"epoch": 8.739669421487603,
|
16997 |
+
"grad_norm": 0.10187830775976181,
|
16998 |
+
"learning_rate": 4.2802155290848133e-07,
|
16999 |
+
"loss": 0.0885,
|
17000 |
+
"mean_token_accuracy": 0.9753796458244324,
|
17001 |
+
"step": 2115
|
17002 |
+
},
|
17003 |
+
{
|
17004 |
+
"epoch": 8.743801652892563,
|
17005 |
+
"grad_norm": 0.09098262339830399,
|
17006 |
+
"learning_rate": 4.2525987087023433e-07,
|
17007 |
+
"loss": 0.0822,
|
17008 |
+
"mean_token_accuracy": 0.9769821166992188,
|
17009 |
+
"step": 2116
|
17010 |
+
},
|
17011 |
+
{
|
17012 |
+
"epoch": 8.74793388429752,
|
17013 |
+
"grad_norm": 0.08996855467557907,
|
17014 |
+
"learning_rate": 4.225067313944703e-07,
|
17015 |
+
"loss": 0.097,
|
17016 |
+
"mean_token_accuracy": 0.9690431356430054,
|
17017 |
+
"step": 2117
|
17018 |
+
},
|
17019 |
+
{
|
17020 |
+
"epoch": 8.75206611570248,
|
17021 |
+
"grad_norm": 0.09314204007387161,
|
17022 |
+
"learning_rate": 4.1976213962221513e-07,
|
17023 |
+
"loss": 0.0865,
|
17024 |
+
"mean_token_accuracy": 0.9770414233207703,
|
17025 |
+
"step": 2118
|
17026 |
+
},
|
17027 |
+
{
|
17028 |
+
"epoch": 8.756198347107437,
|
17029 |
+
"grad_norm": 0.10382431000471115,
|
17030 |
+
"learning_rate": 4.1702610067853756e-07,
|
17031 |
+
"loss": 0.1087,
|
17032 |
+
"mean_token_accuracy": 0.9670014381408691,
|
17033 |
+
"step": 2119
|
17034 |
+
},
|
17035 |
+
{
|
17036 |
+
"epoch": 8.760330578512397,
|
17037 |
+
"grad_norm": 0.08995066583156586,
|
17038 |
+
"learning_rate": 4.1429861967253073e-07,
|
17039 |
+
"loss": 0.102,
|
17040 |
+
"mean_token_accuracy": 0.9701564311981201,
|
17041 |
+
"step": 2120
|
17042 |
+
},
|
17043 |
+
{
|
17044 |
+
"epoch": 8.764462809917354,
|
17045 |
+
"grad_norm": 0.11203364282846451,
|
17046 |
+
"learning_rate": 4.1157970169731057e-07,
|
17047 |
+
"loss": 0.1025,
|
17048 |
+
"mean_token_accuracy": 0.9696673154830933,
|
17049 |
+
"step": 2121
|
17050 |
+
},
|
17051 |
+
{
|
17052 |
+
"epoch": 8.768595041322314,
|
17053 |
+
"grad_norm": 0.111559197306633,
|
17054 |
+
"learning_rate": 4.088693518300019e-07,
|
17055 |
+
"loss": 0.1204,
|
17056 |
+
"mean_token_accuracy": 0.9662195444107056,
|
17057 |
+
"step": 2122
|
17058 |
+
},
|
17059 |
+
{
|
17060 |
+
"epoch": 8.772727272727273,
|
17061 |
+
"grad_norm": 0.10789839178323746,
|
17062 |
+
"learning_rate": 4.0616757513173123e-07,
|
17063 |
+
"loss": 0.1253,
|
17064 |
+
"mean_token_accuracy": 0.9634451866149902,
|
17065 |
+
"step": 2123
|
17066 |
+
},
|
17067 |
+
{
|
17068 |
+
"epoch": 8.776859504132231,
|
17069 |
+
"grad_norm": 0.0962676927447319,
|
17070 |
+
"learning_rate": 4.0347437664761237e-07,
|
17071 |
+
"loss": 0.1044,
|
17072 |
+
"mean_token_accuracy": 0.9675620794296265,
|
17073 |
+
"step": 2124
|
17074 |
+
},
|
17075 |
+
{
|
17076 |
+
"epoch": 8.78099173553719,
|
17077 |
+
"grad_norm": 0.09367375820875168,
|
17078 |
+
"learning_rate": 4.0078976140674465e-07,
|
17079 |
+
"loss": 0.0944,
|
17080 |
+
"mean_token_accuracy": 0.9761354923248291,
|
17081 |
+
"step": 2125
|
17082 |
+
},
|
17083 |
+
{
|
17084 |
+
"epoch": 8.785123966942148,
|
17085 |
+
"grad_norm": 0.09414532035589218,
|
17086 |
+
"learning_rate": 3.981137344221986e-07,
|
17087 |
+
"loss": 0.0882,
|
17088 |
+
"mean_token_accuracy": 0.9753566980361938,
|
17089 |
+
"step": 2126
|
17090 |
+
},
|
17091 |
+
{
|
17092 |
+
"epoch": 8.789256198347108,
|
17093 |
+
"grad_norm": 0.10240423679351807,
|
17094 |
+
"learning_rate": 3.9544630069100644e-07,
|
17095 |
+
"loss": 0.0886,
|
17096 |
+
"mean_token_accuracy": 0.9755526781082153,
|
17097 |
+
"step": 2127
|
17098 |
+
},
|
17099 |
+
{
|
17100 |
+
"epoch": 8.793388429752067,
|
17101 |
+
"grad_norm": 0.10124680399894714,
|
17102 |
+
"learning_rate": 3.9278746519415655e-07,
|
17103 |
+
"loss": 0.0851,
|
17104 |
+
"mean_token_accuracy": 0.9779295921325684,
|
17105 |
+
"step": 2128
|
17106 |
+
},
|
17107 |
+
{
|
17108 |
+
"epoch": 8.797520661157025,
|
17109 |
+
"grad_norm": 0.11509440094232559,
|
17110 |
+
"learning_rate": 3.901372328965769e-07,
|
17111 |
+
"loss": 0.0877,
|
17112 |
+
"mean_token_accuracy": 0.9781274795532227,
|
17113 |
+
"step": 2129
|
17114 |
+
},
|
17115 |
+
{
|
17116 |
+
"epoch": 8.801652892561984,
|
17117 |
+
"grad_norm": 0.12082704156637192,
|
17118 |
+
"learning_rate": 3.874956087471354e-07,
|
17119 |
+
"loss": 0.108,
|
17120 |
+
"mean_token_accuracy": 0.9729946255683899,
|
17121 |
+
"step": 2130
|
17122 |
+
},
|
17123 |
+
{
|
17124 |
+
"epoch": 8.805785123966942,
|
17125 |
+
"grad_norm": 0.0965445339679718,
|
17126 |
+
"learning_rate": 3.8486259767862243e-07,
|
17127 |
+
"loss": 0.0809,
|
17128 |
+
"mean_token_accuracy": 0.9787408113479614,
|
17129 |
+
"step": 2131
|
17130 |
+
},
|
17131 |
+
{
|
17132 |
+
"epoch": 8.809917355371901,
|
17133 |
+
"grad_norm": 0.11878959834575653,
|
17134 |
+
"learning_rate": 3.822382046077483e-07,
|
17135 |
+
"loss": 0.1459,
|
17136 |
+
"mean_token_accuracy": 0.9578744769096375,
|
17137 |
+
"step": 2132
|
17138 |
+
},
|
17139 |
+
{
|
17140 |
+
"epoch": 8.814049586776859,
|
17141 |
+
"grad_norm": 0.10381490737199783,
|
17142 |
+
"learning_rate": 3.7962243443512627e-07,
|
17143 |
+
"loss": 0.0811,
|
17144 |
+
"mean_token_accuracy": 0.9788123965263367,
|
17145 |
+
"step": 2133
|
17146 |
+
},
|
17147 |
+
{
|
17148 |
+
"epoch": 8.818181818181818,
|
17149 |
+
"grad_norm": 0.11626556515693665,
|
17150 |
+
"learning_rate": 3.7701529204526856e-07,
|
17151 |
+
"loss": 0.1372,
|
17152 |
+
"mean_token_accuracy": 0.9591605067253113,
|
17153 |
+
"step": 2134
|
17154 |
+
},
|
17155 |
+
{
|
17156 |
+
"epoch": 8.822314049586776,
|
17157 |
+
"grad_norm": 0.10921085625886917,
|
17158 |
+
"learning_rate": 3.744167823065814e-07,
|
17159 |
+
"loss": 0.0944,
|
17160 |
+
"mean_token_accuracy": 0.9741052985191345,
|
17161 |
+
"step": 2135
|
17162 |
+
},
|
17163 |
+
{
|
17164 |
+
"epoch": 8.826446280991735,
|
17165 |
+
"grad_norm": 0.1696995347738266,
|
17166 |
+
"learning_rate": 3.718269100713445e-07,
|
17167 |
+
"loss": 0.1855,
|
17168 |
+
"mean_token_accuracy": 0.9409846663475037,
|
17169 |
+
"step": 2136
|
17170 |
+
},
|
17171 |
+
{
|
17172 |
+
"epoch": 8.830578512396695,
|
17173 |
+
"grad_norm": 0.06852439790964127,
|
17174 |
+
"learning_rate": 3.692456801757133e-07,
|
17175 |
+
"loss": 0.2206,
|
17176 |
+
"mean_token_accuracy": 0.9229573011398315,
|
17177 |
+
"step": 2137
|
17178 |
+
},
|
17179 |
+
{
|
17180 |
+
"epoch": 8.834710743801653,
|
17181 |
+
"grad_norm": 0.07205780595541,
|
17182 |
+
"learning_rate": 3.6667309743970147e-07,
|
17183 |
+
"loss": 0.2139,
|
17184 |
+
"mean_token_accuracy": 0.9254477024078369,
|
17185 |
+
"step": 2138
|
17186 |
+
},
|
17187 |
+
{
|
17188 |
+
"epoch": 8.838842975206612,
|
17189 |
+
"grad_norm": 0.07377646863460541,
|
17190 |
+
"learning_rate": 3.641091666671781e-07,
|
17191 |
+
"loss": 0.1921,
|
17192 |
+
"mean_token_accuracy": 0.9347940683364868,
|
17193 |
+
"step": 2139
|
17194 |
+
},
|
17195 |
+
{
|
17196 |
+
"epoch": 8.84297520661157,
|
17197 |
+
"grad_norm": 0.08118410408496857,
|
17198 |
+
"learning_rate": 3.615538926458556e-07,
|
17199 |
+
"loss": 0.2315,
|
17200 |
+
"mean_token_accuracy": 0.9196543097496033,
|
17201 |
+
"step": 2140
|
17202 |
+
},
|
17203 |
+
{
|
17204 |
+
"epoch": 8.847107438016529,
|
17205 |
+
"grad_norm": 0.0802064910531044,
|
17206 |
+
"learning_rate": 3.5900728014728046e-07,
|
17207 |
+
"loss": 0.2082,
|
17208 |
+
"mean_token_accuracy": 0.9265360832214355,
|
17209 |
+
"step": 2141
|
17210 |
+
},
|
17211 |
+
{
|
17212 |
+
"epoch": 8.851239669421489,
|
17213 |
+
"grad_norm": 0.07559309154748917,
|
17214 |
+
"learning_rate": 3.564693339268266e-07,
|
17215 |
+
"loss": 0.2115,
|
17216 |
+
"mean_token_accuracy": 0.9263385534286499,
|
17217 |
+
"step": 2142
|
17218 |
+
},
|
17219 |
+
{
|
17220 |
+
"epoch": 8.855371900826446,
|
17221 |
+
"grad_norm": 0.07263598591089249,
|
17222 |
+
"learning_rate": 3.539400587236824e-07,
|
17223 |
+
"loss": 0.1836,
|
17224 |
+
"mean_token_accuracy": 0.9360730648040771,
|
17225 |
+
"step": 2143
|
17226 |
+
},
|
17227 |
+
{
|
17228 |
+
"epoch": 8.859504132231406,
|
17229 |
+
"grad_norm": 0.0771942138671875,
|
17230 |
+
"learning_rate": 3.514194592608489e-07,
|
17231 |
+
"loss": 0.1768,
|
17232 |
+
"mean_token_accuracy": 0.9406779408454895,
|
17233 |
+
"step": 2144
|
17234 |
+
},
|
17235 |
+
{
|
17236 |
+
"epoch": 8.863636363636363,
|
17237 |
+
"grad_norm": 0.1021052822470665,
|
17238 |
+
"learning_rate": 3.4890754024512254e-07,
|
17239 |
+
"loss": 0.2382,
|
17240 |
+
"mean_token_accuracy": 0.9189664721488953,
|
17241 |
+
"step": 2145
|
17242 |
+
},
|
17243 |
+
{
|
17244 |
+
"epoch": 8.867768595041323,
|
17245 |
+
"grad_norm": 0.07152920961380005,
|
17246 |
+
"learning_rate": 3.464043063670941e-07,
|
17247 |
+
"loss": 0.176,
|
17248 |
+
"mean_token_accuracy": 0.9418604373931885,
|
17249 |
+
"step": 2146
|
17250 |
+
},
|
17251 |
+
{
|
17252 |
+
"epoch": 8.87190082644628,
|
17253 |
+
"grad_norm": 0.09480854123830795,
|
17254 |
+
"learning_rate": 3.439097623011328e-07,
|
17255 |
+
"loss": 0.227,
|
17256 |
+
"mean_token_accuracy": 0.9247565865516663,
|
17257 |
+
"step": 2147
|
17258 |
+
},
|
17259 |
+
{
|
17260 |
+
"epoch": 8.87603305785124,
|
17261 |
+
"grad_norm": 0.06957484036684036,
|
17262 |
+
"learning_rate": 3.41423912705382e-07,
|
17263 |
+
"loss": 0.1406,
|
17264 |
+
"mean_token_accuracy": 0.9543790221214294,
|
17265 |
+
"step": 2148
|
17266 |
+
},
|
17267 |
+
{
|
17268 |
+
"epoch": 8.880165289256198,
|
17269 |
+
"grad_norm": 0.08442248404026031,
|
17270 |
+
"learning_rate": 3.389467622217524e-07,
|
17271 |
+
"loss": 0.1655,
|
17272 |
+
"mean_token_accuracy": 0.9483348727226257,
|
17273 |
+
"step": 2149
|
17274 |
+
},
|
17275 |
+
{
|
17276 |
+
"epoch": 8.884297520661157,
|
17277 |
+
"grad_norm": 0.08726052194833755,
|
17278 |
+
"learning_rate": 3.3647831547590714e-07,
|
17279 |
+
"loss": 0.171,
|
17280 |
+
"mean_token_accuracy": 0.9444353580474854,
|
17281 |
+
"step": 2150
|
17282 |
+
},
|
17283 |
+
{
|
17284 |
+
"epoch": 8.888429752066116,
|
17285 |
+
"grad_norm": 0.0831826776266098,
|
17286 |
+
"learning_rate": 3.340185770772586e-07,
|
17287 |
+
"loss": 0.1653,
|
17288 |
+
"mean_token_accuracy": 0.9473860859870911,
|
17289 |
+
"step": 2151
|
17290 |
+
},
|
17291 |
+
{
|
17292 |
+
"epoch": 8.892561983471074,
|
17293 |
+
"grad_norm": 0.07340402901172638,
|
17294 |
+
"learning_rate": 3.3156755161895647e-07,
|
17295 |
+
"loss": 0.126,
|
17296 |
+
"mean_token_accuracy": 0.9628297090530396,
|
17297 |
+
"step": 2152
|
17298 |
+
},
|
17299 |
+
{
|
17300 |
+
"epoch": 8.896694214876034,
|
17301 |
+
"grad_norm": 0.08021709322929382,
|
17302 |
+
"learning_rate": 3.2912524367788077e-07,
|
17303 |
+
"loss": 0.1286,
|
17304 |
+
"mean_token_accuracy": 0.9604715704917908,
|
17305 |
+
"step": 2153
|
17306 |
+
},
|
17307 |
+
{
|
17308 |
+
"epoch": 8.900826446280991,
|
17309 |
+
"grad_norm": 0.09082391858100891,
|
17310 |
+
"learning_rate": 3.26691657814634e-07,
|
17311 |
+
"loss": 0.1633,
|
17312 |
+
"mean_token_accuracy": 0.9478123188018799,
|
17313 |
+
"step": 2154
|
17314 |
+
},
|
17315 |
+
{
|
17316 |
+
"epoch": 8.90495867768595,
|
17317 |
+
"grad_norm": 0.07327866554260254,
|
17318 |
+
"learning_rate": 3.2426679857353205e-07,
|
17319 |
+
"loss": 0.1074,
|
17320 |
+
"mean_token_accuracy": 0.9684982895851135,
|
17321 |
+
"step": 2155
|
17322 |
+
},
|
17323 |
+
{
|
17324 |
+
"epoch": 8.909090909090908,
|
17325 |
+
"grad_norm": 0.10244060307741165,
|
17326 |
+
"learning_rate": 3.2185067048259245e-07,
|
17327 |
+
"loss": 0.2105,
|
17328 |
+
"mean_token_accuracy": 0.9306122660636902,
|
17329 |
+
"step": 2156
|
17330 |
+
},
|
17331 |
+
{
|
17332 |
+
"epoch": 8.913223140495868,
|
17333 |
+
"grad_norm": 0.10891727358102798,
|
17334 |
+
"learning_rate": 3.194432780535295e-07,
|
17335 |
+
"loss": 0.1737,
|
17336 |
+
"mean_token_accuracy": 0.9447806477546692,
|
17337 |
+
"step": 2157
|
17338 |
+
},
|
17339 |
+
{
|
17340 |
+
"epoch": 8.917355371900827,
|
17341 |
+
"grad_norm": 0.08316652476787567,
|
17342 |
+
"learning_rate": 3.1704462578174945e-07,
|
17343 |
+
"loss": 0.0987,
|
17344 |
+
"mean_token_accuracy": 0.9734411239624023,
|
17345 |
+
"step": 2158
|
17346 |
+
},
|
17347 |
+
{
|
17348 |
+
"epoch": 8.921487603305785,
|
17349 |
+
"grad_norm": 0.09768752753734589,
|
17350 |
+
"learning_rate": 3.146547181463322e-07,
|
17351 |
+
"loss": 0.1241,
|
17352 |
+
"mean_token_accuracy": 0.9615846276283264,
|
17353 |
+
"step": 2159
|
17354 |
+
},
|
17355 |
+
{
|
17356 |
+
"epoch": 8.925619834710744,
|
17357 |
+
"grad_norm": 0.09764862805604935,
|
17358 |
+
"learning_rate": 3.1227355961003183e-07,
|
17359 |
+
"loss": 0.1175,
|
17360 |
+
"mean_token_accuracy": 0.962368905544281,
|
17361 |
+
"step": 2160
|
17362 |
+
},
|
17363 |
+
{
|
17364 |
+
"epoch": 8.929752066115702,
|
17365 |
+
"grad_norm": 0.09442981332540512,
|
17366 |
+
"learning_rate": 3.099011546192621e-07,
|
17367 |
+
"loss": 0.1053,
|
17368 |
+
"mean_token_accuracy": 0.9669612646102905,
|
17369 |
+
"step": 2161
|
17370 |
+
},
|
17371 |
+
{
|
17372 |
+
"epoch": 8.933884297520661,
|
17373 |
+
"grad_norm": 0.08316774666309357,
|
17374 |
+
"learning_rate": 3.075375076040943e-07,
|
17375 |
+
"loss": 0.092,
|
17376 |
+
"mean_token_accuracy": 0.971761167049408,
|
17377 |
+
"step": 2162
|
17378 |
+
},
|
17379 |
+
{
|
17380 |
+
"epoch": 8.938016528925619,
|
17381 |
+
"grad_norm": 0.09355759620666504,
|
17382 |
+
"learning_rate": 3.051826229782451e-07,
|
17383 |
+
"loss": 0.1264,
|
17384 |
+
"mean_token_accuracy": 0.9610360860824585,
|
17385 |
+
"step": 2163
|
17386 |
+
},
|
17387 |
+
{
|
17388 |
+
"epoch": 8.942148760330578,
|
17389 |
+
"grad_norm": 0.08833235502243042,
|
17390 |
+
"learning_rate": 3.0283650513906524e-07,
|
17391 |
+
"loss": 0.0889,
|
17392 |
+
"mean_token_accuracy": 0.9773091673851013,
|
17393 |
+
"step": 2164
|
17394 |
+
},
|
17395 |
+
{
|
17396 |
+
"epoch": 8.946280991735538,
|
17397 |
+
"grad_norm": 0.1082148551940918,
|
17398 |
+
"learning_rate": 3.0049915846753983e-07,
|
17399 |
+
"loss": 0.0873,
|
17400 |
+
"mean_token_accuracy": 0.9752772450447083,
|
17401 |
+
"step": 2165
|
17402 |
+
},
|
17403 |
+
{
|
17404 |
+
"epoch": 8.950413223140496,
|
17405 |
+
"grad_norm": 0.09726337343454361,
|
17406 |
+
"learning_rate": 2.981705873282714e-07,
|
17407 |
+
"loss": 0.0871,
|
17408 |
+
"mean_token_accuracy": 0.97633957862854,
|
17409 |
+
"step": 2166
|
17410 |
+
},
|
17411 |
+
{
|
17412 |
+
"epoch": 8.954545454545455,
|
17413 |
+
"grad_norm": 0.09049960970878601,
|
17414 |
+
"learning_rate": 2.9585079606947843e-07,
|
17415 |
+
"loss": 0.1021,
|
17416 |
+
"mean_token_accuracy": 0.9693925380706787,
|
17417 |
+
"step": 2167
|
17418 |
+
},
|
17419 |
+
{
|
17420 |
+
"epoch": 8.958677685950413,
|
17421 |
+
"grad_norm": 0.10062138736248016,
|
17422 |
+
"learning_rate": 2.9353978902298296e-07,
|
17423 |
+
"loss": 0.1027,
|
17424 |
+
"mean_token_accuracy": 0.9701105952262878,
|
17425 |
+
"step": 2168
|
17426 |
+
},
|
17427 |
+
{
|
17428 |
+
"epoch": 8.962809917355372,
|
17429 |
+
"grad_norm": 0.09967434406280518,
|
17430 |
+
"learning_rate": 2.9123757050420476e-07,
|
17431 |
+
"loss": 0.106,
|
17432 |
+
"mean_token_accuracy": 0.9688540101051331,
|
17433 |
+
"step": 2169
|
17434 |
+
},
|
17435 |
+
{
|
17436 |
+
"epoch": 8.96694214876033,
|
17437 |
+
"grad_norm": 0.08865738660097122,
|
17438 |
+
"learning_rate": 2.889441448121516e-07,
|
17439 |
+
"loss": 0.1013,
|
17440 |
+
"mean_token_accuracy": 0.970402717590332,
|
17441 |
+
"step": 2170
|
17442 |
+
},
|
17443 |
+
{
|
17444 |
+
"epoch": 8.97107438016529,
|
17445 |
+
"grad_norm": 0.11323466897010803,
|
17446 |
+
"learning_rate": 2.8665951622941225e-07,
|
17447 |
+
"loss": 0.138,
|
17448 |
+
"mean_token_accuracy": 0.9595091938972473,
|
17449 |
+
"step": 2171
|
17450 |
+
},
|
17451 |
+
{
|
17452 |
+
"epoch": 8.975206611570249,
|
17453 |
+
"grad_norm": 0.08644384145736694,
|
17454 |
+
"learning_rate": 2.843836890221502e-07,
|
17455 |
+
"loss": 0.0865,
|
17456 |
+
"mean_token_accuracy": 0.9795562028884888,
|
17457 |
+
"step": 2172
|
17458 |
+
},
|
17459 |
+
{
|
17460 |
+
"epoch": 8.979338842975206,
|
17461 |
+
"grad_norm": 0.10215216130018234,
|
17462 |
+
"learning_rate": 2.821166674400905e-07,
|
17463 |
+
"loss": 0.1145,
|
17464 |
+
"mean_token_accuracy": 0.9649077653884888,
|
17465 |
+
"step": 2173
|
17466 |
+
},
|
17467 |
+
{
|
17468 |
+
"epoch": 8.983471074380166,
|
17469 |
+
"grad_norm": 0.11719004064798355,
|
17470 |
+
"learning_rate": 2.798584557165185e-07,
|
17471 |
+
"loss": 0.1057,
|
17472 |
+
"mean_token_accuracy": 0.969704806804657,
|
17473 |
+
"step": 2174
|
17474 |
+
},
|
17475 |
+
{
|
17476 |
+
"epoch": 8.987603305785123,
|
17477 |
+
"grad_norm": 0.10035425424575806,
|
17478 |
+
"learning_rate": 2.7760905806826554e-07,
|
17479 |
+
"loss": 0.095,
|
17480 |
+
"mean_token_accuracy": 0.9760020971298218,
|
17481 |
+
"step": 2175
|
17482 |
+
},
|
17483 |
+
{
|
17484 |
+
"epoch": 8.991735537190083,
|
17485 |
+
"grad_norm": 0.12130289524793625,
|
17486 |
+
"learning_rate": 2.753684786957067e-07,
|
17487 |
+
"loss": 0.1554,
|
17488 |
+
"mean_token_accuracy": 0.951777994632721,
|
17489 |
+
"step": 2176
|
17490 |
+
},
|
17491 |
+
{
|
17492 |
+
"epoch": 8.99586776859504,
|
17493 |
+
"grad_norm": 0.0855918899178505,
|
17494 |
+
"learning_rate": 2.7313672178274906e-07,
|
17495 |
+
"loss": 0.0821,
|
17496 |
+
"mean_token_accuracy": 0.9829513430595398,
|
17497 |
+
"step": 2177
|
17498 |
+
},
|
17499 |
+
{
|
17500 |
+
"epoch": 9.0,
|
17501 |
+
"grad_norm": 0.12811078131198883,
|
17502 |
+
"learning_rate": 2.7091379149682683e-07,
|
17503 |
+
"loss": 0.1026,
|
17504 |
+
"mean_token_accuracy": 0.973548173904419,
|
17505 |
+
"step": 2178
|
17506 |
+
},
|
17507 |
+
{
|
17508 |
+
"epoch": 9.0,
|
17509 |
+
"eval_loss": 0.1644172966480255,
|
17510 |
+
"eval_mean_token_accuracy": 0.9770992398262024,
|
17511 |
+
"eval_runtime": 0.2104,
|
17512 |
+
"eval_samples_per_second": 23.768,
|
17513 |
+
"eval_steps_per_second": 4.754,
|
17514 |
+
"step": 2178
|
17515 |
}
|
17516 |
],
|
17517 |
"logging_steps": 1,
|
|
|
17531 |
"attributes": {}
|
17532 |
}
|
17533 |
},
|
17534 |
+
"total_flos": 1.0074075399098204e+18,
|
17535 |
"train_batch_size": 2,
|
17536 |
"trial_name": null,
|
17537 |
"trial_params": null
|