Upload E2 checkpoint
Browse files- global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
- global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
- global_step2000/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
- global_step2000/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
- global_step2000/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
- global_step2000/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
- global_step2000/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
- global_step2000/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
- global_step2000/zero_pp_rank_7_mp_rank_00_model_states.pt +3 -0
- latest +1 -1
- model-00001-of-00004.safetensors +1 -1
- model-00002-of-00004.safetensors +1 -1
- model-00003-of-00004.safetensors +1 -1
- model-00004-of-00004.safetensors +1 -1
- rng_state_0.pth +1 -1
- rng_state_1.pth +1 -1
- rng_state_2.pth +1 -1
- rng_state_3.pth +1 -1
- rng_state_4.pth +1 -1
- rng_state_5.pth +1 -1
- rng_state_6.pth +1 -1
- rng_state_7.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +1603 -3
global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0cdb4b556c834ccf83a60d6efd651e4482b75d238db8e36cd111e4936004e067
|
3 |
+
size 11423429708
|
global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:157dd94d9a55fc3c0344dbb3607edb8a3fd75d0a5fcf6ddaa5d86a1ddb6adf39
|
3 |
+
size 11423429708
|
global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a3a0e641f7aa992b1213e1cf6fca46e093c11db7c5ec94522b432eed86ab2e89
|
3 |
+
size 11423429708
|
global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8126432d183f33ac27017c30601eb49eb821a3f05e117366f325670178c5aa7a
|
3 |
+
size 11423429708
|
global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9d28d6c8ec1e368672960aa4a9efdf3ea0bd755b54ab8e866caad394e0d8291c
|
3 |
+
size 11423429708
|
global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b568717af726d537029642b4f77c9e67419b0a74a5521977b8d486f31fd3db8a
|
3 |
+
size 11423429708
|
global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8fd593e3f8f4b5a516dfac8092155dbc1be416c977f159164e1389e9130ffe92
|
3 |
+
size 11423429708
|
global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19e93281714c7d80e0f04d6adea4bb1f33baf4b4f5c5b57cbd963ac688bd48dc
|
3 |
+
size 11423429708
|
global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:459639211da751f24943c72ae6f76c43cba665eb88ee9a83eaca7fd596e03747
|
3 |
+
size 166293
|
global_step2000/zero_pp_rank_1_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:741e71908eea77bb92b52656fae9fffca00913e381f2fc98e56cbb68ab267b52
|
3 |
+
size 166293
|
global_step2000/zero_pp_rank_2_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:37b14cf128b2071636107aad6e5665dc148fc8be3b34c1c7e371603803cfe352
|
3 |
+
size 166293
|
global_step2000/zero_pp_rank_3_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6d9a6a4a5b01db724facd96e84a303c0af818a248906bb0f1b0978465313826
|
3 |
+
size 166293
|
global_step2000/zero_pp_rank_4_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:760d5052d2770143bd31198a769e3739d4bcbda61ee691a9a9ea840e61b98f95
|
3 |
+
size 166293
|
global_step2000/zero_pp_rank_5_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f92194315b137a7902feced8b6b4c08f0c3af63ddc82836520e59d2e49f83a7d
|
3 |
+
size 166293
|
global_step2000/zero_pp_rank_6_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e5987c9b0ed132408a1e457b107d52473e77f8755e53cfad66e69501ba13f2e1
|
3 |
+
size 166293
|
global_step2000/zero_pp_rank_7_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cc325b7ed520bce8f3b9cf5068eaca86b253acabc55f6d9a45f61d85ad0e13a9
|
3 |
+
size 166293
|
latest
CHANGED
@@ -1 +1 @@
|
|
1 |
-
|
|
|
1 |
+
global_step2000
|
model-00001-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4877660776
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1c8ebd9244c4146ecf809272a71d6b4de47fc24d81bd932a9b771cb1f41d6c2
|
3 |
size 4877660776
|
model-00002-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4932751008
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90d30071d983d8bac86b3707db92a52bf4d92e0a981a63886aed68b61a85dd05
|
3 |
size 4932751008
|
model-00003-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4330865200
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:77bde50d5d797508e74fe17bb3105a5f1a1d3d46dd9db24e584e2a67371cf791
|
3 |
size 4330865200
|
model-00004-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1089994880
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c111ea5d7f44c38d24eaa6ff0aa8d4ace60b5ff82e44f039b1e8e97d7c7372a
|
3 |
size 1089994880
|
rng_state_0.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15984
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85
|
3 |
size 15984
|
rng_state_1.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15984
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73
|
3 |
size 15984
|
rng_state_2.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15984
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b
|
3 |
size 15984
|
rng_state_3.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15984
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc
|
3 |
size 15984
|
rng_state_4.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15984
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972
|
3 |
size 15984
|
rng_state_5.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15984
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991
|
3 |
size 15984
|
rng_state_6.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15984
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa
|
3 |
size 15984
|
rng_state_7.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15984
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773
|
3 |
size 15984
|
scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a1f4ee95be23d350caa2c6cf69b932338c4918ddeea5378d2ec4a8922e5be11
|
3 |
size 1064
|
trainer_state.json
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
-
"epoch":
|
6 |
"eval_steps": 500,
|
7 |
-
"global_step":
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
@@ -1608,6 +1608,1606 @@
|
|
1608 |
"loss": 0.4178,
|
1609 |
"num_tokens": 1045662642.0,
|
1610 |
"step": 1000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1611 |
}
|
1612 |
],
|
1613 |
"logging_steps": 5,
|
@@ -1627,7 +3227,7 @@
|
|
1627 |
"attributes": {}
|
1628 |
}
|
1629 |
},
|
1630 |
-
"total_flos":
|
1631 |
"train_batch_size": 1,
|
1632 |
"trial_name": null,
|
1633 |
"trial_params": null
|
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
+
"epoch": 1.86219739292365,
|
6 |
"eval_steps": 500,
|
7 |
+
"global_step": 2000,
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
|
|
1608 |
"loss": 0.4178,
|
1609 |
"num_tokens": 1045662642.0,
|
1610 |
"step": 1000
|
1611 |
+
},
|
1612 |
+
{
|
1613 |
+
"epoch": 0.9357541899441341,
|
1614 |
+
"grad_norm": 0.5658078705552342,
|
1615 |
+
"learning_rate": 3.8237323214901695e-05,
|
1616 |
+
"loss": 0.4191,
|
1617 |
+
"num_tokens": 1050864562.0,
|
1618 |
+
"step": 1005
|
1619 |
+
},
|
1620 |
+
{
|
1621 |
+
"epoch": 0.9404096834264432,
|
1622 |
+
"grad_norm": 0.5748827219407342,
|
1623 |
+
"learning_rate": 3.815108658157986e-05,
|
1624 |
+
"loss": 0.417,
|
1625 |
+
"num_tokens": 1056107442.0,
|
1626 |
+
"step": 1010
|
1627 |
+
},
|
1628 |
+
{
|
1629 |
+
"epoch": 0.9450651769087524,
|
1630 |
+
"grad_norm": 0.5609460084210117,
|
1631 |
+
"learning_rate": 3.806484994825802e-05,
|
1632 |
+
"loss": 0.4055,
|
1633 |
+
"num_tokens": 1061349356.0,
|
1634 |
+
"step": 1015
|
1635 |
+
},
|
1636 |
+
{
|
1637 |
+
"epoch": 0.9497206703910615,
|
1638 |
+
"grad_norm": 0.48644456614397286,
|
1639 |
+
"learning_rate": 3.7978613314936186e-05,
|
1640 |
+
"loss": 0.4171,
|
1641 |
+
"num_tokens": 1066567410.0,
|
1642 |
+
"step": 1020
|
1643 |
+
},
|
1644 |
+
{
|
1645 |
+
"epoch": 0.9543761638733705,
|
1646 |
+
"grad_norm": 0.49374585932277654,
|
1647 |
+
"learning_rate": 3.789237668161435e-05,
|
1648 |
+
"loss": 0.4086,
|
1649 |
+
"num_tokens": 1071744846.0,
|
1650 |
+
"step": 1025
|
1651 |
+
},
|
1652 |
+
{
|
1653 |
+
"epoch": 0.9590316573556797,
|
1654 |
+
"grad_norm": 0.4590964927032936,
|
1655 |
+
"learning_rate": 3.780614004829251e-05,
|
1656 |
+
"loss": 0.4224,
|
1657 |
+
"num_tokens": 1076987726.0,
|
1658 |
+
"step": 1030
|
1659 |
+
},
|
1660 |
+
{
|
1661 |
+
"epoch": 0.9636871508379888,
|
1662 |
+
"grad_norm": 0.5671382944104515,
|
1663 |
+
"learning_rate": 3.771990341497068e-05,
|
1664 |
+
"loss": 0.416,
|
1665 |
+
"num_tokens": 1082230606.0,
|
1666 |
+
"step": 1035
|
1667 |
+
},
|
1668 |
+
{
|
1669 |
+
"epoch": 0.9683426443202979,
|
1670 |
+
"grad_norm": 0.7029680270448432,
|
1671 |
+
"learning_rate": 3.7633666781648847e-05,
|
1672 |
+
"loss": 0.4103,
|
1673 |
+
"num_tokens": 1087473486.0,
|
1674 |
+
"step": 1040
|
1675 |
+
},
|
1676 |
+
{
|
1677 |
+
"epoch": 0.972998137802607,
|
1678 |
+
"grad_norm": 0.6041713356383028,
|
1679 |
+
"learning_rate": 3.754743014832701e-05,
|
1680 |
+
"loss": 0.4181,
|
1681 |
+
"num_tokens": 1092716366.0,
|
1682 |
+
"step": 1045
|
1683 |
+
},
|
1684 |
+
{
|
1685 |
+
"epoch": 0.9776536312849162,
|
1686 |
+
"grad_norm": 0.5410682956289158,
|
1687 |
+
"learning_rate": 3.746119351500518e-05,
|
1688 |
+
"loss": 0.4141,
|
1689 |
+
"num_tokens": 1097959246.0,
|
1690 |
+
"step": 1050
|
1691 |
+
},
|
1692 |
+
{
|
1693 |
+
"epoch": 0.9823091247672253,
|
1694 |
+
"grad_norm": 0.5964451598240291,
|
1695 |
+
"learning_rate": 3.7374956881683344e-05,
|
1696 |
+
"loss": 0.4107,
|
1697 |
+
"num_tokens": 1103182300.0,
|
1698 |
+
"step": 1055
|
1699 |
+
},
|
1700 |
+
{
|
1701 |
+
"epoch": 0.9869646182495344,
|
1702 |
+
"grad_norm": 0.6398589551828828,
|
1703 |
+
"learning_rate": 3.728872024836151e-05,
|
1704 |
+
"loss": 0.4099,
|
1705 |
+
"num_tokens": 1108425180.0,
|
1706 |
+
"step": 1060
|
1707 |
+
},
|
1708 |
+
{
|
1709 |
+
"epoch": 0.9916201117318436,
|
1710 |
+
"grad_norm": 0.711512694227751,
|
1711 |
+
"learning_rate": 3.720248361503967e-05,
|
1712 |
+
"loss": 0.4159,
|
1713 |
+
"num_tokens": 1113668060.0,
|
1714 |
+
"step": 1065
|
1715 |
+
},
|
1716 |
+
{
|
1717 |
+
"epoch": 0.9962756052141527,
|
1718 |
+
"grad_norm": 0.7462919172813813,
|
1719 |
+
"learning_rate": 3.7116246981717835e-05,
|
1720 |
+
"loss": 0.4158,
|
1721 |
+
"num_tokens": 1118847586.0,
|
1722 |
+
"step": 1070
|
1723 |
+
},
|
1724 |
+
{
|
1725 |
+
"epoch": 1.000931098696462,
|
1726 |
+
"grad_norm": 0.5826662993124105,
|
1727 |
+
"learning_rate": 3.7030010348396e-05,
|
1728 |
+
"loss": 0.399,
|
1729 |
+
"num_tokens": 1123172962.0,
|
1730 |
+
"step": 1075
|
1731 |
+
},
|
1732 |
+
{
|
1733 |
+
"epoch": 1.005586592178771,
|
1734 |
+
"grad_norm": 0.5029709348384636,
|
1735 |
+
"learning_rate": 3.694377371507416e-05,
|
1736 |
+
"loss": 0.355,
|
1737 |
+
"num_tokens": 1128415842.0,
|
1738 |
+
"step": 1080
|
1739 |
+
},
|
1740 |
+
{
|
1741 |
+
"epoch": 1.01024208566108,
|
1742 |
+
"grad_norm": 0.4893880194541248,
|
1743 |
+
"learning_rate": 3.685753708175233e-05,
|
1744 |
+
"loss": 0.3469,
|
1745 |
+
"num_tokens": 1133612886.0,
|
1746 |
+
"step": 1085
|
1747 |
+
},
|
1748 |
+
{
|
1749 |
+
"epoch": 1.0148975791433892,
|
1750 |
+
"grad_norm": 0.514699919148864,
|
1751 |
+
"learning_rate": 3.6771300448430496e-05,
|
1752 |
+
"loss": 0.3532,
|
1753 |
+
"num_tokens": 1138855766.0,
|
1754 |
+
"step": 1090
|
1755 |
+
},
|
1756 |
+
{
|
1757 |
+
"epoch": 1.0195530726256983,
|
1758 |
+
"grad_norm": 0.4849219380515942,
|
1759 |
+
"learning_rate": 3.668506381510866e-05,
|
1760 |
+
"loss": 0.3508,
|
1761 |
+
"num_tokens": 1144098646.0,
|
1762 |
+
"step": 1095
|
1763 |
+
},
|
1764 |
+
{
|
1765 |
+
"epoch": 1.0242085661080074,
|
1766 |
+
"grad_norm": 0.41360144495958895,
|
1767 |
+
"learning_rate": 3.659882718178683e-05,
|
1768 |
+
"loss": 0.3468,
|
1769 |
+
"num_tokens": 1149341526.0,
|
1770 |
+
"step": 1100
|
1771 |
+
},
|
1772 |
+
{
|
1773 |
+
"epoch": 1.0288640595903167,
|
1774 |
+
"grad_norm": 0.5018592330001277,
|
1775 |
+
"learning_rate": 3.651259054846499e-05,
|
1776 |
+
"loss": 0.3565,
|
1777 |
+
"num_tokens": 1154584406.0,
|
1778 |
+
"step": 1105
|
1779 |
+
},
|
1780 |
+
{
|
1781 |
+
"epoch": 1.0335195530726258,
|
1782 |
+
"grad_norm": 0.44206019018889825,
|
1783 |
+
"learning_rate": 3.642635391514315e-05,
|
1784 |
+
"loss": 0.3547,
|
1785 |
+
"num_tokens": 1159806052.0,
|
1786 |
+
"step": 1110
|
1787 |
+
},
|
1788 |
+
{
|
1789 |
+
"epoch": 1.0381750465549349,
|
1790 |
+
"grad_norm": 0.4264076762021219,
|
1791 |
+
"learning_rate": 3.634011728182132e-05,
|
1792 |
+
"loss": 0.3599,
|
1793 |
+
"num_tokens": 1165048932.0,
|
1794 |
+
"step": 1115
|
1795 |
+
},
|
1796 |
+
{
|
1797 |
+
"epoch": 1.042830540037244,
|
1798 |
+
"grad_norm": 0.5567650614214772,
|
1799 |
+
"learning_rate": 3.6253880648499484e-05,
|
1800 |
+
"loss": 0.3564,
|
1801 |
+
"num_tokens": 1170270856.0,
|
1802 |
+
"step": 1120
|
1803 |
+
},
|
1804 |
+
{
|
1805 |
+
"epoch": 1.047486033519553,
|
1806 |
+
"grad_norm": 0.5228694711322092,
|
1807 |
+
"learning_rate": 3.616764401517765e-05,
|
1808 |
+
"loss": 0.3553,
|
1809 |
+
"num_tokens": 1175458142.0,
|
1810 |
+
"step": 1125
|
1811 |
+
},
|
1812 |
+
{
|
1813 |
+
"epoch": 1.0521415270018621,
|
1814 |
+
"grad_norm": 0.4599296157768013,
|
1815 |
+
"learning_rate": 3.608140738185581e-05,
|
1816 |
+
"loss": 0.352,
|
1817 |
+
"num_tokens": 1180603216.0,
|
1818 |
+
"step": 1130
|
1819 |
+
},
|
1820 |
+
{
|
1821 |
+
"epoch": 1.0567970204841712,
|
1822 |
+
"grad_norm": 0.405001210315896,
|
1823 |
+
"learning_rate": 3.599517074853398e-05,
|
1824 |
+
"loss": 0.3466,
|
1825 |
+
"num_tokens": 1185846096.0,
|
1826 |
+
"step": 1135
|
1827 |
+
},
|
1828 |
+
{
|
1829 |
+
"epoch": 1.0614525139664805,
|
1830 |
+
"grad_norm": 0.4688055181909875,
|
1831 |
+
"learning_rate": 3.5908934115212145e-05,
|
1832 |
+
"loss": 0.3472,
|
1833 |
+
"num_tokens": 1191088976.0,
|
1834 |
+
"step": 1140
|
1835 |
+
},
|
1836 |
+
{
|
1837 |
+
"epoch": 1.0661080074487896,
|
1838 |
+
"grad_norm": 0.4347382388506508,
|
1839 |
+
"learning_rate": 3.582269748189031e-05,
|
1840 |
+
"loss": 0.3536,
|
1841 |
+
"num_tokens": 1196289062.0,
|
1842 |
+
"step": 1145
|
1843 |
+
},
|
1844 |
+
{
|
1845 |
+
"epoch": 1.0707635009310987,
|
1846 |
+
"grad_norm": 0.45831883115463856,
|
1847 |
+
"learning_rate": 3.573646084856848e-05,
|
1848 |
+
"loss": 0.3647,
|
1849 |
+
"num_tokens": 1201492904.0,
|
1850 |
+
"step": 1150
|
1851 |
+
},
|
1852 |
+
{
|
1853 |
+
"epoch": 1.0754189944134078,
|
1854 |
+
"grad_norm": 0.4720640762198147,
|
1855 |
+
"learning_rate": 3.5650224215246636e-05,
|
1856 |
+
"loss": 0.3442,
|
1857 |
+
"num_tokens": 1206735784.0,
|
1858 |
+
"step": 1155
|
1859 |
+
},
|
1860 |
+
{
|
1861 |
+
"epoch": 1.080074487895717,
|
1862 |
+
"grad_norm": 0.379073526084197,
|
1863 |
+
"learning_rate": 3.55639875819248e-05,
|
1864 |
+
"loss": 0.3585,
|
1865 |
+
"num_tokens": 1211978664.0,
|
1866 |
+
"step": 1160
|
1867 |
+
},
|
1868 |
+
{
|
1869 |
+
"epoch": 1.084729981378026,
|
1870 |
+
"grad_norm": 0.4669245913480403,
|
1871 |
+
"learning_rate": 3.547775094860297e-05,
|
1872 |
+
"loss": 0.3708,
|
1873 |
+
"num_tokens": 1217221544.0,
|
1874 |
+
"step": 1165
|
1875 |
+
},
|
1876 |
+
{
|
1877 |
+
"epoch": 1.089385474860335,
|
1878 |
+
"grad_norm": 0.8249174245152311,
|
1879 |
+
"learning_rate": 3.539151431528113e-05,
|
1880 |
+
"loss": 0.3597,
|
1881 |
+
"num_tokens": 1222450604.0,
|
1882 |
+
"step": 1170
|
1883 |
+
},
|
1884 |
+
{
|
1885 |
+
"epoch": 1.0940409683426444,
|
1886 |
+
"grad_norm": 0.5231930637239787,
|
1887 |
+
"learning_rate": 3.5305277681959297e-05,
|
1888 |
+
"loss": 0.3526,
|
1889 |
+
"num_tokens": 1227693484.0,
|
1890 |
+
"step": 1175
|
1891 |
+
},
|
1892 |
+
{
|
1893 |
+
"epoch": 1.0986964618249535,
|
1894 |
+
"grad_norm": 0.5270777353874178,
|
1895 |
+
"learning_rate": 3.521904104863746e-05,
|
1896 |
+
"loss": 0.3548,
|
1897 |
+
"num_tokens": 1232936364.0,
|
1898 |
+
"step": 1180
|
1899 |
+
},
|
1900 |
+
{
|
1901 |
+
"epoch": 1.1033519553072626,
|
1902 |
+
"grad_norm": 0.44135156617910987,
|
1903 |
+
"learning_rate": 3.513280441531563e-05,
|
1904 |
+
"loss": 0.3483,
|
1905 |
+
"num_tokens": 1238179244.0,
|
1906 |
+
"step": 1185
|
1907 |
+
},
|
1908 |
+
{
|
1909 |
+
"epoch": 1.1080074487895717,
|
1910 |
+
"grad_norm": 0.4385440230745658,
|
1911 |
+
"learning_rate": 3.5046567781993794e-05,
|
1912 |
+
"loss": 0.3632,
|
1913 |
+
"num_tokens": 1243422124.0,
|
1914 |
+
"step": 1190
|
1915 |
+
},
|
1916 |
+
{
|
1917 |
+
"epoch": 1.1126629422718808,
|
1918 |
+
"grad_norm": 0.4508339201057018,
|
1919 |
+
"learning_rate": 3.496033114867196e-05,
|
1920 |
+
"loss": 0.3598,
|
1921 |
+
"num_tokens": 1248665004.0,
|
1922 |
+
"step": 1195
|
1923 |
+
},
|
1924 |
+
{
|
1925 |
+
"epoch": 1.1173184357541899,
|
1926 |
+
"grad_norm": 0.4779567369574128,
|
1927 |
+
"learning_rate": 3.487409451535012e-05,
|
1928 |
+
"loss": 0.3497,
|
1929 |
+
"num_tokens": 1253907884.0,
|
1930 |
+
"step": 1200
|
1931 |
+
},
|
1932 |
+
{
|
1933 |
+
"epoch": 1.121973929236499,
|
1934 |
+
"grad_norm": 0.5625979479787355,
|
1935 |
+
"learning_rate": 3.4787857882028285e-05,
|
1936 |
+
"loss": 0.3586,
|
1937 |
+
"num_tokens": 1259150764.0,
|
1938 |
+
"step": 1205
|
1939 |
+
},
|
1940 |
+
{
|
1941 |
+
"epoch": 1.1266294227188083,
|
1942 |
+
"grad_norm": 0.6014959273175333,
|
1943 |
+
"learning_rate": 3.470162124870645e-05,
|
1944 |
+
"loss": 0.3514,
|
1945 |
+
"num_tokens": 1264327888.0,
|
1946 |
+
"step": 1210
|
1947 |
+
},
|
1948 |
+
{
|
1949 |
+
"epoch": 1.1312849162011174,
|
1950 |
+
"grad_norm": 0.4906335352032041,
|
1951 |
+
"learning_rate": 3.461538461538462e-05,
|
1952 |
+
"loss": 0.3509,
|
1953 |
+
"num_tokens": 1269570768.0,
|
1954 |
+
"step": 1215
|
1955 |
+
},
|
1956 |
+
{
|
1957 |
+
"epoch": 1.1359404096834265,
|
1958 |
+
"grad_norm": 0.4082724800162844,
|
1959 |
+
"learning_rate": 3.452914798206278e-05,
|
1960 |
+
"loss": 0.3395,
|
1961 |
+
"num_tokens": 1274789340.0,
|
1962 |
+
"step": 1220
|
1963 |
+
},
|
1964 |
+
{
|
1965 |
+
"epoch": 1.1405959031657356,
|
1966 |
+
"grad_norm": 0.4750009264613512,
|
1967 |
+
"learning_rate": 3.4442911348740946e-05,
|
1968 |
+
"loss": 0.3579,
|
1969 |
+
"num_tokens": 1280032220.0,
|
1970 |
+
"step": 1225
|
1971 |
+
},
|
1972 |
+
{
|
1973 |
+
"epoch": 1.1452513966480447,
|
1974 |
+
"grad_norm": 0.48685393486923184,
|
1975 |
+
"learning_rate": 3.435667471541911e-05,
|
1976 |
+
"loss": 0.3549,
|
1977 |
+
"num_tokens": 1285275100.0,
|
1978 |
+
"step": 1230
|
1979 |
+
},
|
1980 |
+
{
|
1981 |
+
"epoch": 1.1499068901303537,
|
1982 |
+
"grad_norm": 0.5834960539136641,
|
1983 |
+
"learning_rate": 3.427043808209728e-05,
|
1984 |
+
"loss": 0.3498,
|
1985 |
+
"num_tokens": 1290517980.0,
|
1986 |
+
"step": 1235
|
1987 |
+
},
|
1988 |
+
{
|
1989 |
+
"epoch": 1.1545623836126628,
|
1990 |
+
"grad_norm": 0.6100955925437853,
|
1991 |
+
"learning_rate": 3.418420144877544e-05,
|
1992 |
+
"loss": 0.3596,
|
1993 |
+
"num_tokens": 1295760860.0,
|
1994 |
+
"step": 1240
|
1995 |
+
},
|
1996 |
+
{
|
1997 |
+
"epoch": 1.1592178770949721,
|
1998 |
+
"grad_norm": 0.522871641456894,
|
1999 |
+
"learning_rate": 3.409796481545361e-05,
|
2000 |
+
"loss": 0.3471,
|
2001 |
+
"num_tokens": 1300940750.0,
|
2002 |
+
"step": 1245
|
2003 |
+
},
|
2004 |
+
{
|
2005 |
+
"epoch": 1.1638733705772812,
|
2006 |
+
"grad_norm": 0.4194412850396169,
|
2007 |
+
"learning_rate": 3.401172818213177e-05,
|
2008 |
+
"loss": 0.3563,
|
2009 |
+
"num_tokens": 1306183630.0,
|
2010 |
+
"step": 1250
|
2011 |
+
},
|
2012 |
+
{
|
2013 |
+
"epoch": 1.1685288640595903,
|
2014 |
+
"grad_norm": 0.44072988623105025,
|
2015 |
+
"learning_rate": 3.3925491548809934e-05,
|
2016 |
+
"loss": 0.3515,
|
2017 |
+
"num_tokens": 1311406684.0,
|
2018 |
+
"step": 1255
|
2019 |
+
},
|
2020 |
+
{
|
2021 |
+
"epoch": 1.1731843575418994,
|
2022 |
+
"grad_norm": 0.5394080245112541,
|
2023 |
+
"learning_rate": 3.38392549154881e-05,
|
2024 |
+
"loss": 0.3683,
|
2025 |
+
"num_tokens": 1316641742.0,
|
2026 |
+
"step": 1260
|
2027 |
+
},
|
2028 |
+
{
|
2029 |
+
"epoch": 1.1778398510242085,
|
2030 |
+
"grad_norm": 0.4449620104225317,
|
2031 |
+
"learning_rate": 3.375301828216627e-05,
|
2032 |
+
"loss": 0.3589,
|
2033 |
+
"num_tokens": 1321884622.0,
|
2034 |
+
"step": 1265
|
2035 |
+
},
|
2036 |
+
{
|
2037 |
+
"epoch": 1.1824953445065176,
|
2038 |
+
"grad_norm": 0.41227147035395784,
|
2039 |
+
"learning_rate": 3.366678164884443e-05,
|
2040 |
+
"loss": 0.353,
|
2041 |
+
"num_tokens": 1327127502.0,
|
2042 |
+
"step": 1270
|
2043 |
+
},
|
2044 |
+
{
|
2045 |
+
"epoch": 1.1871508379888267,
|
2046 |
+
"grad_norm": 0.620493561519156,
|
2047 |
+
"learning_rate": 3.3580545015522595e-05,
|
2048 |
+
"loss": 0.3654,
|
2049 |
+
"num_tokens": 1332370382.0,
|
2050 |
+
"step": 1275
|
2051 |
+
},
|
2052 |
+
{
|
2053 |
+
"epoch": 1.191806331471136,
|
2054 |
+
"grad_norm": 0.4676275047758524,
|
2055 |
+
"learning_rate": 3.3494308382200765e-05,
|
2056 |
+
"loss": 0.3438,
|
2057 |
+
"num_tokens": 1337560874.0,
|
2058 |
+
"step": 1280
|
2059 |
+
},
|
2060 |
+
{
|
2061 |
+
"epoch": 1.196461824953445,
|
2062 |
+
"grad_norm": 0.43665241100017466,
|
2063 |
+
"learning_rate": 3.340807174887893e-05,
|
2064 |
+
"loss": 0.3369,
|
2065 |
+
"num_tokens": 1342803754.0,
|
2066 |
+
"step": 1285
|
2067 |
+
},
|
2068 |
+
{
|
2069 |
+
"epoch": 1.2011173184357542,
|
2070 |
+
"grad_norm": 0.478927000423963,
|
2071 |
+
"learning_rate": 3.332183511555709e-05,
|
2072 |
+
"loss": 0.342,
|
2073 |
+
"num_tokens": 1348046634.0,
|
2074 |
+
"step": 1290
|
2075 |
+
},
|
2076 |
+
{
|
2077 |
+
"epoch": 1.2057728119180633,
|
2078 |
+
"grad_norm": 0.5732412827887052,
|
2079 |
+
"learning_rate": 3.3235598482235256e-05,
|
2080 |
+
"loss": 0.3615,
|
2081 |
+
"num_tokens": 1353226140.0,
|
2082 |
+
"step": 1295
|
2083 |
+
},
|
2084 |
+
{
|
2085 |
+
"epoch": 1.2104283054003724,
|
2086 |
+
"grad_norm": 0.5150714352326667,
|
2087 |
+
"learning_rate": 3.314936184891342e-05,
|
2088 |
+
"loss": 0.3585,
|
2089 |
+
"num_tokens": 1358469020.0,
|
2090 |
+
"step": 1300
|
2091 |
+
},
|
2092 |
+
{
|
2093 |
+
"epoch": 1.2150837988826815,
|
2094 |
+
"grad_norm": 0.4720181041176109,
|
2095 |
+
"learning_rate": 3.306312521559158e-05,
|
2096 |
+
"loss": 0.354,
|
2097 |
+
"num_tokens": 1363711900.0,
|
2098 |
+
"step": 1305
|
2099 |
+
},
|
2100 |
+
{
|
2101 |
+
"epoch": 1.2197392923649906,
|
2102 |
+
"grad_norm": 0.5084117654380252,
|
2103 |
+
"learning_rate": 3.2976888582269747e-05,
|
2104 |
+
"loss": 0.3507,
|
2105 |
+
"num_tokens": 1368954780.0,
|
2106 |
+
"step": 1310
|
2107 |
+
},
|
2108 |
+
{
|
2109 |
+
"epoch": 1.2243947858472999,
|
2110 |
+
"grad_norm": 0.49216184402082547,
|
2111 |
+
"learning_rate": 3.289065194894792e-05,
|
2112 |
+
"loss": 0.3582,
|
2113 |
+
"num_tokens": 1374194182.0,
|
2114 |
+
"step": 1315
|
2115 |
+
},
|
2116 |
+
{
|
2117 |
+
"epoch": 1.229050279329609,
|
2118 |
+
"grad_norm": 0.4310109372956479,
|
2119 |
+
"learning_rate": 3.280441531562608e-05,
|
2120 |
+
"loss": 0.3475,
|
2121 |
+
"num_tokens": 1379397624.0,
|
2122 |
+
"step": 1320
|
2123 |
+
},
|
2124 |
+
{
|
2125 |
+
"epoch": 1.233705772811918,
|
2126 |
+
"grad_norm": 0.34411623806366076,
|
2127 |
+
"learning_rate": 3.2718178682304244e-05,
|
2128 |
+
"loss": 0.3479,
|
2129 |
+
"num_tokens": 1384640504.0,
|
2130 |
+
"step": 1325
|
2131 |
+
},
|
2132 |
+
{
|
2133 |
+
"epoch": 1.2383612662942272,
|
2134 |
+
"grad_norm": 0.3789626797289003,
|
2135 |
+
"learning_rate": 3.2631942048982414e-05,
|
2136 |
+
"loss": 0.3466,
|
2137 |
+
"num_tokens": 1389883384.0,
|
2138 |
+
"step": 1330
|
2139 |
+
},
|
2140 |
+
{
|
2141 |
+
"epoch": 1.2430167597765363,
|
2142 |
+
"grad_norm": 0.39242414869191117,
|
2143 |
+
"learning_rate": 3.254570541566058e-05,
|
2144 |
+
"loss": 0.3484,
|
2145 |
+
"num_tokens": 1395126264.0,
|
2146 |
+
"step": 1335
|
2147 |
+
},
|
2148 |
+
{
|
2149 |
+
"epoch": 1.2476722532588453,
|
2150 |
+
"grad_norm": 0.4280782038705297,
|
2151 |
+
"learning_rate": 3.2459468782338735e-05,
|
2152 |
+
"loss": 0.3513,
|
2153 |
+
"num_tokens": 1400325474.0,
|
2154 |
+
"step": 1340
|
2155 |
+
},
|
2156 |
+
{
|
2157 |
+
"epoch": 1.2523277467411544,
|
2158 |
+
"grad_norm": 0.485326538754542,
|
2159 |
+
"learning_rate": 3.2373232149016905e-05,
|
2160 |
+
"loss": 0.3476,
|
2161 |
+
"num_tokens": 1405510816.0,
|
2162 |
+
"step": 1345
|
2163 |
+
},
|
2164 |
+
{
|
2165 |
+
"epoch": 1.2569832402234637,
|
2166 |
+
"grad_norm": 0.4807785764220693,
|
2167 |
+
"learning_rate": 3.228699551569507e-05,
|
2168 |
+
"loss": 0.363,
|
2169 |
+
"num_tokens": 1410688258.0,
|
2170 |
+
"step": 1350
|
2171 |
+
},
|
2172 |
+
{
|
2173 |
+
"epoch": 1.2616387337057728,
|
2174 |
+
"grad_norm": 0.5077856549055086,
|
2175 |
+
"learning_rate": 3.220075888237323e-05,
|
2176 |
+
"loss": 0.3542,
|
2177 |
+
"num_tokens": 1415931138.0,
|
2178 |
+
"step": 1355
|
2179 |
+
},
|
2180 |
+
{
|
2181 |
+
"epoch": 1.266294227188082,
|
2182 |
+
"grad_norm": 0.41724539394395294,
|
2183 |
+
"learning_rate": 3.2114522249051396e-05,
|
2184 |
+
"loss": 0.3605,
|
2185 |
+
"num_tokens": 1421110144.0,
|
2186 |
+
"step": 1360
|
2187 |
+
},
|
2188 |
+
{
|
2189 |
+
"epoch": 1.270949720670391,
|
2190 |
+
"grad_norm": 0.4560337991563612,
|
2191 |
+
"learning_rate": 3.2028285615729566e-05,
|
2192 |
+
"loss": 0.3406,
|
2193 |
+
"num_tokens": 1426353024.0,
|
2194 |
+
"step": 1365
|
2195 |
+
},
|
2196 |
+
{
|
2197 |
+
"epoch": 1.2756052141527001,
|
2198 |
+
"grad_norm": 0.4756884920927917,
|
2199 |
+
"learning_rate": 3.194204898240773e-05,
|
2200 |
+
"loss": 0.359,
|
2201 |
+
"num_tokens": 1431595904.0,
|
2202 |
+
"step": 1370
|
2203 |
+
},
|
2204 |
+
{
|
2205 |
+
"epoch": 1.2802607076350094,
|
2206 |
+
"grad_norm": 0.5192756462333422,
|
2207 |
+
"learning_rate": 3.185581234908589e-05,
|
2208 |
+
"loss": 0.3508,
|
2209 |
+
"num_tokens": 1436838784.0,
|
2210 |
+
"step": 1375
|
2211 |
+
},
|
2212 |
+
{
|
2213 |
+
"epoch": 1.2849162011173183,
|
2214 |
+
"grad_norm": 0.3560326634114241,
|
2215 |
+
"learning_rate": 3.1769575715764064e-05,
|
2216 |
+
"loss": 0.351,
|
2217 |
+
"num_tokens": 1442040238.0,
|
2218 |
+
"step": 1380
|
2219 |
+
},
|
2220 |
+
{
|
2221 |
+
"epoch": 1.2895716945996276,
|
2222 |
+
"grad_norm": 0.40006619071176525,
|
2223 |
+
"learning_rate": 3.168333908244222e-05,
|
2224 |
+
"loss": 0.3582,
|
2225 |
+
"num_tokens": 1447283118.0,
|
2226 |
+
"step": 1385
|
2227 |
+
},
|
2228 |
+
{
|
2229 |
+
"epoch": 1.2942271880819367,
|
2230 |
+
"grad_norm": 0.49325642461103086,
|
2231 |
+
"learning_rate": 3.1597102449120384e-05,
|
2232 |
+
"loss": 0.3497,
|
2233 |
+
"num_tokens": 1452508926.0,
|
2234 |
+
"step": 1390
|
2235 |
+
},
|
2236 |
+
{
|
2237 |
+
"epoch": 1.2988826815642458,
|
2238 |
+
"grad_norm": 0.4051150117444856,
|
2239 |
+
"learning_rate": 3.1510865815798554e-05,
|
2240 |
+
"loss": 0.3453,
|
2241 |
+
"num_tokens": 1457751806.0,
|
2242 |
+
"step": 1395
|
2243 |
+
},
|
2244 |
+
{
|
2245 |
+
"epoch": 1.303538175046555,
|
2246 |
+
"grad_norm": 0.44239897857904603,
|
2247 |
+
"learning_rate": 3.142462918247672e-05,
|
2248 |
+
"loss": 0.3551,
|
2249 |
+
"num_tokens": 1462994686.0,
|
2250 |
+
"step": 1400
|
2251 |
+
},
|
2252 |
+
{
|
2253 |
+
"epoch": 1.308193668528864,
|
2254 |
+
"grad_norm": 0.4035898982852808,
|
2255 |
+
"learning_rate": 3.133839254915488e-05,
|
2256 |
+
"loss": 0.3565,
|
2257 |
+
"num_tokens": 1468185418.0,
|
2258 |
+
"step": 1405
|
2259 |
+
},
|
2260 |
+
{
|
2261 |
+
"epoch": 1.3128491620111733,
|
2262 |
+
"grad_norm": 0.3795900812088053,
|
2263 |
+
"learning_rate": 3.1252155915833045e-05,
|
2264 |
+
"loss": 0.3508,
|
2265 |
+
"num_tokens": 1473428298.0,
|
2266 |
+
"step": 1410
|
2267 |
+
},
|
2268 |
+
{
|
2269 |
+
"epoch": 1.3175046554934824,
|
2270 |
+
"grad_norm": 0.5917536504763548,
|
2271 |
+
"learning_rate": 3.1165919282511215e-05,
|
2272 |
+
"loss": 0.3581,
|
2273 |
+
"num_tokens": 1478671178.0,
|
2274 |
+
"step": 1415
|
2275 |
+
},
|
2276 |
+
{
|
2277 |
+
"epoch": 1.3221601489757915,
|
2278 |
+
"grad_norm": 0.5746485721379913,
|
2279 |
+
"learning_rate": 3.107968264918938e-05,
|
2280 |
+
"loss": 0.345,
|
2281 |
+
"num_tokens": 1483914058.0,
|
2282 |
+
"step": 1420
|
2283 |
+
},
|
2284 |
+
{
|
2285 |
+
"epoch": 1.3268156424581006,
|
2286 |
+
"grad_norm": 0.5174456748590313,
|
2287 |
+
"learning_rate": 3.099344601586754e-05,
|
2288 |
+
"loss": 0.3469,
|
2289 |
+
"num_tokens": 1489156938.0,
|
2290 |
+
"step": 1425
|
2291 |
+
},
|
2292 |
+
{
|
2293 |
+
"epoch": 1.3314711359404097,
|
2294 |
+
"grad_norm": 0.5266154162574862,
|
2295 |
+
"learning_rate": 3.0907209382545706e-05,
|
2296 |
+
"loss": 0.3499,
|
2297 |
+
"num_tokens": 1494399818.0,
|
2298 |
+
"step": 1430
|
2299 |
+
},
|
2300 |
+
{
|
2301 |
+
"epoch": 1.3361266294227188,
|
2302 |
+
"grad_norm": 0.5381931883646065,
|
2303 |
+
"learning_rate": 3.082097274922387e-05,
|
2304 |
+
"loss": 0.3603,
|
2305 |
+
"num_tokens": 1499642698.0,
|
2306 |
+
"step": 1435
|
2307 |
+
},
|
2308 |
+
{
|
2309 |
+
"epoch": 1.3407821229050279,
|
2310 |
+
"grad_norm": 0.5845199613878549,
|
2311 |
+
"learning_rate": 3.073473611590203e-05,
|
2312 |
+
"loss": 0.3527,
|
2313 |
+
"num_tokens": 1504885578.0,
|
2314 |
+
"step": 1440
|
2315 |
+
},
|
2316 |
+
{
|
2317 |
+
"epoch": 1.3454376163873372,
|
2318 |
+
"grad_norm": 0.4761161751328153,
|
2319 |
+
"learning_rate": 3.06484994825802e-05,
|
2320 |
+
"loss": 0.3495,
|
2321 |
+
"num_tokens": 1510128458.0,
|
2322 |
+
"step": 1445
|
2323 |
+
},
|
2324 |
+
{
|
2325 |
+
"epoch": 1.3500931098696463,
|
2326 |
+
"grad_norm": 0.6099903516270699,
|
2327 |
+
"learning_rate": 3.056226284925837e-05,
|
2328 |
+
"loss": 0.3541,
|
2329 |
+
"num_tokens": 1515311764.0,
|
2330 |
+
"step": 1450
|
2331 |
+
},
|
2332 |
+
{
|
2333 |
+
"epoch": 1.3547486033519553,
|
2334 |
+
"grad_norm": 0.48697775909316104,
|
2335 |
+
"learning_rate": 3.047602621593653e-05,
|
2336 |
+
"loss": 0.3459,
|
2337 |
+
"num_tokens": 1520554644.0,
|
2338 |
+
"step": 1455
|
2339 |
+
},
|
2340 |
+
{
|
2341 |
+
"epoch": 1.3594040968342644,
|
2342 |
+
"grad_norm": 0.5760138459581621,
|
2343 |
+
"learning_rate": 3.0389789582614697e-05,
|
2344 |
+
"loss": 0.341,
|
2345 |
+
"num_tokens": 1525756564.0,
|
2346 |
+
"step": 1460
|
2347 |
+
},
|
2348 |
+
{
|
2349 |
+
"epoch": 1.3640595903165735,
|
2350 |
+
"grad_norm": 0.4382519382413423,
|
2351 |
+
"learning_rate": 3.0303552949292864e-05,
|
2352 |
+
"loss": 0.3511,
|
2353 |
+
"num_tokens": 1530999444.0,
|
2354 |
+
"step": 1465
|
2355 |
+
},
|
2356 |
+
{
|
2357 |
+
"epoch": 1.3687150837988826,
|
2358 |
+
"grad_norm": 0.5060933477322624,
|
2359 |
+
"learning_rate": 3.0217316315971028e-05,
|
2360 |
+
"loss": 0.3561,
|
2361 |
+
"num_tokens": 1536242324.0,
|
2362 |
+
"step": 1470
|
2363 |
+
},
|
2364 |
+
{
|
2365 |
+
"epoch": 1.3733705772811917,
|
2366 |
+
"grad_norm": 0.46671819295156486,
|
2367 |
+
"learning_rate": 3.0131079682649188e-05,
|
2368 |
+
"loss": 0.3575,
|
2369 |
+
"num_tokens": 1541485204.0,
|
2370 |
+
"step": 1475
|
2371 |
+
},
|
2372 |
+
{
|
2373 |
+
"epoch": 1.378026070763501,
|
2374 |
+
"grad_norm": 0.5087368057400945,
|
2375 |
+
"learning_rate": 3.0044843049327355e-05,
|
2376 |
+
"loss": 0.3458,
|
2377 |
+
"num_tokens": 1546728084.0,
|
2378 |
+
"step": 1480
|
2379 |
+
},
|
2380 |
+
{
|
2381 |
+
"epoch": 1.3826815642458101,
|
2382 |
+
"grad_norm": 0.41271390502469796,
|
2383 |
+
"learning_rate": 2.995860641600552e-05,
|
2384 |
+
"loss": 0.3468,
|
2385 |
+
"num_tokens": 1551970964.0,
|
2386 |
+
"step": 1485
|
2387 |
+
},
|
2388 |
+
{
|
2389 |
+
"epoch": 1.3873370577281192,
|
2390 |
+
"grad_norm": 0.46397488100583334,
|
2391 |
+
"learning_rate": 2.9872369782683686e-05,
|
2392 |
+
"loss": 0.3453,
|
2393 |
+
"num_tokens": 1557199712.0,
|
2394 |
+
"step": 1490
|
2395 |
+
},
|
2396 |
+
{
|
2397 |
+
"epoch": 1.3919925512104283,
|
2398 |
+
"grad_norm": 0.606936021926111,
|
2399 |
+
"learning_rate": 2.978613314936185e-05,
|
2400 |
+
"loss": 0.359,
|
2401 |
+
"num_tokens": 1562442592.0,
|
2402 |
+
"step": 1495
|
2403 |
+
},
|
2404 |
+
{
|
2405 |
+
"epoch": 1.3966480446927374,
|
2406 |
+
"grad_norm": 0.5840116139586674,
|
2407 |
+
"learning_rate": 2.9699896516040016e-05,
|
2408 |
+
"loss": 0.3527,
|
2409 |
+
"num_tokens": 1567685472.0,
|
2410 |
+
"step": 1500
|
2411 |
+
},
|
2412 |
+
{
|
2413 |
+
"epoch": 1.4013035381750465,
|
2414 |
+
"grad_norm": 0.41927975243494453,
|
2415 |
+
"learning_rate": 2.961365988271818e-05,
|
2416 |
+
"loss": 0.3476,
|
2417 |
+
"num_tokens": 1572914018.0,
|
2418 |
+
"step": 1505
|
2419 |
+
},
|
2420 |
+
{
|
2421 |
+
"epoch": 1.4059590316573556,
|
2422 |
+
"grad_norm": 0.4719849459010854,
|
2423 |
+
"learning_rate": 2.9527423249396347e-05,
|
2424 |
+
"loss": 0.3435,
|
2425 |
+
"num_tokens": 1578156898.0,
|
2426 |
+
"step": 1510
|
2427 |
+
},
|
2428 |
+
{
|
2429 |
+
"epoch": 1.410614525139665,
|
2430 |
+
"grad_norm": 0.5908747500911715,
|
2431 |
+
"learning_rate": 2.9441186616074514e-05,
|
2432 |
+
"loss": 0.3479,
|
2433 |
+
"num_tokens": 1583399778.0,
|
2434 |
+
"step": 1515
|
2435 |
+
},
|
2436 |
+
{
|
2437 |
+
"epoch": 1.415270018621974,
|
2438 |
+
"grad_norm": 0.4654579138308962,
|
2439 |
+
"learning_rate": 2.9354949982752677e-05,
|
2440 |
+
"loss": 0.356,
|
2441 |
+
"num_tokens": 1588625916.0,
|
2442 |
+
"step": 1520
|
2443 |
+
},
|
2444 |
+
{
|
2445 |
+
"epoch": 1.419925512104283,
|
2446 |
+
"grad_norm": 0.5082441042584376,
|
2447 |
+
"learning_rate": 2.9268713349430837e-05,
|
2448 |
+
"loss": 0.3571,
|
2449 |
+
"num_tokens": 1593853014.0,
|
2450 |
+
"step": 1525
|
2451 |
+
},
|
2452 |
+
{
|
2453 |
+
"epoch": 1.4245810055865922,
|
2454 |
+
"grad_norm": 0.4505932414113476,
|
2455 |
+
"learning_rate": 2.9182476716109004e-05,
|
2456 |
+
"loss": 0.3587,
|
2457 |
+
"num_tokens": 1599054446.0,
|
2458 |
+
"step": 1530
|
2459 |
+
},
|
2460 |
+
{
|
2461 |
+
"epoch": 1.4292364990689013,
|
2462 |
+
"grad_norm": 0.4402651883114836,
|
2463 |
+
"learning_rate": 2.9096240082787168e-05,
|
2464 |
+
"loss": 0.3441,
|
2465 |
+
"num_tokens": 1604297326.0,
|
2466 |
+
"step": 1535
|
2467 |
+
},
|
2468 |
+
{
|
2469 |
+
"epoch": 1.4338919925512104,
|
2470 |
+
"grad_norm": 0.41400290224654057,
|
2471 |
+
"learning_rate": 2.9010003449465335e-05,
|
2472 |
+
"loss": 0.3462,
|
2473 |
+
"num_tokens": 1609497138.0,
|
2474 |
+
"step": 1540
|
2475 |
+
},
|
2476 |
+
{
|
2477 |
+
"epoch": 1.4385474860335195,
|
2478 |
+
"grad_norm": 0.47735272128289663,
|
2479 |
+
"learning_rate": 2.8923766816143498e-05,
|
2480 |
+
"loss": 0.3571,
|
2481 |
+
"num_tokens": 1614724154.0,
|
2482 |
+
"step": 1545
|
2483 |
+
},
|
2484 |
+
{
|
2485 |
+
"epoch": 1.4432029795158288,
|
2486 |
+
"grad_norm": 0.4687209241663635,
|
2487 |
+
"learning_rate": 2.8837530182821665e-05,
|
2488 |
+
"loss": 0.3491,
|
2489 |
+
"num_tokens": 1619955342.0,
|
2490 |
+
"step": 1550
|
2491 |
+
},
|
2492 |
+
{
|
2493 |
+
"epoch": 1.4478584729981379,
|
2494 |
+
"grad_norm": 0.5325094020285824,
|
2495 |
+
"learning_rate": 2.875129354949983e-05,
|
2496 |
+
"loss": 0.3516,
|
2497 |
+
"num_tokens": 1625198222.0,
|
2498 |
+
"step": 1555
|
2499 |
+
},
|
2500 |
+
{
|
2501 |
+
"epoch": 1.452513966480447,
|
2502 |
+
"grad_norm": 0.5380366589721377,
|
2503 |
+
"learning_rate": 2.8665056916177996e-05,
|
2504 |
+
"loss": 0.3575,
|
2505 |
+
"num_tokens": 1630390158.0,
|
2506 |
+
"step": 1560
|
2507 |
+
},
|
2508 |
+
{
|
2509 |
+
"epoch": 1.457169459962756,
|
2510 |
+
"grad_norm": 0.3818882359532099,
|
2511 |
+
"learning_rate": 2.8578820282856163e-05,
|
2512 |
+
"loss": 0.3511,
|
2513 |
+
"num_tokens": 1635633038.0,
|
2514 |
+
"step": 1565
|
2515 |
+
},
|
2516 |
+
{
|
2517 |
+
"epoch": 1.4618249534450651,
|
2518 |
+
"grad_norm": 0.43172836862069464,
|
2519 |
+
"learning_rate": 2.8492583649534323e-05,
|
2520 |
+
"loss": 0.3581,
|
2521 |
+
"num_tokens": 1640875918.0,
|
2522 |
+
"step": 1570
|
2523 |
+
},
|
2524 |
+
{
|
2525 |
+
"epoch": 1.4664804469273742,
|
2526 |
+
"grad_norm": 0.43715719572511025,
|
2527 |
+
"learning_rate": 2.8406347016212486e-05,
|
2528 |
+
"loss": 0.3655,
|
2529 |
+
"num_tokens": 1646118798.0,
|
2530 |
+
"step": 1575
|
2531 |
+
},
|
2532 |
+
{
|
2533 |
+
"epoch": 1.4711359404096833,
|
2534 |
+
"grad_norm": 0.48153517614734775,
|
2535 |
+
"learning_rate": 2.8320110382890653e-05,
|
2536 |
+
"loss": 0.3619,
|
2537 |
+
"num_tokens": 1651329310.0,
|
2538 |
+
"step": 1580
|
2539 |
+
},
|
2540 |
+
{
|
2541 |
+
"epoch": 1.4757914338919926,
|
2542 |
+
"grad_norm": 0.49806621524576733,
|
2543 |
+
"learning_rate": 2.8233873749568817e-05,
|
2544 |
+
"loss": 0.3491,
|
2545 |
+
"num_tokens": 1656572190.0,
|
2546 |
+
"step": 1585
|
2547 |
+
},
|
2548 |
+
{
|
2549 |
+
"epoch": 1.4804469273743017,
|
2550 |
+
"grad_norm": 0.44142126215810823,
|
2551 |
+
"learning_rate": 2.8147637116246984e-05,
|
2552 |
+
"loss": 0.3564,
|
2553 |
+
"num_tokens": 1661779772.0,
|
2554 |
+
"step": 1590
|
2555 |
+
},
|
2556 |
+
{
|
2557 |
+
"epoch": 1.4851024208566108,
|
2558 |
+
"grad_norm": 0.4237230808369376,
|
2559 |
+
"learning_rate": 2.8061400482925147e-05,
|
2560 |
+
"loss": 0.3532,
|
2561 |
+
"num_tokens": 1667018872.0,
|
2562 |
+
"step": 1595
|
2563 |
+
},
|
2564 |
+
{
|
2565 |
+
"epoch": 1.48975791433892,
|
2566 |
+
"grad_norm": 0.5308149652633445,
|
2567 |
+
"learning_rate": 2.7975163849603314e-05,
|
2568 |
+
"loss": 0.3638,
|
2569 |
+
"num_tokens": 1672223952.0,
|
2570 |
+
"step": 1600
|
2571 |
+
},
|
2572 |
+
{
|
2573 |
+
"epoch": 1.494413407821229,
|
2574 |
+
"grad_norm": 0.5017844851185143,
|
2575 |
+
"learning_rate": 2.788892721628148e-05,
|
2576 |
+
"loss": 0.3594,
|
2577 |
+
"num_tokens": 1677466832.0,
|
2578 |
+
"step": 1605
|
2579 |
+
},
|
2580 |
+
{
|
2581 |
+
"epoch": 1.499068901303538,
|
2582 |
+
"grad_norm": 0.4839984310616902,
|
2583 |
+
"learning_rate": 2.7802690582959645e-05,
|
2584 |
+
"loss": 0.3466,
|
2585 |
+
"num_tokens": 1682709712.0,
|
2586 |
+
"step": 1610
|
2587 |
+
},
|
2588 |
+
{
|
2589 |
+
"epoch": 1.5037243947858472,
|
2590 |
+
"grad_norm": 0.41406506477863037,
|
2591 |
+
"learning_rate": 2.7716453949637805e-05,
|
2592 |
+
"loss": 0.3547,
|
2593 |
+
"num_tokens": 1687952592.0,
|
2594 |
+
"step": 1615
|
2595 |
+
},
|
2596 |
+
{
|
2597 |
+
"epoch": 1.5083798882681565,
|
2598 |
+
"grad_norm": 0.42348118656156764,
|
2599 |
+
"learning_rate": 2.7630217316315972e-05,
|
2600 |
+
"loss": 0.3485,
|
2601 |
+
"num_tokens": 1693097166.0,
|
2602 |
+
"step": 1620
|
2603 |
+
},
|
2604 |
+
{
|
2605 |
+
"epoch": 1.5130353817504656,
|
2606 |
+
"grad_norm": 0.4357117827255015,
|
2607 |
+
"learning_rate": 2.7543980682994136e-05,
|
2608 |
+
"loss": 0.352,
|
2609 |
+
"num_tokens": 1698326168.0,
|
2610 |
+
"step": 1625
|
2611 |
+
},
|
2612 |
+
{
|
2613 |
+
"epoch": 1.5176908752327747,
|
2614 |
+
"grad_norm": 0.43945135163916543,
|
2615 |
+
"learning_rate": 2.7457744049672302e-05,
|
2616 |
+
"loss": 0.3512,
|
2617 |
+
"num_tokens": 1703569048.0,
|
2618 |
+
"step": 1630
|
2619 |
+
},
|
2620 |
+
{
|
2621 |
+
"epoch": 1.5223463687150838,
|
2622 |
+
"grad_norm": 0.5192598819819614,
|
2623 |
+
"learning_rate": 2.7371507416350466e-05,
|
2624 |
+
"loss": 0.367,
|
2625 |
+
"num_tokens": 1708766606.0,
|
2626 |
+
"step": 1635
|
2627 |
+
},
|
2628 |
+
{
|
2629 |
+
"epoch": 1.5270018621973929,
|
2630 |
+
"grad_norm": 0.4452998587061349,
|
2631 |
+
"learning_rate": 2.7285270783028633e-05,
|
2632 |
+
"loss": 0.3589,
|
2633 |
+
"num_tokens": 1713944042.0,
|
2634 |
+
"step": 1640
|
2635 |
+
},
|
2636 |
+
{
|
2637 |
+
"epoch": 1.5316573556797022,
|
2638 |
+
"grad_norm": 0.43064181461229223,
|
2639 |
+
"learning_rate": 2.7199034149706797e-05,
|
2640 |
+
"loss": 0.3586,
|
2641 |
+
"num_tokens": 1719186922.0,
|
2642 |
+
"step": 1645
|
2643 |
+
},
|
2644 |
+
{
|
2645 |
+
"epoch": 1.536312849162011,
|
2646 |
+
"grad_norm": 0.44443641092630637,
|
2647 |
+
"learning_rate": 2.7112797516384963e-05,
|
2648 |
+
"loss": 0.3439,
|
2649 |
+
"num_tokens": 1724367282.0,
|
2650 |
+
"step": 1650
|
2651 |
+
},
|
2652 |
+
{
|
2653 |
+
"epoch": 1.5409683426443204,
|
2654 |
+
"grad_norm": 0.4167990111102253,
|
2655 |
+
"learning_rate": 2.702656088306313e-05,
|
2656 |
+
"loss": 0.3407,
|
2657 |
+
"num_tokens": 1729599294.0,
|
2658 |
+
"step": 1655
|
2659 |
+
},
|
2660 |
+
{
|
2661 |
+
"epoch": 1.5456238361266295,
|
2662 |
+
"grad_norm": 0.4462974133274107,
|
2663 |
+
"learning_rate": 2.6940324249741287e-05,
|
2664 |
+
"loss": 0.3529,
|
2665 |
+
"num_tokens": 1734842174.0,
|
2666 |
+
"step": 1660
|
2667 |
+
},
|
2668 |
+
{
|
2669 |
+
"epoch": 1.5502793296089385,
|
2670 |
+
"grad_norm": 0.39854857064913984,
|
2671 |
+
"learning_rate": 2.6854087616419454e-05,
|
2672 |
+
"loss": 0.348,
|
2673 |
+
"num_tokens": 1740064978.0,
|
2674 |
+
"step": 1665
|
2675 |
+
},
|
2676 |
+
{
|
2677 |
+
"epoch": 1.5549348230912476,
|
2678 |
+
"grad_norm": 0.4248070265370859,
|
2679 |
+
"learning_rate": 2.676785098309762e-05,
|
2680 |
+
"loss": 0.3528,
|
2681 |
+
"num_tokens": 1745287654.0,
|
2682 |
+
"step": 1670
|
2683 |
+
},
|
2684 |
+
{
|
2685 |
+
"epoch": 1.5595903165735567,
|
2686 |
+
"grad_norm": 0.5528756147028647,
|
2687 |
+
"learning_rate": 2.6681614349775785e-05,
|
2688 |
+
"loss": 0.3495,
|
2689 |
+
"num_tokens": 1750530534.0,
|
2690 |
+
"step": 1675
|
2691 |
+
},
|
2692 |
+
{
|
2693 |
+
"epoch": 1.564245810055866,
|
2694 |
+
"grad_norm": 0.41949805640470383,
|
2695 |
+
"learning_rate": 2.659537771645395e-05,
|
2696 |
+
"loss": 0.353,
|
2697 |
+
"num_tokens": 1755773414.0,
|
2698 |
+
"step": 1680
|
2699 |
+
},
|
2700 |
+
{
|
2701 |
+
"epoch": 1.568901303538175,
|
2702 |
+
"grad_norm": 0.4182275416284989,
|
2703 |
+
"learning_rate": 2.6509141083132115e-05,
|
2704 |
+
"loss": 0.3545,
|
2705 |
+
"num_tokens": 1760954744.0,
|
2706 |
+
"step": 1685
|
2707 |
+
},
|
2708 |
+
{
|
2709 |
+
"epoch": 1.5735567970204842,
|
2710 |
+
"grad_norm": 0.4424815500442761,
|
2711 |
+
"learning_rate": 2.6422904449810282e-05,
|
2712 |
+
"loss": 0.3608,
|
2713 |
+
"num_tokens": 1766197624.0,
|
2714 |
+
"step": 1690
|
2715 |
+
},
|
2716 |
+
{
|
2717 |
+
"epoch": 1.5782122905027933,
|
2718 |
+
"grad_norm": 0.5037267806139252,
|
2719 |
+
"learning_rate": 2.6336667816488446e-05,
|
2720 |
+
"loss": 0.3618,
|
2721 |
+
"num_tokens": 1771440504.0,
|
2722 |
+
"step": 1695
|
2723 |
+
},
|
2724 |
+
{
|
2725 |
+
"epoch": 1.5828677839851024,
|
2726 |
+
"grad_norm": 0.5757867203405523,
|
2727 |
+
"learning_rate": 2.6250431183166613e-05,
|
2728 |
+
"loss": 0.3537,
|
2729 |
+
"num_tokens": 1776683384.0,
|
2730 |
+
"step": 1700
|
2731 |
+
},
|
2732 |
+
{
|
2733 |
+
"epoch": 1.5875232774674115,
|
2734 |
+
"grad_norm": 0.4418015148391545,
|
2735 |
+
"learning_rate": 2.6164194549844773e-05,
|
2736 |
+
"loss": 0.3503,
|
2737 |
+
"num_tokens": 1781926264.0,
|
2738 |
+
"step": 1705
|
2739 |
+
},
|
2740 |
+
{
|
2741 |
+
"epoch": 1.5921787709497206,
|
2742 |
+
"grad_norm": 0.41320469617664296,
|
2743 |
+
"learning_rate": 2.607795791652294e-05,
|
2744 |
+
"loss": 0.35,
|
2745 |
+
"num_tokens": 1787169144.0,
|
2746 |
+
"step": 1710
|
2747 |
+
},
|
2748 |
+
{
|
2749 |
+
"epoch": 1.59683426443203,
|
2750 |
+
"grad_norm": 0.4654919170528904,
|
2751 |
+
"learning_rate": 2.5991721283201103e-05,
|
2752 |
+
"loss": 0.3492,
|
2753 |
+
"num_tokens": 1792412024.0,
|
2754 |
+
"step": 1715
|
2755 |
+
},
|
2756 |
+
{
|
2757 |
+
"epoch": 1.6014897579143388,
|
2758 |
+
"grad_norm": 0.46789325383997304,
|
2759 |
+
"learning_rate": 2.590548464987927e-05,
|
2760 |
+
"loss": 0.3514,
|
2761 |
+
"num_tokens": 1797634766.0,
|
2762 |
+
"step": 1720
|
2763 |
+
},
|
2764 |
+
{
|
2765 |
+
"epoch": 1.606145251396648,
|
2766 |
+
"grad_norm": 0.383414683196401,
|
2767 |
+
"learning_rate": 2.5819248016557434e-05,
|
2768 |
+
"loss": 0.3429,
|
2769 |
+
"num_tokens": 1802877646.0,
|
2770 |
+
"step": 1725
|
2771 |
+
},
|
2772 |
+
{
|
2773 |
+
"epoch": 1.6108007448789572,
|
2774 |
+
"grad_norm": 0.39464003167522826,
|
2775 |
+
"learning_rate": 2.57330113832356e-05,
|
2776 |
+
"loss": 0.3521,
|
2777 |
+
"num_tokens": 1808104662.0,
|
2778 |
+
"step": 1730
|
2779 |
+
},
|
2780 |
+
{
|
2781 |
+
"epoch": 1.6154562383612663,
|
2782 |
+
"grad_norm": 0.46968645292723943,
|
2783 |
+
"learning_rate": 2.5646774749913764e-05,
|
2784 |
+
"loss": 0.3453,
|
2785 |
+
"num_tokens": 1813321242.0,
|
2786 |
+
"step": 1735
|
2787 |
+
},
|
2788 |
+
{
|
2789 |
+
"epoch": 1.6201117318435754,
|
2790 |
+
"grad_norm": 0.3777881875602588,
|
2791 |
+
"learning_rate": 2.556053811659193e-05,
|
2792 |
+
"loss": 0.3478,
|
2793 |
+
"num_tokens": 1818564122.0,
|
2794 |
+
"step": 1740
|
2795 |
+
},
|
2796 |
+
{
|
2797 |
+
"epoch": 1.6247672253258845,
|
2798 |
+
"grad_norm": 0.3638617662724661,
|
2799 |
+
"learning_rate": 2.5474301483270098e-05,
|
2800 |
+
"loss": 0.35,
|
2801 |
+
"num_tokens": 1823807002.0,
|
2802 |
+
"step": 1745
|
2803 |
+
},
|
2804 |
+
{
|
2805 |
+
"epoch": 1.6294227188081938,
|
2806 |
+
"grad_norm": 0.43356993528545573,
|
2807 |
+
"learning_rate": 2.5388064849948255e-05,
|
2808 |
+
"loss": 0.3522,
|
2809 |
+
"num_tokens": 1828986398.0,
|
2810 |
+
"step": 1750
|
2811 |
+
},
|
2812 |
+
{
|
2813 |
+
"epoch": 1.6340782122905027,
|
2814 |
+
"grad_norm": 0.41013840469091506,
|
2815 |
+
"learning_rate": 2.5301828216626422e-05,
|
2816 |
+
"loss": 0.3463,
|
2817 |
+
"num_tokens": 1834229278.0,
|
2818 |
+
"step": 1755
|
2819 |
+
},
|
2820 |
+
{
|
2821 |
+
"epoch": 1.638733705772812,
|
2822 |
+
"grad_norm": 0.48759437990509724,
|
2823 |
+
"learning_rate": 2.521559158330459e-05,
|
2824 |
+
"loss": 0.3462,
|
2825 |
+
"num_tokens": 1839416094.0,
|
2826 |
+
"step": 1760
|
2827 |
+
},
|
2828 |
+
{
|
2829 |
+
"epoch": 1.643389199255121,
|
2830 |
+
"grad_norm": 0.4004343586595845,
|
2831 |
+
"learning_rate": 2.5129354949982752e-05,
|
2832 |
+
"loss": 0.3583,
|
2833 |
+
"num_tokens": 1844605766.0,
|
2834 |
+
"step": 1765
|
2835 |
+
},
|
2836 |
+
{
|
2837 |
+
"epoch": 1.6480446927374302,
|
2838 |
+
"grad_norm": 0.4132680619461919,
|
2839 |
+
"learning_rate": 2.504311831666092e-05,
|
2840 |
+
"loss": 0.3549,
|
2841 |
+
"num_tokens": 1849848646.0,
|
2842 |
+
"step": 1770
|
2843 |
+
},
|
2844 |
+
{
|
2845 |
+
"epoch": 1.6527001862197392,
|
2846 |
+
"grad_norm": 0.4470980214548608,
|
2847 |
+
"learning_rate": 2.4956881683339083e-05,
|
2848 |
+
"loss": 0.3545,
|
2849 |
+
"num_tokens": 1855091526.0,
|
2850 |
+
"step": 1775
|
2851 |
+
},
|
2852 |
+
{
|
2853 |
+
"epoch": 1.6573556797020483,
|
2854 |
+
"grad_norm": 0.41600413543566717,
|
2855 |
+
"learning_rate": 2.487064505001725e-05,
|
2856 |
+
"loss": 0.3519,
|
2857 |
+
"num_tokens": 1860334406.0,
|
2858 |
+
"step": 1780
|
2859 |
+
},
|
2860 |
+
{
|
2861 |
+
"epoch": 1.6620111731843576,
|
2862 |
+
"grad_norm": 0.4179342529390727,
|
2863 |
+
"learning_rate": 2.4784408416695413e-05,
|
2864 |
+
"loss": 0.3558,
|
2865 |
+
"num_tokens": 1865567454.0,
|
2866 |
+
"step": 1785
|
2867 |
+
},
|
2868 |
+
{
|
2869 |
+
"epoch": 1.6666666666666665,
|
2870 |
+
"grad_norm": 0.4295317513177463,
|
2871 |
+
"learning_rate": 2.4698171783373577e-05,
|
2872 |
+
"loss": 0.3428,
|
2873 |
+
"num_tokens": 1870810334.0,
|
2874 |
+
"step": 1790
|
2875 |
+
},
|
2876 |
+
{
|
2877 |
+
"epoch": 1.6713221601489758,
|
2878 |
+
"grad_norm": 0.4490287015466731,
|
2879 |
+
"learning_rate": 2.4611935150051744e-05,
|
2880 |
+
"loss": 0.3503,
|
2881 |
+
"num_tokens": 1876053214.0,
|
2882 |
+
"step": 1795
|
2883 |
+
},
|
2884 |
+
{
|
2885 |
+
"epoch": 1.675977653631285,
|
2886 |
+
"grad_norm": 0.5271562590781398,
|
2887 |
+
"learning_rate": 2.4525698516729908e-05,
|
2888 |
+
"loss": 0.3499,
|
2889 |
+
"num_tokens": 1881296094.0,
|
2890 |
+
"step": 1800
|
2891 |
+
},
|
2892 |
+
{
|
2893 |
+
"epoch": 1.680633147113594,
|
2894 |
+
"grad_norm": 0.3938709205380673,
|
2895 |
+
"learning_rate": 2.4439461883408075e-05,
|
2896 |
+
"loss": 0.3532,
|
2897 |
+
"num_tokens": 1886538974.0,
|
2898 |
+
"step": 1805
|
2899 |
+
},
|
2900 |
+
{
|
2901 |
+
"epoch": 1.6852886405959033,
|
2902 |
+
"grad_norm": 0.36542236991861243,
|
2903 |
+
"learning_rate": 2.4353225250086238e-05,
|
2904 |
+
"loss": 0.3494,
|
2905 |
+
"num_tokens": 1891781854.0,
|
2906 |
+
"step": 1810
|
2907 |
+
},
|
2908 |
+
{
|
2909 |
+
"epoch": 1.6899441340782122,
|
2910 |
+
"grad_norm": 0.5158674940389951,
|
2911 |
+
"learning_rate": 2.42669886167644e-05,
|
2912 |
+
"loss": 0.3568,
|
2913 |
+
"num_tokens": 1897024734.0,
|
2914 |
+
"step": 1815
|
2915 |
+
},
|
2916 |
+
{
|
2917 |
+
"epoch": 1.6945996275605215,
|
2918 |
+
"grad_norm": 0.5898812452716973,
|
2919 |
+
"learning_rate": 2.418075198344257e-05,
|
2920 |
+
"loss": 0.3427,
|
2921 |
+
"num_tokens": 1902267614.0,
|
2922 |
+
"step": 1820
|
2923 |
+
},
|
2924 |
+
{
|
2925 |
+
"epoch": 1.6992551210428304,
|
2926 |
+
"grad_norm": 0.46830565847427696,
|
2927 |
+
"learning_rate": 2.4094515350120732e-05,
|
2928 |
+
"loss": 0.3479,
|
2929 |
+
"num_tokens": 1907454764.0,
|
2930 |
+
"step": 1825
|
2931 |
+
},
|
2932 |
+
{
|
2933 |
+
"epoch": 1.7039106145251397,
|
2934 |
+
"grad_norm": 0.4720815332727475,
|
2935 |
+
"learning_rate": 2.4008278716798896e-05,
|
2936 |
+
"loss": 0.3538,
|
2937 |
+
"num_tokens": 1912633368.0,
|
2938 |
+
"step": 1830
|
2939 |
+
},
|
2940 |
+
{
|
2941 |
+
"epoch": 1.7085661080074488,
|
2942 |
+
"grad_norm": 0.42858660436141865,
|
2943 |
+
"learning_rate": 2.3922042083477063e-05,
|
2944 |
+
"loss": 0.3498,
|
2945 |
+
"num_tokens": 1917859036.0,
|
2946 |
+
"step": 1835
|
2947 |
+
},
|
2948 |
+
{
|
2949 |
+
"epoch": 1.7132216014897579,
|
2950 |
+
"grad_norm": 0.41197749771239084,
|
2951 |
+
"learning_rate": 2.3835805450155226e-05,
|
2952 |
+
"loss": 0.3415,
|
2953 |
+
"num_tokens": 1923043306.0,
|
2954 |
+
"step": 1840
|
2955 |
+
},
|
2956 |
+
{
|
2957 |
+
"epoch": 1.7178770949720672,
|
2958 |
+
"grad_norm": 0.4115656817654512,
|
2959 |
+
"learning_rate": 2.3749568816833393e-05,
|
2960 |
+
"loss": 0.3551,
|
2961 |
+
"num_tokens": 1928286186.0,
|
2962 |
+
"step": 1845
|
2963 |
+
},
|
2964 |
+
{
|
2965 |
+
"epoch": 1.722532588454376,
|
2966 |
+
"grad_norm": 0.43870051277515076,
|
2967 |
+
"learning_rate": 2.3663332183511557e-05,
|
2968 |
+
"loss": 0.3473,
|
2969 |
+
"num_tokens": 1933529066.0,
|
2970 |
+
"step": 1850
|
2971 |
+
},
|
2972 |
+
{
|
2973 |
+
"epoch": 1.7271880819366854,
|
2974 |
+
"grad_norm": 0.4051372237420238,
|
2975 |
+
"learning_rate": 2.357709555018972e-05,
|
2976 |
+
"loss": 0.3406,
|
2977 |
+
"num_tokens": 1938771946.0,
|
2978 |
+
"step": 1855
|
2979 |
+
},
|
2980 |
+
{
|
2981 |
+
"epoch": 1.7318435754189943,
|
2982 |
+
"grad_norm": 0.36861278944342607,
|
2983 |
+
"learning_rate": 2.3490858916867887e-05,
|
2984 |
+
"loss": 0.3395,
|
2985 |
+
"num_tokens": 1943951362.0,
|
2986 |
+
"step": 1860
|
2987 |
+
},
|
2988 |
+
{
|
2989 |
+
"epoch": 1.7364990689013036,
|
2990 |
+
"grad_norm": 0.37116380184385894,
|
2991 |
+
"learning_rate": 2.340462228354605e-05,
|
2992 |
+
"loss": 0.3554,
|
2993 |
+
"num_tokens": 1949060174.0,
|
2994 |
+
"step": 1865
|
2995 |
+
},
|
2996 |
+
{
|
2997 |
+
"epoch": 1.7411545623836127,
|
2998 |
+
"grad_norm": 0.5205105281665823,
|
2999 |
+
"learning_rate": 2.3318385650224218e-05,
|
3000 |
+
"loss": 0.3537,
|
3001 |
+
"num_tokens": 1954293264.0,
|
3002 |
+
"step": 1870
|
3003 |
+
},
|
3004 |
+
{
|
3005 |
+
"epoch": 1.7458100558659218,
|
3006 |
+
"grad_norm": 0.4456953970069166,
|
3007 |
+
"learning_rate": 2.323214901690238e-05,
|
3008 |
+
"loss": 0.3439,
|
3009 |
+
"num_tokens": 1959495158.0,
|
3010 |
+
"step": 1875
|
3011 |
+
},
|
3012 |
+
{
|
3013 |
+
"epoch": 1.750465549348231,
|
3014 |
+
"grad_norm": 0.47447329743147876,
|
3015 |
+
"learning_rate": 2.3145912383580545e-05,
|
3016 |
+
"loss": 0.3545,
|
3017 |
+
"num_tokens": 1964738038.0,
|
3018 |
+
"step": 1880
|
3019 |
+
},
|
3020 |
+
{
|
3021 |
+
"epoch": 1.75512104283054,
|
3022 |
+
"grad_norm": 0.3557351154983947,
|
3023 |
+
"learning_rate": 2.3059675750258712e-05,
|
3024 |
+
"loss": 0.3567,
|
3025 |
+
"num_tokens": 1969980918.0,
|
3026 |
+
"step": 1885
|
3027 |
+
},
|
3028 |
+
{
|
3029 |
+
"epoch": 1.7597765363128492,
|
3030 |
+
"grad_norm": 0.40245903573677616,
|
3031 |
+
"learning_rate": 2.2973439116936875e-05,
|
3032 |
+
"loss": 0.3553,
|
3033 |
+
"num_tokens": 1975223798.0,
|
3034 |
+
"step": 1890
|
3035 |
+
},
|
3036 |
+
{
|
3037 |
+
"epoch": 1.7644320297951583,
|
3038 |
+
"grad_norm": 0.42631391419004666,
|
3039 |
+
"learning_rate": 2.2887202483615042e-05,
|
3040 |
+
"loss": 0.3524,
|
3041 |
+
"num_tokens": 1980411150.0,
|
3042 |
+
"step": 1895
|
3043 |
+
},
|
3044 |
+
{
|
3045 |
+
"epoch": 1.7690875232774674,
|
3046 |
+
"grad_norm": 0.4036734907222041,
|
3047 |
+
"learning_rate": 2.2800965850293206e-05,
|
3048 |
+
"loss": 0.3489,
|
3049 |
+
"num_tokens": 1985654030.0,
|
3050 |
+
"step": 1900
|
3051 |
+
},
|
3052 |
+
{
|
3053 |
+
"epoch": 1.7737430167597765,
|
3054 |
+
"grad_norm": 0.43266404699630584,
|
3055 |
+
"learning_rate": 2.271472921697137e-05,
|
3056 |
+
"loss": 0.3571,
|
3057 |
+
"num_tokens": 1990896910.0,
|
3058 |
+
"step": 1905
|
3059 |
+
},
|
3060 |
+
{
|
3061 |
+
"epoch": 1.7783985102420856,
|
3062 |
+
"grad_norm": 0.4390506238721815,
|
3063 |
+
"learning_rate": 2.2628492583649536e-05,
|
3064 |
+
"loss": 0.3415,
|
3065 |
+
"num_tokens": 1996077922.0,
|
3066 |
+
"step": 1910
|
3067 |
+
},
|
3068 |
+
{
|
3069 |
+
"epoch": 1.783054003724395,
|
3070 |
+
"grad_norm": 0.45868358663134706,
|
3071 |
+
"learning_rate": 2.25422559503277e-05,
|
3072 |
+
"loss": 0.343,
|
3073 |
+
"num_tokens": 2001320802.0,
|
3074 |
+
"step": 1915
|
3075 |
+
},
|
3076 |
+
{
|
3077 |
+
"epoch": 1.7877094972067038,
|
3078 |
+
"grad_norm": 0.4308135634115946,
|
3079 |
+
"learning_rate": 2.2456019317005867e-05,
|
3080 |
+
"loss": 0.3455,
|
3081 |
+
"num_tokens": 2006537026.0,
|
3082 |
+
"step": 1920
|
3083 |
+
},
|
3084 |
+
{
|
3085 |
+
"epoch": 1.7923649906890131,
|
3086 |
+
"grad_norm": 0.5352962259215817,
|
3087 |
+
"learning_rate": 2.236978268368403e-05,
|
3088 |
+
"loss": 0.3483,
|
3089 |
+
"num_tokens": 2011779524.0,
|
3090 |
+
"step": 1925
|
3091 |
+
},
|
3092 |
+
{
|
3093 |
+
"epoch": 1.7970204841713222,
|
3094 |
+
"grad_norm": 0.45675715635498815,
|
3095 |
+
"learning_rate": 2.2283546050362194e-05,
|
3096 |
+
"loss": 0.3511,
|
3097 |
+
"num_tokens": 2017022404.0,
|
3098 |
+
"step": 1930
|
3099 |
+
},
|
3100 |
+
{
|
3101 |
+
"epoch": 1.8016759776536313,
|
3102 |
+
"grad_norm": 0.37896808667371934,
|
3103 |
+
"learning_rate": 2.219730941704036e-05,
|
3104 |
+
"loss": 0.3394,
|
3105 |
+
"num_tokens": 2022239646.0,
|
3106 |
+
"step": 1935
|
3107 |
+
},
|
3108 |
+
{
|
3109 |
+
"epoch": 1.8063314711359404,
|
3110 |
+
"grad_norm": 0.5097191341558315,
|
3111 |
+
"learning_rate": 2.2111072783718524e-05,
|
3112 |
+
"loss": 0.3473,
|
3113 |
+
"num_tokens": 2027418744.0,
|
3114 |
+
"step": 1940
|
3115 |
+
},
|
3116 |
+
{
|
3117 |
+
"epoch": 1.8109869646182495,
|
3118 |
+
"grad_norm": 0.4478696818866437,
|
3119 |
+
"learning_rate": 2.2024836150396688e-05,
|
3120 |
+
"loss": 0.3506,
|
3121 |
+
"num_tokens": 2032661624.0,
|
3122 |
+
"step": 1945
|
3123 |
+
},
|
3124 |
+
{
|
3125 |
+
"epoch": 1.8156424581005588,
|
3126 |
+
"grad_norm": 0.5348913121148593,
|
3127 |
+
"learning_rate": 2.1938599517074855e-05,
|
3128 |
+
"loss": 0.3431,
|
3129 |
+
"num_tokens": 2037904504.0,
|
3130 |
+
"step": 1950
|
3131 |
+
},
|
3132 |
+
{
|
3133 |
+
"epoch": 1.8202979515828677,
|
3134 |
+
"grad_norm": 0.35782661985763603,
|
3135 |
+
"learning_rate": 2.185236288375302e-05,
|
3136 |
+
"loss": 0.3435,
|
3137 |
+
"num_tokens": 2043147384.0,
|
3138 |
+
"step": 1955
|
3139 |
+
},
|
3140 |
+
{
|
3141 |
+
"epoch": 1.824953445065177,
|
3142 |
+
"grad_norm": 0.4030255067164449,
|
3143 |
+
"learning_rate": 2.1766126250431186e-05,
|
3144 |
+
"loss": 0.3454,
|
3145 |
+
"num_tokens": 2048390264.0,
|
3146 |
+
"step": 1960
|
3147 |
+
},
|
3148 |
+
{
|
3149 |
+
"epoch": 1.829608938547486,
|
3150 |
+
"grad_norm": 0.4856718408524328,
|
3151 |
+
"learning_rate": 2.167988961710935e-05,
|
3152 |
+
"loss": 0.3508,
|
3153 |
+
"num_tokens": 2053565316.0,
|
3154 |
+
"step": 1965
|
3155 |
+
},
|
3156 |
+
{
|
3157 |
+
"epoch": 1.8342644320297952,
|
3158 |
+
"grad_norm": 0.38701100880731554,
|
3159 |
+
"learning_rate": 2.1593652983787513e-05,
|
3160 |
+
"loss": 0.3387,
|
3161 |
+
"num_tokens": 2058808196.0,
|
3162 |
+
"step": 1970
|
3163 |
+
},
|
3164 |
+
{
|
3165 |
+
"epoch": 1.8389199255121043,
|
3166 |
+
"grad_norm": 0.37713447559268454,
|
3167 |
+
"learning_rate": 2.150741635046568e-05,
|
3168 |
+
"loss": 0.3482,
|
3169 |
+
"num_tokens": 2064051076.0,
|
3170 |
+
"step": 1975
|
3171 |
+
},
|
3172 |
+
{
|
3173 |
+
"epoch": 1.8435754189944134,
|
3174 |
+
"grad_norm": 0.4177726543134029,
|
3175 |
+
"learning_rate": 2.1421179717143843e-05,
|
3176 |
+
"loss": 0.339,
|
3177 |
+
"num_tokens": 2069293956.0,
|
3178 |
+
"step": 1980
|
3179 |
+
},
|
3180 |
+
{
|
3181 |
+
"epoch": 1.8482309124767227,
|
3182 |
+
"grad_norm": 0.5101259156674925,
|
3183 |
+
"learning_rate": 2.133494308382201e-05,
|
3184 |
+
"loss": 0.3532,
|
3185 |
+
"num_tokens": 2074536836.0,
|
3186 |
+
"step": 1985
|
3187 |
+
},
|
3188 |
+
{
|
3189 |
+
"epoch": 1.8528864059590315,
|
3190 |
+
"grad_norm": 0.44611679979677504,
|
3191 |
+
"learning_rate": 2.1248706450500174e-05,
|
3192 |
+
"loss": 0.3516,
|
3193 |
+
"num_tokens": 2079779716.0,
|
3194 |
+
"step": 1990
|
3195 |
+
},
|
3196 |
+
{
|
3197 |
+
"epoch": 1.8575418994413408,
|
3198 |
+
"grad_norm": 0.4524211991169788,
|
3199 |
+
"learning_rate": 2.1162469817178337e-05,
|
3200 |
+
"loss": 0.3432,
|
3201 |
+
"num_tokens": 2084952036.0,
|
3202 |
+
"step": 1995
|
3203 |
+
},
|
3204 |
+
{
|
3205 |
+
"epoch": 1.86219739292365,
|
3206 |
+
"grad_norm": 0.3783683026141043,
|
3207 |
+
"learning_rate": 2.1076233183856504e-05,
|
3208 |
+
"loss": 0.3549,
|
3209 |
+
"num_tokens": 2090194916.0,
|
3210 |
+
"step": 2000
|
3211 |
}
|
3212 |
],
|
3213 |
"logging_steps": 5,
|
|
|
3227 |
"attributes": {}
|
3228 |
}
|
3229 |
},
|
3230 |
+
"total_flos": 1.7113228334533181e+18,
|
3231 |
"train_batch_size": 1,
|
3232 |
"trial_name": null,
|
3233 |
"trial_params": null
|