ZMC2019 commited on
Commit
9e6183e
·
verified ·
1 Parent(s): 1e42316

Upload E2 checkpoint

Browse files
global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cdb4b556c834ccf83a60d6efd651e4482b75d238db8e36cd111e4936004e067
3
+ size 11423429708
global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:157dd94d9a55fc3c0344dbb3607edb8a3fd75d0a5fcf6ddaa5d86a1ddb6adf39
3
+ size 11423429708
global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3a0e641f7aa992b1213e1cf6fca46e093c11db7c5ec94522b432eed86ab2e89
3
+ size 11423429708
global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8126432d183f33ac27017c30601eb49eb821a3f05e117366f325670178c5aa7a
3
+ size 11423429708
global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d28d6c8ec1e368672960aa4a9efdf3ea0bd755b54ab8e866caad394e0d8291c
3
+ size 11423429708
global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b568717af726d537029642b4f77c9e67419b0a74a5521977b8d486f31fd3db8a
3
+ size 11423429708
global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd593e3f8f4b5a516dfac8092155dbc1be416c977f159164e1389e9130ffe92
3
+ size 11423429708
global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19e93281714c7d80e0f04d6adea4bb1f33baf4b4f5c5b57cbd963ac688bd48dc
3
+ size 11423429708
global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:459639211da751f24943c72ae6f76c43cba665eb88ee9a83eaca7fd596e03747
3
+ size 166293
global_step2000/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:741e71908eea77bb92b52656fae9fffca00913e381f2fc98e56cbb68ab267b52
3
+ size 166293
global_step2000/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37b14cf128b2071636107aad6e5665dc148fc8be3b34c1c7e371603803cfe352
3
+ size 166293
global_step2000/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6d9a6a4a5b01db724facd96e84a303c0af818a248906bb0f1b0978465313826
3
+ size 166293
global_step2000/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:760d5052d2770143bd31198a769e3739d4bcbda61ee691a9a9ea840e61b98f95
3
+ size 166293
global_step2000/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f92194315b137a7902feced8b6b4c08f0c3af63ddc82836520e59d2e49f83a7d
3
+ size 166293
global_step2000/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5987c9b0ed132408a1e457b107d52473e77f8755e53cfad66e69501ba13f2e1
3
+ size 166293
global_step2000/zero_pp_rank_7_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc325b7ed520bce8f3b9cf5068eaca86b253acabc55f6d9a45f61d85ad0e13a9
3
+ size 166293
latest CHANGED
@@ -1 +1 @@
1
- global_step1000
 
1
+ global_step2000
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86f16ae8576b09cddbe25a2c03e2efe2889e5a02ddd278b044cbb65bcfa9ec81
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1c8ebd9244c4146ecf809272a71d6b4de47fc24d81bd932a9b771cb1f41d6c2
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b3d5a248152de58e62715836521dd89143c14bb07ff790fc5d8fbf43e426994
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90d30071d983d8bac86b3707db92a52bf4d92e0a981a63886aed68b61a85dd05
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7211304c7612e2f448f7778401abd61bb028bc0fe2fb8e8fc6a5e4d10f86a6d
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77bde50d5d797508e74fe17bb3105a5f1a1d3d46dd9db24e584e2a67371cf791
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:138187716c55594e048b80ab226519cb31dbd849efe41dd9f5579ddb1b3c5a76
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c111ea5d7f44c38d24eaa6ff0aa8d4ace60b5ff82e44f039b1e8e97d7c7372a
3
  size 1089994880
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85
3
  size 15984
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0728b56dab7abb5ef8a0d4bae3519c5767c97467bdd886d26bf19cc8599d0312
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73
3
  size 15984
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4e481d4ef1546694da7337f6bb6c658b866dcb79b85deeb477da0d27ebe851e
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b
3
  size 15984
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:353c60be37ea56fc992fca446598ceca5d1fd002aa3bd6dbb9ad740e6f47ebb3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc
3
  size 15984
rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9107fe964ba7205e354084b85210e5a5ea1c98cfd4d38adb9cd3926945dcae4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972
3
  size 15984
rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69d1bb1abee38b92e53f3f23549b642ce0f1edcdccf7b6129847ac61636e96d5
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991
3
  size 15984
rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:afd5516048e20f36959601574e29e40106085a7d3cdc7bf425ce5e84633490e6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa
3
  size 15984
rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e2c46927fc06939b4c976a01e4b95dec1f8b98ceaea86d31a5d756fc30ff006
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773
3
  size 15984
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4a1318e6b2a090007a2e160fa226b63a07ef8bed0e3001c5edee1c06ac6d736
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a1f4ee95be23d350caa2c6cf69b932338c4918ddeea5378d2ec4a8922e5be11
3
  size 1064
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.931098696461825,
6
  "eval_steps": 500,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1608,6 +1608,1606 @@
1608
  "loss": 0.4178,
1609
  "num_tokens": 1045662642.0,
1610
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1611
  }
1612
  ],
1613
  "logging_steps": 5,
@@ -1627,7 +3227,7 @@
1627
  "attributes": {}
1628
  }
1629
  },
1630
- "total_flos": 8.555395589750129e+17,
1631
  "train_batch_size": 1,
1632
  "trial_name": null,
1633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.86219739292365,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1608
  "loss": 0.4178,
1609
  "num_tokens": 1045662642.0,
1610
  "step": 1000
1611
+ },
1612
+ {
1613
+ "epoch": 0.9357541899441341,
1614
+ "grad_norm": 0.5658078705552342,
1615
+ "learning_rate": 3.8237323214901695e-05,
1616
+ "loss": 0.4191,
1617
+ "num_tokens": 1050864562.0,
1618
+ "step": 1005
1619
+ },
1620
+ {
1621
+ "epoch": 0.9404096834264432,
1622
+ "grad_norm": 0.5748827219407342,
1623
+ "learning_rate": 3.815108658157986e-05,
1624
+ "loss": 0.417,
1625
+ "num_tokens": 1056107442.0,
1626
+ "step": 1010
1627
+ },
1628
+ {
1629
+ "epoch": 0.9450651769087524,
1630
+ "grad_norm": 0.5609460084210117,
1631
+ "learning_rate": 3.806484994825802e-05,
1632
+ "loss": 0.4055,
1633
+ "num_tokens": 1061349356.0,
1634
+ "step": 1015
1635
+ },
1636
+ {
1637
+ "epoch": 0.9497206703910615,
1638
+ "grad_norm": 0.48644456614397286,
1639
+ "learning_rate": 3.7978613314936186e-05,
1640
+ "loss": 0.4171,
1641
+ "num_tokens": 1066567410.0,
1642
+ "step": 1020
1643
+ },
1644
+ {
1645
+ "epoch": 0.9543761638733705,
1646
+ "grad_norm": 0.49374585932277654,
1647
+ "learning_rate": 3.789237668161435e-05,
1648
+ "loss": 0.4086,
1649
+ "num_tokens": 1071744846.0,
1650
+ "step": 1025
1651
+ },
1652
+ {
1653
+ "epoch": 0.9590316573556797,
1654
+ "grad_norm": 0.4590964927032936,
1655
+ "learning_rate": 3.780614004829251e-05,
1656
+ "loss": 0.4224,
1657
+ "num_tokens": 1076987726.0,
1658
+ "step": 1030
1659
+ },
1660
+ {
1661
+ "epoch": 0.9636871508379888,
1662
+ "grad_norm": 0.5671382944104515,
1663
+ "learning_rate": 3.771990341497068e-05,
1664
+ "loss": 0.416,
1665
+ "num_tokens": 1082230606.0,
1666
+ "step": 1035
1667
+ },
1668
+ {
1669
+ "epoch": 0.9683426443202979,
1670
+ "grad_norm": 0.7029680270448432,
1671
+ "learning_rate": 3.7633666781648847e-05,
1672
+ "loss": 0.4103,
1673
+ "num_tokens": 1087473486.0,
1674
+ "step": 1040
1675
+ },
1676
+ {
1677
+ "epoch": 0.972998137802607,
1678
+ "grad_norm": 0.6041713356383028,
1679
+ "learning_rate": 3.754743014832701e-05,
1680
+ "loss": 0.4181,
1681
+ "num_tokens": 1092716366.0,
1682
+ "step": 1045
1683
+ },
1684
+ {
1685
+ "epoch": 0.9776536312849162,
1686
+ "grad_norm": 0.5410682956289158,
1687
+ "learning_rate": 3.746119351500518e-05,
1688
+ "loss": 0.4141,
1689
+ "num_tokens": 1097959246.0,
1690
+ "step": 1050
1691
+ },
1692
+ {
1693
+ "epoch": 0.9823091247672253,
1694
+ "grad_norm": 0.5964451598240291,
1695
+ "learning_rate": 3.7374956881683344e-05,
1696
+ "loss": 0.4107,
1697
+ "num_tokens": 1103182300.0,
1698
+ "step": 1055
1699
+ },
1700
+ {
1701
+ "epoch": 0.9869646182495344,
1702
+ "grad_norm": 0.6398589551828828,
1703
+ "learning_rate": 3.728872024836151e-05,
1704
+ "loss": 0.4099,
1705
+ "num_tokens": 1108425180.0,
1706
+ "step": 1060
1707
+ },
1708
+ {
1709
+ "epoch": 0.9916201117318436,
1710
+ "grad_norm": 0.711512694227751,
1711
+ "learning_rate": 3.720248361503967e-05,
1712
+ "loss": 0.4159,
1713
+ "num_tokens": 1113668060.0,
1714
+ "step": 1065
1715
+ },
1716
+ {
1717
+ "epoch": 0.9962756052141527,
1718
+ "grad_norm": 0.7462919172813813,
1719
+ "learning_rate": 3.7116246981717835e-05,
1720
+ "loss": 0.4158,
1721
+ "num_tokens": 1118847586.0,
1722
+ "step": 1070
1723
+ },
1724
+ {
1725
+ "epoch": 1.000931098696462,
1726
+ "grad_norm": 0.5826662993124105,
1727
+ "learning_rate": 3.7030010348396e-05,
1728
+ "loss": 0.399,
1729
+ "num_tokens": 1123172962.0,
1730
+ "step": 1075
1731
+ },
1732
+ {
1733
+ "epoch": 1.005586592178771,
1734
+ "grad_norm": 0.5029709348384636,
1735
+ "learning_rate": 3.694377371507416e-05,
1736
+ "loss": 0.355,
1737
+ "num_tokens": 1128415842.0,
1738
+ "step": 1080
1739
+ },
1740
+ {
1741
+ "epoch": 1.01024208566108,
1742
+ "grad_norm": 0.4893880194541248,
1743
+ "learning_rate": 3.685753708175233e-05,
1744
+ "loss": 0.3469,
1745
+ "num_tokens": 1133612886.0,
1746
+ "step": 1085
1747
+ },
1748
+ {
1749
+ "epoch": 1.0148975791433892,
1750
+ "grad_norm": 0.514699919148864,
1751
+ "learning_rate": 3.6771300448430496e-05,
1752
+ "loss": 0.3532,
1753
+ "num_tokens": 1138855766.0,
1754
+ "step": 1090
1755
+ },
1756
+ {
1757
+ "epoch": 1.0195530726256983,
1758
+ "grad_norm": 0.4849219380515942,
1759
+ "learning_rate": 3.668506381510866e-05,
1760
+ "loss": 0.3508,
1761
+ "num_tokens": 1144098646.0,
1762
+ "step": 1095
1763
+ },
1764
+ {
1765
+ "epoch": 1.0242085661080074,
1766
+ "grad_norm": 0.41360144495958895,
1767
+ "learning_rate": 3.659882718178683e-05,
1768
+ "loss": 0.3468,
1769
+ "num_tokens": 1149341526.0,
1770
+ "step": 1100
1771
+ },
1772
+ {
1773
+ "epoch": 1.0288640595903167,
1774
+ "grad_norm": 0.5018592330001277,
1775
+ "learning_rate": 3.651259054846499e-05,
1776
+ "loss": 0.3565,
1777
+ "num_tokens": 1154584406.0,
1778
+ "step": 1105
1779
+ },
1780
+ {
1781
+ "epoch": 1.0335195530726258,
1782
+ "grad_norm": 0.44206019018889825,
1783
+ "learning_rate": 3.642635391514315e-05,
1784
+ "loss": 0.3547,
1785
+ "num_tokens": 1159806052.0,
1786
+ "step": 1110
1787
+ },
1788
+ {
1789
+ "epoch": 1.0381750465549349,
1790
+ "grad_norm": 0.4264076762021219,
1791
+ "learning_rate": 3.634011728182132e-05,
1792
+ "loss": 0.3599,
1793
+ "num_tokens": 1165048932.0,
1794
+ "step": 1115
1795
+ },
1796
+ {
1797
+ "epoch": 1.042830540037244,
1798
+ "grad_norm": 0.5567650614214772,
1799
+ "learning_rate": 3.6253880648499484e-05,
1800
+ "loss": 0.3564,
1801
+ "num_tokens": 1170270856.0,
1802
+ "step": 1120
1803
+ },
1804
+ {
1805
+ "epoch": 1.047486033519553,
1806
+ "grad_norm": 0.5228694711322092,
1807
+ "learning_rate": 3.616764401517765e-05,
1808
+ "loss": 0.3553,
1809
+ "num_tokens": 1175458142.0,
1810
+ "step": 1125
1811
+ },
1812
+ {
1813
+ "epoch": 1.0521415270018621,
1814
+ "grad_norm": 0.4599296157768013,
1815
+ "learning_rate": 3.608140738185581e-05,
1816
+ "loss": 0.352,
1817
+ "num_tokens": 1180603216.0,
1818
+ "step": 1130
1819
+ },
1820
+ {
1821
+ "epoch": 1.0567970204841712,
1822
+ "grad_norm": 0.405001210315896,
1823
+ "learning_rate": 3.599517074853398e-05,
1824
+ "loss": 0.3466,
1825
+ "num_tokens": 1185846096.0,
1826
+ "step": 1135
1827
+ },
1828
+ {
1829
+ "epoch": 1.0614525139664805,
1830
+ "grad_norm": 0.4688055181909875,
1831
+ "learning_rate": 3.5908934115212145e-05,
1832
+ "loss": 0.3472,
1833
+ "num_tokens": 1191088976.0,
1834
+ "step": 1140
1835
+ },
1836
+ {
1837
+ "epoch": 1.0661080074487896,
1838
+ "grad_norm": 0.4347382388506508,
1839
+ "learning_rate": 3.582269748189031e-05,
1840
+ "loss": 0.3536,
1841
+ "num_tokens": 1196289062.0,
1842
+ "step": 1145
1843
+ },
1844
+ {
1845
+ "epoch": 1.0707635009310987,
1846
+ "grad_norm": 0.45831883115463856,
1847
+ "learning_rate": 3.573646084856848e-05,
1848
+ "loss": 0.3647,
1849
+ "num_tokens": 1201492904.0,
1850
+ "step": 1150
1851
+ },
1852
+ {
1853
+ "epoch": 1.0754189944134078,
1854
+ "grad_norm": 0.4720640762198147,
1855
+ "learning_rate": 3.5650224215246636e-05,
1856
+ "loss": 0.3442,
1857
+ "num_tokens": 1206735784.0,
1858
+ "step": 1155
1859
+ },
1860
+ {
1861
+ "epoch": 1.080074487895717,
1862
+ "grad_norm": 0.379073526084197,
1863
+ "learning_rate": 3.55639875819248e-05,
1864
+ "loss": 0.3585,
1865
+ "num_tokens": 1211978664.0,
1866
+ "step": 1160
1867
+ },
1868
+ {
1869
+ "epoch": 1.084729981378026,
1870
+ "grad_norm": 0.4669245913480403,
1871
+ "learning_rate": 3.547775094860297e-05,
1872
+ "loss": 0.3708,
1873
+ "num_tokens": 1217221544.0,
1874
+ "step": 1165
1875
+ },
1876
+ {
1877
+ "epoch": 1.089385474860335,
1878
+ "grad_norm": 0.8249174245152311,
1879
+ "learning_rate": 3.539151431528113e-05,
1880
+ "loss": 0.3597,
1881
+ "num_tokens": 1222450604.0,
1882
+ "step": 1170
1883
+ },
1884
+ {
1885
+ "epoch": 1.0940409683426444,
1886
+ "grad_norm": 0.5231930637239787,
1887
+ "learning_rate": 3.5305277681959297e-05,
1888
+ "loss": 0.3526,
1889
+ "num_tokens": 1227693484.0,
1890
+ "step": 1175
1891
+ },
1892
+ {
1893
+ "epoch": 1.0986964618249535,
1894
+ "grad_norm": 0.5270777353874178,
1895
+ "learning_rate": 3.521904104863746e-05,
1896
+ "loss": 0.3548,
1897
+ "num_tokens": 1232936364.0,
1898
+ "step": 1180
1899
+ },
1900
+ {
1901
+ "epoch": 1.1033519553072626,
1902
+ "grad_norm": 0.44135156617910987,
1903
+ "learning_rate": 3.513280441531563e-05,
1904
+ "loss": 0.3483,
1905
+ "num_tokens": 1238179244.0,
1906
+ "step": 1185
1907
+ },
1908
+ {
1909
+ "epoch": 1.1080074487895717,
1910
+ "grad_norm": 0.4385440230745658,
1911
+ "learning_rate": 3.5046567781993794e-05,
1912
+ "loss": 0.3632,
1913
+ "num_tokens": 1243422124.0,
1914
+ "step": 1190
1915
+ },
1916
+ {
1917
+ "epoch": 1.1126629422718808,
1918
+ "grad_norm": 0.4508339201057018,
1919
+ "learning_rate": 3.496033114867196e-05,
1920
+ "loss": 0.3598,
1921
+ "num_tokens": 1248665004.0,
1922
+ "step": 1195
1923
+ },
1924
+ {
1925
+ "epoch": 1.1173184357541899,
1926
+ "grad_norm": 0.4779567369574128,
1927
+ "learning_rate": 3.487409451535012e-05,
1928
+ "loss": 0.3497,
1929
+ "num_tokens": 1253907884.0,
1930
+ "step": 1200
1931
+ },
1932
+ {
1933
+ "epoch": 1.121973929236499,
1934
+ "grad_norm": 0.5625979479787355,
1935
+ "learning_rate": 3.4787857882028285e-05,
1936
+ "loss": 0.3586,
1937
+ "num_tokens": 1259150764.0,
1938
+ "step": 1205
1939
+ },
1940
+ {
1941
+ "epoch": 1.1266294227188083,
1942
+ "grad_norm": 0.6014959273175333,
1943
+ "learning_rate": 3.470162124870645e-05,
1944
+ "loss": 0.3514,
1945
+ "num_tokens": 1264327888.0,
1946
+ "step": 1210
1947
+ },
1948
+ {
1949
+ "epoch": 1.1312849162011174,
1950
+ "grad_norm": 0.4906335352032041,
1951
+ "learning_rate": 3.461538461538462e-05,
1952
+ "loss": 0.3509,
1953
+ "num_tokens": 1269570768.0,
1954
+ "step": 1215
1955
+ },
1956
+ {
1957
+ "epoch": 1.1359404096834265,
1958
+ "grad_norm": 0.4082724800162844,
1959
+ "learning_rate": 3.452914798206278e-05,
1960
+ "loss": 0.3395,
1961
+ "num_tokens": 1274789340.0,
1962
+ "step": 1220
1963
+ },
1964
+ {
1965
+ "epoch": 1.1405959031657356,
1966
+ "grad_norm": 0.4750009264613512,
1967
+ "learning_rate": 3.4442911348740946e-05,
1968
+ "loss": 0.3579,
1969
+ "num_tokens": 1280032220.0,
1970
+ "step": 1225
1971
+ },
1972
+ {
1973
+ "epoch": 1.1452513966480447,
1974
+ "grad_norm": 0.48685393486923184,
1975
+ "learning_rate": 3.435667471541911e-05,
1976
+ "loss": 0.3549,
1977
+ "num_tokens": 1285275100.0,
1978
+ "step": 1230
1979
+ },
1980
+ {
1981
+ "epoch": 1.1499068901303537,
1982
+ "grad_norm": 0.5834960539136641,
1983
+ "learning_rate": 3.427043808209728e-05,
1984
+ "loss": 0.3498,
1985
+ "num_tokens": 1290517980.0,
1986
+ "step": 1235
1987
+ },
1988
+ {
1989
+ "epoch": 1.1545623836126628,
1990
+ "grad_norm": 0.6100955925437853,
1991
+ "learning_rate": 3.418420144877544e-05,
1992
+ "loss": 0.3596,
1993
+ "num_tokens": 1295760860.0,
1994
+ "step": 1240
1995
+ },
1996
+ {
1997
+ "epoch": 1.1592178770949721,
1998
+ "grad_norm": 0.522871641456894,
1999
+ "learning_rate": 3.409796481545361e-05,
2000
+ "loss": 0.3471,
2001
+ "num_tokens": 1300940750.0,
2002
+ "step": 1245
2003
+ },
2004
+ {
2005
+ "epoch": 1.1638733705772812,
2006
+ "grad_norm": 0.4194412850396169,
2007
+ "learning_rate": 3.401172818213177e-05,
2008
+ "loss": 0.3563,
2009
+ "num_tokens": 1306183630.0,
2010
+ "step": 1250
2011
+ },
2012
+ {
2013
+ "epoch": 1.1685288640595903,
2014
+ "grad_norm": 0.44072988623105025,
2015
+ "learning_rate": 3.3925491548809934e-05,
2016
+ "loss": 0.3515,
2017
+ "num_tokens": 1311406684.0,
2018
+ "step": 1255
2019
+ },
2020
+ {
2021
+ "epoch": 1.1731843575418994,
2022
+ "grad_norm": 0.5394080245112541,
2023
+ "learning_rate": 3.38392549154881e-05,
2024
+ "loss": 0.3683,
2025
+ "num_tokens": 1316641742.0,
2026
+ "step": 1260
2027
+ },
2028
+ {
2029
+ "epoch": 1.1778398510242085,
2030
+ "grad_norm": 0.4449620104225317,
2031
+ "learning_rate": 3.375301828216627e-05,
2032
+ "loss": 0.3589,
2033
+ "num_tokens": 1321884622.0,
2034
+ "step": 1265
2035
+ },
2036
+ {
2037
+ "epoch": 1.1824953445065176,
2038
+ "grad_norm": 0.41227147035395784,
2039
+ "learning_rate": 3.366678164884443e-05,
2040
+ "loss": 0.353,
2041
+ "num_tokens": 1327127502.0,
2042
+ "step": 1270
2043
+ },
2044
+ {
2045
+ "epoch": 1.1871508379888267,
2046
+ "grad_norm": 0.620493561519156,
2047
+ "learning_rate": 3.3580545015522595e-05,
2048
+ "loss": 0.3654,
2049
+ "num_tokens": 1332370382.0,
2050
+ "step": 1275
2051
+ },
2052
+ {
2053
+ "epoch": 1.191806331471136,
2054
+ "grad_norm": 0.4676275047758524,
2055
+ "learning_rate": 3.3494308382200765e-05,
2056
+ "loss": 0.3438,
2057
+ "num_tokens": 1337560874.0,
2058
+ "step": 1280
2059
+ },
2060
+ {
2061
+ "epoch": 1.196461824953445,
2062
+ "grad_norm": 0.43665241100017466,
2063
+ "learning_rate": 3.340807174887893e-05,
2064
+ "loss": 0.3369,
2065
+ "num_tokens": 1342803754.0,
2066
+ "step": 1285
2067
+ },
2068
+ {
2069
+ "epoch": 1.2011173184357542,
2070
+ "grad_norm": 0.478927000423963,
2071
+ "learning_rate": 3.332183511555709e-05,
2072
+ "loss": 0.342,
2073
+ "num_tokens": 1348046634.0,
2074
+ "step": 1290
2075
+ },
2076
+ {
2077
+ "epoch": 1.2057728119180633,
2078
+ "grad_norm": 0.5732412827887052,
2079
+ "learning_rate": 3.3235598482235256e-05,
2080
+ "loss": 0.3615,
2081
+ "num_tokens": 1353226140.0,
2082
+ "step": 1295
2083
+ },
2084
+ {
2085
+ "epoch": 1.2104283054003724,
2086
+ "grad_norm": 0.5150714352326667,
2087
+ "learning_rate": 3.314936184891342e-05,
2088
+ "loss": 0.3585,
2089
+ "num_tokens": 1358469020.0,
2090
+ "step": 1300
2091
+ },
2092
+ {
2093
+ "epoch": 1.2150837988826815,
2094
+ "grad_norm": 0.4720181041176109,
2095
+ "learning_rate": 3.306312521559158e-05,
2096
+ "loss": 0.354,
2097
+ "num_tokens": 1363711900.0,
2098
+ "step": 1305
2099
+ },
2100
+ {
2101
+ "epoch": 1.2197392923649906,
2102
+ "grad_norm": 0.5084117654380252,
2103
+ "learning_rate": 3.2976888582269747e-05,
2104
+ "loss": 0.3507,
2105
+ "num_tokens": 1368954780.0,
2106
+ "step": 1310
2107
+ },
2108
+ {
2109
+ "epoch": 1.2243947858472999,
2110
+ "grad_norm": 0.49216184402082547,
2111
+ "learning_rate": 3.289065194894792e-05,
2112
+ "loss": 0.3582,
2113
+ "num_tokens": 1374194182.0,
2114
+ "step": 1315
2115
+ },
2116
+ {
2117
+ "epoch": 1.229050279329609,
2118
+ "grad_norm": 0.4310109372956479,
2119
+ "learning_rate": 3.280441531562608e-05,
2120
+ "loss": 0.3475,
2121
+ "num_tokens": 1379397624.0,
2122
+ "step": 1320
2123
+ },
2124
+ {
2125
+ "epoch": 1.233705772811918,
2126
+ "grad_norm": 0.34411623806366076,
2127
+ "learning_rate": 3.2718178682304244e-05,
2128
+ "loss": 0.3479,
2129
+ "num_tokens": 1384640504.0,
2130
+ "step": 1325
2131
+ },
2132
+ {
2133
+ "epoch": 1.2383612662942272,
2134
+ "grad_norm": 0.3789626797289003,
2135
+ "learning_rate": 3.2631942048982414e-05,
2136
+ "loss": 0.3466,
2137
+ "num_tokens": 1389883384.0,
2138
+ "step": 1330
2139
+ },
2140
+ {
2141
+ "epoch": 1.2430167597765363,
2142
+ "grad_norm": 0.39242414869191117,
2143
+ "learning_rate": 3.254570541566058e-05,
2144
+ "loss": 0.3484,
2145
+ "num_tokens": 1395126264.0,
2146
+ "step": 1335
2147
+ },
2148
+ {
2149
+ "epoch": 1.2476722532588453,
2150
+ "grad_norm": 0.4280782038705297,
2151
+ "learning_rate": 3.2459468782338735e-05,
2152
+ "loss": 0.3513,
2153
+ "num_tokens": 1400325474.0,
2154
+ "step": 1340
2155
+ },
2156
+ {
2157
+ "epoch": 1.2523277467411544,
2158
+ "grad_norm": 0.485326538754542,
2159
+ "learning_rate": 3.2373232149016905e-05,
2160
+ "loss": 0.3476,
2161
+ "num_tokens": 1405510816.0,
2162
+ "step": 1345
2163
+ },
2164
+ {
2165
+ "epoch": 1.2569832402234637,
2166
+ "grad_norm": 0.4807785764220693,
2167
+ "learning_rate": 3.228699551569507e-05,
2168
+ "loss": 0.363,
2169
+ "num_tokens": 1410688258.0,
2170
+ "step": 1350
2171
+ },
2172
+ {
2173
+ "epoch": 1.2616387337057728,
2174
+ "grad_norm": 0.5077856549055086,
2175
+ "learning_rate": 3.220075888237323e-05,
2176
+ "loss": 0.3542,
2177
+ "num_tokens": 1415931138.0,
2178
+ "step": 1355
2179
+ },
2180
+ {
2181
+ "epoch": 1.266294227188082,
2182
+ "grad_norm": 0.41724539394395294,
2183
+ "learning_rate": 3.2114522249051396e-05,
2184
+ "loss": 0.3605,
2185
+ "num_tokens": 1421110144.0,
2186
+ "step": 1360
2187
+ },
2188
+ {
2189
+ "epoch": 1.270949720670391,
2190
+ "grad_norm": 0.4560337991563612,
2191
+ "learning_rate": 3.2028285615729566e-05,
2192
+ "loss": 0.3406,
2193
+ "num_tokens": 1426353024.0,
2194
+ "step": 1365
2195
+ },
2196
+ {
2197
+ "epoch": 1.2756052141527001,
2198
+ "grad_norm": 0.4756884920927917,
2199
+ "learning_rate": 3.194204898240773e-05,
2200
+ "loss": 0.359,
2201
+ "num_tokens": 1431595904.0,
2202
+ "step": 1370
2203
+ },
2204
+ {
2205
+ "epoch": 1.2802607076350094,
2206
+ "grad_norm": 0.5192756462333422,
2207
+ "learning_rate": 3.185581234908589e-05,
2208
+ "loss": 0.3508,
2209
+ "num_tokens": 1436838784.0,
2210
+ "step": 1375
2211
+ },
2212
+ {
2213
+ "epoch": 1.2849162011173183,
2214
+ "grad_norm": 0.3560326634114241,
2215
+ "learning_rate": 3.1769575715764064e-05,
2216
+ "loss": 0.351,
2217
+ "num_tokens": 1442040238.0,
2218
+ "step": 1380
2219
+ },
2220
+ {
2221
+ "epoch": 1.2895716945996276,
2222
+ "grad_norm": 0.40006619071176525,
2223
+ "learning_rate": 3.168333908244222e-05,
2224
+ "loss": 0.3582,
2225
+ "num_tokens": 1447283118.0,
2226
+ "step": 1385
2227
+ },
2228
+ {
2229
+ "epoch": 1.2942271880819367,
2230
+ "grad_norm": 0.49325642461103086,
2231
+ "learning_rate": 3.1597102449120384e-05,
2232
+ "loss": 0.3497,
2233
+ "num_tokens": 1452508926.0,
2234
+ "step": 1390
2235
+ },
2236
+ {
2237
+ "epoch": 1.2988826815642458,
2238
+ "grad_norm": 0.4051150117444856,
2239
+ "learning_rate": 3.1510865815798554e-05,
2240
+ "loss": 0.3453,
2241
+ "num_tokens": 1457751806.0,
2242
+ "step": 1395
2243
+ },
2244
+ {
2245
+ "epoch": 1.303538175046555,
2246
+ "grad_norm": 0.44239897857904603,
2247
+ "learning_rate": 3.142462918247672e-05,
2248
+ "loss": 0.3551,
2249
+ "num_tokens": 1462994686.0,
2250
+ "step": 1400
2251
+ },
2252
+ {
2253
+ "epoch": 1.308193668528864,
2254
+ "grad_norm": 0.4035898982852808,
2255
+ "learning_rate": 3.133839254915488e-05,
2256
+ "loss": 0.3565,
2257
+ "num_tokens": 1468185418.0,
2258
+ "step": 1405
2259
+ },
2260
+ {
2261
+ "epoch": 1.3128491620111733,
2262
+ "grad_norm": 0.3795900812088053,
2263
+ "learning_rate": 3.1252155915833045e-05,
2264
+ "loss": 0.3508,
2265
+ "num_tokens": 1473428298.0,
2266
+ "step": 1410
2267
+ },
2268
+ {
2269
+ "epoch": 1.3175046554934824,
2270
+ "grad_norm": 0.5917536504763548,
2271
+ "learning_rate": 3.1165919282511215e-05,
2272
+ "loss": 0.3581,
2273
+ "num_tokens": 1478671178.0,
2274
+ "step": 1415
2275
+ },
2276
+ {
2277
+ "epoch": 1.3221601489757915,
2278
+ "grad_norm": 0.5746485721379913,
2279
+ "learning_rate": 3.107968264918938e-05,
2280
+ "loss": 0.345,
2281
+ "num_tokens": 1483914058.0,
2282
+ "step": 1420
2283
+ },
2284
+ {
2285
+ "epoch": 1.3268156424581006,
2286
+ "grad_norm": 0.5174456748590313,
2287
+ "learning_rate": 3.099344601586754e-05,
2288
+ "loss": 0.3469,
2289
+ "num_tokens": 1489156938.0,
2290
+ "step": 1425
2291
+ },
2292
+ {
2293
+ "epoch": 1.3314711359404097,
2294
+ "grad_norm": 0.5266154162574862,
2295
+ "learning_rate": 3.0907209382545706e-05,
2296
+ "loss": 0.3499,
2297
+ "num_tokens": 1494399818.0,
2298
+ "step": 1430
2299
+ },
2300
+ {
2301
+ "epoch": 1.3361266294227188,
2302
+ "grad_norm": 0.5381931883646065,
2303
+ "learning_rate": 3.082097274922387e-05,
2304
+ "loss": 0.3603,
2305
+ "num_tokens": 1499642698.0,
2306
+ "step": 1435
2307
+ },
2308
+ {
2309
+ "epoch": 1.3407821229050279,
2310
+ "grad_norm": 0.5845199613878549,
2311
+ "learning_rate": 3.073473611590203e-05,
2312
+ "loss": 0.3527,
2313
+ "num_tokens": 1504885578.0,
2314
+ "step": 1440
2315
+ },
2316
+ {
2317
+ "epoch": 1.3454376163873372,
2318
+ "grad_norm": 0.4761161751328153,
2319
+ "learning_rate": 3.06484994825802e-05,
2320
+ "loss": 0.3495,
2321
+ "num_tokens": 1510128458.0,
2322
+ "step": 1445
2323
+ },
2324
+ {
2325
+ "epoch": 1.3500931098696463,
2326
+ "grad_norm": 0.6099903516270699,
2327
+ "learning_rate": 3.056226284925837e-05,
2328
+ "loss": 0.3541,
2329
+ "num_tokens": 1515311764.0,
2330
+ "step": 1450
2331
+ },
2332
+ {
2333
+ "epoch": 1.3547486033519553,
2334
+ "grad_norm": 0.48697775909316104,
2335
+ "learning_rate": 3.047602621593653e-05,
2336
+ "loss": 0.3459,
2337
+ "num_tokens": 1520554644.0,
2338
+ "step": 1455
2339
+ },
2340
+ {
2341
+ "epoch": 1.3594040968342644,
2342
+ "grad_norm": 0.5760138459581621,
2343
+ "learning_rate": 3.0389789582614697e-05,
2344
+ "loss": 0.341,
2345
+ "num_tokens": 1525756564.0,
2346
+ "step": 1460
2347
+ },
2348
+ {
2349
+ "epoch": 1.3640595903165735,
2350
+ "grad_norm": 0.4382519382413423,
2351
+ "learning_rate": 3.0303552949292864e-05,
2352
+ "loss": 0.3511,
2353
+ "num_tokens": 1530999444.0,
2354
+ "step": 1465
2355
+ },
2356
+ {
2357
+ "epoch": 1.3687150837988826,
2358
+ "grad_norm": 0.5060933477322624,
2359
+ "learning_rate": 3.0217316315971028e-05,
2360
+ "loss": 0.3561,
2361
+ "num_tokens": 1536242324.0,
2362
+ "step": 1470
2363
+ },
2364
+ {
2365
+ "epoch": 1.3733705772811917,
2366
+ "grad_norm": 0.46671819295156486,
2367
+ "learning_rate": 3.0131079682649188e-05,
2368
+ "loss": 0.3575,
2369
+ "num_tokens": 1541485204.0,
2370
+ "step": 1475
2371
+ },
2372
+ {
2373
+ "epoch": 1.378026070763501,
2374
+ "grad_norm": 0.5087368057400945,
2375
+ "learning_rate": 3.0044843049327355e-05,
2376
+ "loss": 0.3458,
2377
+ "num_tokens": 1546728084.0,
2378
+ "step": 1480
2379
+ },
2380
+ {
2381
+ "epoch": 1.3826815642458101,
2382
+ "grad_norm": 0.41271390502469796,
2383
+ "learning_rate": 2.995860641600552e-05,
2384
+ "loss": 0.3468,
2385
+ "num_tokens": 1551970964.0,
2386
+ "step": 1485
2387
+ },
2388
+ {
2389
+ "epoch": 1.3873370577281192,
2390
+ "grad_norm": 0.46397488100583334,
2391
+ "learning_rate": 2.9872369782683686e-05,
2392
+ "loss": 0.3453,
2393
+ "num_tokens": 1557199712.0,
2394
+ "step": 1490
2395
+ },
2396
+ {
2397
+ "epoch": 1.3919925512104283,
2398
+ "grad_norm": 0.606936021926111,
2399
+ "learning_rate": 2.978613314936185e-05,
2400
+ "loss": 0.359,
2401
+ "num_tokens": 1562442592.0,
2402
+ "step": 1495
2403
+ },
2404
+ {
2405
+ "epoch": 1.3966480446927374,
2406
+ "grad_norm": 0.5840116139586674,
2407
+ "learning_rate": 2.9699896516040016e-05,
2408
+ "loss": 0.3527,
2409
+ "num_tokens": 1567685472.0,
2410
+ "step": 1500
2411
+ },
2412
+ {
2413
+ "epoch": 1.4013035381750465,
2414
+ "grad_norm": 0.41927975243494453,
2415
+ "learning_rate": 2.961365988271818e-05,
2416
+ "loss": 0.3476,
2417
+ "num_tokens": 1572914018.0,
2418
+ "step": 1505
2419
+ },
2420
+ {
2421
+ "epoch": 1.4059590316573556,
2422
+ "grad_norm": 0.4719849459010854,
2423
+ "learning_rate": 2.9527423249396347e-05,
2424
+ "loss": 0.3435,
2425
+ "num_tokens": 1578156898.0,
2426
+ "step": 1510
2427
+ },
2428
+ {
2429
+ "epoch": 1.410614525139665,
2430
+ "grad_norm": 0.5908747500911715,
2431
+ "learning_rate": 2.9441186616074514e-05,
2432
+ "loss": 0.3479,
2433
+ "num_tokens": 1583399778.0,
2434
+ "step": 1515
2435
+ },
2436
+ {
2437
+ "epoch": 1.415270018621974,
2438
+ "grad_norm": 0.4654579138308962,
2439
+ "learning_rate": 2.9354949982752677e-05,
2440
+ "loss": 0.356,
2441
+ "num_tokens": 1588625916.0,
2442
+ "step": 1520
2443
+ },
2444
+ {
2445
+ "epoch": 1.419925512104283,
2446
+ "grad_norm": 0.5082441042584376,
2447
+ "learning_rate": 2.9268713349430837e-05,
2448
+ "loss": 0.3571,
2449
+ "num_tokens": 1593853014.0,
2450
+ "step": 1525
2451
+ },
2452
+ {
2453
+ "epoch": 1.4245810055865922,
2454
+ "grad_norm": 0.4505932414113476,
2455
+ "learning_rate": 2.9182476716109004e-05,
2456
+ "loss": 0.3587,
2457
+ "num_tokens": 1599054446.0,
2458
+ "step": 1530
2459
+ },
2460
+ {
2461
+ "epoch": 1.4292364990689013,
2462
+ "grad_norm": 0.4402651883114836,
2463
+ "learning_rate": 2.9096240082787168e-05,
2464
+ "loss": 0.3441,
2465
+ "num_tokens": 1604297326.0,
2466
+ "step": 1535
2467
+ },
2468
+ {
2469
+ "epoch": 1.4338919925512104,
2470
+ "grad_norm": 0.41400290224654057,
2471
+ "learning_rate": 2.9010003449465335e-05,
2472
+ "loss": 0.3462,
2473
+ "num_tokens": 1609497138.0,
2474
+ "step": 1540
2475
+ },
2476
+ {
2477
+ "epoch": 1.4385474860335195,
2478
+ "grad_norm": 0.47735272128289663,
2479
+ "learning_rate": 2.8923766816143498e-05,
2480
+ "loss": 0.3571,
2481
+ "num_tokens": 1614724154.0,
2482
+ "step": 1545
2483
+ },
2484
+ {
2485
+ "epoch": 1.4432029795158288,
2486
+ "grad_norm": 0.4687209241663635,
2487
+ "learning_rate": 2.8837530182821665e-05,
2488
+ "loss": 0.3491,
2489
+ "num_tokens": 1619955342.0,
2490
+ "step": 1550
2491
+ },
2492
+ {
2493
+ "epoch": 1.4478584729981379,
2494
+ "grad_norm": 0.5325094020285824,
2495
+ "learning_rate": 2.875129354949983e-05,
2496
+ "loss": 0.3516,
2497
+ "num_tokens": 1625198222.0,
2498
+ "step": 1555
2499
+ },
2500
+ {
2501
+ "epoch": 1.452513966480447,
2502
+ "grad_norm": 0.5380366589721377,
2503
+ "learning_rate": 2.8665056916177996e-05,
2504
+ "loss": 0.3575,
2505
+ "num_tokens": 1630390158.0,
2506
+ "step": 1560
2507
+ },
2508
+ {
2509
+ "epoch": 1.457169459962756,
2510
+ "grad_norm": 0.3818882359532099,
2511
+ "learning_rate": 2.8578820282856163e-05,
2512
+ "loss": 0.3511,
2513
+ "num_tokens": 1635633038.0,
2514
+ "step": 1565
2515
+ },
2516
+ {
2517
+ "epoch": 1.4618249534450651,
2518
+ "grad_norm": 0.43172836862069464,
2519
+ "learning_rate": 2.8492583649534323e-05,
2520
+ "loss": 0.3581,
2521
+ "num_tokens": 1640875918.0,
2522
+ "step": 1570
2523
+ },
2524
+ {
2525
+ "epoch": 1.4664804469273742,
2526
+ "grad_norm": 0.43715719572511025,
2527
+ "learning_rate": 2.8406347016212486e-05,
2528
+ "loss": 0.3655,
2529
+ "num_tokens": 1646118798.0,
2530
+ "step": 1575
2531
+ },
2532
+ {
2533
+ "epoch": 1.4711359404096833,
2534
+ "grad_norm": 0.48153517614734775,
2535
+ "learning_rate": 2.8320110382890653e-05,
2536
+ "loss": 0.3619,
2537
+ "num_tokens": 1651329310.0,
2538
+ "step": 1580
2539
+ },
2540
+ {
2541
+ "epoch": 1.4757914338919926,
2542
+ "grad_norm": 0.49806621524576733,
2543
+ "learning_rate": 2.8233873749568817e-05,
2544
+ "loss": 0.3491,
2545
+ "num_tokens": 1656572190.0,
2546
+ "step": 1585
2547
+ },
2548
+ {
2549
+ "epoch": 1.4804469273743017,
2550
+ "grad_norm": 0.44142126215810823,
2551
+ "learning_rate": 2.8147637116246984e-05,
2552
+ "loss": 0.3564,
2553
+ "num_tokens": 1661779772.0,
2554
+ "step": 1590
2555
+ },
2556
+ {
2557
+ "epoch": 1.4851024208566108,
2558
+ "grad_norm": 0.4237230808369376,
2559
+ "learning_rate": 2.8061400482925147e-05,
2560
+ "loss": 0.3532,
2561
+ "num_tokens": 1667018872.0,
2562
+ "step": 1595
2563
+ },
2564
+ {
2565
+ "epoch": 1.48975791433892,
2566
+ "grad_norm": 0.5308149652633445,
2567
+ "learning_rate": 2.7975163849603314e-05,
2568
+ "loss": 0.3638,
2569
+ "num_tokens": 1672223952.0,
2570
+ "step": 1600
2571
+ },
2572
+ {
2573
+ "epoch": 1.494413407821229,
2574
+ "grad_norm": 0.5017844851185143,
2575
+ "learning_rate": 2.788892721628148e-05,
2576
+ "loss": 0.3594,
2577
+ "num_tokens": 1677466832.0,
2578
+ "step": 1605
2579
+ },
2580
+ {
2581
+ "epoch": 1.499068901303538,
2582
+ "grad_norm": 0.4839984310616902,
2583
+ "learning_rate": 2.7802690582959645e-05,
2584
+ "loss": 0.3466,
2585
+ "num_tokens": 1682709712.0,
2586
+ "step": 1610
2587
+ },
2588
+ {
2589
+ "epoch": 1.5037243947858472,
2590
+ "grad_norm": 0.41406506477863037,
2591
+ "learning_rate": 2.7716453949637805e-05,
2592
+ "loss": 0.3547,
2593
+ "num_tokens": 1687952592.0,
2594
+ "step": 1615
2595
+ },
2596
+ {
2597
+ "epoch": 1.5083798882681565,
2598
+ "grad_norm": 0.42348118656156764,
2599
+ "learning_rate": 2.7630217316315972e-05,
2600
+ "loss": 0.3485,
2601
+ "num_tokens": 1693097166.0,
2602
+ "step": 1620
2603
+ },
2604
+ {
2605
+ "epoch": 1.5130353817504656,
2606
+ "grad_norm": 0.4357117827255015,
2607
+ "learning_rate": 2.7543980682994136e-05,
2608
+ "loss": 0.352,
2609
+ "num_tokens": 1698326168.0,
2610
+ "step": 1625
2611
+ },
2612
+ {
2613
+ "epoch": 1.5176908752327747,
2614
+ "grad_norm": 0.43945135163916543,
2615
+ "learning_rate": 2.7457744049672302e-05,
2616
+ "loss": 0.3512,
2617
+ "num_tokens": 1703569048.0,
2618
+ "step": 1630
2619
+ },
2620
+ {
2621
+ "epoch": 1.5223463687150838,
2622
+ "grad_norm": 0.5192598819819614,
2623
+ "learning_rate": 2.7371507416350466e-05,
2624
+ "loss": 0.367,
2625
+ "num_tokens": 1708766606.0,
2626
+ "step": 1635
2627
+ },
2628
+ {
2629
+ "epoch": 1.5270018621973929,
2630
+ "grad_norm": 0.4452998587061349,
2631
+ "learning_rate": 2.7285270783028633e-05,
2632
+ "loss": 0.3589,
2633
+ "num_tokens": 1713944042.0,
2634
+ "step": 1640
2635
+ },
2636
+ {
2637
+ "epoch": 1.5316573556797022,
2638
+ "grad_norm": 0.43064181461229223,
2639
+ "learning_rate": 2.7199034149706797e-05,
2640
+ "loss": 0.3586,
2641
+ "num_tokens": 1719186922.0,
2642
+ "step": 1645
2643
+ },
2644
+ {
2645
+ "epoch": 1.536312849162011,
2646
+ "grad_norm": 0.44443641092630637,
2647
+ "learning_rate": 2.7112797516384963e-05,
2648
+ "loss": 0.3439,
2649
+ "num_tokens": 1724367282.0,
2650
+ "step": 1650
2651
+ },
2652
+ {
2653
+ "epoch": 1.5409683426443204,
2654
+ "grad_norm": 0.4167990111102253,
2655
+ "learning_rate": 2.702656088306313e-05,
2656
+ "loss": 0.3407,
2657
+ "num_tokens": 1729599294.0,
2658
+ "step": 1655
2659
+ },
2660
+ {
2661
+ "epoch": 1.5456238361266295,
2662
+ "grad_norm": 0.4462974133274107,
2663
+ "learning_rate": 2.6940324249741287e-05,
2664
+ "loss": 0.3529,
2665
+ "num_tokens": 1734842174.0,
2666
+ "step": 1660
2667
+ },
2668
+ {
2669
+ "epoch": 1.5502793296089385,
2670
+ "grad_norm": 0.39854857064913984,
2671
+ "learning_rate": 2.6854087616419454e-05,
2672
+ "loss": 0.348,
2673
+ "num_tokens": 1740064978.0,
2674
+ "step": 1665
2675
+ },
2676
+ {
2677
+ "epoch": 1.5549348230912476,
2678
+ "grad_norm": 0.4248070265370859,
2679
+ "learning_rate": 2.676785098309762e-05,
2680
+ "loss": 0.3528,
2681
+ "num_tokens": 1745287654.0,
2682
+ "step": 1670
2683
+ },
2684
+ {
2685
+ "epoch": 1.5595903165735567,
2686
+ "grad_norm": 0.5528756147028647,
2687
+ "learning_rate": 2.6681614349775785e-05,
2688
+ "loss": 0.3495,
2689
+ "num_tokens": 1750530534.0,
2690
+ "step": 1675
2691
+ },
2692
+ {
2693
+ "epoch": 1.564245810055866,
2694
+ "grad_norm": 0.41949805640470383,
2695
+ "learning_rate": 2.659537771645395e-05,
2696
+ "loss": 0.353,
2697
+ "num_tokens": 1755773414.0,
2698
+ "step": 1680
2699
+ },
2700
+ {
2701
+ "epoch": 1.568901303538175,
2702
+ "grad_norm": 0.4182275416284989,
2703
+ "learning_rate": 2.6509141083132115e-05,
2704
+ "loss": 0.3545,
2705
+ "num_tokens": 1760954744.0,
2706
+ "step": 1685
2707
+ },
2708
+ {
2709
+ "epoch": 1.5735567970204842,
2710
+ "grad_norm": 0.4424815500442761,
2711
+ "learning_rate": 2.6422904449810282e-05,
2712
+ "loss": 0.3608,
2713
+ "num_tokens": 1766197624.0,
2714
+ "step": 1690
2715
+ },
2716
+ {
2717
+ "epoch": 1.5782122905027933,
2718
+ "grad_norm": 0.5037267806139252,
2719
+ "learning_rate": 2.6336667816488446e-05,
2720
+ "loss": 0.3618,
2721
+ "num_tokens": 1771440504.0,
2722
+ "step": 1695
2723
+ },
2724
+ {
2725
+ "epoch": 1.5828677839851024,
2726
+ "grad_norm": 0.5757867203405523,
2727
+ "learning_rate": 2.6250431183166613e-05,
2728
+ "loss": 0.3537,
2729
+ "num_tokens": 1776683384.0,
2730
+ "step": 1700
2731
+ },
2732
+ {
2733
+ "epoch": 1.5875232774674115,
2734
+ "grad_norm": 0.4418015148391545,
2735
+ "learning_rate": 2.6164194549844773e-05,
2736
+ "loss": 0.3503,
2737
+ "num_tokens": 1781926264.0,
2738
+ "step": 1705
2739
+ },
2740
+ {
2741
+ "epoch": 1.5921787709497206,
2742
+ "grad_norm": 0.41320469617664296,
2743
+ "learning_rate": 2.607795791652294e-05,
2744
+ "loss": 0.35,
2745
+ "num_tokens": 1787169144.0,
2746
+ "step": 1710
2747
+ },
2748
+ {
2749
+ "epoch": 1.59683426443203,
2750
+ "grad_norm": 0.4654919170528904,
2751
+ "learning_rate": 2.5991721283201103e-05,
2752
+ "loss": 0.3492,
2753
+ "num_tokens": 1792412024.0,
2754
+ "step": 1715
2755
+ },
2756
+ {
2757
+ "epoch": 1.6014897579143388,
2758
+ "grad_norm": 0.46789325383997304,
2759
+ "learning_rate": 2.590548464987927e-05,
2760
+ "loss": 0.3514,
2761
+ "num_tokens": 1797634766.0,
2762
+ "step": 1720
2763
+ },
2764
+ {
2765
+ "epoch": 1.606145251396648,
2766
+ "grad_norm": 0.383414683196401,
2767
+ "learning_rate": 2.5819248016557434e-05,
2768
+ "loss": 0.3429,
2769
+ "num_tokens": 1802877646.0,
2770
+ "step": 1725
2771
+ },
2772
+ {
2773
+ "epoch": 1.6108007448789572,
2774
+ "grad_norm": 0.39464003167522826,
2775
+ "learning_rate": 2.57330113832356e-05,
2776
+ "loss": 0.3521,
2777
+ "num_tokens": 1808104662.0,
2778
+ "step": 1730
2779
+ },
2780
+ {
2781
+ "epoch": 1.6154562383612663,
2782
+ "grad_norm": 0.46968645292723943,
2783
+ "learning_rate": 2.5646774749913764e-05,
2784
+ "loss": 0.3453,
2785
+ "num_tokens": 1813321242.0,
2786
+ "step": 1735
2787
+ },
2788
+ {
2789
+ "epoch": 1.6201117318435754,
2790
+ "grad_norm": 0.3777881875602588,
2791
+ "learning_rate": 2.556053811659193e-05,
2792
+ "loss": 0.3478,
2793
+ "num_tokens": 1818564122.0,
2794
+ "step": 1740
2795
+ },
2796
+ {
2797
+ "epoch": 1.6247672253258845,
2798
+ "grad_norm": 0.3638617662724661,
2799
+ "learning_rate": 2.5474301483270098e-05,
2800
+ "loss": 0.35,
2801
+ "num_tokens": 1823807002.0,
2802
+ "step": 1745
2803
+ },
2804
+ {
2805
+ "epoch": 1.6294227188081938,
2806
+ "grad_norm": 0.43356993528545573,
2807
+ "learning_rate": 2.5388064849948255e-05,
2808
+ "loss": 0.3522,
2809
+ "num_tokens": 1828986398.0,
2810
+ "step": 1750
2811
+ },
2812
+ {
2813
+ "epoch": 1.6340782122905027,
2814
+ "grad_norm": 0.41013840469091506,
2815
+ "learning_rate": 2.5301828216626422e-05,
2816
+ "loss": 0.3463,
2817
+ "num_tokens": 1834229278.0,
2818
+ "step": 1755
2819
+ },
2820
+ {
2821
+ "epoch": 1.638733705772812,
2822
+ "grad_norm": 0.48759437990509724,
2823
+ "learning_rate": 2.521559158330459e-05,
2824
+ "loss": 0.3462,
2825
+ "num_tokens": 1839416094.0,
2826
+ "step": 1760
2827
+ },
2828
+ {
2829
+ "epoch": 1.643389199255121,
2830
+ "grad_norm": 0.4004343586595845,
2831
+ "learning_rate": 2.5129354949982752e-05,
2832
+ "loss": 0.3583,
2833
+ "num_tokens": 1844605766.0,
2834
+ "step": 1765
2835
+ },
2836
+ {
2837
+ "epoch": 1.6480446927374302,
2838
+ "grad_norm": 0.4132680619461919,
2839
+ "learning_rate": 2.504311831666092e-05,
2840
+ "loss": 0.3549,
2841
+ "num_tokens": 1849848646.0,
2842
+ "step": 1770
2843
+ },
2844
+ {
2845
+ "epoch": 1.6527001862197392,
2846
+ "grad_norm": 0.4470980214548608,
2847
+ "learning_rate": 2.4956881683339083e-05,
2848
+ "loss": 0.3545,
2849
+ "num_tokens": 1855091526.0,
2850
+ "step": 1775
2851
+ },
2852
+ {
2853
+ "epoch": 1.6573556797020483,
2854
+ "grad_norm": 0.41600413543566717,
2855
+ "learning_rate": 2.487064505001725e-05,
2856
+ "loss": 0.3519,
2857
+ "num_tokens": 1860334406.0,
2858
+ "step": 1780
2859
+ },
2860
+ {
2861
+ "epoch": 1.6620111731843576,
2862
+ "grad_norm": 0.4179342529390727,
2863
+ "learning_rate": 2.4784408416695413e-05,
2864
+ "loss": 0.3558,
2865
+ "num_tokens": 1865567454.0,
2866
+ "step": 1785
2867
+ },
2868
+ {
2869
+ "epoch": 1.6666666666666665,
2870
+ "grad_norm": 0.4295317513177463,
2871
+ "learning_rate": 2.4698171783373577e-05,
2872
+ "loss": 0.3428,
2873
+ "num_tokens": 1870810334.0,
2874
+ "step": 1790
2875
+ },
2876
+ {
2877
+ "epoch": 1.6713221601489758,
2878
+ "grad_norm": 0.4490287015466731,
2879
+ "learning_rate": 2.4611935150051744e-05,
2880
+ "loss": 0.3503,
2881
+ "num_tokens": 1876053214.0,
2882
+ "step": 1795
2883
+ },
2884
+ {
2885
+ "epoch": 1.675977653631285,
2886
+ "grad_norm": 0.5271562590781398,
2887
+ "learning_rate": 2.4525698516729908e-05,
2888
+ "loss": 0.3499,
2889
+ "num_tokens": 1881296094.0,
2890
+ "step": 1800
2891
+ },
2892
+ {
2893
+ "epoch": 1.680633147113594,
2894
+ "grad_norm": 0.3938709205380673,
2895
+ "learning_rate": 2.4439461883408075e-05,
2896
+ "loss": 0.3532,
2897
+ "num_tokens": 1886538974.0,
2898
+ "step": 1805
2899
+ },
2900
+ {
2901
+ "epoch": 1.6852886405959033,
2902
+ "grad_norm": 0.36542236991861243,
2903
+ "learning_rate": 2.4353225250086238e-05,
2904
+ "loss": 0.3494,
2905
+ "num_tokens": 1891781854.0,
2906
+ "step": 1810
2907
+ },
2908
+ {
2909
+ "epoch": 1.6899441340782122,
2910
+ "grad_norm": 0.5158674940389951,
2911
+ "learning_rate": 2.42669886167644e-05,
2912
+ "loss": 0.3568,
2913
+ "num_tokens": 1897024734.0,
2914
+ "step": 1815
2915
+ },
2916
+ {
2917
+ "epoch": 1.6945996275605215,
2918
+ "grad_norm": 0.5898812452716973,
2919
+ "learning_rate": 2.418075198344257e-05,
2920
+ "loss": 0.3427,
2921
+ "num_tokens": 1902267614.0,
2922
+ "step": 1820
2923
+ },
2924
+ {
2925
+ "epoch": 1.6992551210428304,
2926
+ "grad_norm": 0.46830565847427696,
2927
+ "learning_rate": 2.4094515350120732e-05,
2928
+ "loss": 0.3479,
2929
+ "num_tokens": 1907454764.0,
2930
+ "step": 1825
2931
+ },
2932
+ {
2933
+ "epoch": 1.7039106145251397,
2934
+ "grad_norm": 0.4720815332727475,
2935
+ "learning_rate": 2.4008278716798896e-05,
2936
+ "loss": 0.3538,
2937
+ "num_tokens": 1912633368.0,
2938
+ "step": 1830
2939
+ },
2940
+ {
2941
+ "epoch": 1.7085661080074488,
2942
+ "grad_norm": 0.42858660436141865,
2943
+ "learning_rate": 2.3922042083477063e-05,
2944
+ "loss": 0.3498,
2945
+ "num_tokens": 1917859036.0,
2946
+ "step": 1835
2947
+ },
2948
+ {
2949
+ "epoch": 1.7132216014897579,
2950
+ "grad_norm": 0.41197749771239084,
2951
+ "learning_rate": 2.3835805450155226e-05,
2952
+ "loss": 0.3415,
2953
+ "num_tokens": 1923043306.0,
2954
+ "step": 1840
2955
+ },
2956
+ {
2957
+ "epoch": 1.7178770949720672,
2958
+ "grad_norm": 0.4115656817654512,
2959
+ "learning_rate": 2.3749568816833393e-05,
2960
+ "loss": 0.3551,
2961
+ "num_tokens": 1928286186.0,
2962
+ "step": 1845
2963
+ },
2964
+ {
2965
+ "epoch": 1.722532588454376,
2966
+ "grad_norm": 0.43870051277515076,
2967
+ "learning_rate": 2.3663332183511557e-05,
2968
+ "loss": 0.3473,
2969
+ "num_tokens": 1933529066.0,
2970
+ "step": 1850
2971
+ },
2972
+ {
2973
+ "epoch": 1.7271880819366854,
2974
+ "grad_norm": 0.4051372237420238,
2975
+ "learning_rate": 2.357709555018972e-05,
2976
+ "loss": 0.3406,
2977
+ "num_tokens": 1938771946.0,
2978
+ "step": 1855
2979
+ },
2980
+ {
2981
+ "epoch": 1.7318435754189943,
2982
+ "grad_norm": 0.36861278944342607,
2983
+ "learning_rate": 2.3490858916867887e-05,
2984
+ "loss": 0.3395,
2985
+ "num_tokens": 1943951362.0,
2986
+ "step": 1860
2987
+ },
2988
+ {
2989
+ "epoch": 1.7364990689013036,
2990
+ "grad_norm": 0.37116380184385894,
2991
+ "learning_rate": 2.340462228354605e-05,
2992
+ "loss": 0.3554,
2993
+ "num_tokens": 1949060174.0,
2994
+ "step": 1865
2995
+ },
2996
+ {
2997
+ "epoch": 1.7411545623836127,
2998
+ "grad_norm": 0.5205105281665823,
2999
+ "learning_rate": 2.3318385650224218e-05,
3000
+ "loss": 0.3537,
3001
+ "num_tokens": 1954293264.0,
3002
+ "step": 1870
3003
+ },
3004
+ {
3005
+ "epoch": 1.7458100558659218,
3006
+ "grad_norm": 0.4456953970069166,
3007
+ "learning_rate": 2.323214901690238e-05,
3008
+ "loss": 0.3439,
3009
+ "num_tokens": 1959495158.0,
3010
+ "step": 1875
3011
+ },
3012
+ {
3013
+ "epoch": 1.750465549348231,
3014
+ "grad_norm": 0.47447329743147876,
3015
+ "learning_rate": 2.3145912383580545e-05,
3016
+ "loss": 0.3545,
3017
+ "num_tokens": 1964738038.0,
3018
+ "step": 1880
3019
+ },
3020
+ {
3021
+ "epoch": 1.75512104283054,
3022
+ "grad_norm": 0.3557351154983947,
3023
+ "learning_rate": 2.3059675750258712e-05,
3024
+ "loss": 0.3567,
3025
+ "num_tokens": 1969980918.0,
3026
+ "step": 1885
3027
+ },
3028
+ {
3029
+ "epoch": 1.7597765363128492,
3030
+ "grad_norm": 0.40245903573677616,
3031
+ "learning_rate": 2.2973439116936875e-05,
3032
+ "loss": 0.3553,
3033
+ "num_tokens": 1975223798.0,
3034
+ "step": 1890
3035
+ },
3036
+ {
3037
+ "epoch": 1.7644320297951583,
3038
+ "grad_norm": 0.42631391419004666,
3039
+ "learning_rate": 2.2887202483615042e-05,
3040
+ "loss": 0.3524,
3041
+ "num_tokens": 1980411150.0,
3042
+ "step": 1895
3043
+ },
3044
+ {
3045
+ "epoch": 1.7690875232774674,
3046
+ "grad_norm": 0.4036734907222041,
3047
+ "learning_rate": 2.2800965850293206e-05,
3048
+ "loss": 0.3489,
3049
+ "num_tokens": 1985654030.0,
3050
+ "step": 1900
3051
+ },
3052
+ {
3053
+ "epoch": 1.7737430167597765,
3054
+ "grad_norm": 0.43266404699630584,
3055
+ "learning_rate": 2.271472921697137e-05,
3056
+ "loss": 0.3571,
3057
+ "num_tokens": 1990896910.0,
3058
+ "step": 1905
3059
+ },
3060
+ {
3061
+ "epoch": 1.7783985102420856,
3062
+ "grad_norm": 0.4390506238721815,
3063
+ "learning_rate": 2.2628492583649536e-05,
3064
+ "loss": 0.3415,
3065
+ "num_tokens": 1996077922.0,
3066
+ "step": 1910
3067
+ },
3068
+ {
3069
+ "epoch": 1.783054003724395,
3070
+ "grad_norm": 0.45868358663134706,
3071
+ "learning_rate": 2.25422559503277e-05,
3072
+ "loss": 0.343,
3073
+ "num_tokens": 2001320802.0,
3074
+ "step": 1915
3075
+ },
3076
+ {
3077
+ "epoch": 1.7877094972067038,
3078
+ "grad_norm": 0.4308135634115946,
3079
+ "learning_rate": 2.2456019317005867e-05,
3080
+ "loss": 0.3455,
3081
+ "num_tokens": 2006537026.0,
3082
+ "step": 1920
3083
+ },
3084
+ {
3085
+ "epoch": 1.7923649906890131,
3086
+ "grad_norm": 0.5352962259215817,
3087
+ "learning_rate": 2.236978268368403e-05,
3088
+ "loss": 0.3483,
3089
+ "num_tokens": 2011779524.0,
3090
+ "step": 1925
3091
+ },
3092
+ {
3093
+ "epoch": 1.7970204841713222,
3094
+ "grad_norm": 0.45675715635498815,
3095
+ "learning_rate": 2.2283546050362194e-05,
3096
+ "loss": 0.3511,
3097
+ "num_tokens": 2017022404.0,
3098
+ "step": 1930
3099
+ },
3100
+ {
3101
+ "epoch": 1.8016759776536313,
3102
+ "grad_norm": 0.37896808667371934,
3103
+ "learning_rate": 2.219730941704036e-05,
3104
+ "loss": 0.3394,
3105
+ "num_tokens": 2022239646.0,
3106
+ "step": 1935
3107
+ },
3108
+ {
3109
+ "epoch": 1.8063314711359404,
3110
+ "grad_norm": 0.5097191341558315,
3111
+ "learning_rate": 2.2111072783718524e-05,
3112
+ "loss": 0.3473,
3113
+ "num_tokens": 2027418744.0,
3114
+ "step": 1940
3115
+ },
3116
+ {
3117
+ "epoch": 1.8109869646182495,
3118
+ "grad_norm": 0.4478696818866437,
3119
+ "learning_rate": 2.2024836150396688e-05,
3120
+ "loss": 0.3506,
3121
+ "num_tokens": 2032661624.0,
3122
+ "step": 1945
3123
+ },
3124
+ {
3125
+ "epoch": 1.8156424581005588,
3126
+ "grad_norm": 0.5348913121148593,
3127
+ "learning_rate": 2.1938599517074855e-05,
3128
+ "loss": 0.3431,
3129
+ "num_tokens": 2037904504.0,
3130
+ "step": 1950
3131
+ },
3132
+ {
3133
+ "epoch": 1.8202979515828677,
3134
+ "grad_norm": 0.35782661985763603,
3135
+ "learning_rate": 2.185236288375302e-05,
3136
+ "loss": 0.3435,
3137
+ "num_tokens": 2043147384.0,
3138
+ "step": 1955
3139
+ },
3140
+ {
3141
+ "epoch": 1.824953445065177,
3142
+ "grad_norm": 0.4030255067164449,
3143
+ "learning_rate": 2.1766126250431186e-05,
3144
+ "loss": 0.3454,
3145
+ "num_tokens": 2048390264.0,
3146
+ "step": 1960
3147
+ },
3148
+ {
3149
+ "epoch": 1.829608938547486,
3150
+ "grad_norm": 0.4856718408524328,
3151
+ "learning_rate": 2.167988961710935e-05,
3152
+ "loss": 0.3508,
3153
+ "num_tokens": 2053565316.0,
3154
+ "step": 1965
3155
+ },
3156
+ {
3157
+ "epoch": 1.8342644320297952,
3158
+ "grad_norm": 0.38701100880731554,
3159
+ "learning_rate": 2.1593652983787513e-05,
3160
+ "loss": 0.3387,
3161
+ "num_tokens": 2058808196.0,
3162
+ "step": 1970
3163
+ },
3164
+ {
3165
+ "epoch": 1.8389199255121043,
3166
+ "grad_norm": 0.37713447559268454,
3167
+ "learning_rate": 2.150741635046568e-05,
3168
+ "loss": 0.3482,
3169
+ "num_tokens": 2064051076.0,
3170
+ "step": 1975
3171
+ },
3172
+ {
3173
+ "epoch": 1.8435754189944134,
3174
+ "grad_norm": 0.4177726543134029,
3175
+ "learning_rate": 2.1421179717143843e-05,
3176
+ "loss": 0.339,
3177
+ "num_tokens": 2069293956.0,
3178
+ "step": 1980
3179
+ },
3180
+ {
3181
+ "epoch": 1.8482309124767227,
3182
+ "grad_norm": 0.5101259156674925,
3183
+ "learning_rate": 2.133494308382201e-05,
3184
+ "loss": 0.3532,
3185
+ "num_tokens": 2074536836.0,
3186
+ "step": 1985
3187
+ },
3188
+ {
3189
+ "epoch": 1.8528864059590315,
3190
+ "grad_norm": 0.44611679979677504,
3191
+ "learning_rate": 2.1248706450500174e-05,
3192
+ "loss": 0.3516,
3193
+ "num_tokens": 2079779716.0,
3194
+ "step": 1990
3195
+ },
3196
+ {
3197
+ "epoch": 1.8575418994413408,
3198
+ "grad_norm": 0.4524211991169788,
3199
+ "learning_rate": 2.1162469817178337e-05,
3200
+ "loss": 0.3432,
3201
+ "num_tokens": 2084952036.0,
3202
+ "step": 1995
3203
+ },
3204
+ {
3205
+ "epoch": 1.86219739292365,
3206
+ "grad_norm": 0.3783683026141043,
3207
+ "learning_rate": 2.1076233183856504e-05,
3208
+ "loss": 0.3549,
3209
+ "num_tokens": 2090194916.0,
3210
+ "step": 2000
3211
  }
3212
  ],
3213
  "logging_steps": 5,
 
3227
  "attributes": {}
3228
  }
3229
  },
3230
+ "total_flos": 1.7113228334533181e+18,
3231
  "train_batch_size": 1,
3232
  "trial_name": null,
3233
  "trial_params": null