nileshmalpeddi commited on
Commit
40dfdcb
·
verified ·
1 Parent(s): 8c00c6b

End of training

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  base_model: LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct
3
- datasets: testing_layer_29_Matrix_attn.attention.v_proj
4
  library_name: transformers
5
  model_name: output_exaone_tucker_rank16_newdata_b1_g2_gradient2_e1_b512
6
  tags:
@@ -12,7 +12,7 @@ licence: license
12
 
13
  # Model Card for output_exaone_tucker_rank16_newdata_b1_g2_gradient2_e1_b512
14
 
15
- This model is a fine-tuned version of [LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct) on the [testing_layer_29_Matrix_attn.attention.v_proj](https://huggingface.co/datasets/testing_layer_29_Matrix_attn.attention.v_proj) dataset.
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
 
1
  ---
2
  base_model: LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct
3
+ datasets: testing_layer_29_Matrix_mlp.c_proj
4
  library_name: transformers
5
  model_name: output_exaone_tucker_rank16_newdata_b1_g2_gradient2_e1_b512
6
  tags:
 
12
 
13
  # Model Card for output_exaone_tucker_rank16_newdata_b1_g2_gradient2_e1_b512
14
 
15
+ This model is a fine-tuned version of [LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct) on the [testing_layer_29_Matrix_mlp.c_proj](https://huggingface.co/datasets/testing_layer_29_Matrix_mlp.c_proj) dataset.
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "eval_accuracy": 0.7482185646263703,
3
- "eval_loss": 5.08984375,
4
- "eval_runtime": 179.6951,
5
- "eval_samples": 5103,
6
- "eval_samples_per_second": 28.398,
7
- "eval_steps_per_second": 14.202,
8
- "perplexity": 162.3644906197382,
9
- "total_flos": 22519612440576.0,
10
- "train_loss": 5.291015625,
11
- "train_runtime": 0.3705,
12
- "train_samples": 76256,
13
- "train_samples_per_second": 21.592,
14
- "train_steps_per_second": 5.398
15
  }
 
1
  {
2
+ "eval_accuracy": 0.8085339415160145,
3
+ "eval_loss": 1.681640625,
4
+ "eval_runtime": 927.2318,
5
+ "eval_samples": 25971,
6
+ "eval_samples_per_second": 28.009,
7
+ "eval_steps_per_second": 14.005,
8
+ "perplexity": 5.374366061438169,
9
+ "total_flos": 9.3961326985583e+17,
10
+ "train_loss": 1.7179553695028122,
11
+ "train_runtime": 11634.1539,
12
+ "train_samples": 294393,
13
+ "train_samples_per_second": 25.304,
14
+ "train_steps_per_second": 6.326
15
  }
eval_layer_index_28_mat_mlp.c_proj_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_accuracy": 0.7446557896577809,
3
+ "eval_loss": 5.17578125,
4
+ "eval_runtime": 182.4709,
5
+ "eval_samples": 5103,
6
+ "eval_samples_per_second": 27.966,
7
+ "eval_steps_per_second": 13.986,
8
+ "perplexity": 176.93479059943397
9
+ }
eval_layer_index_29_mat_mlp.c_proj_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "eval_accuracy": 0.754675579446587,
3
- "eval_loss": 4.87890625,
4
- "eval_runtime": 178.1261,
5
- "eval_samples": 5103,
6
- "eval_samples_per_second": 28.648,
7
- "eval_steps_per_second": 14.327,
8
- "perplexity": 131.48677155266952
9
  }
 
1
  {
2
+ "eval_accuracy": 0.8085339415160145,
3
+ "eval_loss": 1.681640625,
4
+ "eval_runtime": 927.2318,
5
+ "eval_samples": 25971,
6
+ "eval_samples_per_second": 28.009,
7
+ "eval_steps_per_second": 14.005,
8
+ "perplexity": 5.374366061438169
9
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fbf8023595c7699bdaf3bd34c81d283119a263140ec8d55d3704309e156753c
3
- size 4669288872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8f8f30ae6f98975b62efe02993f798af7fee6c3dd2a00c9213c8e4750246109
3
+ size 4701521288
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 5193544192
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00002-of-00002.safetensors",
@@ -202,18 +202,10 @@
202
  "transformer.h.28.mlp.c_fc_0.weight": "model-00001-of-00002.safetensors",
203
  "transformer.h.28.mlp.c_fc_1.weight": "model-00001-of-00002.safetensors",
204
  "transformer.h.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
205
- "transformer.h.29.attn.attention.k_proj.tucker_core": "model-00001-of-00002.safetensors",
206
- "transformer.h.29.attn.attention.k_proj.tucker_factors.0": "model-00001-of-00002.safetensors",
207
- "transformer.h.29.attn.attention.k_proj.tucker_factors.1": "model-00001-of-00002.safetensors",
208
- "transformer.h.29.attn.attention.out_proj.tucker_core": "model-00001-of-00002.safetensors",
209
- "transformer.h.29.attn.attention.out_proj.tucker_factors.0": "model-00001-of-00002.safetensors",
210
- "transformer.h.29.attn.attention.out_proj.tucker_factors.1": "model-00001-of-00002.safetensors",
211
- "transformer.h.29.attn.attention.q_proj.tucker_core": "model-00001-of-00002.safetensors",
212
- "transformer.h.29.attn.attention.q_proj.tucker_factors.0": "model-00001-of-00002.safetensors",
213
- "transformer.h.29.attn.attention.q_proj.tucker_factors.1": "model-00001-of-00002.safetensors",
214
- "transformer.h.29.attn.attention.v_proj.tucker_core": "model-00001-of-00002.safetensors",
215
- "transformer.h.29.attn.attention.v_proj.tucker_factors.0": "model-00001-of-00002.safetensors",
216
- "transformer.h.29.attn.attention.v_proj.tucker_factors.1": "model-00001-of-00002.safetensors",
217
  "transformer.h.29.ln_1.weight": "model-00001-of-00002.safetensors",
218
  "transformer.h.29.ln_2.weight": "model-00001-of-00002.safetensors",
219
  "transformer.h.29.mlp.c_fc_0.tucker_core": "model-00001-of-00002.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 5225777664
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00002-of-00002.safetensors",
 
202
  "transformer.h.28.mlp.c_fc_0.weight": "model-00001-of-00002.safetensors",
203
  "transformer.h.28.mlp.c_fc_1.weight": "model-00001-of-00002.safetensors",
204
  "transformer.h.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
205
+ "transformer.h.29.attn.attention.k_proj.weight": "model-00001-of-00002.safetensors",
206
+ "transformer.h.29.attn.attention.out_proj.weight": "model-00001-of-00002.safetensors",
207
+ "transformer.h.29.attn.attention.q_proj.weight": "model-00001-of-00002.safetensors",
208
+ "transformer.h.29.attn.attention.v_proj.weight": "model-00001-of-00002.safetensors",
 
 
 
 
 
 
 
 
209
  "transformer.h.29.ln_1.weight": "model-00001-of-00002.safetensors",
210
  "transformer.h.29.ln_2.weight": "model-00001-of-00002.safetensors",
211
  "transformer.h.29.mlp.c_fc_0.tucker_core": "model-00001-of-00002.safetensors",
runs/Mar13_14-25-10_demo2/events.out.tfevents.1741903489.demo2.193067.4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98d60275642b11b67444ab2bc0c1d429ffdeb9f2a0f4c8c6a208ca24813230bb
3
+ size 6413
runs/Mar13_14-25-10_demo2/events.out.tfevents.1741903699.demo2.193067.5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa3219ec1a2cdfd00e35c7b9d6b6e05de2b2107fcce7ba2e0cfb7d11418503ff
3
+ size 467
runs/Mar13_16-43-27_demo2/events.out.tfevents.1741910387.demo2.205134.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7694a82a8f44650110657d9e9daee591aed308ebd41940f7957ef51a36802a9
3
+ size 47468
runs/Mar13_16-43-27_demo2/events.out.tfevents.1741922968.demo2.205134.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dbeef7de6b74f1c6c10a1141bd28c7edcf66ccac977d5fd86516e67f984805e
3
+ size 481
train_layer_index_28_mat_mlp.c_proj_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 21926516883456.0,
3
+ "train_loss": 5.8623046875,
4
+ "train_runtime": 0.353,
5
+ "train_samples": 76256,
6
+ "train_samples_per_second": 22.664,
7
+ "train_steps_per_second": 5.666
8
+ }
train_layer_index_29_mat_mlp.c_proj_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 22694737215488.0,
3
- "train_loss": 5.046875,
4
- "train_runtime": 1.0639,
5
- "train_samples": 76256,
6
- "train_samples_per_second": 7.52,
7
- "train_steps_per_second": 1.88
8
  }
 
1
  {
2
+ "total_flos": 9.3961326985583e+17,
3
+ "train_loss": 1.7179553695028122,
4
+ "train_runtime": 11634.1539,
5
+ "train_samples": 294393,
6
+ "train_samples_per_second": 25.304,
7
+ "train_steps_per_second": 6.326
8
  }
trainer_state.json CHANGED
@@ -1,26 +1,1202 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.0001049097775912715,
5
  "eval_steps": 500,
6
- "global_step": 2,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0001049097775912715,
13
- "mean_token_accuracy": 0.2768045552074909,
14
- "step": 2,
15
- "total_flos": 22519612440576.0,
16
- "train_loss": 5.291015625,
17
- "train_runtime": 0.3705,
18
- "train_samples_per_second": 21.592,
19
- "train_steps_per_second": 5.398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  }
21
  ],
22
  "logging_steps": 500,
23
- "max_steps": 2,
24
  "num_input_tokens_seen": 0,
25
  "num_train_epochs": 1,
26
  "save_steps": 0,
@@ -36,7 +1212,7 @@
36
  "attributes": {}
37
  }
38
  },
39
- "total_flos": 22519612440576.0,
40
  "train_batch_size": 1,
41
  "trial_name": null,
42
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9999932063832823,
5
  "eval_steps": 500,
6
+ "global_step": 73598,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.006793616717732019,
13
+ "grad_norm": 2.0208959579467773,
14
+ "learning_rate": 0.0004966371368787195,
15
+ "loss": 2.0427,
16
+ "mean_token_accuracy": 0.5520137556195259,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.013587233435464039,
21
+ "grad_norm": 1.8117527961730957,
22
+ "learning_rate": 0.0004932403054430827,
23
+ "loss": 1.8742,
24
+ "mean_token_accuracy": 0.5727833653092385,
25
+ "step": 1000
26
+ },
27
+ {
28
+ "epoch": 0.020380850153196056,
29
+ "grad_norm": 1.5713497400283813,
30
+ "learning_rate": 0.0004898434740074459,
31
+ "loss": 1.8367,
32
+ "mean_token_accuracy": 0.5783296891450882,
33
+ "step": 1500
34
+ },
35
+ {
36
+ "epoch": 0.027174466870928077,
37
+ "grad_norm": 1.6636950969696045,
38
+ "learning_rate": 0.00048644664257180905,
39
+ "loss": 1.8238,
40
+ "mean_token_accuracy": 0.5810909342765808,
41
+ "step": 2000
42
+ },
43
+ {
44
+ "epoch": 0.033968083588660096,
45
+ "grad_norm": 1.525341272354126,
46
+ "learning_rate": 0.0004830498111361722,
47
+ "loss": 1.8096,
48
+ "mean_token_accuracy": 0.5805237484574318,
49
+ "step": 2500
50
+ },
51
+ {
52
+ "epoch": 0.04076170030639211,
53
+ "grad_norm": 1.7869559526443481,
54
+ "learning_rate": 0.00047965297970053533,
55
+ "loss": 1.8145,
56
+ "mean_token_accuracy": 0.5809020410776138,
57
+ "step": 3000
58
+ },
59
+ {
60
+ "epoch": 0.04755531702412413,
61
+ "grad_norm": 1.389845848083496,
62
+ "learning_rate": 0.0004762561482648985,
63
+ "loss": 1.7937,
64
+ "mean_token_accuracy": 0.5843603150844574,
65
+ "step": 3500
66
+ },
67
+ {
68
+ "epoch": 0.054348933741856155,
69
+ "grad_norm": 1.1970040798187256,
70
+ "learning_rate": 0.00047285931682926167,
71
+ "loss": 1.7926,
72
+ "mean_token_accuracy": 0.5820418976843357,
73
+ "step": 4000
74
+ },
75
+ {
76
+ "epoch": 0.06114255045958817,
77
+ "grad_norm": 1.4389044046401978,
78
+ "learning_rate": 0.00046946248539362484,
79
+ "loss": 1.7622,
80
+ "mean_token_accuracy": 0.5891672801971436,
81
+ "step": 4500
82
+ },
83
+ {
84
+ "epoch": 0.06793616717732019,
85
+ "grad_norm": 1.177388072013855,
86
+ "learning_rate": 0.000466065653957988,
87
+ "loss": 1.7738,
88
+ "mean_token_accuracy": 0.5879195163547992,
89
+ "step": 5000
90
+ },
91
+ {
92
+ "epoch": 0.07472978389505221,
93
+ "grad_norm": 1.3432203531265259,
94
+ "learning_rate": 0.00046268240984809374,
95
+ "loss": 1.7767,
96
+ "mean_token_accuracy": 0.5867721286118031,
97
+ "step": 5500
98
+ },
99
+ {
100
+ "epoch": 0.08152340061278422,
101
+ "grad_norm": 1.360933542251587,
102
+ "learning_rate": 0.0004592855784124569,
103
+ "loss": 1.7767,
104
+ "mean_token_accuracy": 0.5867527797520161,
105
+ "step": 6000
106
+ },
107
+ {
108
+ "epoch": 0.08831701733051625,
109
+ "grad_norm": 1.1908655166625977,
110
+ "learning_rate": 0.0004559023343025626,
111
+ "loss": 1.7606,
112
+ "mean_token_accuracy": 0.5915117806196213,
113
+ "step": 6500
114
+ },
115
+ {
116
+ "epoch": 0.09511063404824827,
117
+ "grad_norm": 1.1951971054077148,
118
+ "learning_rate": 0.00045250550286692574,
119
+ "loss": 1.7629,
120
+ "mean_token_accuracy": 0.5892836402058601,
121
+ "step": 7000
122
+ },
123
+ {
124
+ "epoch": 0.10190425076598028,
125
+ "grad_norm": 1.207234501838684,
126
+ "learning_rate": 0.00044912225875703147,
127
+ "loss": 1.7618,
128
+ "mean_token_accuracy": 0.5882115732729435,
129
+ "step": 7500
130
+ },
131
+ {
132
+ "epoch": 0.10869786748371231,
133
+ "grad_norm": 1.1731985807418823,
134
+ "learning_rate": 0.0004457254273213946,
135
+ "loss": 1.753,
136
+ "mean_token_accuracy": 0.5895786600708961,
137
+ "step": 8000
138
+ },
139
+ {
140
+ "epoch": 0.11549148420144432,
141
+ "grad_norm": 1.3629071712493896,
142
+ "learning_rate": 0.0004423421832115003,
143
+ "loss": 1.7627,
144
+ "mean_token_accuracy": 0.587433840662241,
145
+ "step": 8500
146
+ },
147
+ {
148
+ "epoch": 0.12228510091917634,
149
+ "grad_norm": 1.2118574380874634,
150
+ "learning_rate": 0.0004389453517758635,
151
+ "loss": 1.7529,
152
+ "mean_token_accuracy": 0.5890023435950279,
153
+ "step": 9000
154
+ },
155
+ {
156
+ "epoch": 0.12907871763690837,
157
+ "grad_norm": 1.1447389125823975,
158
+ "learning_rate": 0.0004355621076659692,
159
+ "loss": 1.7433,
160
+ "mean_token_accuracy": 0.5922272650897503,
161
+ "step": 9500
162
+ },
163
+ {
164
+ "epoch": 0.13587233435464038,
165
+ "grad_norm": 1.1409871578216553,
166
+ "learning_rate": 0.00043216527623033237,
167
+ "loss": 1.7479,
168
+ "mean_token_accuracy": 0.5903287023007869,
169
+ "step": 10000
170
+ },
171
+ {
172
+ "epoch": 0.1426659510723724,
173
+ "grad_norm": 1.2514444589614868,
174
+ "learning_rate": 0.00042878203212043804,
175
+ "loss": 1.7471,
176
+ "mean_token_accuracy": 0.5892516044676304,
177
+ "step": 10500
178
+ },
179
+ {
180
+ "epoch": 0.14945956779010441,
181
+ "grad_norm": 1.3417694568634033,
182
+ "learning_rate": 0.00042538520068480126,
183
+ "loss": 1.7324,
184
+ "mean_token_accuracy": 0.5957142559885978,
185
+ "step": 11000
186
+ },
187
+ {
188
+ "epoch": 0.15625318450783643,
189
+ "grad_norm": 1.3369439840316772,
190
+ "learning_rate": 0.00042200195657490693,
191
+ "loss": 1.7428,
192
+ "mean_token_accuracy": 0.5911534672677516,
193
+ "step": 11500
194
+ },
195
+ {
196
+ "epoch": 0.16304680122556844,
197
+ "grad_norm": 1.2490959167480469,
198
+ "learning_rate": 0.00041860512513927005,
199
+ "loss": 1.7318,
200
+ "mean_token_accuracy": 0.5937872271835803,
201
+ "step": 12000
202
+ },
203
+ {
204
+ "epoch": 0.1698404179433005,
205
+ "grad_norm": 1.2229673862457275,
206
+ "learning_rate": 0.0004152218810293758,
207
+ "loss": 1.7429,
208
+ "mean_token_accuracy": 0.5896593699157238,
209
+ "step": 12500
210
+ },
211
+ {
212
+ "epoch": 0.1766340346610325,
213
+ "grad_norm": 1.3295321464538574,
214
+ "learning_rate": 0.00041182504959373894,
215
+ "loss": 1.7633,
216
+ "mean_token_accuracy": 0.5877331503927707,
217
+ "step": 13000
218
+ },
219
+ {
220
+ "epoch": 0.18342765137876452,
221
+ "grad_norm": 0.9519675970077515,
222
+ "learning_rate": 0.00040844180548384466,
223
+ "loss": 1.7573,
224
+ "mean_token_accuracy": 0.5893017112016677,
225
+ "step": 13500
226
+ },
227
+ {
228
+ "epoch": 0.19022126809649653,
229
+ "grad_norm": 1.448428750038147,
230
+ "learning_rate": 0.00040504497404820783,
231
+ "loss": 1.7314,
232
+ "mean_token_accuracy": 0.5936685436069965,
233
+ "step": 14000
234
+ },
235
+ {
236
+ "epoch": 0.19701488481422855,
237
+ "grad_norm": 1.3305081129074097,
238
+ "learning_rate": 0.00040166172993831356,
239
+ "loss": 1.7293,
240
+ "mean_token_accuracy": 0.5929152786135673,
241
+ "step": 14500
242
+ },
243
+ {
244
+ "epoch": 0.20380850153196056,
245
+ "grad_norm": 1.2279210090637207,
246
+ "learning_rate": 0.00039826489850267673,
247
+ "loss": 1.728,
248
+ "mean_token_accuracy": 0.5941076972186565,
249
+ "step": 15000
250
+ },
251
+ {
252
+ "epoch": 0.21060211824969258,
253
+ "grad_norm": 0.9746572375297546,
254
+ "learning_rate": 0.0003948816543927824,
255
+ "loss": 1.7452,
256
+ "mean_token_accuracy": 0.5927870056033134,
257
+ "step": 15500
258
+ },
259
+ {
260
+ "epoch": 0.21739573496742462,
261
+ "grad_norm": 1.3720732927322388,
262
+ "learning_rate": 0.0003914848229571456,
263
+ "loss": 1.7259,
264
+ "mean_token_accuracy": 0.5940872375369072,
265
+ "step": 16000
266
+ },
267
+ {
268
+ "epoch": 0.22418935168515663,
269
+ "grad_norm": 1.2336689233779907,
270
+ "learning_rate": 0.0003881015788472513,
271
+ "loss": 1.7365,
272
+ "mean_token_accuracy": 0.5929380969107151,
273
+ "step": 16500
274
+ },
275
+ {
276
+ "epoch": 0.23098296840288865,
277
+ "grad_norm": 1.0067358016967773,
278
+ "learning_rate": 0.0003847047474116144,
279
+ "loss": 1.7373,
280
+ "mean_token_accuracy": 0.5914620462656021,
281
+ "step": 17000
282
+ },
283
+ {
284
+ "epoch": 0.23777658512062066,
285
+ "grad_norm": 1.2364681959152222,
286
+ "learning_rate": 0.0003813215033017202,
287
+ "loss": 1.7298,
288
+ "mean_token_accuracy": 0.5921443883776665,
289
+ "step": 17500
290
+ },
291
+ {
292
+ "epoch": 0.24457020183835268,
293
+ "grad_norm": 1.186237096786499,
294
+ "learning_rate": 0.0003779246718660833,
295
+ "loss": 1.7336,
296
+ "mean_token_accuracy": 0.5918942322134971,
297
+ "step": 18000
298
+ },
299
+ {
300
+ "epoch": 0.2513638185560847,
301
+ "grad_norm": 1.2172770500183105,
302
+ "learning_rate": 0.000374541427756189,
303
+ "loss": 1.7321,
304
+ "mean_token_accuracy": 0.592568263143301,
305
+ "step": 18500
306
+ },
307
+ {
308
+ "epoch": 0.25815743527381674,
309
+ "grad_norm": 1.074506402015686,
310
+ "learning_rate": 0.0003711445963205522,
311
+ "loss": 1.726,
312
+ "mean_token_accuracy": 0.5930768192112446,
313
+ "step": 19000
314
+ },
315
+ {
316
+ "epoch": 0.2649510519915487,
317
+ "grad_norm": 1.3433698415756226,
318
+ "learning_rate": 0.0003677681458735292,
319
+ "loss": 1.7252,
320
+ "mean_token_accuracy": 0.5934848180115223,
321
+ "step": 19500
322
+ },
323
+ {
324
+ "epoch": 0.27174466870928077,
325
+ "grad_norm": 1.0935838222503662,
326
+ "learning_rate": 0.00036437131443789233,
327
+ "loss": 1.7276,
328
+ "mean_token_accuracy": 0.5916450168192386,
329
+ "step": 20000
330
+ },
331
+ {
332
+ "epoch": 0.27853828542701276,
333
+ "grad_norm": 1.5121397972106934,
334
+ "learning_rate": 0.0003609744830022555,
335
+ "loss": 1.738,
336
+ "mean_token_accuracy": 0.5906197059154511,
337
+ "step": 20500
338
+ },
339
+ {
340
+ "epoch": 0.2853319021447448,
341
+ "grad_norm": 1.2155652046203613,
342
+ "learning_rate": 0.0003575776515666186,
343
+ "loss": 1.7221,
344
+ "mean_token_accuracy": 0.5948817262351513,
345
+ "step": 21000
346
+ },
347
+ {
348
+ "epoch": 0.29212551886247684,
349
+ "grad_norm": 1.0498167276382446,
350
+ "learning_rate": 0.0003541944074567244,
351
+ "loss": 1.7346,
352
+ "mean_token_accuracy": 0.5901701744496822,
353
+ "step": 21500
354
+ },
355
+ {
356
+ "epoch": 0.29891913558020883,
357
+ "grad_norm": 0.9237672686576843,
358
+ "learning_rate": 0.0003507975760210875,
359
+ "loss": 1.7265,
360
+ "mean_token_accuracy": 0.5947049247324466,
361
+ "step": 22000
362
+ },
363
+ {
364
+ "epoch": 0.30571275229794087,
365
+ "grad_norm": 1.235968828201294,
366
+ "learning_rate": 0.00034741433191119324,
367
+ "loss": 1.7359,
368
+ "mean_token_accuracy": 0.5912741076052189,
369
+ "step": 22500
370
+ },
371
+ {
372
+ "epoch": 0.31250636901567286,
373
+ "grad_norm": 1.3003114461898804,
374
+ "learning_rate": 0.0003440175004755564,
375
+ "loss": 1.7165,
376
+ "mean_token_accuracy": 0.596044946283102,
377
+ "step": 23000
378
+ },
379
+ {
380
+ "epoch": 0.3192999857334049,
381
+ "grad_norm": 0.9103087186813354,
382
+ "learning_rate": 0.00034063425636566213,
383
+ "loss": 1.7145,
384
+ "mean_token_accuracy": 0.5962611735165119,
385
+ "step": 23500
386
+ },
387
+ {
388
+ "epoch": 0.3260936024511369,
389
+ "grad_norm": 1.3072879314422607,
390
+ "learning_rate": 0.0003372374249300253,
391
+ "loss": 1.7182,
392
+ "mean_token_accuracy": 0.5958844081163407,
393
+ "step": 24000
394
+ },
395
+ {
396
+ "epoch": 0.33288721916886893,
397
+ "grad_norm": 1.122262716293335,
398
+ "learning_rate": 0.00033385418082013097,
399
+ "loss": 1.7277,
400
+ "mean_token_accuracy": 0.5924356364905834,
401
+ "step": 24500
402
+ },
403
+ {
404
+ "epoch": 0.339680835886601,
405
+ "grad_norm": 1.0892757177352905,
406
+ "learning_rate": 0.0003304573493844942,
407
+ "loss": 1.7168,
408
+ "mean_token_accuracy": 0.5951408152878285,
409
+ "step": 25000
410
+ },
411
+ {
412
+ "epoch": 0.34647445260433296,
413
+ "grad_norm": 1.3095890283584595,
414
+ "learning_rate": 0.00032707410527459986,
415
+ "loss": 1.7103,
416
+ "mean_token_accuracy": 0.5958214187920093,
417
+ "step": 25500
418
+ },
419
+ {
420
+ "epoch": 0.353268069322065,
421
+ "grad_norm": 0.9692500233650208,
422
+ "learning_rate": 0.000323677273838963,
423
+ "loss": 1.7127,
424
+ "mean_token_accuracy": 0.5952938674986362,
425
+ "step": 26000
426
+ },
427
+ {
428
+ "epoch": 0.360061686039797,
429
+ "grad_norm": 1.0970525741577148,
430
+ "learning_rate": 0.00032029402972906876,
431
+ "loss": 1.7251,
432
+ "mean_token_accuracy": 0.5936492686867714,
433
+ "step": 26500
434
+ },
435
+ {
436
+ "epoch": 0.36685530275752903,
437
+ "grad_norm": 1.2200478315353394,
438
+ "learning_rate": 0.00031689719829343187,
439
+ "loss": 1.7082,
440
+ "mean_token_accuracy": 0.5961000478565693,
441
+ "step": 27000
442
+ },
443
+ {
444
+ "epoch": 0.373648919475261,
445
+ "grad_norm": 1.0182169675827026,
446
+ "learning_rate": 0.0003135139541835376,
447
+ "loss": 1.7038,
448
+ "mean_token_accuracy": 0.5973086108565331,
449
+ "step": 27500
450
+ },
451
+ {
452
+ "epoch": 0.38044253619299306,
453
+ "grad_norm": 0.9256690740585327,
454
+ "learning_rate": 0.00031011712274790076,
455
+ "loss": 1.7208,
456
+ "mean_token_accuracy": 0.593766670614481,
457
+ "step": 28000
458
+ },
459
+ {
460
+ "epoch": 0.3872361529107251,
461
+ "grad_norm": 1.1710305213928223,
462
+ "learning_rate": 0.0003067270849751352,
463
+ "loss": 1.73,
464
+ "mean_token_accuracy": 0.594021442502737,
465
+ "step": 28500
466
+ },
467
+ {
468
+ "epoch": 0.3940297696284571,
469
+ "grad_norm": 1.1612814664840698,
470
+ "learning_rate": 0.00030333704720236966,
471
+ "loss": 1.7146,
472
+ "mean_token_accuracy": 0.5956000906527043,
473
+ "step": 29000
474
+ },
475
+ {
476
+ "epoch": 0.40082338634618914,
477
+ "grad_norm": 1.2125431299209595,
478
+ "learning_rate": 0.00029994021576673283,
479
+ "loss": 1.7076,
480
+ "mean_token_accuracy": 0.5957852031290531,
481
+ "step": 29500
482
+ },
483
+ {
484
+ "epoch": 0.4076170030639211,
485
+ "grad_norm": 1.2176785469055176,
486
+ "learning_rate": 0.0002965569716568385,
487
+ "loss": 1.7312,
488
+ "mean_token_accuracy": 0.5945550291240216,
489
+ "step": 30000
490
+ },
491
+ {
492
+ "epoch": 0.41441061978165317,
493
+ "grad_norm": 1.5686943531036377,
494
+ "learning_rate": 0.00029316014022120167,
495
+ "loss": 1.7171,
496
+ "mean_token_accuracy": 0.5942135262489319,
497
+ "step": 30500
498
+ },
499
+ {
500
+ "epoch": 0.42120423649938515,
501
+ "grad_norm": 1.0116132497787476,
502
+ "learning_rate": 0.0002897768961113074,
503
+ "loss": 1.6985,
504
+ "mean_token_accuracy": 0.5982498173713684,
505
+ "step": 31000
506
+ },
507
+ {
508
+ "epoch": 0.4279978532171172,
509
+ "grad_norm": 0.9668023586273193,
510
+ "learning_rate": 0.0002863800646756705,
511
+ "loss": 1.7078,
512
+ "mean_token_accuracy": 0.5950165801644325,
513
+ "step": 31500
514
+ },
515
+ {
516
+ "epoch": 0.43479146993484924,
517
+ "grad_norm": 1.2987300157546997,
518
+ "learning_rate": 0.00028299682056577623,
519
+ "loss": 1.7288,
520
+ "mean_token_accuracy": 0.5917907827198505,
521
+ "step": 32000
522
+ },
523
+ {
524
+ "epoch": 0.4415850866525812,
525
+ "grad_norm": 1.1014894247055054,
526
+ "learning_rate": 0.0002795999891301394,
527
+ "loss": 1.7207,
528
+ "mean_token_accuracy": 0.5922197285294533,
529
+ "step": 32500
530
+ },
531
+ {
532
+ "epoch": 0.44837870337031327,
533
+ "grad_norm": 1.0592535734176636,
534
+ "learning_rate": 0.0002762167450202451,
535
+ "loss": 1.7043,
536
+ "mean_token_accuracy": 0.5975054582953453,
537
+ "step": 33000
538
+ },
539
+ {
540
+ "epoch": 0.45517232008804526,
541
+ "grad_norm": 1.084416151046753,
542
+ "learning_rate": 0.0002728199135846083,
543
+ "loss": 1.7153,
544
+ "mean_token_accuracy": 0.59563816472888,
545
+ "step": 33500
546
+ },
547
+ {
548
+ "epoch": 0.4619659368057773,
549
+ "grad_norm": 1.0490503311157227,
550
+ "learning_rate": 0.000269436669474714,
551
+ "loss": 1.7049,
552
+ "mean_token_accuracy": 0.5960744704604148,
553
+ "step": 34000
554
+ },
555
+ {
556
+ "epoch": 0.4687595535235093,
557
+ "grad_norm": 1.3198046684265137,
558
+ "learning_rate": 0.0002660398380390772,
559
+ "loss": 1.6951,
560
+ "mean_token_accuracy": 0.6000102725625038,
561
+ "step": 34500
562
+ },
563
+ {
564
+ "epoch": 0.47555317024124133,
565
+ "grad_norm": 1.1859066486358643,
566
+ "learning_rate": 0.00026265659392918285,
567
+ "loss": 1.7025,
568
+ "mean_token_accuracy": 0.5961538652777671,
569
+ "step": 35000
570
+ },
571
+ {
572
+ "epoch": 0.4823467869589734,
573
+ "grad_norm": 1.0394221544265747,
574
+ "learning_rate": 0.000259259762493546,
575
+ "loss": 1.695,
576
+ "mean_token_accuracy": 0.599614026337862,
577
+ "step": 35500
578
+ },
579
+ {
580
+ "epoch": 0.48914040367670536,
581
+ "grad_norm": 1.1829257011413574,
582
+ "learning_rate": 0.00025587651838365175,
583
+ "loss": 1.7086,
584
+ "mean_token_accuracy": 0.5964278879761696,
585
+ "step": 36000
586
+ },
587
+ {
588
+ "epoch": 0.4959340203944374,
589
+ "grad_norm": 1.0766243934631348,
590
+ "learning_rate": 0.00025247968694801486,
591
+ "loss": 1.7,
592
+ "mean_token_accuracy": 0.5977904357016086,
593
+ "step": 36500
594
+ },
595
+ {
596
+ "epoch": 0.5027276371121694,
597
+ "grad_norm": 1.1784560680389404,
598
+ "learning_rate": 0.0002490964428381206,
599
+ "loss": 1.715,
600
+ "mean_token_accuracy": 0.5959987764656544,
601
+ "step": 37000
602
+ },
603
+ {
604
+ "epoch": 0.5095212538299014,
605
+ "grad_norm": 1.1208888292312622,
606
+ "learning_rate": 0.00024569961140248376,
607
+ "loss": 1.7046,
608
+ "mean_token_accuracy": 0.5964538981616497,
609
+ "step": 37500
610
+ },
611
+ {
612
+ "epoch": 0.5163148705476335,
613
+ "grad_norm": 1.0677920579910278,
614
+ "learning_rate": 0.00024231636729258948,
615
+ "loss": 1.7001,
616
+ "mean_token_accuracy": 0.5974933596849441,
617
+ "step": 38000
618
+ },
619
+ {
620
+ "epoch": 0.5231084872653654,
621
+ "grad_norm": 0.9215283393859863,
622
+ "learning_rate": 0.00023891953585695265,
623
+ "loss": 1.7105,
624
+ "mean_token_accuracy": 0.596855210095644,
625
+ "step": 38500
626
+ },
627
+ {
628
+ "epoch": 0.5299021039830974,
629
+ "grad_norm": 1.23247492313385,
630
+ "learning_rate": 0.00023553629174705835,
631
+ "loss": 1.711,
632
+ "mean_token_accuracy": 0.5971770713925362,
633
+ "step": 39000
634
+ },
635
+ {
636
+ "epoch": 0.5366957207008295,
637
+ "grad_norm": 1.07834792137146,
638
+ "learning_rate": 0.00023213946031142152,
639
+ "loss": 1.6993,
640
+ "mean_token_accuracy": 0.5984128168821334,
641
+ "step": 39500
642
+ },
643
+ {
644
+ "epoch": 0.5434893374185615,
645
+ "grad_norm": 1.1771501302719116,
646
+ "learning_rate": 0.00022875621620152724,
647
+ "loss": 1.6937,
648
+ "mean_token_accuracy": 0.6004006116986275,
649
+ "step": 40000
650
+ },
651
+ {
652
+ "epoch": 0.5502829541362936,
653
+ "grad_norm": 1.2967567443847656,
654
+ "learning_rate": 0.00022535938476589038,
655
+ "loss": 1.7038,
656
+ "mean_token_accuracy": 0.5961601254045963,
657
+ "step": 40500
658
+ },
659
+ {
660
+ "epoch": 0.5570765708540255,
661
+ "grad_norm": 1.1175864934921265,
662
+ "learning_rate": 0.00022197614065599608,
663
+ "loss": 1.716,
664
+ "mean_token_accuracy": 0.5941783719956875,
665
+ "step": 41000
666
+ },
667
+ {
668
+ "epoch": 0.5638701875717576,
669
+ "grad_norm": 1.380516529083252,
670
+ "learning_rate": 0.00021857930922035927,
671
+ "loss": 1.6936,
672
+ "mean_token_accuracy": 0.5974556840360165,
673
+ "step": 41500
674
+ },
675
+ {
676
+ "epoch": 0.5706638042894896,
677
+ "grad_norm": 1.1645792722702026,
678
+ "learning_rate": 0.00021519606511046497,
679
+ "loss": 1.6964,
680
+ "mean_token_accuracy": 0.5985994503200054,
681
+ "step": 42000
682
+ },
683
+ {
684
+ "epoch": 0.5774574210072216,
685
+ "grad_norm": 1.3061178922653198,
686
+ "learning_rate": 0.0002117992336748281,
687
+ "loss": 1.7065,
688
+ "mean_token_accuracy": 0.5961524350643158,
689
+ "step": 42500
690
+ },
691
+ {
692
+ "epoch": 0.5842510377249537,
693
+ "grad_norm": 1.1523349285125732,
694
+ "learning_rate": 0.0002084159895649338,
695
+ "loss": 1.7001,
696
+ "mean_token_accuracy": 0.5993789280354976,
697
+ "step": 43000
698
+ },
699
+ {
700
+ "epoch": 0.5910446544426856,
701
+ "grad_norm": 1.1238070726394653,
702
+ "learning_rate": 0.000205019158129297,
703
+ "loss": 1.724,
704
+ "mean_token_accuracy": 0.5929664156734943,
705
+ "step": 43500
706
+ },
707
+ {
708
+ "epoch": 0.5978382711604177,
709
+ "grad_norm": 1.1592812538146973,
710
+ "learning_rate": 0.0002016359140194027,
711
+ "loss": 1.7065,
712
+ "mean_token_accuracy": 0.5956415718495846,
713
+ "step": 44000
714
+ },
715
+ {
716
+ "epoch": 0.6046318878781497,
717
+ "grad_norm": 1.080269694328308,
718
+ "learning_rate": 0.00019823908258376585,
719
+ "loss": 1.6983,
720
+ "mean_token_accuracy": 0.5975038340389729,
721
+ "step": 44500
722
+ },
723
+ {
724
+ "epoch": 0.6114255045958817,
725
+ "grad_norm": 1.281002163887024,
726
+ "learning_rate": 0.0001948558384738716,
727
+ "loss": 1.6965,
728
+ "mean_token_accuracy": 0.5972202825844288,
729
+ "step": 45000
730
+ },
731
+ {
732
+ "epoch": 0.6182191213136137,
733
+ "grad_norm": 1.004654049873352,
734
+ "learning_rate": 0.00019145900703823474,
735
+ "loss": 1.7176,
736
+ "mean_token_accuracy": 0.5956167621910572,
737
+ "step": 45500
738
+ },
739
+ {
740
+ "epoch": 0.6250127380313457,
741
+ "grad_norm": 1.0295976400375366,
742
+ "learning_rate": 0.00018807576292834043,
743
+ "loss": 1.6929,
744
+ "mean_token_accuracy": 0.5980769891738892,
745
+ "step": 46000
746
+ },
747
+ {
748
+ "epoch": 0.6318063547490778,
749
+ "grad_norm": 1.0710487365722656,
750
+ "learning_rate": 0.00018467893149270363,
751
+ "loss": 1.7116,
752
+ "mean_token_accuracy": 0.5948944690227509,
753
+ "step": 46500
754
+ },
755
+ {
756
+ "epoch": 0.6385999714668098,
757
+ "grad_norm": 1.1924978494644165,
758
+ "learning_rate": 0.00018129568738280933,
759
+ "loss": 1.6896,
760
+ "mean_token_accuracy": 0.6006084454953671,
761
+ "step": 47000
762
+ },
763
+ {
764
+ "epoch": 0.6453935881845418,
765
+ "grad_norm": 1.1349835395812988,
766
+ "learning_rate": 0.00017789885594717247,
767
+ "loss": 1.706,
768
+ "mean_token_accuracy": 0.5981652895510197,
769
+ "step": 47500
770
+ },
771
+ {
772
+ "epoch": 0.6521872049022738,
773
+ "grad_norm": 1.4327572584152222,
774
+ "learning_rate": 0.0001745156118372782,
775
+ "loss": 1.7034,
776
+ "mean_token_accuracy": 0.5959363037645817,
777
+ "step": 48000
778
+ },
779
+ {
780
+ "epoch": 0.6589808216200058,
781
+ "grad_norm": 1.499800682067871,
782
+ "learning_rate": 0.00017111878040164136,
783
+ "loss": 1.696,
784
+ "mean_token_accuracy": 0.598519076347351,
785
+ "step": 48500
786
+ },
787
+ {
788
+ "epoch": 0.6657744383377379,
789
+ "grad_norm": 1.173693299293518,
790
+ "learning_rate": 0.00016773553629174706,
791
+ "loss": 1.6872,
792
+ "mean_token_accuracy": 0.6003931475877762,
793
+ "step": 49000
794
+ },
795
+ {
796
+ "epoch": 0.6725680550554699,
797
+ "grad_norm": 1.0841670036315918,
798
+ "learning_rate": 0.0001643387048561102,
799
+ "loss": 1.7033,
800
+ "mean_token_accuracy": 0.5981890520751476,
801
+ "step": 49500
802
+ },
803
+ {
804
+ "epoch": 0.679361671773202,
805
+ "grad_norm": 1.2305493354797363,
806
+ "learning_rate": 0.00016095546074621593,
807
+ "loss": 1.7026,
808
+ "mean_token_accuracy": 0.5967764587402343,
809
+ "step": 50000
810
+ },
811
+ {
812
+ "epoch": 0.6861552884909339,
813
+ "grad_norm": 1.1358468532562256,
814
+ "learning_rate": 0.0001575586293105791,
815
+ "loss": 1.7024,
816
+ "mean_token_accuracy": 0.5969898876547813,
817
+ "step": 50500
818
+ },
819
+ {
820
+ "epoch": 0.6929489052086659,
821
+ "grad_norm": 1.3159434795379639,
822
+ "learning_rate": 0.0001541753852006848,
823
+ "loss": 1.6995,
824
+ "mean_token_accuracy": 0.5962181152105331,
825
+ "step": 51000
826
+ },
827
+ {
828
+ "epoch": 0.699742521926398,
829
+ "grad_norm": 1.27256441116333,
830
+ "learning_rate": 0.00015077855376504796,
831
+ "loss": 1.7029,
832
+ "mean_token_accuracy": 0.5967380773425103,
833
+ "step": 51500
834
+ },
835
+ {
836
+ "epoch": 0.70653613864413,
837
+ "grad_norm": 1.5178892612457275,
838
+ "learning_rate": 0.00014739530965515369,
839
+ "loss": 1.6899,
840
+ "mean_token_accuracy": 0.5981109066009521,
841
+ "step": 52000
842
+ },
843
+ {
844
+ "epoch": 0.7133297553618619,
845
+ "grad_norm": 0.9817212820053101,
846
+ "learning_rate": 0.00014399847821951683,
847
+ "loss": 1.6943,
848
+ "mean_token_accuracy": 0.5980751592516899,
849
+ "step": 52500
850
+ },
851
+ {
852
+ "epoch": 0.720123372079594,
853
+ "grad_norm": 1.0862836837768555,
854
+ "learning_rate": 0.00014061523410962255,
855
+ "loss": 1.7085,
856
+ "mean_token_accuracy": 0.5958100288212299,
857
+ "step": 53000
858
+ },
859
+ {
860
+ "epoch": 0.726916988797326,
861
+ "grad_norm": 1.3386940956115723,
862
+ "learning_rate": 0.00013721840267398572,
863
+ "loss": 1.6903,
864
+ "mean_token_accuracy": 0.5972609171569347,
865
+ "step": 53500
866
+ },
867
+ {
868
+ "epoch": 0.7337106055150581,
869
+ "grad_norm": 1.6551252603530884,
870
+ "learning_rate": 0.00013383515856409142,
871
+ "loss": 1.6976,
872
+ "mean_token_accuracy": 0.5984482218325138,
873
+ "step": 54000
874
+ },
875
+ {
876
+ "epoch": 0.7405042222327901,
877
+ "grad_norm": 1.1888887882232666,
878
+ "learning_rate": 0.0001304383271284546,
879
+ "loss": 1.7019,
880
+ "mean_token_accuracy": 0.5954872596561909,
881
+ "step": 54500
882
+ },
883
+ {
884
+ "epoch": 0.747297838950522,
885
+ "grad_norm": 1.0421268939971924,
886
+ "learning_rate": 0.00012705508301856028,
887
+ "loss": 1.6823,
888
+ "mean_token_accuracy": 0.5998311370313167,
889
+ "step": 55000
890
+ },
891
+ {
892
+ "epoch": 0.7540914556682541,
893
+ "grad_norm": 1.0341774225234985,
894
+ "learning_rate": 0.00012365825158292345,
895
+ "loss": 1.7042,
896
+ "mean_token_accuracy": 0.5982704365849495,
897
+ "step": 55500
898
+ },
899
+ {
900
+ "epoch": 0.7608850723859861,
901
+ "grad_norm": 1.222990870475769,
902
+ "learning_rate": 0.00012027500747302916,
903
+ "loss": 1.6795,
904
+ "mean_token_accuracy": 0.6036300667524338,
905
+ "step": 56000
906
+ },
907
+ {
908
+ "epoch": 0.7676786891037182,
909
+ "grad_norm": 1.0490987300872803,
910
+ "learning_rate": 0.00011687817603739233,
911
+ "loss": 1.6875,
912
+ "mean_token_accuracy": 0.597304708212614,
913
+ "step": 56500
914
+ },
915
+ {
916
+ "epoch": 0.7744723058214502,
917
+ "grad_norm": 1.1952784061431885,
918
+ "learning_rate": 0.00011349493192749803,
919
+ "loss": 1.6796,
920
+ "mean_token_accuracy": 0.6028886337280274,
921
+ "step": 57000
922
+ },
923
+ {
924
+ "epoch": 0.7812659225391821,
925
+ "grad_norm": 1.000869870185852,
926
+ "learning_rate": 0.0001100981004918612,
927
+ "loss": 1.7052,
928
+ "mean_token_accuracy": 0.5968053241670132,
929
+ "step": 57500
930
+ },
931
+ {
932
+ "epoch": 0.7880595392569142,
933
+ "grad_norm": 1.133309006690979,
934
+ "learning_rate": 0.0001067148563819669,
935
+ "loss": 1.6965,
936
+ "mean_token_accuracy": 0.5972890597879886,
937
+ "step": 58000
938
+ },
939
+ {
940
+ "epoch": 0.7948531559746462,
941
+ "grad_norm": 1.2667903900146484,
942
+ "learning_rate": 0.00010331802494633007,
943
+ "loss": 1.6786,
944
+ "mean_token_accuracy": 0.601085704267025,
945
+ "step": 58500
946
+ },
947
+ {
948
+ "epoch": 0.8016467726923783,
949
+ "grad_norm": 1.0539146661758423,
950
+ "learning_rate": 9.993478083643577e-05,
951
+ "loss": 1.7029,
952
+ "mean_token_accuracy": 0.5962156045734882,
953
+ "step": 59000
954
+ },
955
+ {
956
+ "epoch": 0.8084403894101102,
957
+ "grad_norm": 1.1023699045181274,
958
+ "learning_rate": 9.653794940079893e-05,
959
+ "loss": 1.6731,
960
+ "mean_token_accuracy": 0.6045135918557644,
961
+ "step": 59500
962
+ },
963
+ {
964
+ "epoch": 0.8152340061278422,
965
+ "grad_norm": 1.227386236190796,
966
+ "learning_rate": 9.315470529090464e-05,
967
+ "loss": 1.6855,
968
+ "mean_token_accuracy": 0.5978653418123722,
969
+ "step": 60000
970
+ },
971
+ {
972
+ "epoch": 0.8220276228455743,
973
+ "grad_norm": 1.2664766311645508,
974
+ "learning_rate": 8.976466751813909e-05,
975
+ "loss": 1.7055,
976
+ "mean_token_accuracy": 0.5958617155849933,
977
+ "step": 60500
978
+ },
979
+ {
980
+ "epoch": 0.8288212395633063,
981
+ "grad_norm": 1.263852834701538,
982
+ "learning_rate": 8.636783608250224e-05,
983
+ "loss": 1.6971,
984
+ "mean_token_accuracy": 0.5953974407315255,
985
+ "step": 61000
986
+ },
987
+ {
988
+ "epoch": 0.8356148562810384,
989
+ "grad_norm": 1.1483241319656372,
990
+ "learning_rate": 8.297100464686541e-05,
991
+ "loss": 1.6798,
992
+ "mean_token_accuracy": 0.5995199010372162,
993
+ "step": 61500
994
+ },
995
+ {
996
+ "epoch": 0.8424084729987703,
997
+ "grad_norm": 0.8838757872581482,
998
+ "learning_rate": 7.957417321122856e-05,
999
+ "loss": 1.6863,
1000
+ "mean_token_accuracy": 0.5994715365469456,
1001
+ "step": 62000
1002
+ },
1003
+ {
1004
+ "epoch": 0.8492020897165024,
1005
+ "grad_norm": 1.0497251749038696,
1006
+ "learning_rate": 7.619092910133428e-05,
1007
+ "loss": 1.6692,
1008
+ "mean_token_accuracy": 0.6024055682122708,
1009
+ "step": 62500
1010
+ },
1011
+ {
1012
+ "epoch": 0.8559957064342344,
1013
+ "grad_norm": 0.9796432852745056,
1014
+ "learning_rate": 7.279409766569745e-05,
1015
+ "loss": 1.6799,
1016
+ "mean_token_accuracy": 0.5993573130667209,
1017
+ "step": 63000
1018
+ },
1019
+ {
1020
+ "epoch": 0.8627893231519664,
1021
+ "grad_norm": 1.2059897184371948,
1022
+ "learning_rate": 6.941085355580315e-05,
1023
+ "loss": 1.6763,
1024
+ "mean_token_accuracy": 0.6016550965607166,
1025
+ "step": 63500
1026
+ },
1027
+ {
1028
+ "epoch": 0.8695829398696985,
1029
+ "grad_norm": 1.184564232826233,
1030
+ "learning_rate": 6.601402212016632e-05,
1031
+ "loss": 1.683,
1032
+ "mean_token_accuracy": 0.5992586967945099,
1033
+ "step": 64000
1034
+ },
1035
+ {
1036
+ "epoch": 0.8763765565874304,
1037
+ "grad_norm": 0.979015588760376,
1038
+ "learning_rate": 6.262398434740074e-05,
1039
+ "loss": 1.6916,
1040
+ "mean_token_accuracy": 0.6002075519263744,
1041
+ "step": 64500
1042
+ },
1043
+ {
1044
+ "epoch": 0.8831701733051625,
1045
+ "grad_norm": 1.4841409921646118,
1046
+ "learning_rate": 5.924074023750645e-05,
1047
+ "loss": 1.6815,
1048
+ "mean_token_accuracy": 0.6010229328274727,
1049
+ "step": 65000
1050
+ },
1051
+ {
1052
+ "epoch": 0.8899637900228945,
1053
+ "grad_norm": 1.0104435682296753,
1054
+ "learning_rate": 5.5843908801869616e-05,
1055
+ "loss": 1.6942,
1056
+ "mean_token_accuracy": 0.5977232045531273,
1057
+ "step": 65500
1058
+ },
1059
+ {
1060
+ "epoch": 0.8967574067406265,
1061
+ "grad_norm": 1.364670991897583,
1062
+ "learning_rate": 5.244707736623278e-05,
1063
+ "loss": 1.6862,
1064
+ "mean_token_accuracy": 0.5978881956636906,
1065
+ "step": 66000
1066
+ },
1067
+ {
1068
+ "epoch": 0.9035510234583585,
1069
+ "grad_norm": 1.085087537765503,
1070
+ "learning_rate": 4.905024593059594e-05,
1071
+ "loss": 1.6812,
1072
+ "mean_token_accuracy": 0.5997030725479126,
1073
+ "step": 66500
1074
+ },
1075
+ {
1076
+ "epoch": 0.9103446401760905,
1077
+ "grad_norm": 0.9562463164329529,
1078
+ "learning_rate": 4.567379548357292e-05,
1079
+ "loss": 1.6848,
1080
+ "mean_token_accuracy": 0.5997031436264515,
1081
+ "step": 67000
1082
+ },
1083
+ {
1084
+ "epoch": 0.9171382568938226,
1085
+ "grad_norm": 1.2848726511001587,
1086
+ "learning_rate": 4.2276964047936085e-05,
1087
+ "loss": 1.6873,
1088
+ "mean_token_accuracy": 0.599891084432602,
1089
+ "step": 67500
1090
+ },
1091
+ {
1092
+ "epoch": 0.9239318736115546,
1093
+ "grad_norm": 1.168323040008545,
1094
+ "learning_rate": 3.888013261229924e-05,
1095
+ "loss": 1.6807,
1096
+ "mean_token_accuracy": 0.599244915753603,
1097
+ "step": 68000
1098
+ },
1099
+ {
1100
+ "epoch": 0.9307254903292866,
1101
+ "grad_norm": 1.1764905452728271,
1102
+ "learning_rate": 3.548330117666241e-05,
1103
+ "loss": 1.6857,
1104
+ "mean_token_accuracy": 0.6015235537290573,
1105
+ "step": 68500
1106
+ },
1107
+ {
1108
+ "epoch": 0.9375191070470186,
1109
+ "grad_norm": 1.2431254386901855,
1110
+ "learning_rate": 3.210005706676812e-05,
1111
+ "loss": 1.6735,
1112
+ "mean_token_accuracy": 0.6007710628509522,
1113
+ "step": 69000
1114
+ },
1115
+ {
1116
+ "epoch": 0.9443127237647506,
1117
+ "grad_norm": 1.4310569763183594,
1118
+ "learning_rate": 2.870322563113128e-05,
1119
+ "loss": 1.6949,
1120
+ "mean_token_accuracy": 0.5974322560429574,
1121
+ "step": 69500
1122
+ },
1123
+ {
1124
+ "epoch": 0.9511063404824827,
1125
+ "grad_norm": 1.0426503419876099,
1126
+ "learning_rate": 2.531998152123699e-05,
1127
+ "loss": 1.6717,
1128
+ "mean_token_accuracy": 0.6013675058782101,
1129
+ "step": 70000
1130
+ },
1131
+ {
1132
+ "epoch": 0.9578999572002147,
1133
+ "grad_norm": 1.1059454679489136,
1134
+ "learning_rate": 2.192315008560015e-05,
1135
+ "loss": 1.6838,
1136
+ "mean_token_accuracy": 0.5987102429866791,
1137
+ "step": 70500
1138
+ },
1139
+ {
1140
+ "epoch": 0.9646935739179467,
1141
+ "grad_norm": 1.03636634349823,
1142
+ "learning_rate": 1.8539905975705863e-05,
1143
+ "loss": 1.6863,
1144
+ "mean_token_accuracy": 0.5992070758640766,
1145
+ "step": 71000
1146
+ },
1147
+ {
1148
+ "epoch": 0.9714871906356787,
1149
+ "grad_norm": 0.9837082028388977,
1150
+ "learning_rate": 1.5143074540069023e-05,
1151
+ "loss": 1.678,
1152
+ "mean_token_accuracy": 0.5995057925581933,
1153
+ "step": 71500
1154
+ },
1155
+ {
1156
+ "epoch": 0.9782808073534107,
1157
+ "grad_norm": 1.1581671237945557,
1158
+ "learning_rate": 1.1759830430174733e-05,
1159
+ "loss": 1.6811,
1160
+ "mean_token_accuracy": 0.6011576275527477,
1161
+ "step": 72000
1162
+ },
1163
+ {
1164
+ "epoch": 0.9850744240711428,
1165
+ "grad_norm": 0.954434335231781,
1166
+ "learning_rate": 8.362998994537894e-06,
1167
+ "loss": 1.6745,
1168
+ "mean_token_accuracy": 0.6010885013043881,
1169
+ "step": 72500
1170
+ },
1171
+ {
1172
+ "epoch": 0.9918680407888748,
1173
+ "grad_norm": 0.984001100063324,
1174
+ "learning_rate": 4.979754884643605e-06,
1175
+ "loss": 1.6786,
1176
+ "mean_token_accuracy": 0.6019023385941982,
1177
+ "step": 73000
1178
+ },
1179
+ {
1180
+ "epoch": 0.9986616575066067,
1181
+ "grad_norm": 1.2359673976898193,
1182
+ "learning_rate": 1.5829234490067665e-06,
1183
+ "loss": 1.6652,
1184
+ "mean_token_accuracy": 0.6032795180380345,
1185
+ "step": 73500
1186
+ },
1187
+ {
1188
+ "epoch": 0.9999932063832823,
1189
+ "mean_token_accuracy": 0.5990850239688036,
1190
+ "step": 73598,
1191
+ "total_flos": 9.3961326985583e+17,
1192
+ "train_loss": 1.7179553695028122,
1193
+ "train_runtime": 11634.1539,
1194
+ "train_samples_per_second": 25.304,
1195
+ "train_steps_per_second": 6.326
1196
  }
1197
  ],
1198
  "logging_steps": 500,
1199
+ "max_steps": 73598,
1200
  "num_input_tokens_seen": 0,
1201
  "num_train_epochs": 1,
1202
  "save_steps": 0,
 
1212
  "attributes": {}
1213
  }
1214
  },
1215
+ "total_flos": 9.3961326985583e+17,
1216
  "train_batch_size": 1,
1217
  "trial_name": null,
1218
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38e1f87563c53804e31a9ee530266c654ed4ae752736d5ffd386f47ff6460f57
3
  size 7032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48860954eaa763919043f93faa94628bd8ab746ef7fc8bbcc2ca2eb33563b89b
3
  size 7032