dpabonc commited on
Commit
4b7f489
·
verified ·
1 Parent(s): 864c851

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,3 +1,58 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
3
+ library_name: transformers
4
+ model_name: TinyLlama-1.1B-Chat-v1.0-sft
5
+ tags:
6
+ - generated_from_trainer
7
+ - sft
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for TinyLlama-1.1B-Chat-v1.0-sft
13
+
14
+ This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.19.0
38
+ - Transformers: 4.53.0
39
+ - Pytorch: 2.7.1+cu118
40
+ - Datasets: 3.6.0
41
+ - Tokenizers: 0.21.2
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message['role'] == 'user' %}
3
+ {{ '<|user|>
4
+ ' + message['content'] + eos_token }}
5
+ {% elif message['role'] == 'system' %}
6
+ {{ '<|system|>
7
+ ' + message['content'] + eos_token }}
8
+ {% elif message['role'] == 'assistant' %}
9
+ {{ '<|assistant|>
10
+ ' + message['content'] + eos_token }}
11
+ {% endif %}
12
+ {% if loop.last and add_generation_prompt %}
13
+ {{ '<|assistant|>' }}
14
+ {% endif %}
15
+ {% endfor %}
checkpoint-1000/chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message['role'] == 'user' %}
3
+ {{ '<|user|>
4
+ ' + message['content'] + eos_token }}
5
+ {% elif message['role'] == 'system' %}
6
+ {{ '<|system|>
7
+ ' + message['content'] + eos_token }}
8
+ {% elif message['role'] == 'assistant' %}
9
+ {{ '<|assistant|>
10
+ ' + message['content'] + eos_token }}
11
+ {% endif %}
12
+ {% if loop.last and add_generation_prompt %}
13
+ {{ '<|assistant|>' }}
14
+ {% endif %}
15
+ {% endfor %}
checkpoint-1000/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5632,
14
+ "max_position_embeddings": 2048,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 22,
19
+ "num_key_value_heads": 4,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.53.0",
27
+ "use_cache": true,
28
+ "vocab_size": 32001
29
+ }
checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.53.0"
7
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ce140b02e2c527fa587492251e6f0c3e30b8fb48c0a0ecaf5a32fec46a01bb7
3
+ size 4400232920
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7087ae7747d0e66edbbfb8de9ae584bec24f5cfce7205ab49f4ff0fccc92309
3
+ size 8800594203
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b5a67da83d9c0e696e53b6ca872b7aedf0300a6f1f06e4a2fda8e6850bba188
3
+ size 14645
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cf3f593de075f8621f954f65e72b4201caf184063029598ab21043538178a42
3
+ size 1465
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.1952,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.032,
14
+ "grad_norm": 4.924402713775635,
15
+ "learning_rate": 5.69620253164557e-07,
16
+ "loss": 1.003,
17
+ "mean_token_accuracy": 0.7565429553389549,
18
+ "num_tokens": 108351.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.064,
23
+ "grad_norm": 5.024679183959961,
24
+ "learning_rate": 1.2025316455696204e-06,
25
+ "loss": 0.9631,
26
+ "mean_token_accuracy": 0.7678055994212627,
27
+ "num_tokens": 220746.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.096,
32
+ "grad_norm": 3.090874671936035,
33
+ "learning_rate": 1.8354430379746838e-06,
34
+ "loss": 0.884,
35
+ "mean_token_accuracy": 0.779016162455082,
36
+ "num_tokens": 333736.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.128,
41
+ "grad_norm": 2.65234637260437,
42
+ "learning_rate": 2.4683544303797473e-06,
43
+ "loss": 0.7815,
44
+ "mean_token_accuracy": 0.8002516604959965,
45
+ "num_tokens": 449637.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.16,
50
+ "grad_norm": 2.4638168811798096,
51
+ "learning_rate": 3.10126582278481e-06,
52
+ "loss": 0.75,
53
+ "mean_token_accuracy": 0.8030893772840499,
54
+ "num_tokens": 562989.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.192,
59
+ "grad_norm": 2.374603509902954,
60
+ "learning_rate": 3.7341772151898737e-06,
61
+ "loss": 0.7152,
62
+ "mean_token_accuracy": 0.8119513258337975,
63
+ "num_tokens": 673839.0,
64
+ "step": 60
65
+ },
66
+ {
67
+ "epoch": 0.224,
68
+ "grad_norm": 2.330502510070801,
69
+ "learning_rate": 4.367088607594937e-06,
70
+ "loss": 0.6941,
71
+ "mean_token_accuracy": 0.815937215834856,
72
+ "num_tokens": 781061.0,
73
+ "step": 70
74
+ },
75
+ {
76
+ "epoch": 0.256,
77
+ "grad_norm": 2.267749309539795,
78
+ "learning_rate": 5e-06,
79
+ "loss": 0.696,
80
+ "mean_token_accuracy": 0.8140616729855538,
81
+ "num_tokens": 891312.0,
82
+ "step": 80
83
+ },
84
+ {
85
+ "epoch": 0.288,
86
+ "grad_norm": 2.236624240875244,
87
+ "learning_rate": 4.9994413292115925e-06,
88
+ "loss": 0.6568,
89
+ "mean_token_accuracy": 0.8207014128565788,
90
+ "num_tokens": 1005632.0,
91
+ "step": 90
92
+ },
93
+ {
94
+ "epoch": 0.32,
95
+ "grad_norm": 2.1301517486572266,
96
+ "learning_rate": 4.997765566536809e-06,
97
+ "loss": 0.6286,
98
+ "mean_token_accuracy": 0.8304322719573974,
99
+ "num_tokens": 1119621.0,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.352,
104
+ "grad_norm": 2.2088050842285156,
105
+ "learning_rate": 4.9949734609353746e-06,
106
+ "loss": 0.6173,
107
+ "mean_token_accuracy": 0.8298865057528019,
108
+ "num_tokens": 1228735.0,
109
+ "step": 110
110
+ },
111
+ {
112
+ "epoch": 0.384,
113
+ "grad_norm": 2.109389305114746,
114
+ "learning_rate": 4.991066260301558e-06,
115
+ "loss": 0.6392,
116
+ "mean_token_accuracy": 0.8268058508634567,
117
+ "num_tokens": 1339064.0,
118
+ "step": 120
119
+ },
120
+ {
121
+ "epoch": 0.416,
122
+ "grad_norm": 1.9944440126419067,
123
+ "learning_rate": 4.986045710906447e-06,
124
+ "loss": 0.5951,
125
+ "mean_token_accuracy": 0.8359164826571941,
126
+ "num_tokens": 1452954.0,
127
+ "step": 130
128
+ },
129
+ {
130
+ "epoch": 0.448,
131
+ "grad_norm": 2.0711472034454346,
132
+ "learning_rate": 4.979914056617472e-06,
133
+ "loss": 0.6136,
134
+ "mean_token_accuracy": 0.8322145372629166,
135
+ "num_tokens": 1564718.0,
136
+ "step": 140
137
+ },
138
+ {
139
+ "epoch": 0.48,
140
+ "grad_norm": 2.3020684719085693,
141
+ "learning_rate": 4.972674037895542e-06,
142
+ "loss": 0.593,
143
+ "mean_token_accuracy": 0.8376348964869976,
144
+ "num_tokens": 1673953.0,
145
+ "step": 150
146
+ },
147
+ {
148
+ "epoch": 0.512,
149
+ "grad_norm": 2.002972364425659,
150
+ "learning_rate": 4.964328890570231e-06,
151
+ "loss": 0.6062,
152
+ "mean_token_accuracy": 0.8321727059781552,
153
+ "num_tokens": 1780894.0,
154
+ "step": 160
155
+ },
156
+ {
157
+ "epoch": 0.544,
158
+ "grad_norm": 2.345299482345581,
159
+ "learning_rate": 4.954882344393566e-06,
160
+ "loss": 0.6018,
161
+ "mean_token_accuracy": 0.8306759320199489,
162
+ "num_tokens": 1895107.0,
163
+ "step": 170
164
+ },
165
+ {
166
+ "epoch": 0.576,
167
+ "grad_norm": 2.180422306060791,
168
+ "learning_rate": 4.94433862137307e-06,
169
+ "loss": 0.5945,
170
+ "mean_token_accuracy": 0.834496195614338,
171
+ "num_tokens": 2004705.0,
172
+ "step": 180
173
+ },
174
+ {
175
+ "epoch": 0.608,
176
+ "grad_norm": 2.1533453464508057,
177
+ "learning_rate": 4.9327024338847836e-06,
178
+ "loss": 0.5989,
179
+ "mean_token_accuracy": 0.8349006526172161,
180
+ "num_tokens": 2117436.0,
181
+ "step": 190
182
+ },
183
+ {
184
+ "epoch": 0.64,
185
+ "grad_norm": 2.128485918045044,
186
+ "learning_rate": 4.919978982567138e-06,
187
+ "loss": 0.5582,
188
+ "mean_token_accuracy": 0.8449515514075756,
189
+ "num_tokens": 2229604.0,
190
+ "step": 200
191
+ },
192
+ {
193
+ "epoch": 0.672,
194
+ "grad_norm": 2.323518753051758,
195
+ "learning_rate": 4.906173953996596e-06,
196
+ "loss": 0.5596,
197
+ "mean_token_accuracy": 0.8436533316969872,
198
+ "num_tokens": 2343563.0,
199
+ "step": 210
200
+ },
201
+ {
202
+ "epoch": 0.704,
203
+ "grad_norm": 2.2156243324279785,
204
+ "learning_rate": 4.891293518146112e-06,
205
+ "loss": 0.5855,
206
+ "mean_token_accuracy": 0.8377246461808682,
207
+ "num_tokens": 2453154.0,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 0.736,
212
+ "grad_norm": 2.2261300086975098,
213
+ "learning_rate": 4.875344325627551e-06,
214
+ "loss": 0.5716,
215
+ "mean_token_accuracy": 0.8415978990495205,
216
+ "num_tokens": 2566369.0,
217
+ "step": 230
218
+ },
219
+ {
220
+ "epoch": 0.768,
221
+ "grad_norm": 2.0974161624908447,
222
+ "learning_rate": 4.85833350471928e-06,
223
+ "loss": 0.5628,
224
+ "mean_token_accuracy": 0.8448635570704937,
225
+ "num_tokens": 2674001.0,
226
+ "step": 240
227
+ },
228
+ {
229
+ "epoch": 0.8,
230
+ "grad_norm": 2.181368112564087,
231
+ "learning_rate": 4.840268658180281e-06,
232
+ "loss": 0.5575,
233
+ "mean_token_accuracy": 0.8437174156308174,
234
+ "num_tokens": 2785131.0,
235
+ "step": 250
236
+ },
237
+ {
238
+ "epoch": 0.832,
239
+ "grad_norm": 1.9916293621063232,
240
+ "learning_rate": 4.821157859852201e-06,
241
+ "loss": 0.5496,
242
+ "mean_token_accuracy": 0.8463803254067898,
243
+ "num_tokens": 2898708.0,
244
+ "step": 260
245
+ },
246
+ {
247
+ "epoch": 0.864,
248
+ "grad_norm": 1.9616676568984985,
249
+ "learning_rate": 4.801009651050856e-06,
250
+ "loss": 0.5599,
251
+ "mean_token_accuracy": 0.8417543187737465,
252
+ "num_tokens": 3008577.0,
253
+ "step": 270
254
+ },
255
+ {
256
+ "epoch": 0.896,
257
+ "grad_norm": 2.2142021656036377,
258
+ "learning_rate": 4.779833036748801e-06,
259
+ "loss": 0.5268,
260
+ "mean_token_accuracy": 0.8514743112027645,
261
+ "num_tokens": 3120102.0,
262
+ "step": 280
263
+ },
264
+ {
265
+ "epoch": 0.928,
266
+ "grad_norm": 1.8409771919250488,
267
+ "learning_rate": 4.757637481550683e-06,
268
+ "loss": 0.5443,
269
+ "mean_token_accuracy": 0.8460176154971123,
270
+ "num_tokens": 3234456.0,
271
+ "step": 290
272
+ },
273
+ {
274
+ "epoch": 0.96,
275
+ "grad_norm": 1.9530022144317627,
276
+ "learning_rate": 4.73443290546316e-06,
277
+ "loss": 0.5487,
278
+ "mean_token_accuracy": 0.8462878614664078,
279
+ "num_tokens": 3348821.0,
280
+ "step": 300
281
+ },
282
+ {
283
+ "epoch": 0.992,
284
+ "grad_norm": 1.9719116687774658,
285
+ "learning_rate": 4.7102296794612865e-06,
286
+ "loss": 0.5229,
287
+ "mean_token_accuracy": 0.8531442761421204,
288
+ "num_tokens": 3455469.0,
289
+ "step": 310
290
+ },
291
+ {
292
+ "epoch": 1.0224,
293
+ "grad_norm": 1.9529277086257935,
294
+ "learning_rate": 4.6850386208533424e-06,
295
+ "loss": 0.4849,
296
+ "mean_token_accuracy": 0.8625573249239671,
297
+ "num_tokens": 3561864.0,
298
+ "step": 320
299
+ },
300
+ {
301
+ "epoch": 1.0544,
302
+ "grad_norm": 2.105419635772705,
303
+ "learning_rate": 4.658870988446189e-06,
304
+ "loss": 0.4819,
305
+ "mean_token_accuracy": 0.864888047426939,
306
+ "num_tokens": 3672405.0,
307
+ "step": 330
308
+ },
309
+ {
310
+ "epoch": 1.0864,
311
+ "grad_norm": 2.035449504852295,
312
+ "learning_rate": 4.631738477513286e-06,
313
+ "loss": 0.4735,
314
+ "mean_token_accuracy": 0.8661263197660446,
315
+ "num_tokens": 3780404.0,
316
+ "step": 340
317
+ },
318
+ {
319
+ "epoch": 1.1184,
320
+ "grad_norm": 2.1748783588409424,
321
+ "learning_rate": 4.603653214567654e-06,
322
+ "loss": 0.4638,
323
+ "mean_token_accuracy": 0.8661996729671955,
324
+ "num_tokens": 3894359.0,
325
+ "step": 350
326
+ },
327
+ {
328
+ "epoch": 1.1504,
329
+ "grad_norm": 2.0794739723205566,
330
+ "learning_rate": 4.574627751942087e-06,
331
+ "loss": 0.4683,
332
+ "mean_token_accuracy": 0.8664901047945023,
333
+ "num_tokens": 4007235.0,
334
+ "step": 360
335
+ },
336
+ {
337
+ "epoch": 1.1824,
338
+ "grad_norm": 2.2591140270233154,
339
+ "learning_rate": 4.544675062179056e-06,
340
+ "loss": 0.4616,
341
+ "mean_token_accuracy": 0.8676725938916207,
342
+ "num_tokens": 4116127.0,
343
+ "step": 370
344
+ },
345
+ {
346
+ "epoch": 1.2144,
347
+ "grad_norm": 2.111358165740967,
348
+ "learning_rate": 4.513808532232804e-06,
349
+ "loss": 0.4742,
350
+ "mean_token_accuracy": 0.8660650610923767,
351
+ "num_tokens": 4228679.0,
352
+ "step": 380
353
+ },
354
+ {
355
+ "epoch": 1.2464,
356
+ "grad_norm": 2.115079402923584,
357
+ "learning_rate": 4.482041957486229e-06,
358
+ "loss": 0.4679,
359
+ "mean_token_accuracy": 0.8653517335653305,
360
+ "num_tokens": 4341814.0,
361
+ "step": 390
362
+ },
363
+ {
364
+ "epoch": 1.2784,
365
+ "grad_norm": 2.15097713470459,
366
+ "learning_rate": 4.4493895355852176e-06,
367
+ "loss": 0.448,
368
+ "mean_token_accuracy": 0.8716798067092896,
369
+ "num_tokens": 4451355.0,
370
+ "step": 400
371
+ },
372
+ {
373
+ "epoch": 1.3104,
374
+ "grad_norm": 2.124791383743286,
375
+ "learning_rate": 4.415865860093199e-06,
376
+ "loss": 0.4576,
377
+ "mean_token_accuracy": 0.8684222124516964,
378
+ "num_tokens": 4566924.0,
379
+ "step": 410
380
+ },
381
+ {
382
+ "epoch": 1.3424,
383
+ "grad_norm": 2.033034563064575,
384
+ "learning_rate": 4.381485913968747e-06,
385
+ "loss": 0.4558,
386
+ "mean_token_accuracy": 0.8672670938074589,
387
+ "num_tokens": 4677684.0,
388
+ "step": 420
389
+ },
390
+ {
391
+ "epoch": 1.3744,
392
+ "grad_norm": 2.179928779602051,
393
+ "learning_rate": 4.346265062869147e-06,
394
+ "loss": 0.468,
395
+ "mean_token_accuracy": 0.8666120313107968,
396
+ "num_tokens": 4787468.0,
397
+ "step": 430
398
+ },
399
+ {
400
+ "epoch": 1.4064,
401
+ "grad_norm": 2.1226675510406494,
402
+ "learning_rate": 4.310219048282921e-06,
403
+ "loss": 0.4514,
404
+ "mean_token_accuracy": 0.8693858951330184,
405
+ "num_tokens": 4899692.0,
406
+ "step": 440
407
+ },
408
+ {
409
+ "epoch": 1.4384000000000001,
410
+ "grad_norm": 2.226494550704956,
411
+ "learning_rate": 4.273363980494379e-06,
412
+ "loss": 0.4761,
413
+ "mean_token_accuracy": 0.8673883900046349,
414
+ "num_tokens": 5008859.0,
415
+ "step": 450
416
+ },
417
+ {
418
+ "epoch": 1.4704,
419
+ "grad_norm": 2.039381980895996,
420
+ "learning_rate": 4.235716331383343e-06,
421
+ "loss": 0.4652,
422
+ "mean_token_accuracy": 0.8683492943644524,
423
+ "num_tokens": 5117719.0,
424
+ "step": 460
425
+ },
426
+ {
427
+ "epoch": 1.5024,
428
+ "grad_norm": 2.1596784591674805,
429
+ "learning_rate": 4.197292927063263e-06,
430
+ "loss": 0.4701,
431
+ "mean_token_accuracy": 0.863686279207468,
432
+ "num_tokens": 5227041.0,
433
+ "step": 470
434
+ },
435
+ {
436
+ "epoch": 1.5344,
437
+ "grad_norm": 2.153256893157959,
438
+ "learning_rate": 4.158110940361007e-06,
439
+ "loss": 0.4559,
440
+ "mean_token_accuracy": 0.8705666847527027,
441
+ "num_tokens": 5334777.0,
442
+ "step": 480
443
+ },
444
+ {
445
+ "epoch": 1.5664,
446
+ "grad_norm": 2.5348920822143555,
447
+ "learning_rate": 4.118187883141694e-06,
448
+ "loss": 0.461,
449
+ "mean_token_accuracy": 0.8692857407033443,
450
+ "num_tokens": 5441237.0,
451
+ "step": 490
452
+ },
453
+ {
454
+ "epoch": 1.5984,
455
+ "grad_norm": 2.2534334659576416,
456
+ "learning_rate": 4.077541598482009e-06,
457
+ "loss": 0.4748,
458
+ "mean_token_accuracy": 0.8672291107475758,
459
+ "num_tokens": 5553620.0,
460
+ "step": 500
461
+ },
462
+ {
463
+ "epoch": 1.6303999999999998,
464
+ "grad_norm": 2.1640467643737793,
465
+ "learning_rate": 4.036190252695467e-06,
466
+ "loss": 0.4874,
467
+ "mean_token_accuracy": 0.8607077576220036,
468
+ "num_tokens": 5663373.0,
469
+ "step": 510
470
+ },
471
+ {
472
+ "epoch": 1.6623999999999999,
473
+ "grad_norm": 2.1866941452026367,
474
+ "learning_rate": 3.994152327213232e-06,
475
+ "loss": 0.4545,
476
+ "mean_token_accuracy": 0.8709075525403023,
477
+ "num_tokens": 5776290.0,
478
+ "step": 520
479
+ },
480
+ {
481
+ "epoch": 1.6944,
482
+ "grad_norm": 2.4210782051086426,
483
+ "learning_rate": 3.95144661032408e-06,
484
+ "loss": 0.4316,
485
+ "mean_token_accuracy": 0.8738773807883262,
486
+ "num_tokens": 5882282.0,
487
+ "step": 530
488
+ },
489
+ {
490
+ "epoch": 1.7264,
491
+ "grad_norm": 2.2260217666625977,
492
+ "learning_rate": 3.9080921887772314e-06,
493
+ "loss": 0.4497,
494
+ "mean_token_accuracy": 0.8696616031229496,
495
+ "num_tokens": 5999645.0,
496
+ "step": 540
497
+ },
498
+ {
499
+ "epoch": 1.7584,
500
+ "grad_norm": 2.1372995376586914,
501
+ "learning_rate": 3.864108439251779e-06,
502
+ "loss": 0.4466,
503
+ "mean_token_accuracy": 0.8693803034722805,
504
+ "num_tokens": 6108677.0,
505
+ "step": 550
506
+ },
507
+ {
508
+ "epoch": 1.7904,
509
+ "grad_norm": 2.0875401496887207,
510
+ "learning_rate": 3.8195150196965414e-06,
511
+ "loss": 0.4618,
512
+ "mean_token_accuracy": 0.866703423857689,
513
+ "num_tokens": 6222393.0,
514
+ "step": 560
515
+ },
516
+ {
517
+ "epoch": 1.8224,
518
+ "grad_norm": 2.06887149810791,
519
+ "learning_rate": 3.7743318605442093e-06,
520
+ "loss": 0.4593,
521
+ "mean_token_accuracy": 0.8678097896277904,
522
+ "num_tokens": 6337550.0,
523
+ "step": 570
524
+ },
525
+ {
526
+ "epoch": 1.8544,
527
+ "grad_norm": 1.9830808639526367,
528
+ "learning_rate": 3.728579155803697e-06,
529
+ "loss": 0.4577,
530
+ "mean_token_accuracy": 0.8675460621714592,
531
+ "num_tokens": 6449551.0,
532
+ "step": 580
533
+ },
534
+ {
535
+ "epoch": 1.8864,
536
+ "grad_norm": 2.165144443511963,
537
+ "learning_rate": 3.6822773540347107e-06,
538
+ "loss": 0.439,
539
+ "mean_token_accuracy": 0.8739401362836361,
540
+ "num_tokens": 6562113.0,
541
+ "step": 590
542
+ },
543
+ {
544
+ "epoch": 1.9184,
545
+ "grad_norm": 2.1738622188568115,
546
+ "learning_rate": 3.635447149208528e-06,
547
+ "loss": 0.4412,
548
+ "mean_token_accuracy": 0.8708230577409267,
549
+ "num_tokens": 6674860.0,
550
+ "step": 600
551
+ },
552
+ {
553
+ "epoch": 1.9504000000000001,
554
+ "grad_norm": 2.4379212856292725,
555
+ "learning_rate": 3.5881094714591102e-06,
556
+ "loss": 0.4522,
557
+ "mean_token_accuracy": 0.8691440485417843,
558
+ "num_tokens": 6789358.0,
559
+ "step": 610
560
+ },
561
+ {
562
+ "epoch": 1.9824000000000002,
563
+ "grad_norm": 2.1988134384155273,
564
+ "learning_rate": 3.540285477728658e-06,
565
+ "loss": 0.4351,
566
+ "mean_token_accuracy": 0.8745189763605594,
567
+ "num_tokens": 6899897.0,
568
+ "step": 620
569
+ },
570
+ {
571
+ "epoch": 2.0128,
572
+ "grad_norm": 2.226576089859009,
573
+ "learning_rate": 3.491996542311795e-06,
574
+ "loss": 0.4196,
575
+ "mean_token_accuracy": 0.8793611534331974,
576
+ "num_tokens": 7005635.0,
577
+ "step": 630
578
+ },
579
+ {
580
+ "epoch": 2.0448,
581
+ "grad_norm": 2.284040927886963,
582
+ "learning_rate": 3.44326424730262e-06,
583
+ "loss": 0.3729,
584
+ "mean_token_accuracy": 0.8900357685983181,
585
+ "num_tokens": 7118882.0,
586
+ "step": 640
587
+ },
588
+ {
589
+ "epoch": 2.0768,
590
+ "grad_norm": 2.3776021003723145,
591
+ "learning_rate": 3.394110372948871e-06,
592
+ "loss": 0.3784,
593
+ "mean_token_accuracy": 0.8912972524762154,
594
+ "num_tokens": 7229223.0,
595
+ "step": 650
596
+ },
597
+ {
598
+ "epoch": 2.1088,
599
+ "grad_norm": 2.3752377033233643,
600
+ "learning_rate": 3.344556887917539e-06,
601
+ "loss": 0.3754,
602
+ "mean_token_accuracy": 0.8933634988963604,
603
+ "num_tokens": 7344352.0,
604
+ "step": 660
605
+ },
606
+ {
607
+ "epoch": 2.1408,
608
+ "grad_norm": 2.2204787731170654,
609
+ "learning_rate": 3.2946259394762645e-06,
610
+ "loss": 0.3603,
611
+ "mean_token_accuracy": 0.8949612870812416,
612
+ "num_tokens": 7452978.0,
613
+ "step": 670
614
+ },
615
+ {
616
+ "epoch": 2.1728,
617
+ "grad_norm": 2.3392176628112793,
618
+ "learning_rate": 3.244339843594913e-06,
619
+ "loss": 0.3633,
620
+ "mean_token_accuracy": 0.8934120312333107,
621
+ "num_tokens": 7562987.0,
622
+ "step": 680
623
+ },
624
+ {
625
+ "epoch": 2.2048,
626
+ "grad_norm": 2.548868417739868,
627
+ "learning_rate": 3.19372107497175e-06,
628
+ "loss": 0.3688,
629
+ "mean_token_accuracy": 0.8925295993685722,
630
+ "num_tokens": 7672018.0,
631
+ "step": 690
632
+ },
633
+ {
634
+ "epoch": 2.2368,
635
+ "grad_norm": 2.524747371673584,
636
+ "learning_rate": 3.1427922569886756e-06,
637
+ "loss": 0.3535,
638
+ "mean_token_accuracy": 0.8959250211715698,
639
+ "num_tokens": 7778694.0,
640
+ "step": 700
641
+ },
642
+ {
643
+ "epoch": 2.2688,
644
+ "grad_norm": 2.2159647941589355,
645
+ "learning_rate": 3.0915761516000053e-06,
646
+ "loss": 0.3645,
647
+ "mean_token_accuracy": 0.8916135348379612,
648
+ "num_tokens": 7889175.0,
649
+ "step": 710
650
+ },
651
+ {
652
+ "epoch": 2.3008,
653
+ "grad_norm": 2.6435368061065674,
654
+ "learning_rate": 3.0400956491593215e-06,
655
+ "loss": 0.3742,
656
+ "mean_token_accuracy": 0.8912425316870213,
657
+ "num_tokens": 7997644.0,
658
+ "step": 720
659
+ },
660
+ {
661
+ "epoch": 2.3327999999999998,
662
+ "grad_norm": 2.5125110149383545,
663
+ "learning_rate": 2.9883737581889315e-06,
664
+ "loss": 0.3767,
665
+ "mean_token_accuracy": 0.8903645075857639,
666
+ "num_tokens": 8109860.0,
667
+ "step": 730
668
+ },
669
+ {
670
+ "epoch": 2.3648,
671
+ "grad_norm": 2.482232093811035,
672
+ "learning_rate": 2.9364335950965227e-06,
673
+ "loss": 0.3531,
674
+ "mean_token_accuracy": 0.8950154200196266,
675
+ "num_tokens": 8219541.0,
676
+ "step": 740
677
+ },
678
+ {
679
+ "epoch": 2.3968,
680
+ "grad_norm": 2.360308885574341,
681
+ "learning_rate": 2.8842983738435855e-06,
682
+ "loss": 0.3602,
683
+ "mean_token_accuracy": 0.892794082313776,
684
+ "num_tokens": 8329765.0,
685
+ "step": 750
686
+ },
687
+ {
688
+ "epoch": 2.4288,
689
+ "grad_norm": 2.5826432704925537,
690
+ "learning_rate": 2.831991395570249e-06,
691
+ "loss": 0.364,
692
+ "mean_token_accuracy": 0.8930977649986744,
693
+ "num_tokens": 8443917.0,
694
+ "step": 760
695
+ },
696
+ {
697
+ "epoch": 2.4608,
698
+ "grad_norm": 2.588911771774292,
699
+ "learning_rate": 2.779536038181146e-06,
700
+ "loss": 0.3771,
701
+ "mean_token_accuracy": 0.8915269665420056,
702
+ "num_tokens": 8550550.0,
703
+ "step": 770
704
+ },
705
+ {
706
+ "epoch": 2.4928,
707
+ "grad_norm": 2.4379560947418213,
708
+ "learning_rate": 2.726955745896972e-06,
709
+ "loss": 0.3726,
710
+ "mean_token_accuracy": 0.8931849867105484,
711
+ "num_tokens": 8659752.0,
712
+ "step": 780
713
+ },
714
+ {
715
+ "epoch": 2.5248,
716
+ "grad_norm": 2.285378932952881,
717
+ "learning_rate": 2.6742740187764026e-06,
718
+ "loss": 0.3644,
719
+ "mean_token_accuracy": 0.8937488354742527,
720
+ "num_tokens": 8775392.0,
721
+ "step": 790
722
+ },
723
+ {
724
+ "epoch": 2.5568,
725
+ "grad_norm": 2.6339035034179688,
726
+ "learning_rate": 2.621514402213059e-06,
727
+ "loss": 0.3528,
728
+ "mean_token_accuracy": 0.8967724144458771,
729
+ "num_tokens": 8888499.0,
730
+ "step": 800
731
+ },
732
+ {
733
+ "epoch": 2.5888,
734
+ "grad_norm": 2.2561960220336914,
735
+ "learning_rate": 2.568700476412204e-06,
736
+ "loss": 0.3846,
737
+ "mean_token_accuracy": 0.8879048563539982,
738
+ "num_tokens": 9001912.0,
739
+ "step": 810
740
+ },
741
+ {
742
+ "epoch": 2.6208,
743
+ "grad_norm": 2.3487298488616943,
744
+ "learning_rate": 2.515855845851894e-06,
745
+ "loss": 0.3602,
746
+ "mean_token_accuracy": 0.8923349373042584,
747
+ "num_tokens": 9113468.0,
748
+ "step": 820
749
+ },
750
+ {
751
+ "epoch": 2.6528,
752
+ "grad_norm": 2.4302189350128174,
753
+ "learning_rate": 2.46300412873326e-06,
754
+ "loss": 0.3613,
755
+ "mean_token_accuracy": 0.8928801603615284,
756
+ "num_tokens": 9229335.0,
757
+ "step": 830
758
+ },
759
+ {
760
+ "epoch": 2.6848,
761
+ "grad_norm": 2.48608136177063,
762
+ "learning_rate": 2.4101689464246816e-06,
763
+ "loss": 0.3642,
764
+ "mean_token_accuracy": 0.8935355007648468,
765
+ "num_tokens": 9344595.0,
766
+ "step": 840
767
+ },
768
+ {
769
+ "epoch": 2.7168,
770
+ "grad_norm": 2.357041120529175,
771
+ "learning_rate": 2.3573739129045227e-06,
772
+ "loss": 0.3579,
773
+ "mean_token_accuracy": 0.8953744530677795,
774
+ "num_tokens": 9455015.0,
775
+ "step": 850
776
+ },
777
+ {
778
+ "epoch": 2.7488,
779
+ "grad_norm": 2.649960517883301,
780
+ "learning_rate": 2.3046426242071834e-06,
781
+ "loss": 0.3689,
782
+ "mean_token_accuracy": 0.8912502139806747,
783
+ "num_tokens": 9560806.0,
784
+ "step": 860
785
+ },
786
+ {
787
+ "epoch": 2.7808,
788
+ "grad_norm": 2.3866405487060547,
789
+ "learning_rate": 2.251998647877169e-06,
790
+ "loss": 0.364,
791
+ "mean_token_accuracy": 0.8922963574528694,
792
+ "num_tokens": 9673310.0,
793
+ "step": 870
794
+ },
795
+ {
796
+ "epoch": 2.8128,
797
+ "grad_norm": 2.3306539058685303,
798
+ "learning_rate": 2.1994655124358875e-06,
799
+ "loss": 0.347,
800
+ "mean_token_accuracy": 0.8968791022896767,
801
+ "num_tokens": 9787134.0,
802
+ "step": 880
803
+ },
804
+ {
805
+ "epoch": 2.8448,
806
+ "grad_norm": 2.3708155155181885,
807
+ "learning_rate": 2.1470666968658956e-06,
808
+ "loss": 0.3576,
809
+ "mean_token_accuracy": 0.8956275820732117,
810
+ "num_tokens": 9901311.0,
811
+ "step": 890
812
+ },
813
+ {
814
+ "epoch": 2.8768000000000002,
815
+ "grad_norm": 2.466033935546875,
816
+ "learning_rate": 2.0948256201172772e-06,
817
+ "loss": 0.3495,
818
+ "mean_token_accuracy": 0.8957146309316159,
819
+ "num_tokens": 10017025.0,
820
+ "step": 900
821
+ },
822
+ {
823
+ "epoch": 2.9088000000000003,
824
+ "grad_norm": 2.630037784576416,
825
+ "learning_rate": 2.0427656306408594e-06,
826
+ "loss": 0.3643,
827
+ "mean_token_accuracy": 0.892961747199297,
828
+ "num_tokens": 10131105.0,
829
+ "step": 910
830
+ },
831
+ {
832
+ "epoch": 2.9408,
833
+ "grad_norm": 2.4972527027130127,
834
+ "learning_rate": 1.990909995952936e-06,
835
+ "loss": 0.3542,
836
+ "mean_token_accuracy": 0.8959409989416599,
837
+ "num_tokens": 10240555.0,
838
+ "step": 920
839
+ },
840
+ {
841
+ "epoch": 2.9728,
842
+ "grad_norm": 2.722083330154419,
843
+ "learning_rate": 1.939281892236157e-06,
844
+ "loss": 0.3531,
845
+ "mean_token_accuracy": 0.8945782586932183,
846
+ "num_tokens": 10347889.0,
847
+ "step": 930
848
+ },
849
+ {
850
+ "epoch": 3.0032,
851
+ "grad_norm": 2.2683236598968506,
852
+ "learning_rate": 1.8879043939812503e-06,
853
+ "loss": 0.3579,
854
+ "mean_token_accuracy": 0.8970388119157992,
855
+ "num_tokens": 10453648.0,
856
+ "step": 940
857
+ },
858
+ {
859
+ "epoch": 3.0352,
860
+ "grad_norm": 2.5510222911834717,
861
+ "learning_rate": 1.8368004636741794e-06,
862
+ "loss": 0.3062,
863
+ "mean_token_accuracy": 0.9120431691408157,
864
+ "num_tokens": 10565156.0,
865
+ "step": 950
866
+ },
867
+ {
868
+ "epoch": 3.0672,
869
+ "grad_norm": 2.730621337890625,
870
+ "learning_rate": 1.7859929415333725e-06,
871
+ "loss": 0.2912,
872
+ "mean_token_accuracy": 0.9148132555186749,
873
+ "num_tokens": 10676677.0,
874
+ "step": 960
875
+ },
876
+ {
877
+ "epoch": 3.0992,
878
+ "grad_norm": 2.5102102756500244,
879
+ "learning_rate": 1.735504535301592e-06,
880
+ "loss": 0.2882,
881
+ "mean_token_accuracy": 0.9159041397273541,
882
+ "num_tokens": 10787468.0,
883
+ "step": 970
884
+ },
885
+ {
886
+ "epoch": 3.1312,
887
+ "grad_norm": 2.3301403522491455,
888
+ "learning_rate": 1.6853578100970086e-06,
889
+ "loss": 0.3016,
890
+ "mean_token_accuracy": 0.911161607503891,
891
+ "num_tokens": 10897279.0,
892
+ "step": 980
893
+ },
894
+ {
895
+ "epoch": 3.1632,
896
+ "grad_norm": 2.4964194297790527,
897
+ "learning_rate": 1.6355751783280283e-06,
898
+ "loss": 0.2927,
899
+ "mean_token_accuracy": 0.9142641559243202,
900
+ "num_tokens": 11008731.0,
901
+ "step": 990
902
+ },
903
+ {
904
+ "epoch": 3.1952,
905
+ "grad_norm": 2.547372341156006,
906
+ "learning_rate": 1.5861788896763625e-06,
907
+ "loss": 0.2991,
908
+ "mean_token_accuracy": 0.9131025090813637,
909
+ "num_tokens": 11123820.0,
910
+ "step": 1000
911
+ }
912
+ ],
913
+ "logging_steps": 10,
914
+ "max_steps": 1565,
915
+ "num_input_tokens_seen": 0,
916
+ "num_train_epochs": 5,
917
+ "save_steps": 500,
918
+ "stateful_callbacks": {
919
+ "TrainerControl": {
920
+ "args": {
921
+ "should_epoch_stop": false,
922
+ "should_evaluate": false,
923
+ "should_log": false,
924
+ "should_save": true,
925
+ "should_training_stop": false
926
+ },
927
+ "attributes": {}
928
+ }
929
+ },
930
+ "total_flos": 1.0242157166877082e+17,
931
+ "train_batch_size": 4,
932
+ "trial_name": null,
933
+ "trial_params": null
934
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:805391ab265a423fa4bf8e6272db7bca4771272624226f8273e5a50efd4ae604
3
+ size 6161
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5632,
14
+ "max_position_embeddings": 2048,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 22,
19
+ "num_key_value_heads": 4,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.53.0",
27
+ "use_cache": true,
28
+ "vocab_size": 32001
29
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.53.0"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ce140b02e2c527fa587492251e6f0c3e30b8fb48c0a0ecaf5a32fec46a01bb7
3
+ size 4400232920
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7087ae7747d0e66edbbfb8de9ae584bec24f5cfce7205ab49f4ff0fccc92309
3
+ size 8800594203
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b5a67da83d9c0e696e53b6ca872b7aedf0300a6f1f06e4a2fda8e6850bba188
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cf3f593de075f8621f954f65e72b4201caf184063029598ab21043538178a42
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
trainer_state.json ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.1952,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.032,
14
+ "grad_norm": 4.924402713775635,
15
+ "learning_rate": 5.69620253164557e-07,
16
+ "loss": 1.003,
17
+ "mean_token_accuracy": 0.7565429553389549,
18
+ "num_tokens": 108351.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.064,
23
+ "grad_norm": 5.024679183959961,
24
+ "learning_rate": 1.2025316455696204e-06,
25
+ "loss": 0.9631,
26
+ "mean_token_accuracy": 0.7678055994212627,
27
+ "num_tokens": 220746.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.096,
32
+ "grad_norm": 3.090874671936035,
33
+ "learning_rate": 1.8354430379746838e-06,
34
+ "loss": 0.884,
35
+ "mean_token_accuracy": 0.779016162455082,
36
+ "num_tokens": 333736.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.128,
41
+ "grad_norm": 2.65234637260437,
42
+ "learning_rate": 2.4683544303797473e-06,
43
+ "loss": 0.7815,
44
+ "mean_token_accuracy": 0.8002516604959965,
45
+ "num_tokens": 449637.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.16,
50
+ "grad_norm": 2.4638168811798096,
51
+ "learning_rate": 3.10126582278481e-06,
52
+ "loss": 0.75,
53
+ "mean_token_accuracy": 0.8030893772840499,
54
+ "num_tokens": 562989.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.192,
59
+ "grad_norm": 2.374603509902954,
60
+ "learning_rate": 3.7341772151898737e-06,
61
+ "loss": 0.7152,
62
+ "mean_token_accuracy": 0.8119513258337975,
63
+ "num_tokens": 673839.0,
64
+ "step": 60
65
+ },
66
+ {
67
+ "epoch": 0.224,
68
+ "grad_norm": 2.330502510070801,
69
+ "learning_rate": 4.367088607594937e-06,
70
+ "loss": 0.6941,
71
+ "mean_token_accuracy": 0.815937215834856,
72
+ "num_tokens": 781061.0,
73
+ "step": 70
74
+ },
75
+ {
76
+ "epoch": 0.256,
77
+ "grad_norm": 2.267749309539795,
78
+ "learning_rate": 5e-06,
79
+ "loss": 0.696,
80
+ "mean_token_accuracy": 0.8140616729855538,
81
+ "num_tokens": 891312.0,
82
+ "step": 80
83
+ },
84
+ {
85
+ "epoch": 0.288,
86
+ "grad_norm": 2.236624240875244,
87
+ "learning_rate": 4.9994413292115925e-06,
88
+ "loss": 0.6568,
89
+ "mean_token_accuracy": 0.8207014128565788,
90
+ "num_tokens": 1005632.0,
91
+ "step": 90
92
+ },
93
+ {
94
+ "epoch": 0.32,
95
+ "grad_norm": 2.1301517486572266,
96
+ "learning_rate": 4.997765566536809e-06,
97
+ "loss": 0.6286,
98
+ "mean_token_accuracy": 0.8304322719573974,
99
+ "num_tokens": 1119621.0,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.352,
104
+ "grad_norm": 2.2088050842285156,
105
+ "learning_rate": 4.9949734609353746e-06,
106
+ "loss": 0.6173,
107
+ "mean_token_accuracy": 0.8298865057528019,
108
+ "num_tokens": 1228735.0,
109
+ "step": 110
110
+ },
111
+ {
112
+ "epoch": 0.384,
113
+ "grad_norm": 2.109389305114746,
114
+ "learning_rate": 4.991066260301558e-06,
115
+ "loss": 0.6392,
116
+ "mean_token_accuracy": 0.8268058508634567,
117
+ "num_tokens": 1339064.0,
118
+ "step": 120
119
+ },
120
+ {
121
+ "epoch": 0.416,
122
+ "grad_norm": 1.9944440126419067,
123
+ "learning_rate": 4.986045710906447e-06,
124
+ "loss": 0.5951,
125
+ "mean_token_accuracy": 0.8359164826571941,
126
+ "num_tokens": 1452954.0,
127
+ "step": 130
128
+ },
129
+ {
130
+ "epoch": 0.448,
131
+ "grad_norm": 2.0711472034454346,
132
+ "learning_rate": 4.979914056617472e-06,
133
+ "loss": 0.6136,
134
+ "mean_token_accuracy": 0.8322145372629166,
135
+ "num_tokens": 1564718.0,
136
+ "step": 140
137
+ },
138
+ {
139
+ "epoch": 0.48,
140
+ "grad_norm": 2.3020684719085693,
141
+ "learning_rate": 4.972674037895542e-06,
142
+ "loss": 0.593,
143
+ "mean_token_accuracy": 0.8376348964869976,
144
+ "num_tokens": 1673953.0,
145
+ "step": 150
146
+ },
147
+ {
148
+ "epoch": 0.512,
149
+ "grad_norm": 2.002972364425659,
150
+ "learning_rate": 4.964328890570231e-06,
151
+ "loss": 0.6062,
152
+ "mean_token_accuracy": 0.8321727059781552,
153
+ "num_tokens": 1780894.0,
154
+ "step": 160
155
+ },
156
+ {
157
+ "epoch": 0.544,
158
+ "grad_norm": 2.345299482345581,
159
+ "learning_rate": 4.954882344393566e-06,
160
+ "loss": 0.6018,
161
+ "mean_token_accuracy": 0.8306759320199489,
162
+ "num_tokens": 1895107.0,
163
+ "step": 170
164
+ },
165
+ {
166
+ "epoch": 0.576,
167
+ "grad_norm": 2.180422306060791,
168
+ "learning_rate": 4.94433862137307e-06,
169
+ "loss": 0.5945,
170
+ "mean_token_accuracy": 0.834496195614338,
171
+ "num_tokens": 2004705.0,
172
+ "step": 180
173
+ },
174
+ {
175
+ "epoch": 0.608,
176
+ "grad_norm": 2.1533453464508057,
177
+ "learning_rate": 4.9327024338847836e-06,
178
+ "loss": 0.5989,
179
+ "mean_token_accuracy": 0.8349006526172161,
180
+ "num_tokens": 2117436.0,
181
+ "step": 190
182
+ },
183
+ {
184
+ "epoch": 0.64,
185
+ "grad_norm": 2.128485918045044,
186
+ "learning_rate": 4.919978982567138e-06,
187
+ "loss": 0.5582,
188
+ "mean_token_accuracy": 0.8449515514075756,
189
+ "num_tokens": 2229604.0,
190
+ "step": 200
191
+ },
192
+ {
193
+ "epoch": 0.672,
194
+ "grad_norm": 2.323518753051758,
195
+ "learning_rate": 4.906173953996596e-06,
196
+ "loss": 0.5596,
197
+ "mean_token_accuracy": 0.8436533316969872,
198
+ "num_tokens": 2343563.0,
199
+ "step": 210
200
+ },
201
+ {
202
+ "epoch": 0.704,
203
+ "grad_norm": 2.2156243324279785,
204
+ "learning_rate": 4.891293518146112e-06,
205
+ "loss": 0.5855,
206
+ "mean_token_accuracy": 0.8377246461808682,
207
+ "num_tokens": 2453154.0,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 0.736,
212
+ "grad_norm": 2.2261300086975098,
213
+ "learning_rate": 4.875344325627551e-06,
214
+ "loss": 0.5716,
215
+ "mean_token_accuracy": 0.8415978990495205,
216
+ "num_tokens": 2566369.0,
217
+ "step": 230
218
+ },
219
+ {
220
+ "epoch": 0.768,
221
+ "grad_norm": 2.0974161624908447,
222
+ "learning_rate": 4.85833350471928e-06,
223
+ "loss": 0.5628,
224
+ "mean_token_accuracy": 0.8448635570704937,
225
+ "num_tokens": 2674001.0,
226
+ "step": 240
227
+ },
228
+ {
229
+ "epoch": 0.8,
230
+ "grad_norm": 2.181368112564087,
231
+ "learning_rate": 4.840268658180281e-06,
232
+ "loss": 0.5575,
233
+ "mean_token_accuracy": 0.8437174156308174,
234
+ "num_tokens": 2785131.0,
235
+ "step": 250
236
+ },
237
+ {
238
+ "epoch": 0.832,
239
+ "grad_norm": 1.9916293621063232,
240
+ "learning_rate": 4.821157859852201e-06,
241
+ "loss": 0.5496,
242
+ "mean_token_accuracy": 0.8463803254067898,
243
+ "num_tokens": 2898708.0,
244
+ "step": 260
245
+ },
246
+ {
247
+ "epoch": 0.864,
248
+ "grad_norm": 1.9616676568984985,
249
+ "learning_rate": 4.801009651050856e-06,
250
+ "loss": 0.5599,
251
+ "mean_token_accuracy": 0.8417543187737465,
252
+ "num_tokens": 3008577.0,
253
+ "step": 270
254
+ },
255
+ {
256
+ "epoch": 0.896,
257
+ "grad_norm": 2.2142021656036377,
258
+ "learning_rate": 4.779833036748801e-06,
259
+ "loss": 0.5268,
260
+ "mean_token_accuracy": 0.8514743112027645,
261
+ "num_tokens": 3120102.0,
262
+ "step": 280
263
+ },
264
+ {
265
+ "epoch": 0.928,
266
+ "grad_norm": 1.8409771919250488,
267
+ "learning_rate": 4.757637481550683e-06,
268
+ "loss": 0.5443,
269
+ "mean_token_accuracy": 0.8460176154971123,
270
+ "num_tokens": 3234456.0,
271
+ "step": 290
272
+ },
273
+ {
274
+ "epoch": 0.96,
275
+ "grad_norm": 1.9530022144317627,
276
+ "learning_rate": 4.73443290546316e-06,
277
+ "loss": 0.5487,
278
+ "mean_token_accuracy": 0.8462878614664078,
279
+ "num_tokens": 3348821.0,
280
+ "step": 300
281
+ },
282
+ {
283
+ "epoch": 0.992,
284
+ "grad_norm": 1.9719116687774658,
285
+ "learning_rate": 4.7102296794612865e-06,
286
+ "loss": 0.5229,
287
+ "mean_token_accuracy": 0.8531442761421204,
288
+ "num_tokens": 3455469.0,
289
+ "step": 310
290
+ },
291
+ {
292
+ "epoch": 1.0224,
293
+ "grad_norm": 1.9529277086257935,
294
+ "learning_rate": 4.6850386208533424e-06,
295
+ "loss": 0.4849,
296
+ "mean_token_accuracy": 0.8625573249239671,
297
+ "num_tokens": 3561864.0,
298
+ "step": 320
299
+ },
300
+ {
301
+ "epoch": 1.0544,
302
+ "grad_norm": 2.105419635772705,
303
+ "learning_rate": 4.658870988446189e-06,
304
+ "loss": 0.4819,
305
+ "mean_token_accuracy": 0.864888047426939,
306
+ "num_tokens": 3672405.0,
307
+ "step": 330
308
+ },
309
+ {
310
+ "epoch": 1.0864,
311
+ "grad_norm": 2.035449504852295,
312
+ "learning_rate": 4.631738477513286e-06,
313
+ "loss": 0.4735,
314
+ "mean_token_accuracy": 0.8661263197660446,
315
+ "num_tokens": 3780404.0,
316
+ "step": 340
317
+ },
318
+ {
319
+ "epoch": 1.1184,
320
+ "grad_norm": 2.1748783588409424,
321
+ "learning_rate": 4.603653214567654e-06,
322
+ "loss": 0.4638,
323
+ "mean_token_accuracy": 0.8661996729671955,
324
+ "num_tokens": 3894359.0,
325
+ "step": 350
326
+ },
327
+ {
328
+ "epoch": 1.1504,
329
+ "grad_norm": 2.0794739723205566,
330
+ "learning_rate": 4.574627751942087e-06,
331
+ "loss": 0.4683,
332
+ "mean_token_accuracy": 0.8664901047945023,
333
+ "num_tokens": 4007235.0,
334
+ "step": 360
335
+ },
336
+ {
337
+ "epoch": 1.1824,
338
+ "grad_norm": 2.2591140270233154,
339
+ "learning_rate": 4.544675062179056e-06,
340
+ "loss": 0.4616,
341
+ "mean_token_accuracy": 0.8676725938916207,
342
+ "num_tokens": 4116127.0,
343
+ "step": 370
344
+ },
345
+ {
346
+ "epoch": 1.2144,
347
+ "grad_norm": 2.111358165740967,
348
+ "learning_rate": 4.513808532232804e-06,
349
+ "loss": 0.4742,
350
+ "mean_token_accuracy": 0.8660650610923767,
351
+ "num_tokens": 4228679.0,
352
+ "step": 380
353
+ },
354
+ {
355
+ "epoch": 1.2464,
356
+ "grad_norm": 2.115079402923584,
357
+ "learning_rate": 4.482041957486229e-06,
358
+ "loss": 0.4679,
359
+ "mean_token_accuracy": 0.8653517335653305,
360
+ "num_tokens": 4341814.0,
361
+ "step": 390
362
+ },
363
+ {
364
+ "epoch": 1.2784,
365
+ "grad_norm": 2.15097713470459,
366
+ "learning_rate": 4.4493895355852176e-06,
367
+ "loss": 0.448,
368
+ "mean_token_accuracy": 0.8716798067092896,
369
+ "num_tokens": 4451355.0,
370
+ "step": 400
371
+ },
372
+ {
373
+ "epoch": 1.3104,
374
+ "grad_norm": 2.124791383743286,
375
+ "learning_rate": 4.415865860093199e-06,
376
+ "loss": 0.4576,
377
+ "mean_token_accuracy": 0.8684222124516964,
378
+ "num_tokens": 4566924.0,
379
+ "step": 410
380
+ },
381
+ {
382
+ "epoch": 1.3424,
383
+ "grad_norm": 2.033034563064575,
384
+ "learning_rate": 4.381485913968747e-06,
385
+ "loss": 0.4558,
386
+ "mean_token_accuracy": 0.8672670938074589,
387
+ "num_tokens": 4677684.0,
388
+ "step": 420
389
+ },
390
+ {
391
+ "epoch": 1.3744,
392
+ "grad_norm": 2.179928779602051,
393
+ "learning_rate": 4.346265062869147e-06,
394
+ "loss": 0.468,
395
+ "mean_token_accuracy": 0.8666120313107968,
396
+ "num_tokens": 4787468.0,
397
+ "step": 430
398
+ },
399
+ {
400
+ "epoch": 1.4064,
401
+ "grad_norm": 2.1226675510406494,
402
+ "learning_rate": 4.310219048282921e-06,
403
+ "loss": 0.4514,
404
+ "mean_token_accuracy": 0.8693858951330184,
405
+ "num_tokens": 4899692.0,
406
+ "step": 440
407
+ },
408
+ {
409
+ "epoch": 1.4384000000000001,
410
+ "grad_norm": 2.226494550704956,
411
+ "learning_rate": 4.273363980494379e-06,
412
+ "loss": 0.4761,
413
+ "mean_token_accuracy": 0.8673883900046349,
414
+ "num_tokens": 5008859.0,
415
+ "step": 450
416
+ },
417
+ {
418
+ "epoch": 1.4704,
419
+ "grad_norm": 2.039381980895996,
420
+ "learning_rate": 4.235716331383343e-06,
421
+ "loss": 0.4652,
422
+ "mean_token_accuracy": 0.8683492943644524,
423
+ "num_tokens": 5117719.0,
424
+ "step": 460
425
+ },
426
+ {
427
+ "epoch": 1.5024,
428
+ "grad_norm": 2.1596784591674805,
429
+ "learning_rate": 4.197292927063263e-06,
430
+ "loss": 0.4701,
431
+ "mean_token_accuracy": 0.863686279207468,
432
+ "num_tokens": 5227041.0,
433
+ "step": 470
434
+ },
435
+ {
436
+ "epoch": 1.5344,
437
+ "grad_norm": 2.153256893157959,
438
+ "learning_rate": 4.158110940361007e-06,
439
+ "loss": 0.4559,
440
+ "mean_token_accuracy": 0.8705666847527027,
441
+ "num_tokens": 5334777.0,
442
+ "step": 480
443
+ },
444
+ {
445
+ "epoch": 1.5664,
446
+ "grad_norm": 2.5348920822143555,
447
+ "learning_rate": 4.118187883141694e-06,
448
+ "loss": 0.461,
449
+ "mean_token_accuracy": 0.8692857407033443,
450
+ "num_tokens": 5441237.0,
451
+ "step": 490
452
+ },
453
+ {
454
+ "epoch": 1.5984,
455
+ "grad_norm": 2.2534334659576416,
456
+ "learning_rate": 4.077541598482009e-06,
457
+ "loss": 0.4748,
458
+ "mean_token_accuracy": 0.8672291107475758,
459
+ "num_tokens": 5553620.0,
460
+ "step": 500
461
+ },
462
+ {
463
+ "epoch": 1.6303999999999998,
464
+ "grad_norm": 2.1640467643737793,
465
+ "learning_rate": 4.036190252695467e-06,
466
+ "loss": 0.4874,
467
+ "mean_token_accuracy": 0.8607077576220036,
468
+ "num_tokens": 5663373.0,
469
+ "step": 510
470
+ },
471
+ {
472
+ "epoch": 1.6623999999999999,
473
+ "grad_norm": 2.1866941452026367,
474
+ "learning_rate": 3.994152327213232e-06,
475
+ "loss": 0.4545,
476
+ "mean_token_accuracy": 0.8709075525403023,
477
+ "num_tokens": 5776290.0,
478
+ "step": 520
479
+ },
480
+ {
481
+ "epoch": 1.6944,
482
+ "grad_norm": 2.4210782051086426,
483
+ "learning_rate": 3.95144661032408e-06,
484
+ "loss": 0.4316,
485
+ "mean_token_accuracy": 0.8738773807883262,
486
+ "num_tokens": 5882282.0,
487
+ "step": 530
488
+ },
489
+ {
490
+ "epoch": 1.7264,
491
+ "grad_norm": 2.2260217666625977,
492
+ "learning_rate": 3.9080921887772314e-06,
493
+ "loss": 0.4497,
494
+ "mean_token_accuracy": 0.8696616031229496,
495
+ "num_tokens": 5999645.0,
496
+ "step": 540
497
+ },
498
+ {
499
+ "epoch": 1.7584,
500
+ "grad_norm": 2.1372995376586914,
501
+ "learning_rate": 3.864108439251779e-06,
502
+ "loss": 0.4466,
503
+ "mean_token_accuracy": 0.8693803034722805,
504
+ "num_tokens": 6108677.0,
505
+ "step": 550
506
+ },
507
+ {
508
+ "epoch": 1.7904,
509
+ "grad_norm": 2.0875401496887207,
510
+ "learning_rate": 3.8195150196965414e-06,
511
+ "loss": 0.4618,
512
+ "mean_token_accuracy": 0.866703423857689,
513
+ "num_tokens": 6222393.0,
514
+ "step": 560
515
+ },
516
+ {
517
+ "epoch": 1.8224,
518
+ "grad_norm": 2.06887149810791,
519
+ "learning_rate": 3.7743318605442093e-06,
520
+ "loss": 0.4593,
521
+ "mean_token_accuracy": 0.8678097896277904,
522
+ "num_tokens": 6337550.0,
523
+ "step": 570
524
+ },
525
+ {
526
+ "epoch": 1.8544,
527
+ "grad_norm": 1.9830808639526367,
528
+ "learning_rate": 3.728579155803697e-06,
529
+ "loss": 0.4577,
530
+ "mean_token_accuracy": 0.8675460621714592,
531
+ "num_tokens": 6449551.0,
532
+ "step": 580
533
+ },
534
+ {
535
+ "epoch": 1.8864,
536
+ "grad_norm": 2.165144443511963,
537
+ "learning_rate": 3.6822773540347107e-06,
538
+ "loss": 0.439,
539
+ "mean_token_accuracy": 0.8739401362836361,
540
+ "num_tokens": 6562113.0,
541
+ "step": 590
542
+ },
543
+ {
544
+ "epoch": 1.9184,
545
+ "grad_norm": 2.1738622188568115,
546
+ "learning_rate": 3.635447149208528e-06,
547
+ "loss": 0.4412,
548
+ "mean_token_accuracy": 0.8708230577409267,
549
+ "num_tokens": 6674860.0,
550
+ "step": 600
551
+ },
552
+ {
553
+ "epoch": 1.9504000000000001,
554
+ "grad_norm": 2.4379212856292725,
555
+ "learning_rate": 3.5881094714591102e-06,
556
+ "loss": 0.4522,
557
+ "mean_token_accuracy": 0.8691440485417843,
558
+ "num_tokens": 6789358.0,
559
+ "step": 610
560
+ },
561
+ {
562
+ "epoch": 1.9824000000000002,
563
+ "grad_norm": 2.1988134384155273,
564
+ "learning_rate": 3.540285477728658e-06,
565
+ "loss": 0.4351,
566
+ "mean_token_accuracy": 0.8745189763605594,
567
+ "num_tokens": 6899897.0,
568
+ "step": 620
569
+ },
570
+ {
571
+ "epoch": 2.0128,
572
+ "grad_norm": 2.226576089859009,
573
+ "learning_rate": 3.491996542311795e-06,
574
+ "loss": 0.4196,
575
+ "mean_token_accuracy": 0.8793611534331974,
576
+ "num_tokens": 7005635.0,
577
+ "step": 630
578
+ },
579
+ {
580
+ "epoch": 2.0448,
581
+ "grad_norm": 2.284040927886963,
582
+ "learning_rate": 3.44326424730262e-06,
583
+ "loss": 0.3729,
584
+ "mean_token_accuracy": 0.8900357685983181,
585
+ "num_tokens": 7118882.0,
586
+ "step": 640
587
+ },
588
+ {
589
+ "epoch": 2.0768,
590
+ "grad_norm": 2.3776021003723145,
591
+ "learning_rate": 3.394110372948871e-06,
592
+ "loss": 0.3784,
593
+ "mean_token_accuracy": 0.8912972524762154,
594
+ "num_tokens": 7229223.0,
595
+ "step": 650
596
+ },
597
+ {
598
+ "epoch": 2.1088,
599
+ "grad_norm": 2.3752377033233643,
600
+ "learning_rate": 3.344556887917539e-06,
601
+ "loss": 0.3754,
602
+ "mean_token_accuracy": 0.8933634988963604,
603
+ "num_tokens": 7344352.0,
604
+ "step": 660
605
+ },
606
+ {
607
+ "epoch": 2.1408,
608
+ "grad_norm": 2.2204787731170654,
609
+ "learning_rate": 3.2946259394762645e-06,
610
+ "loss": 0.3603,
611
+ "mean_token_accuracy": 0.8949612870812416,
612
+ "num_tokens": 7452978.0,
613
+ "step": 670
614
+ },
615
+ {
616
+ "epoch": 2.1728,
617
+ "grad_norm": 2.3392176628112793,
618
+ "learning_rate": 3.244339843594913e-06,
619
+ "loss": 0.3633,
620
+ "mean_token_accuracy": 0.8934120312333107,
621
+ "num_tokens": 7562987.0,
622
+ "step": 680
623
+ },
624
+ {
625
+ "epoch": 2.2048,
626
+ "grad_norm": 2.548868417739868,
627
+ "learning_rate": 3.19372107497175e-06,
628
+ "loss": 0.3688,
629
+ "mean_token_accuracy": 0.8925295993685722,
630
+ "num_tokens": 7672018.0,
631
+ "step": 690
632
+ },
633
+ {
634
+ "epoch": 2.2368,
635
+ "grad_norm": 2.524747371673584,
636
+ "learning_rate": 3.1427922569886756e-06,
637
+ "loss": 0.3535,
638
+ "mean_token_accuracy": 0.8959250211715698,
639
+ "num_tokens": 7778694.0,
640
+ "step": 700
641
+ },
642
+ {
643
+ "epoch": 2.2688,
644
+ "grad_norm": 2.2159647941589355,
645
+ "learning_rate": 3.0915761516000053e-06,
646
+ "loss": 0.3645,
647
+ "mean_token_accuracy": 0.8916135348379612,
648
+ "num_tokens": 7889175.0,
649
+ "step": 710
650
+ },
651
+ {
652
+ "epoch": 2.3008,
653
+ "grad_norm": 2.6435368061065674,
654
+ "learning_rate": 3.0400956491593215e-06,
655
+ "loss": 0.3742,
656
+ "mean_token_accuracy": 0.8912425316870213,
657
+ "num_tokens": 7997644.0,
658
+ "step": 720
659
+ },
660
+ {
661
+ "epoch": 2.3327999999999998,
662
+ "grad_norm": 2.5125110149383545,
663
+ "learning_rate": 2.9883737581889315e-06,
664
+ "loss": 0.3767,
665
+ "mean_token_accuracy": 0.8903645075857639,
666
+ "num_tokens": 8109860.0,
667
+ "step": 730
668
+ },
669
+ {
670
+ "epoch": 2.3648,
671
+ "grad_norm": 2.482232093811035,
672
+ "learning_rate": 2.9364335950965227e-06,
673
+ "loss": 0.3531,
674
+ "mean_token_accuracy": 0.8950154200196266,
675
+ "num_tokens": 8219541.0,
676
+ "step": 740
677
+ },
678
+ {
679
+ "epoch": 2.3968,
680
+ "grad_norm": 2.360308885574341,
681
+ "learning_rate": 2.8842983738435855e-06,
682
+ "loss": 0.3602,
683
+ "mean_token_accuracy": 0.892794082313776,
684
+ "num_tokens": 8329765.0,
685
+ "step": 750
686
+ },
687
+ {
688
+ "epoch": 2.4288,
689
+ "grad_norm": 2.5826432704925537,
690
+ "learning_rate": 2.831991395570249e-06,
691
+ "loss": 0.364,
692
+ "mean_token_accuracy": 0.8930977649986744,
693
+ "num_tokens": 8443917.0,
694
+ "step": 760
695
+ },
696
+ {
697
+ "epoch": 2.4608,
698
+ "grad_norm": 2.588911771774292,
699
+ "learning_rate": 2.779536038181146e-06,
700
+ "loss": 0.3771,
701
+ "mean_token_accuracy": 0.8915269665420056,
702
+ "num_tokens": 8550550.0,
703
+ "step": 770
704
+ },
705
+ {
706
+ "epoch": 2.4928,
707
+ "grad_norm": 2.4379560947418213,
708
+ "learning_rate": 2.726955745896972e-06,
709
+ "loss": 0.3726,
710
+ "mean_token_accuracy": 0.8931849867105484,
711
+ "num_tokens": 8659752.0,
712
+ "step": 780
713
+ },
714
+ {
715
+ "epoch": 2.5248,
716
+ "grad_norm": 2.285378932952881,
717
+ "learning_rate": 2.6742740187764026e-06,
718
+ "loss": 0.3644,
719
+ "mean_token_accuracy": 0.8937488354742527,
720
+ "num_tokens": 8775392.0,
721
+ "step": 790
722
+ },
723
+ {
724
+ "epoch": 2.5568,
725
+ "grad_norm": 2.6339035034179688,
726
+ "learning_rate": 2.621514402213059e-06,
727
+ "loss": 0.3528,
728
+ "mean_token_accuracy": 0.8967724144458771,
729
+ "num_tokens": 8888499.0,
730
+ "step": 800
731
+ },
732
+ {
733
+ "epoch": 2.5888,
734
+ "grad_norm": 2.2561960220336914,
735
+ "learning_rate": 2.568700476412204e-06,
736
+ "loss": 0.3846,
737
+ "mean_token_accuracy": 0.8879048563539982,
738
+ "num_tokens": 9001912.0,
739
+ "step": 810
740
+ },
741
+ {
742
+ "epoch": 2.6208,
743
+ "grad_norm": 2.3487298488616943,
744
+ "learning_rate": 2.515855845851894e-06,
745
+ "loss": 0.3602,
746
+ "mean_token_accuracy": 0.8923349373042584,
747
+ "num_tokens": 9113468.0,
748
+ "step": 820
749
+ },
750
+ {
751
+ "epoch": 2.6528,
752
+ "grad_norm": 2.4302189350128174,
753
+ "learning_rate": 2.46300412873326e-06,
754
+ "loss": 0.3613,
755
+ "mean_token_accuracy": 0.8928801603615284,
756
+ "num_tokens": 9229335.0,
757
+ "step": 830
758
+ },
759
+ {
760
+ "epoch": 2.6848,
761
+ "grad_norm": 2.48608136177063,
762
+ "learning_rate": 2.4101689464246816e-06,
763
+ "loss": 0.3642,
764
+ "mean_token_accuracy": 0.8935355007648468,
765
+ "num_tokens": 9344595.0,
766
+ "step": 840
767
+ },
768
+ {
769
+ "epoch": 2.7168,
770
+ "grad_norm": 2.357041120529175,
771
+ "learning_rate": 2.3573739129045227e-06,
772
+ "loss": 0.3579,
773
+ "mean_token_accuracy": 0.8953744530677795,
774
+ "num_tokens": 9455015.0,
775
+ "step": 850
776
+ },
777
+ {
778
+ "epoch": 2.7488,
779
+ "grad_norm": 2.649960517883301,
780
+ "learning_rate": 2.3046426242071834e-06,
781
+ "loss": 0.3689,
782
+ "mean_token_accuracy": 0.8912502139806747,
783
+ "num_tokens": 9560806.0,
784
+ "step": 860
785
+ },
786
+ {
787
+ "epoch": 2.7808,
788
+ "grad_norm": 2.3866405487060547,
789
+ "learning_rate": 2.251998647877169e-06,
790
+ "loss": 0.364,
791
+ "mean_token_accuracy": 0.8922963574528694,
792
+ "num_tokens": 9673310.0,
793
+ "step": 870
794
+ },
795
+ {
796
+ "epoch": 2.8128,
797
+ "grad_norm": 2.3306539058685303,
798
+ "learning_rate": 2.1994655124358875e-06,
799
+ "loss": 0.347,
800
+ "mean_token_accuracy": 0.8968791022896767,
801
+ "num_tokens": 9787134.0,
802
+ "step": 880
803
+ },
804
+ {
805
+ "epoch": 2.8448,
806
+ "grad_norm": 2.3708155155181885,
807
+ "learning_rate": 2.1470666968658956e-06,
808
+ "loss": 0.3576,
809
+ "mean_token_accuracy": 0.8956275820732117,
810
+ "num_tokens": 9901311.0,
811
+ "step": 890
812
+ },
813
+ {
814
+ "epoch": 2.8768000000000002,
815
+ "grad_norm": 2.466033935546875,
816
+ "learning_rate": 2.0948256201172772e-06,
817
+ "loss": 0.3495,
818
+ "mean_token_accuracy": 0.8957146309316159,
819
+ "num_tokens": 10017025.0,
820
+ "step": 900
821
+ },
822
+ {
823
+ "epoch": 2.9088000000000003,
824
+ "grad_norm": 2.630037784576416,
825
+ "learning_rate": 2.0427656306408594e-06,
826
+ "loss": 0.3643,
827
+ "mean_token_accuracy": 0.892961747199297,
828
+ "num_tokens": 10131105.0,
829
+ "step": 910
830
+ },
831
+ {
832
+ "epoch": 2.9408,
833
+ "grad_norm": 2.4972527027130127,
834
+ "learning_rate": 1.990909995952936e-06,
835
+ "loss": 0.3542,
836
+ "mean_token_accuracy": 0.8959409989416599,
837
+ "num_tokens": 10240555.0,
838
+ "step": 920
839
+ },
840
+ {
841
+ "epoch": 2.9728,
842
+ "grad_norm": 2.722083330154419,
843
+ "learning_rate": 1.939281892236157e-06,
844
+ "loss": 0.3531,
845
+ "mean_token_accuracy": 0.8945782586932183,
846
+ "num_tokens": 10347889.0,
847
+ "step": 930
848
+ },
849
+ {
850
+ "epoch": 3.0032,
851
+ "grad_norm": 2.2683236598968506,
852
+ "learning_rate": 1.8879043939812503e-06,
853
+ "loss": 0.3579,
854
+ "mean_token_accuracy": 0.8970388119157992,
855
+ "num_tokens": 10453648.0,
856
+ "step": 940
857
+ },
858
+ {
859
+ "epoch": 3.0352,
860
+ "grad_norm": 2.5510222911834717,
861
+ "learning_rate": 1.8368004636741794e-06,
862
+ "loss": 0.3062,
863
+ "mean_token_accuracy": 0.9120431691408157,
864
+ "num_tokens": 10565156.0,
865
+ "step": 950
866
+ },
867
+ {
868
+ "epoch": 3.0672,
869
+ "grad_norm": 2.730621337890625,
870
+ "learning_rate": 1.7859929415333725e-06,
871
+ "loss": 0.2912,
872
+ "mean_token_accuracy": 0.9148132555186749,
873
+ "num_tokens": 10676677.0,
874
+ "step": 960
875
+ },
876
+ {
877
+ "epoch": 3.0992,
878
+ "grad_norm": 2.5102102756500244,
879
+ "learning_rate": 1.735504535301592e-06,
880
+ "loss": 0.2882,
881
+ "mean_token_accuracy": 0.9159041397273541,
882
+ "num_tokens": 10787468.0,
883
+ "step": 970
884
+ },
885
+ {
886
+ "epoch": 3.1312,
887
+ "grad_norm": 2.3301403522491455,
888
+ "learning_rate": 1.6853578100970086e-06,
889
+ "loss": 0.3016,
890
+ "mean_token_accuracy": 0.911161607503891,
891
+ "num_tokens": 10897279.0,
892
+ "step": 980
893
+ },
894
+ {
895
+ "epoch": 3.1632,
896
+ "grad_norm": 2.4964194297790527,
897
+ "learning_rate": 1.6355751783280283e-06,
898
+ "loss": 0.2927,
899
+ "mean_token_accuracy": 0.9142641559243202,
900
+ "num_tokens": 11008731.0,
901
+ "step": 990
902
+ },
903
+ {
904
+ "epoch": 3.1952,
905
+ "grad_norm": 2.547372341156006,
906
+ "learning_rate": 1.5861788896763625e-06,
907
+ "loss": 0.2991,
908
+ "mean_token_accuracy": 0.9131025090813637,
909
+ "num_tokens": 11123820.0,
910
+ "step": 1000
911
+ }
912
+ ],
913
+ "logging_steps": 10,
914
+ "max_steps": 1565,
915
+ "num_input_tokens_seen": 0,
916
+ "num_train_epochs": 5,
917
+ "save_steps": 500,
918
+ "stateful_callbacks": {
919
+ "TrainerControl": {
920
+ "args": {
921
+ "should_epoch_stop": false,
922
+ "should_evaluate": false,
923
+ "should_log": false,
924
+ "should_save": true,
925
+ "should_training_stop": false
926
+ },
927
+ "attributes": {}
928
+ }
929
+ },
930
+ "total_flos": 1.0242157166877082e+17,
931
+ "train_batch_size": 4,
932
+ "trial_name": null,
933
+ "trial_params": null
934
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:805391ab265a423fa4bf8e6272db7bca4771272624226f8273e5a50efd4ae604
3
+ size 6161