Felix-Thin commited on
Commit
da72bbd
·
verified ·
1 Parent(s): 902cddc

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 0,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 2,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 512,
14
+ "n_embd": 512,
15
+ "n_head": 8,
16
+ "n_inner": null,
17
+ "n_layer": 6,
18
+ "n_positions": 512,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.52.3",
30
+ "use_cache": true,
31
+ "vocab_size": 50257
32
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.52.3"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c97967e7892ee92c3125cfb5fcc7973cb4866d5be9d315dce5056853368efbd
3
+ size 179643784
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a732cc857dacb756641184afa2add289ad5143a461ed696b8268a23f21a260d
3
+ size 359336314
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f0905a2ca2884b688f260ec79576f47723236ac25d6f8f69a3c3d85cbc1b71c
3
+ size 14244
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59a4acfb0da74c479080613978839dd3cbb4608fd2c07e764b4c844401d8dd5f
3
+ size 988
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e88b9a7c20d0a7e67be23b50c7564265c79690a0db9c7737ff1c025326cda20a
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "<mask>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "<pad>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "eos_token": "</s>",
47
+ "extra_special_tokens": {},
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 1000000000000000019884624838656,
50
+ "pad_token": "<pad>",
51
+ "tokenizer_class": "PreTrainedTokenizer",
52
+ "unk_token": "<unk>"
53
+ }
trainer_state.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 4000,
3
+ "best_metric": 6.2553181648254395,
4
+ "best_model_checkpoint": "gpt-small-c4/checkpoint-4000",
5
+ "epoch": 0.20065211938801103,
6
+ "eval_steps": 100,
7
+ "global_step": 4000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.005016302984700276,
14
+ "grad_norm": 1.367209553718567,
15
+ "learning_rate": 4.975169300225734e-05,
16
+ "loss": 8.9927,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.005016302984700276,
21
+ "eval_loss": 8.038568496704102,
22
+ "eval_runtime": 106.068,
23
+ "eval_samples_per_second": 167.619,
24
+ "eval_steps_per_second": 20.958,
25
+ "step": 100
26
+ },
27
+ {
28
+ "epoch": 0.010032605969400551,
29
+ "grad_norm": 1.294573426246643,
30
+ "learning_rate": 4.950087785302233e-05,
31
+ "loss": 7.6182,
32
+ "step": 200
33
+ },
34
+ {
35
+ "epoch": 0.010032605969400551,
36
+ "eval_loss": 7.498734474182129,
37
+ "eval_runtime": 102.098,
38
+ "eval_samples_per_second": 174.137,
39
+ "eval_steps_per_second": 21.773,
40
+ "step": 200
41
+ },
42
+ {
43
+ "epoch": 0.015048908954100828,
44
+ "grad_norm": 1.3138136863708496,
45
+ "learning_rate": 4.925006270378731e-05,
46
+ "loss": 7.4073,
47
+ "step": 300
48
+ },
49
+ {
50
+ "epoch": 0.015048908954100828,
51
+ "eval_loss": 7.337331295013428,
52
+ "eval_runtime": 93.9798,
53
+ "eval_samples_per_second": 189.179,
54
+ "eval_steps_per_second": 23.654,
55
+ "step": 300
56
+ },
57
+ {
58
+ "epoch": 0.020065211938801102,
59
+ "grad_norm": 1.247786283493042,
60
+ "learning_rate": 4.89992475545523e-05,
61
+ "loss": 7.2394,
62
+ "step": 400
63
+ },
64
+ {
65
+ "epoch": 0.020065211938801102,
66
+ "eval_loss": 7.221430778503418,
67
+ "eval_runtime": 94.4305,
68
+ "eval_samples_per_second": 188.276,
69
+ "eval_steps_per_second": 23.541,
70
+ "step": 400
71
+ },
72
+ {
73
+ "epoch": 0.02508151492350138,
74
+ "grad_norm": 1.5004276037216187,
75
+ "learning_rate": 4.874843240531728e-05,
76
+ "loss": 7.1744,
77
+ "step": 500
78
+ },
79
+ {
80
+ "epoch": 0.02508151492350138,
81
+ "eval_loss": 7.13123893737793,
82
+ "eval_runtime": 93.9964,
83
+ "eval_samples_per_second": 189.146,
84
+ "eval_steps_per_second": 23.65,
85
+ "step": 500
86
+ },
87
+ {
88
+ "epoch": 0.030097817908201655,
89
+ "grad_norm": 1.6555237770080566,
90
+ "learning_rate": 4.8497617256082266e-05,
91
+ "loss": 7.0923,
92
+ "step": 600
93
+ },
94
+ {
95
+ "epoch": 0.030097817908201655,
96
+ "eval_loss": 7.061245918273926,
97
+ "eval_runtime": 94.3719,
98
+ "eval_samples_per_second": 188.393,
99
+ "eval_steps_per_second": 23.556,
100
+ "step": 600
101
+ },
102
+ {
103
+ "epoch": 0.03511412089290193,
104
+ "grad_norm": 1.7525192499160767,
105
+ "learning_rate": 4.8246802106847256e-05,
106
+ "loss": 7.0225,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.03511412089290193,
111
+ "eval_loss": 6.9907097816467285,
112
+ "eval_runtime": 94.5022,
113
+ "eval_samples_per_second": 188.133,
114
+ "eval_steps_per_second": 23.523,
115
+ "step": 700
116
+ },
117
+ {
118
+ "epoch": 0.040130423877602205,
119
+ "grad_norm": 1.6555861234664917,
120
+ "learning_rate": 4.7995986957612246e-05,
121
+ "loss": 6.9318,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 0.040130423877602205,
126
+ "eval_loss": 6.9372453689575195,
127
+ "eval_runtime": 94.1339,
128
+ "eval_samples_per_second": 188.869,
129
+ "eval_steps_per_second": 23.615,
130
+ "step": 800
131
+ },
132
+ {
133
+ "epoch": 0.045146726862302484,
134
+ "grad_norm": 2.0145726203918457,
135
+ "learning_rate": 4.774517180837723e-05,
136
+ "loss": 6.8885,
137
+ "step": 900
138
+ },
139
+ {
140
+ "epoch": 0.045146726862302484,
141
+ "eval_loss": 6.890288352966309,
142
+ "eval_runtime": 94.637,
143
+ "eval_samples_per_second": 187.865,
144
+ "eval_steps_per_second": 23.49,
145
+ "step": 900
146
+ },
147
+ {
148
+ "epoch": 0.05016302984700276,
149
+ "grad_norm": 1.4140468835830688,
150
+ "learning_rate": 4.749435665914221e-05,
151
+ "loss": 6.8442,
152
+ "step": 1000
153
+ },
154
+ {
155
+ "epoch": 0.05016302984700276,
156
+ "eval_loss": 6.846121788024902,
157
+ "eval_runtime": 98.5415,
158
+ "eval_samples_per_second": 180.421,
159
+ "eval_steps_per_second": 22.559,
160
+ "step": 1000
161
+ },
162
+ {
163
+ "epoch": 0.05517933283170304,
164
+ "grad_norm": 2.1040637493133545,
165
+ "learning_rate": 4.72435415099072e-05,
166
+ "loss": 6.7646,
167
+ "step": 1100
168
+ },
169
+ {
170
+ "epoch": 0.05517933283170304,
171
+ "eval_loss": 6.808917999267578,
172
+ "eval_runtime": 95.1845,
173
+ "eval_samples_per_second": 186.785,
174
+ "eval_steps_per_second": 23.355,
175
+ "step": 1100
176
+ },
177
+ {
178
+ "epoch": 0.06019563581640331,
179
+ "grad_norm": 1.9216736555099487,
180
+ "learning_rate": 4.6992726360672185e-05,
181
+ "loss": 6.7815,
182
+ "step": 1200
183
+ },
184
+ {
185
+ "epoch": 0.06019563581640331,
186
+ "eval_loss": 6.7738037109375,
187
+ "eval_runtime": 94.6149,
188
+ "eval_samples_per_second": 187.909,
189
+ "eval_steps_per_second": 23.495,
190
+ "step": 1200
191
+ },
192
+ {
193
+ "epoch": 0.06521193880110358,
194
+ "grad_norm": 1.5636332035064697,
195
+ "learning_rate": 4.6741911211437175e-05,
196
+ "loss": 6.7469,
197
+ "step": 1300
198
+ },
199
+ {
200
+ "epoch": 0.06521193880110358,
201
+ "eval_loss": 6.739809036254883,
202
+ "eval_runtime": 94.7876,
203
+ "eval_samples_per_second": 187.567,
204
+ "eval_steps_per_second": 23.452,
205
+ "step": 1300
206
+ },
207
+ {
208
+ "epoch": 0.07022824178580386,
209
+ "grad_norm": 1.5623961687088013,
210
+ "learning_rate": 4.649109606220216e-05,
211
+ "loss": 6.7202,
212
+ "step": 1400
213
+ },
214
+ {
215
+ "epoch": 0.07022824178580386,
216
+ "eval_loss": 6.719963550567627,
217
+ "eval_runtime": 94.3193,
218
+ "eval_samples_per_second": 188.498,
219
+ "eval_steps_per_second": 23.569,
220
+ "step": 1400
221
+ },
222
+ {
223
+ "epoch": 0.07524454477050414,
224
+ "grad_norm": 1.5444605350494385,
225
+ "learning_rate": 4.624028091296714e-05,
226
+ "loss": 6.689,
227
+ "step": 1500
228
+ },
229
+ {
230
+ "epoch": 0.07524454477050414,
231
+ "eval_loss": 6.685614109039307,
232
+ "eval_runtime": 94.1801,
233
+ "eval_samples_per_second": 188.777,
234
+ "eval_steps_per_second": 23.604,
235
+ "step": 1500
236
+ },
237
+ {
238
+ "epoch": 0.08026084775520441,
239
+ "grad_norm": 2.0542852878570557,
240
+ "learning_rate": 4.598946576373213e-05,
241
+ "loss": 6.6448,
242
+ "step": 1600
243
+ },
244
+ {
245
+ "epoch": 0.08026084775520441,
246
+ "eval_loss": 6.665727615356445,
247
+ "eval_runtime": 94.8119,
248
+ "eval_samples_per_second": 187.519,
249
+ "eval_steps_per_second": 23.446,
250
+ "step": 1600
251
+ },
252
+ {
253
+ "epoch": 0.08527715073990469,
254
+ "grad_norm": 1.7234691381454468,
255
+ "learning_rate": 4.573865061449712e-05,
256
+ "loss": 6.6167,
257
+ "step": 1700
258
+ },
259
+ {
260
+ "epoch": 0.08527715073990469,
261
+ "eval_loss": 6.640410423278809,
262
+ "eval_runtime": 94.192,
263
+ "eval_samples_per_second": 188.753,
264
+ "eval_steps_per_second": 23.601,
265
+ "step": 1700
266
+ },
267
+ {
268
+ "epoch": 0.09029345372460497,
269
+ "grad_norm": 1.8299592733383179,
270
+ "learning_rate": 4.5487835465262104e-05,
271
+ "loss": 6.6109,
272
+ "step": 1800
273
+ },
274
+ {
275
+ "epoch": 0.09029345372460497,
276
+ "eval_loss": 6.620120525360107,
277
+ "eval_runtime": 93.7995,
278
+ "eval_samples_per_second": 189.543,
279
+ "eval_steps_per_second": 23.699,
280
+ "step": 1800
281
+ },
282
+ {
283
+ "epoch": 0.09530975670930525,
284
+ "grad_norm": 3.1380774974823,
285
+ "learning_rate": 4.523702031602709e-05,
286
+ "loss": 6.6401,
287
+ "step": 1900
288
+ },
289
+ {
290
+ "epoch": 0.09530975670930525,
291
+ "eval_loss": 6.596529006958008,
292
+ "eval_runtime": 94.29,
293
+ "eval_samples_per_second": 188.557,
294
+ "eval_steps_per_second": 23.576,
295
+ "step": 1900
296
+ },
297
+ {
298
+ "epoch": 0.10032605969400551,
299
+ "grad_norm": 2.301722526550293,
300
+ "learning_rate": 4.4986205166792077e-05,
301
+ "loss": 6.5718,
302
+ "step": 2000
303
+ },
304
+ {
305
+ "epoch": 0.10032605969400551,
306
+ "eval_loss": 6.580358982086182,
307
+ "eval_runtime": 94.8156,
308
+ "eval_samples_per_second": 187.511,
309
+ "eval_steps_per_second": 23.445,
310
+ "step": 2000
311
+ },
312
+ {
313
+ "epoch": 0.1053423626787058,
314
+ "grad_norm": 2.1571052074432373,
315
+ "learning_rate": 4.473539001755706e-05,
316
+ "loss": 6.5723,
317
+ "step": 2100
318
+ },
319
+ {
320
+ "epoch": 0.1053423626787058,
321
+ "eval_loss": 6.559490203857422,
322
+ "eval_runtime": 93.9805,
323
+ "eval_samples_per_second": 189.177,
324
+ "eval_steps_per_second": 23.654,
325
+ "step": 2100
326
+ },
327
+ {
328
+ "epoch": 0.11035866566340607,
329
+ "grad_norm": 1.8901225328445435,
330
+ "learning_rate": 4.448457486832205e-05,
331
+ "loss": 6.5405,
332
+ "step": 2200
333
+ },
334
+ {
335
+ "epoch": 0.11035866566340607,
336
+ "eval_loss": 6.536637783050537,
337
+ "eval_runtime": 95.7913,
338
+ "eval_samples_per_second": 185.601,
339
+ "eval_steps_per_second": 23.207,
340
+ "step": 2200
341
+ },
342
+ {
343
+ "epoch": 0.11537496864810634,
344
+ "grad_norm": 1.9951658248901367,
345
+ "learning_rate": 4.423375971908704e-05,
346
+ "loss": 6.5143,
347
+ "step": 2300
348
+ },
349
+ {
350
+ "epoch": 0.11537496864810634,
351
+ "eval_loss": 6.518816947937012,
352
+ "eval_runtime": 94.7754,
353
+ "eval_samples_per_second": 187.591,
354
+ "eval_steps_per_second": 23.455,
355
+ "step": 2300
356
+ },
357
+ {
358
+ "epoch": 0.12039127163280662,
359
+ "grad_norm": 1.5648778676986694,
360
+ "learning_rate": 4.398294456985202e-05,
361
+ "loss": 6.5145,
362
+ "step": 2400
363
+ },
364
+ {
365
+ "epoch": 0.12039127163280662,
366
+ "eval_loss": 6.496397018432617,
367
+ "eval_runtime": 119.6751,
368
+ "eval_samples_per_second": 148.561,
369
+ "eval_steps_per_second": 18.575,
370
+ "step": 2400
371
+ },
372
+ {
373
+ "epoch": 0.1254075746175069,
374
+ "grad_norm": 1.6384185552597046,
375
+ "learning_rate": 4.3732129420617006e-05,
376
+ "loss": 6.4635,
377
+ "step": 2500
378
+ },
379
+ {
380
+ "epoch": 0.1254075746175069,
381
+ "eval_loss": 6.48058557510376,
382
+ "eval_runtime": 116.6468,
383
+ "eval_samples_per_second": 152.417,
384
+ "eval_steps_per_second": 19.058,
385
+ "step": 2500
386
+ },
387
+ {
388
+ "epoch": 0.13042387760220717,
389
+ "grad_norm": 2.172386884689331,
390
+ "learning_rate": 4.3481314271381995e-05,
391
+ "loss": 6.4973,
392
+ "step": 2600
393
+ },
394
+ {
395
+ "epoch": 0.13042387760220717,
396
+ "eval_loss": 6.466433048248291,
397
+ "eval_runtime": 98.1503,
398
+ "eval_samples_per_second": 181.141,
399
+ "eval_steps_per_second": 22.649,
400
+ "step": 2600
401
+ },
402
+ {
403
+ "epoch": 0.13544018058690746,
404
+ "grad_norm": 1.9169673919677734,
405
+ "learning_rate": 4.323049912214698e-05,
406
+ "loss": 6.475,
407
+ "step": 2700
408
+ },
409
+ {
410
+ "epoch": 0.13544018058690746,
411
+ "eval_loss": 6.4444684982299805,
412
+ "eval_runtime": 94.3642,
413
+ "eval_samples_per_second": 188.408,
414
+ "eval_steps_per_second": 23.558,
415
+ "step": 2700
416
+ },
417
+ {
418
+ "epoch": 0.14045648357160773,
419
+ "grad_norm": 1.8222382068634033,
420
+ "learning_rate": 4.297968397291197e-05,
421
+ "loss": 6.4216,
422
+ "step": 2800
423
+ },
424
+ {
425
+ "epoch": 0.14045648357160773,
426
+ "eval_loss": 6.434403896331787,
427
+ "eval_runtime": 96.3492,
428
+ "eval_samples_per_second": 184.527,
429
+ "eval_steps_per_second": 23.072,
430
+ "step": 2800
431
+ },
432
+ {
433
+ "epoch": 0.145472786556308,
434
+ "grad_norm": 2.0480902194976807,
435
+ "learning_rate": 4.272886882367695e-05,
436
+ "loss": 6.4062,
437
+ "step": 2900
438
+ },
439
+ {
440
+ "epoch": 0.145472786556308,
441
+ "eval_loss": 6.415233135223389,
442
+ "eval_runtime": 94.0684,
443
+ "eval_samples_per_second": 189.001,
444
+ "eval_steps_per_second": 23.632,
445
+ "step": 2900
446
+ },
447
+ {
448
+ "epoch": 0.1504890895410083,
449
+ "grad_norm": 1.965072512626648,
450
+ "learning_rate": 4.2478053674441935e-05,
451
+ "loss": 6.3655,
452
+ "step": 3000
453
+ },
454
+ {
455
+ "epoch": 0.1504890895410083,
456
+ "eval_loss": 6.401170253753662,
457
+ "eval_runtime": 94.0417,
458
+ "eval_samples_per_second": 189.054,
459
+ "eval_steps_per_second": 23.638,
460
+ "step": 3000
461
+ },
462
+ {
463
+ "epoch": 0.15550539252570855,
464
+ "grad_norm": 2.16786789894104,
465
+ "learning_rate": 4.2227238525206924e-05,
466
+ "loss": 6.4038,
467
+ "step": 3100
468
+ },
469
+ {
470
+ "epoch": 0.15550539252570855,
471
+ "eval_loss": 6.388797283172607,
472
+ "eval_runtime": 93.9464,
473
+ "eval_samples_per_second": 189.246,
474
+ "eval_steps_per_second": 23.662,
475
+ "step": 3100
476
+ },
477
+ {
478
+ "epoch": 0.16052169551040882,
479
+ "grad_norm": 2.5082712173461914,
480
+ "learning_rate": 4.1976423375971914e-05,
481
+ "loss": 6.3553,
482
+ "step": 3200
483
+ },
484
+ {
485
+ "epoch": 0.16052169551040882,
486
+ "eval_loss": 6.368188858032227,
487
+ "eval_runtime": 94.1089,
488
+ "eval_samples_per_second": 188.919,
489
+ "eval_steps_per_second": 23.622,
490
+ "step": 3200
491
+ },
492
+ {
493
+ "epoch": 0.1655379984951091,
494
+ "grad_norm": 2.0116617679595947,
495
+ "learning_rate": 4.17256082267369e-05,
496
+ "loss": 6.3573,
497
+ "step": 3300
498
+ },
499
+ {
500
+ "epoch": 0.1655379984951091,
501
+ "eval_loss": 6.35645866394043,
502
+ "eval_runtime": 94.2829,
503
+ "eval_samples_per_second": 188.571,
504
+ "eval_steps_per_second": 23.578,
505
+ "step": 3300
506
+ },
507
+ {
508
+ "epoch": 0.17055430147980938,
509
+ "grad_norm": 2.309736728668213,
510
+ "learning_rate": 4.147479307750188e-05,
511
+ "loss": 6.3458,
512
+ "step": 3400
513
+ },
514
+ {
515
+ "epoch": 0.17055430147980938,
516
+ "eval_loss": 6.339991092681885,
517
+ "eval_runtime": 94.4661,
518
+ "eval_samples_per_second": 188.205,
519
+ "eval_steps_per_second": 23.532,
520
+ "step": 3400
521
+ },
522
+ {
523
+ "epoch": 0.17557060446450964,
524
+ "grad_norm": 1.963045597076416,
525
+ "learning_rate": 4.122397792826687e-05,
526
+ "loss": 6.3157,
527
+ "step": 3500
528
+ },
529
+ {
530
+ "epoch": 0.17557060446450964,
531
+ "eval_loss": 6.325737476348877,
532
+ "eval_runtime": 94.2601,
533
+ "eval_samples_per_second": 188.616,
534
+ "eval_steps_per_second": 23.584,
535
+ "step": 3500
536
+ },
537
+ {
538
+ "epoch": 0.18058690744920994,
539
+ "grad_norm": 2.3348584175109863,
540
+ "learning_rate": 4.0973162779031853e-05,
541
+ "loss": 6.3232,
542
+ "step": 3600
543
+ },
544
+ {
545
+ "epoch": 0.18058690744920994,
546
+ "eval_loss": 6.314403533935547,
547
+ "eval_runtime": 94.6269,
548
+ "eval_samples_per_second": 187.885,
549
+ "eval_steps_per_second": 23.492,
550
+ "step": 3600
551
+ },
552
+ {
553
+ "epoch": 0.1856032104339102,
554
+ "grad_norm": 1.7809332609176636,
555
+ "learning_rate": 4.072234762979684e-05,
556
+ "loss": 6.2809,
557
+ "step": 3700
558
+ },
559
+ {
560
+ "epoch": 0.1856032104339102,
561
+ "eval_loss": 6.2983903884887695,
562
+ "eval_runtime": 94.2727,
563
+ "eval_samples_per_second": 188.591,
564
+ "eval_steps_per_second": 23.581,
565
+ "step": 3700
566
+ },
567
+ {
568
+ "epoch": 0.1906195134186105,
569
+ "grad_norm": 2.0216691493988037,
570
+ "learning_rate": 4.047153248056183e-05,
571
+ "loss": 6.2558,
572
+ "step": 3800
573
+ },
574
+ {
575
+ "epoch": 0.1906195134186105,
576
+ "eval_loss": 6.28033971786499,
577
+ "eval_runtime": 94.2314,
578
+ "eval_samples_per_second": 188.674,
579
+ "eval_steps_per_second": 23.591,
580
+ "step": 3800
581
+ },
582
+ {
583
+ "epoch": 0.19563581640331076,
584
+ "grad_norm": 2.2930386066436768,
585
+ "learning_rate": 4.022071733132681e-05,
586
+ "loss": 6.2869,
587
+ "step": 3900
588
+ },
589
+ {
590
+ "epoch": 0.19563581640331076,
591
+ "eval_loss": 6.2675676345825195,
592
+ "eval_runtime": 94.5425,
593
+ "eval_samples_per_second": 188.053,
594
+ "eval_steps_per_second": 23.513,
595
+ "step": 3900
596
+ },
597
+ {
598
+ "epoch": 0.20065211938801103,
599
+ "grad_norm": 2.321624755859375,
600
+ "learning_rate": 3.99699021820918e-05,
601
+ "loss": 6.2825,
602
+ "step": 4000
603
+ },
604
+ {
605
+ "epoch": 0.20065211938801103,
606
+ "eval_loss": 6.2553181648254395,
607
+ "eval_runtime": 94.2273,
608
+ "eval_samples_per_second": 188.682,
609
+ "eval_steps_per_second": 23.592,
610
+ "step": 4000
611
+ }
612
+ ],
613
+ "logging_steps": 100,
614
+ "max_steps": 19935,
615
+ "num_input_tokens_seen": 0,
616
+ "num_train_epochs": 1,
617
+ "save_steps": 100,
618
+ "stateful_callbacks": {
619
+ "TrainerControl": {
620
+ "args": {
621
+ "should_epoch_stop": false,
622
+ "should_evaluate": false,
623
+ "should_log": false,
624
+ "should_save": true,
625
+ "should_training_stop": false
626
+ },
627
+ "attributes": {}
628
+ }
629
+ },
630
+ "total_flos": 929726201856000.0,
631
+ "train_batch_size": 8,
632
+ "trial_name": null,
633
+ "trial_params": null
634
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:816cd880c2d6abfb50dbc2582892848cac9e4eab6906857c70c52f7f8bdb136c
3
+ size 5240