xihajun commited on
Commit
996fb52
1 Parent(s): 5ef3013

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +9 -0
  2. adapter_config.json +21 -0
  3. adapter_model.bin +3 -0
  4. checkpoint-164/README.md +34 -0
  5. checkpoint-164/adapter_config.json +21 -0
  6. checkpoint-164/adapter_model.bin +3 -0
  7. checkpoint-164/optimizer.pt +3 -0
  8. checkpoint-164/pytorch_model.bin +3 -0
  9. checkpoint-164/rng_state.pth +3 -0
  10. checkpoint-164/scheduler.pt +3 -0
  11. checkpoint-164/special_tokens_map.json +30 -0
  12. checkpoint-164/tokenizer.model +3 -0
  13. checkpoint-164/tokenizer_config.json +44 -0
  14. checkpoint-164/trainer_state.json +343 -0
  15. checkpoint-164/training_args.bin +3 -0
  16. checkpoint-168/README.md +34 -0
  17. checkpoint-168/adapter_config.json +21 -0
  18. checkpoint-168/adapter_model.bin +3 -0
  19. checkpoint-168/optimizer.pt +3 -0
  20. checkpoint-168/pytorch_model.bin +3 -0
  21. checkpoint-168/rng_state.pth +3 -0
  22. checkpoint-168/scheduler.pt +3 -0
  23. checkpoint-168/special_tokens_map.json +30 -0
  24. checkpoint-168/tokenizer.model +3 -0
  25. checkpoint-168/tokenizer_config.json +44 -0
  26. checkpoint-168/trainer_state.json +355 -0
  27. checkpoint-168/training_args.bin +3 -0
  28. checkpoint-172/README.md +34 -0
  29. checkpoint-172/adapter_config.json +21 -0
  30. checkpoint-172/adapter_model.bin +3 -0
  31. checkpoint-172/optimizer.pt +3 -0
  32. checkpoint-172/pytorch_model.bin +3 -0
  33. checkpoint-172/rng_state.pth +3 -0
  34. checkpoint-172/scheduler.pt +3 -0
  35. checkpoint-172/special_tokens_map.json +30 -0
  36. checkpoint-172/tokenizer.model +3 -0
  37. checkpoint-172/tokenizer_config.json +44 -0
  38. checkpoint-172/trainer_state.json +361 -0
  39. checkpoint-172/training_args.bin +3 -0
  40. checkpoint-176/README.md +34 -0
  41. checkpoint-176/adapter_config.json +21 -0
  42. checkpoint-176/adapter_model.bin +3 -0
  43. checkpoint-176/optimizer.pt +3 -0
  44. checkpoint-176/pytorch_model.bin +3 -0
  45. checkpoint-176/rng_state.pth +3 -0
  46. checkpoint-176/scheduler.pt +3 -0
  47. checkpoint-176/special_tokens_map.json +30 -0
  48. checkpoint-176/tokenizer.model +3 -0
  49. checkpoint-176/tokenizer_config.json +44 -0
  50. checkpoint-176/trainer_state.json +367 -0
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - autotrain
4
+ - text-generation
5
+ widget:
6
+ - text: "I love AutoTrain because "
7
+ ---
8
+
9
+ # Model Trained Using AutoTrain
adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "01-ai/Yi-34B",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 16,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0d970728c41ee58d9d09394814c1d363562c81ec58d5829c692d1bc6496c627
3
+ size 86594570
checkpoint-164/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: QuantizationMethod.BITS_AND_BYTES
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float16
18
+
19
+ The following `bitsandbytes` quantization config was used during training:
20
+ - quant_method: QuantizationMethod.BITS_AND_BYTES
21
+ - load_in_8bit: False
22
+ - load_in_4bit: True
23
+ - llm_int8_threshold: 6.0
24
+ - llm_int8_skip_modules: None
25
+ - llm_int8_enable_fp32_cpu_offload: False
26
+ - llm_int8_has_fp16_weight: False
27
+ - bnb_4bit_quant_type: nf4
28
+ - bnb_4bit_use_double_quant: False
29
+ - bnb_4bit_compute_dtype: float16
30
+ ### Framework versions
31
+
32
+ - PEFT 0.5.0
33
+
34
+ - PEFT 0.5.0
checkpoint-164/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "01-ai/Yi-34B",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 16,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
checkpoint-164/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:681860c8e5ad531989e00690acda26a1e6c0069d0c2b2f6f64e36a88daaf4656
3
+ size 86594570
checkpoint-164/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9af9cc2dedd4ed734091b0260274ce225436851af9b9f1a5cd3873ff292c203c
3
+ size 173217146
checkpoint-164/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:049c26b844b79121ddd8379f7f69194e63f6fbf6aa007eeac0c66f17eebb8893
3
+ size 888
checkpoint-164/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a424f8dd91e7729143e26a0f44429ef2973cd296eeeff4bc6dff14b1264133d8
3
+ size 14244
checkpoint-164/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e193130b6122442aeae48a6025d2f156693ddff438585e86b0844b100b6147
3
+ size 1064
checkpoint-164/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-164/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39
3
+ size 1033105
checkpoint-164/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<|startoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "01-ai/Yi-34B--tokenization_yi.YiTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<|startoftext|>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "<|endoftext|>",
39
+ "model_max_length": 1024,
40
+ "pad_token": "<unk>",
41
+ "sp_model_kwargs": {},
42
+ "tokenizer_class": "YiTokenizer",
43
+ "unk_token": "<unk>"
44
+ }
checkpoint-164/trainer_state.json ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 41.0,
5
+ "eval_steps": 500,
6
+ "global_step": 164,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.75,
13
+ "learning_rate": 0.0003,
14
+ "loss": 1.0279,
15
+ "step": 3
16
+ },
17
+ {
18
+ "epoch": 1.5,
19
+ "learning_rate": 0.0006,
20
+ "loss": 0.9556,
21
+ "step": 6
22
+ },
23
+ {
24
+ "epoch": 2.25,
25
+ "learning_rate": 0.0009000000000000001,
26
+ "loss": 0.9999,
27
+ "step": 9
28
+ },
29
+ {
30
+ "epoch": 3.0,
31
+ "learning_rate": 0.0012,
32
+ "loss": 0.7355,
33
+ "step": 12
34
+ },
35
+ {
36
+ "epoch": 3.75,
37
+ "learning_rate": 0.0015,
38
+ "loss": 0.6017,
39
+ "step": 15
40
+ },
41
+ {
42
+ "epoch": 4.5,
43
+ "learning_rate": 0.0018000000000000002,
44
+ "loss": 0.4413,
45
+ "step": 18
46
+ },
47
+ {
48
+ "epoch": 5.25,
49
+ "learning_rate": 0.001988888888888889,
50
+ "loss": 0.3896,
51
+ "step": 21
52
+ },
53
+ {
54
+ "epoch": 6.0,
55
+ "learning_rate": 0.0019555555555555554,
56
+ "loss": 0.2588,
57
+ "step": 24
58
+ },
59
+ {
60
+ "epoch": 6.75,
61
+ "learning_rate": 0.0019222222222222223,
62
+ "loss": 0.1937,
63
+ "step": 27
64
+ },
65
+ {
66
+ "epoch": 7.5,
67
+ "learning_rate": 0.001888888888888889,
68
+ "loss": 0.1074,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 8.25,
73
+ "learning_rate": 0.0018555555555555556,
74
+ "loss": 0.1232,
75
+ "step": 33
76
+ },
77
+ {
78
+ "epoch": 9.0,
79
+ "learning_rate": 0.0018222222222222223,
80
+ "loss": 0.086,
81
+ "step": 36
82
+ },
83
+ {
84
+ "epoch": 9.75,
85
+ "learning_rate": 0.0018111111111111112,
86
+ "loss": 1.6039,
87
+ "step": 39
88
+ },
89
+ {
90
+ "epoch": 10.5,
91
+ "learning_rate": 0.0018000000000000002,
92
+ "loss": 1.4514,
93
+ "step": 42
94
+ },
95
+ {
96
+ "epoch": 11.25,
97
+ "learning_rate": 0.0017666666666666666,
98
+ "loss": 0.158,
99
+ "step": 45
100
+ },
101
+ {
102
+ "epoch": 12.0,
103
+ "learning_rate": 0.0017333333333333335,
104
+ "loss": 0.0712,
105
+ "step": 48
106
+ },
107
+ {
108
+ "epoch": 12.75,
109
+ "learning_rate": 0.0017,
110
+ "loss": 0.052,
111
+ "step": 51
112
+ },
113
+ {
114
+ "epoch": 13.5,
115
+ "learning_rate": 0.0016666666666666668,
116
+ "loss": 0.0412,
117
+ "step": 54
118
+ },
119
+ {
120
+ "epoch": 14.25,
121
+ "learning_rate": 0.0016333333333333334,
122
+ "loss": 0.0452,
123
+ "step": 57
124
+ },
125
+ {
126
+ "epoch": 15.0,
127
+ "learning_rate": 0.0016,
128
+ "loss": 0.0432,
129
+ "step": 60
130
+ },
131
+ {
132
+ "epoch": 15.75,
133
+ "learning_rate": 0.0015666666666666667,
134
+ "loss": 0.025,
135
+ "step": 63
136
+ },
137
+ {
138
+ "epoch": 16.5,
139
+ "learning_rate": 0.0015333333333333334,
140
+ "loss": 0.0245,
141
+ "step": 66
142
+ },
143
+ {
144
+ "epoch": 17.25,
145
+ "learning_rate": 0.0015,
146
+ "loss": 0.0172,
147
+ "step": 69
148
+ },
149
+ {
150
+ "epoch": 18.0,
151
+ "learning_rate": 0.0014666666666666667,
152
+ "loss": 0.0154,
153
+ "step": 72
154
+ },
155
+ {
156
+ "epoch": 18.75,
157
+ "learning_rate": 0.0014333333333333333,
158
+ "loss": 0.0128,
159
+ "step": 75
160
+ },
161
+ {
162
+ "epoch": 19.5,
163
+ "learning_rate": 0.0014,
164
+ "loss": 0.0171,
165
+ "step": 78
166
+ },
167
+ {
168
+ "epoch": 20.25,
169
+ "learning_rate": 0.0013666666666666666,
170
+ "loss": 0.011,
171
+ "step": 81
172
+ },
173
+ {
174
+ "epoch": 21.0,
175
+ "learning_rate": 0.0013333333333333333,
176
+ "loss": 0.0218,
177
+ "step": 84
178
+ },
179
+ {
180
+ "epoch": 21.75,
181
+ "learning_rate": 0.0013000000000000002,
182
+ "loss": 0.0102,
183
+ "step": 87
184
+ },
185
+ {
186
+ "epoch": 22.5,
187
+ "learning_rate": 0.0012666666666666666,
188
+ "loss": 0.0063,
189
+ "step": 90
190
+ },
191
+ {
192
+ "epoch": 23.25,
193
+ "learning_rate": 0.0012333333333333335,
194
+ "loss": 0.0073,
195
+ "step": 93
196
+ },
197
+ {
198
+ "epoch": 24.0,
199
+ "learning_rate": 0.0012,
200
+ "loss": 0.0047,
201
+ "step": 96
202
+ },
203
+ {
204
+ "epoch": 24.75,
205
+ "learning_rate": 0.0011666666666666668,
206
+ "loss": 0.0036,
207
+ "step": 99
208
+ },
209
+ {
210
+ "epoch": 25.5,
211
+ "learning_rate": 0.0011333333333333334,
212
+ "loss": 0.0037,
213
+ "step": 102
214
+ },
215
+ {
216
+ "epoch": 26.25,
217
+ "learning_rate": 0.0011,
218
+ "loss": 0.0033,
219
+ "step": 105
220
+ },
221
+ {
222
+ "epoch": 27.0,
223
+ "learning_rate": 0.0010666666666666667,
224
+ "loss": 0.0039,
225
+ "step": 108
226
+ },
227
+ {
228
+ "epoch": 27.75,
229
+ "learning_rate": 0.0010333333333333334,
230
+ "loss": 0.0025,
231
+ "step": 111
232
+ },
233
+ {
234
+ "epoch": 28.5,
235
+ "learning_rate": 0.001,
236
+ "loss": 0.0033,
237
+ "step": 114
238
+ },
239
+ {
240
+ "epoch": 29.25,
241
+ "learning_rate": 0.0009666666666666667,
242
+ "loss": 0.0021,
243
+ "step": 117
244
+ },
245
+ {
246
+ "epoch": 30.0,
247
+ "learning_rate": 0.0009333333333333333,
248
+ "loss": 0.0027,
249
+ "step": 120
250
+ },
251
+ {
252
+ "epoch": 30.75,
253
+ "learning_rate": 0.0009000000000000001,
254
+ "loss": 0.0017,
255
+ "step": 123
256
+ },
257
+ {
258
+ "epoch": 31.5,
259
+ "learning_rate": 0.0008666666666666667,
260
+ "loss": 0.0016,
261
+ "step": 126
262
+ },
263
+ {
264
+ "epoch": 32.25,
265
+ "learning_rate": 0.0008333333333333334,
266
+ "loss": 0.0015,
267
+ "step": 129
268
+ },
269
+ {
270
+ "epoch": 33.0,
271
+ "learning_rate": 0.0008,
272
+ "loss": 0.0013,
273
+ "step": 132
274
+ },
275
+ {
276
+ "epoch": 33.75,
277
+ "learning_rate": 0.0007666666666666667,
278
+ "loss": 0.0008,
279
+ "step": 135
280
+ },
281
+ {
282
+ "epoch": 34.5,
283
+ "learning_rate": 0.0007333333333333333,
284
+ "loss": 0.0018,
285
+ "step": 138
286
+ },
287
+ {
288
+ "epoch": 35.25,
289
+ "learning_rate": 0.0007,
290
+ "loss": 0.0012,
291
+ "step": 141
292
+ },
293
+ {
294
+ "epoch": 36.0,
295
+ "learning_rate": 0.0006666666666666666,
296
+ "loss": 0.0012,
297
+ "step": 144
298
+ },
299
+ {
300
+ "epoch": 36.75,
301
+ "learning_rate": 0.0006333333333333333,
302
+ "loss": 0.0009,
303
+ "step": 147
304
+ },
305
+ {
306
+ "epoch": 37.5,
307
+ "learning_rate": 0.0006,
308
+ "loss": 0.0009,
309
+ "step": 150
310
+ },
311
+ {
312
+ "epoch": 38.25,
313
+ "learning_rate": 0.0005666666666666667,
314
+ "loss": 0.0011,
315
+ "step": 153
316
+ },
317
+ {
318
+ "epoch": 39.0,
319
+ "learning_rate": 0.0005333333333333334,
320
+ "loss": 0.0008,
321
+ "step": 156
322
+ },
323
+ {
324
+ "epoch": 39.75,
325
+ "learning_rate": 0.0005,
326
+ "loss": 0.0009,
327
+ "step": 159
328
+ },
329
+ {
330
+ "epoch": 40.5,
331
+ "learning_rate": 0.00046666666666666666,
332
+ "loss": 0.0008,
333
+ "step": 162
334
+ }
335
+ ],
336
+ "logging_steps": 3,
337
+ "max_steps": 200,
338
+ "num_train_epochs": 50,
339
+ "save_steps": 500,
340
+ "total_flos": 1.368414759073874e+17,
341
+ "trial_name": null,
342
+ "trial_params": null
343
+ }
checkpoint-164/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bced848f1d7870fe05133227865677a8a42f46b0aeea6d70d51cc48491cfe2d
3
+ size 4536
checkpoint-168/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: QuantizationMethod.BITS_AND_BYTES
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float16
18
+
19
+ The following `bitsandbytes` quantization config was used during training:
20
+ - quant_method: QuantizationMethod.BITS_AND_BYTES
21
+ - load_in_8bit: False
22
+ - load_in_4bit: True
23
+ - llm_int8_threshold: 6.0
24
+ - llm_int8_skip_modules: None
25
+ - llm_int8_enable_fp32_cpu_offload: False
26
+ - llm_int8_has_fp16_weight: False
27
+ - bnb_4bit_quant_type: nf4
28
+ - bnb_4bit_use_double_quant: False
29
+ - bnb_4bit_compute_dtype: float16
30
+ ### Framework versions
31
+
32
+ - PEFT 0.5.0
33
+
34
+ - PEFT 0.5.0
checkpoint-168/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "01-ai/Yi-34B",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 16,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
checkpoint-168/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96c358fca7c2dc4ffc3c06c560e64d86508a33020cb22479d0b526dafe45f1d9
3
+ size 86594570
checkpoint-168/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6147288262d352d3b5e17aa94c22ce8b9412a405146e90ebac4ac289aa27bd1a
3
+ size 173217146
checkpoint-168/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:049c26b844b79121ddd8379f7f69194e63f6fbf6aa007eeac0c66f17eebb8893
3
+ size 888
checkpoint-168/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:162326a97017efe6343e5f3d49be7d54462bd3318dd9ed3493b4133f3032e5af
3
+ size 14244
checkpoint-168/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be143a756ee9696cd132039a42b0578be1b35db72f361aa51d7d5d24015ad77f
3
+ size 1064
checkpoint-168/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-168/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39
3
+ size 1033105
checkpoint-168/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<|startoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "01-ai/Yi-34B--tokenization_yi.YiTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<|startoftext|>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "<|endoftext|>",
39
+ "model_max_length": 1024,
40
+ "pad_token": "<unk>",
41
+ "sp_model_kwargs": {},
42
+ "tokenizer_class": "YiTokenizer",
43
+ "unk_token": "<unk>"
44
+ }
checkpoint-168/trainer_state.json ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 42.0,
5
+ "eval_steps": 500,
6
+ "global_step": 168,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.75,
13
+ "learning_rate": 0.0003,
14
+ "loss": 1.0279,
15
+ "step": 3
16
+ },
17
+ {
18
+ "epoch": 1.5,
19
+ "learning_rate": 0.0006,
20
+ "loss": 0.9556,
21
+ "step": 6
22
+ },
23
+ {
24
+ "epoch": 2.25,
25
+ "learning_rate": 0.0009000000000000001,
26
+ "loss": 0.9999,
27
+ "step": 9
28
+ },
29
+ {
30
+ "epoch": 3.0,
31
+ "learning_rate": 0.0012,
32
+ "loss": 0.7355,
33
+ "step": 12
34
+ },
35
+ {
36
+ "epoch": 3.75,
37
+ "learning_rate": 0.0015,
38
+ "loss": 0.6017,
39
+ "step": 15
40
+ },
41
+ {
42
+ "epoch": 4.5,
43
+ "learning_rate": 0.0018000000000000002,
44
+ "loss": 0.4413,
45
+ "step": 18
46
+ },
47
+ {
48
+ "epoch": 5.25,
49
+ "learning_rate": 0.001988888888888889,
50
+ "loss": 0.3896,
51
+ "step": 21
52
+ },
53
+ {
54
+ "epoch": 6.0,
55
+ "learning_rate": 0.0019555555555555554,
56
+ "loss": 0.2588,
57
+ "step": 24
58
+ },
59
+ {
60
+ "epoch": 6.75,
61
+ "learning_rate": 0.0019222222222222223,
62
+ "loss": 0.1937,
63
+ "step": 27
64
+ },
65
+ {
66
+ "epoch": 7.5,
67
+ "learning_rate": 0.001888888888888889,
68
+ "loss": 0.1074,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 8.25,
73
+ "learning_rate": 0.0018555555555555556,
74
+ "loss": 0.1232,
75
+ "step": 33
76
+ },
77
+ {
78
+ "epoch": 9.0,
79
+ "learning_rate": 0.0018222222222222223,
80
+ "loss": 0.086,
81
+ "step": 36
82
+ },
83
+ {
84
+ "epoch": 9.75,
85
+ "learning_rate": 0.0018111111111111112,
86
+ "loss": 1.6039,
87
+ "step": 39
88
+ },
89
+ {
90
+ "epoch": 10.5,
91
+ "learning_rate": 0.0018000000000000002,
92
+ "loss": 1.4514,
93
+ "step": 42
94
+ },
95
+ {
96
+ "epoch": 11.25,
97
+ "learning_rate": 0.0017666666666666666,
98
+ "loss": 0.158,
99
+ "step": 45
100
+ },
101
+ {
102
+ "epoch": 12.0,
103
+ "learning_rate": 0.0017333333333333335,
104
+ "loss": 0.0712,
105
+ "step": 48
106
+ },
107
+ {
108
+ "epoch": 12.75,
109
+ "learning_rate": 0.0017,
110
+ "loss": 0.052,
111
+ "step": 51
112
+ },
113
+ {
114
+ "epoch": 13.5,
115
+ "learning_rate": 0.0016666666666666668,
116
+ "loss": 0.0412,
117
+ "step": 54
118
+ },
119
+ {
120
+ "epoch": 14.25,
121
+ "learning_rate": 0.0016333333333333334,
122
+ "loss": 0.0452,
123
+ "step": 57
124
+ },
125
+ {
126
+ "epoch": 15.0,
127
+ "learning_rate": 0.0016,
128
+ "loss": 0.0432,
129
+ "step": 60
130
+ },
131
+ {
132
+ "epoch": 15.75,
133
+ "learning_rate": 0.0015666666666666667,
134
+ "loss": 0.025,
135
+ "step": 63
136
+ },
137
+ {
138
+ "epoch": 16.5,
139
+ "learning_rate": 0.0015333333333333334,
140
+ "loss": 0.0245,
141
+ "step": 66
142
+ },
143
+ {
144
+ "epoch": 17.25,
145
+ "learning_rate": 0.0015,
146
+ "loss": 0.0172,
147
+ "step": 69
148
+ },
149
+ {
150
+ "epoch": 18.0,
151
+ "learning_rate": 0.0014666666666666667,
152
+ "loss": 0.0154,
153
+ "step": 72
154
+ },
155
+ {
156
+ "epoch": 18.75,
157
+ "learning_rate": 0.0014333333333333333,
158
+ "loss": 0.0128,
159
+ "step": 75
160
+ },
161
+ {
162
+ "epoch": 19.5,
163
+ "learning_rate": 0.0014,
164
+ "loss": 0.0171,
165
+ "step": 78
166
+ },
167
+ {
168
+ "epoch": 20.25,
169
+ "learning_rate": 0.0013666666666666666,
170
+ "loss": 0.011,
171
+ "step": 81
172
+ },
173
+ {
174
+ "epoch": 21.0,
175
+ "learning_rate": 0.0013333333333333333,
176
+ "loss": 0.0218,
177
+ "step": 84
178
+ },
179
+ {
180
+ "epoch": 21.75,
181
+ "learning_rate": 0.0013000000000000002,
182
+ "loss": 0.0102,
183
+ "step": 87
184
+ },
185
+ {
186
+ "epoch": 22.5,
187
+ "learning_rate": 0.0012666666666666666,
188
+ "loss": 0.0063,
189
+ "step": 90
190
+ },
191
+ {
192
+ "epoch": 23.25,
193
+ "learning_rate": 0.0012333333333333335,
194
+ "loss": 0.0073,
195
+ "step": 93
196
+ },
197
+ {
198
+ "epoch": 24.0,
199
+ "learning_rate": 0.0012,
200
+ "loss": 0.0047,
201
+ "step": 96
202
+ },
203
+ {
204
+ "epoch": 24.75,
205
+ "learning_rate": 0.0011666666666666668,
206
+ "loss": 0.0036,
207
+ "step": 99
208
+ },
209
+ {
210
+ "epoch": 25.5,
211
+ "learning_rate": 0.0011333333333333334,
212
+ "loss": 0.0037,
213
+ "step": 102
214
+ },
215
+ {
216
+ "epoch": 26.25,
217
+ "learning_rate": 0.0011,
218
+ "loss": 0.0033,
219
+ "step": 105
220
+ },
221
+ {
222
+ "epoch": 27.0,
223
+ "learning_rate": 0.0010666666666666667,
224
+ "loss": 0.0039,
225
+ "step": 108
226
+ },
227
+ {
228
+ "epoch": 27.75,
229
+ "learning_rate": 0.0010333333333333334,
230
+ "loss": 0.0025,
231
+ "step": 111
232
+ },
233
+ {
234
+ "epoch": 28.5,
235
+ "learning_rate": 0.001,
236
+ "loss": 0.0033,
237
+ "step": 114
238
+ },
239
+ {
240
+ "epoch": 29.25,
241
+ "learning_rate": 0.0009666666666666667,
242
+ "loss": 0.0021,
243
+ "step": 117
244
+ },
245
+ {
246
+ "epoch": 30.0,
247
+ "learning_rate": 0.0009333333333333333,
248
+ "loss": 0.0027,
249
+ "step": 120
250
+ },
251
+ {
252
+ "epoch": 30.75,
253
+ "learning_rate": 0.0009000000000000001,
254
+ "loss": 0.0017,
255
+ "step": 123
256
+ },
257
+ {
258
+ "epoch": 31.5,
259
+ "learning_rate": 0.0008666666666666667,
260
+ "loss": 0.0016,
261
+ "step": 126
262
+ },
263
+ {
264
+ "epoch": 32.25,
265
+ "learning_rate": 0.0008333333333333334,
266
+ "loss": 0.0015,
267
+ "step": 129
268
+ },
269
+ {
270
+ "epoch": 33.0,
271
+ "learning_rate": 0.0008,
272
+ "loss": 0.0013,
273
+ "step": 132
274
+ },
275
+ {
276
+ "epoch": 33.75,
277
+ "learning_rate": 0.0007666666666666667,
278
+ "loss": 0.0008,
279
+ "step": 135
280
+ },
281
+ {
282
+ "epoch": 34.5,
283
+ "learning_rate": 0.0007333333333333333,
284
+ "loss": 0.0018,
285
+ "step": 138
286
+ },
287
+ {
288
+ "epoch": 35.25,
289
+ "learning_rate": 0.0007,
290
+ "loss": 0.0012,
291
+ "step": 141
292
+ },
293
+ {
294
+ "epoch": 36.0,
295
+ "learning_rate": 0.0006666666666666666,
296
+ "loss": 0.0012,
297
+ "step": 144
298
+ },
299
+ {
300
+ "epoch": 36.75,
301
+ "learning_rate": 0.0006333333333333333,
302
+ "loss": 0.0009,
303
+ "step": 147
304
+ },
305
+ {
306
+ "epoch": 37.5,
307
+ "learning_rate": 0.0006,
308
+ "loss": 0.0009,
309
+ "step": 150
310
+ },
311
+ {
312
+ "epoch": 38.25,
313
+ "learning_rate": 0.0005666666666666667,
314
+ "loss": 0.0011,
315
+ "step": 153
316
+ },
317
+ {
318
+ "epoch": 39.0,
319
+ "learning_rate": 0.0005333333333333334,
320
+ "loss": 0.0008,
321
+ "step": 156
322
+ },
323
+ {
324
+ "epoch": 39.75,
325
+ "learning_rate": 0.0005,
326
+ "loss": 0.0009,
327
+ "step": 159
328
+ },
329
+ {
330
+ "epoch": 40.5,
331
+ "learning_rate": 0.00046666666666666666,
332
+ "loss": 0.0008,
333
+ "step": 162
334
+ },
335
+ {
336
+ "epoch": 41.25,
337
+ "learning_rate": 0.00043333333333333337,
338
+ "loss": 0.0008,
339
+ "step": 165
340
+ },
341
+ {
342
+ "epoch": 42.0,
343
+ "learning_rate": 0.0004,
344
+ "loss": 0.0007,
345
+ "step": 168
346
+ }
347
+ ],
348
+ "logging_steps": 3,
349
+ "max_steps": 200,
350
+ "num_train_epochs": 50,
351
+ "save_steps": 500,
352
+ "total_flos": 1.401790728807383e+17,
353
+ "trial_name": null,
354
+ "trial_params": null
355
+ }
checkpoint-168/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bced848f1d7870fe05133227865677a8a42f46b0aeea6d70d51cc48491cfe2d
3
+ size 4536
checkpoint-172/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: QuantizationMethod.BITS_AND_BYTES
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float16
18
+
19
+ The following `bitsandbytes` quantization config was used during training:
20
+ - quant_method: QuantizationMethod.BITS_AND_BYTES
21
+ - load_in_8bit: False
22
+ - load_in_4bit: True
23
+ - llm_int8_threshold: 6.0
24
+ - llm_int8_skip_modules: None
25
+ - llm_int8_enable_fp32_cpu_offload: False
26
+ - llm_int8_has_fp16_weight: False
27
+ - bnb_4bit_quant_type: nf4
28
+ - bnb_4bit_use_double_quant: False
29
+ - bnb_4bit_compute_dtype: float16
30
+ ### Framework versions
31
+
32
+ - PEFT 0.5.0
33
+
34
+ - PEFT 0.5.0
checkpoint-172/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "01-ai/Yi-34B",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 16,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
checkpoint-172/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac2dc6cf888c263478d35a5f26d79f07d8cd3fe703eee2f44638980feb7047c0
3
+ size 86594570
checkpoint-172/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00ed1a44e832632b59cf106b6c79ac0fabcc2d611fc23d8b11cbd83341cce356
3
+ size 173217146
checkpoint-172/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:049c26b844b79121ddd8379f7f69194e63f6fbf6aa007eeac0c66f17eebb8893
3
+ size 888
checkpoint-172/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c33df53e040c606fe9c5996ae08553653693b25604ec75ba51c5ccce0c2031
3
+ size 14244
checkpoint-172/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70e687cc1f6a99bbdeef59978930e7b34b221f7fc176d331c06d8eb71bda68e4
3
+ size 1064
checkpoint-172/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-172/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39
3
+ size 1033105
checkpoint-172/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<|startoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "01-ai/Yi-34B--tokenization_yi.YiTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<|startoftext|>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "<|endoftext|>",
39
+ "model_max_length": 1024,
40
+ "pad_token": "<unk>",
41
+ "sp_model_kwargs": {},
42
+ "tokenizer_class": "YiTokenizer",
43
+ "unk_token": "<unk>"
44
+ }
checkpoint-172/trainer_state.json ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 43.0,
5
+ "eval_steps": 500,
6
+ "global_step": 172,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.75,
13
+ "learning_rate": 0.0003,
14
+ "loss": 1.0279,
15
+ "step": 3
16
+ },
17
+ {
18
+ "epoch": 1.5,
19
+ "learning_rate": 0.0006,
20
+ "loss": 0.9556,
21
+ "step": 6
22
+ },
23
+ {
24
+ "epoch": 2.25,
25
+ "learning_rate": 0.0009000000000000001,
26
+ "loss": 0.9999,
27
+ "step": 9
28
+ },
29
+ {
30
+ "epoch": 3.0,
31
+ "learning_rate": 0.0012,
32
+ "loss": 0.7355,
33
+ "step": 12
34
+ },
35
+ {
36
+ "epoch": 3.75,
37
+ "learning_rate": 0.0015,
38
+ "loss": 0.6017,
39
+ "step": 15
40
+ },
41
+ {
42
+ "epoch": 4.5,
43
+ "learning_rate": 0.0018000000000000002,
44
+ "loss": 0.4413,
45
+ "step": 18
46
+ },
47
+ {
48
+ "epoch": 5.25,
49
+ "learning_rate": 0.001988888888888889,
50
+ "loss": 0.3896,
51
+ "step": 21
52
+ },
53
+ {
54
+ "epoch": 6.0,
55
+ "learning_rate": 0.0019555555555555554,
56
+ "loss": 0.2588,
57
+ "step": 24
58
+ },
59
+ {
60
+ "epoch": 6.75,
61
+ "learning_rate": 0.0019222222222222223,
62
+ "loss": 0.1937,
63
+ "step": 27
64
+ },
65
+ {
66
+ "epoch": 7.5,
67
+ "learning_rate": 0.001888888888888889,
68
+ "loss": 0.1074,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 8.25,
73
+ "learning_rate": 0.0018555555555555556,
74
+ "loss": 0.1232,
75
+ "step": 33
76
+ },
77
+ {
78
+ "epoch": 9.0,
79
+ "learning_rate": 0.0018222222222222223,
80
+ "loss": 0.086,
81
+ "step": 36
82
+ },
83
+ {
84
+ "epoch": 9.75,
85
+ "learning_rate": 0.0018111111111111112,
86
+ "loss": 1.6039,
87
+ "step": 39
88
+ },
89
+ {
90
+ "epoch": 10.5,
91
+ "learning_rate": 0.0018000000000000002,
92
+ "loss": 1.4514,
93
+ "step": 42
94
+ },
95
+ {
96
+ "epoch": 11.25,
97
+ "learning_rate": 0.0017666666666666666,
98
+ "loss": 0.158,
99
+ "step": 45
100
+ },
101
+ {
102
+ "epoch": 12.0,
103
+ "learning_rate": 0.0017333333333333335,
104
+ "loss": 0.0712,
105
+ "step": 48
106
+ },
107
+ {
108
+ "epoch": 12.75,
109
+ "learning_rate": 0.0017,
110
+ "loss": 0.052,
111
+ "step": 51
112
+ },
113
+ {
114
+ "epoch": 13.5,
115
+ "learning_rate": 0.0016666666666666668,
116
+ "loss": 0.0412,
117
+ "step": 54
118
+ },
119
+ {
120
+ "epoch": 14.25,
121
+ "learning_rate": 0.0016333333333333334,
122
+ "loss": 0.0452,
123
+ "step": 57
124
+ },
125
+ {
126
+ "epoch": 15.0,
127
+ "learning_rate": 0.0016,
128
+ "loss": 0.0432,
129
+ "step": 60
130
+ },
131
+ {
132
+ "epoch": 15.75,
133
+ "learning_rate": 0.0015666666666666667,
134
+ "loss": 0.025,
135
+ "step": 63
136
+ },
137
+ {
138
+ "epoch": 16.5,
139
+ "learning_rate": 0.0015333333333333334,
140
+ "loss": 0.0245,
141
+ "step": 66
142
+ },
143
+ {
144
+ "epoch": 17.25,
145
+ "learning_rate": 0.0015,
146
+ "loss": 0.0172,
147
+ "step": 69
148
+ },
149
+ {
150
+ "epoch": 18.0,
151
+ "learning_rate": 0.0014666666666666667,
152
+ "loss": 0.0154,
153
+ "step": 72
154
+ },
155
+ {
156
+ "epoch": 18.75,
157
+ "learning_rate": 0.0014333333333333333,
158
+ "loss": 0.0128,
159
+ "step": 75
160
+ },
161
+ {
162
+ "epoch": 19.5,
163
+ "learning_rate": 0.0014,
164
+ "loss": 0.0171,
165
+ "step": 78
166
+ },
167
+ {
168
+ "epoch": 20.25,
169
+ "learning_rate": 0.0013666666666666666,
170
+ "loss": 0.011,
171
+ "step": 81
172
+ },
173
+ {
174
+ "epoch": 21.0,
175
+ "learning_rate": 0.0013333333333333333,
176
+ "loss": 0.0218,
177
+ "step": 84
178
+ },
179
+ {
180
+ "epoch": 21.75,
181
+ "learning_rate": 0.0013000000000000002,
182
+ "loss": 0.0102,
183
+ "step": 87
184
+ },
185
+ {
186
+ "epoch": 22.5,
187
+ "learning_rate": 0.0012666666666666666,
188
+ "loss": 0.0063,
189
+ "step": 90
190
+ },
191
+ {
192
+ "epoch": 23.25,
193
+ "learning_rate": 0.0012333333333333335,
194
+ "loss": 0.0073,
195
+ "step": 93
196
+ },
197
+ {
198
+ "epoch": 24.0,
199
+ "learning_rate": 0.0012,
200
+ "loss": 0.0047,
201
+ "step": 96
202
+ },
203
+ {
204
+ "epoch": 24.75,
205
+ "learning_rate": 0.0011666666666666668,
206
+ "loss": 0.0036,
207
+ "step": 99
208
+ },
209
+ {
210
+ "epoch": 25.5,
211
+ "learning_rate": 0.0011333333333333334,
212
+ "loss": 0.0037,
213
+ "step": 102
214
+ },
215
+ {
216
+ "epoch": 26.25,
217
+ "learning_rate": 0.0011,
218
+ "loss": 0.0033,
219
+ "step": 105
220
+ },
221
+ {
222
+ "epoch": 27.0,
223
+ "learning_rate": 0.0010666666666666667,
224
+ "loss": 0.0039,
225
+ "step": 108
226
+ },
227
+ {
228
+ "epoch": 27.75,
229
+ "learning_rate": 0.0010333333333333334,
230
+ "loss": 0.0025,
231
+ "step": 111
232
+ },
233
+ {
234
+ "epoch": 28.5,
235
+ "learning_rate": 0.001,
236
+ "loss": 0.0033,
237
+ "step": 114
238
+ },
239
+ {
240
+ "epoch": 29.25,
241
+ "learning_rate": 0.0009666666666666667,
242
+ "loss": 0.0021,
243
+ "step": 117
244
+ },
245
+ {
246
+ "epoch": 30.0,
247
+ "learning_rate": 0.0009333333333333333,
248
+ "loss": 0.0027,
249
+ "step": 120
250
+ },
251
+ {
252
+ "epoch": 30.75,
253
+ "learning_rate": 0.0009000000000000001,
254
+ "loss": 0.0017,
255
+ "step": 123
256
+ },
257
+ {
258
+ "epoch": 31.5,
259
+ "learning_rate": 0.0008666666666666667,
260
+ "loss": 0.0016,
261
+ "step": 126
262
+ },
263
+ {
264
+ "epoch": 32.25,
265
+ "learning_rate": 0.0008333333333333334,
266
+ "loss": 0.0015,
267
+ "step": 129
268
+ },
269
+ {
270
+ "epoch": 33.0,
271
+ "learning_rate": 0.0008,
272
+ "loss": 0.0013,
273
+ "step": 132
274
+ },
275
+ {
276
+ "epoch": 33.75,
277
+ "learning_rate": 0.0007666666666666667,
278
+ "loss": 0.0008,
279
+ "step": 135
280
+ },
281
+ {
282
+ "epoch": 34.5,
283
+ "learning_rate": 0.0007333333333333333,
284
+ "loss": 0.0018,
285
+ "step": 138
286
+ },
287
+ {
288
+ "epoch": 35.25,
289
+ "learning_rate": 0.0007,
290
+ "loss": 0.0012,
291
+ "step": 141
292
+ },
293
+ {
294
+ "epoch": 36.0,
295
+ "learning_rate": 0.0006666666666666666,
296
+ "loss": 0.0012,
297
+ "step": 144
298
+ },
299
+ {
300
+ "epoch": 36.75,
301
+ "learning_rate": 0.0006333333333333333,
302
+ "loss": 0.0009,
303
+ "step": 147
304
+ },
305
+ {
306
+ "epoch": 37.5,
307
+ "learning_rate": 0.0006,
308
+ "loss": 0.0009,
309
+ "step": 150
310
+ },
311
+ {
312
+ "epoch": 38.25,
313
+ "learning_rate": 0.0005666666666666667,
314
+ "loss": 0.0011,
315
+ "step": 153
316
+ },
317
+ {
318
+ "epoch": 39.0,
319
+ "learning_rate": 0.0005333333333333334,
320
+ "loss": 0.0008,
321
+ "step": 156
322
+ },
323
+ {
324
+ "epoch": 39.75,
325
+ "learning_rate": 0.0005,
326
+ "loss": 0.0009,
327
+ "step": 159
328
+ },
329
+ {
330
+ "epoch": 40.5,
331
+ "learning_rate": 0.00046666666666666666,
332
+ "loss": 0.0008,
333
+ "step": 162
334
+ },
335
+ {
336
+ "epoch": 41.25,
337
+ "learning_rate": 0.00043333333333333337,
338
+ "loss": 0.0008,
339
+ "step": 165
340
+ },
341
+ {
342
+ "epoch": 42.0,
343
+ "learning_rate": 0.0004,
344
+ "loss": 0.0007,
345
+ "step": 168
346
+ },
347
+ {
348
+ "epoch": 42.75,
349
+ "learning_rate": 0.00036666666666666667,
350
+ "loss": 0.0009,
351
+ "step": 171
352
+ }
353
+ ],
354
+ "logging_steps": 3,
355
+ "max_steps": 200,
356
+ "num_train_epochs": 50,
357
+ "save_steps": 500,
358
+ "total_flos": 1.4351666985408922e+17,
359
+ "trial_name": null,
360
+ "trial_params": null
361
+ }
checkpoint-172/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bced848f1d7870fe05133227865677a8a42f46b0aeea6d70d51cc48491cfe2d
3
+ size 4536
checkpoint-176/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: QuantizationMethod.BITS_AND_BYTES
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float16
18
+
19
+ The following `bitsandbytes` quantization config was used during training:
20
+ - quant_method: QuantizationMethod.BITS_AND_BYTES
21
+ - load_in_8bit: False
22
+ - load_in_4bit: True
23
+ - llm_int8_threshold: 6.0
24
+ - llm_int8_skip_modules: None
25
+ - llm_int8_enable_fp32_cpu_offload: False
26
+ - llm_int8_has_fp16_weight: False
27
+ - bnb_4bit_quant_type: nf4
28
+ - bnb_4bit_use_double_quant: False
29
+ - bnb_4bit_compute_dtype: float16
30
+ ### Framework versions
31
+
32
+ - PEFT 0.5.0
33
+
34
+ - PEFT 0.5.0
checkpoint-176/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "01-ai/Yi-34B",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 16,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
checkpoint-176/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bff9fc6e61825784dde82455107f3ebe20aa182b8abc74c30fdfeef8ba44c27
3
+ size 86594570
checkpoint-176/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fd03d215f0c6b24a9bea93739d50dc511050da8b16f2a5e908292885436b00c
3
+ size 173217146
checkpoint-176/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:049c26b844b79121ddd8379f7f69194e63f6fbf6aa007eeac0c66f17eebb8893
3
+ size 888
checkpoint-176/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c09bfc96fb814943c5ec064965dfa2176d723725c25d91d0d23e152cba9aee41
3
+ size 14244
checkpoint-176/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2001da3b376aba9f87cce497aff1695cd5a251e4f194567847110ab31723c6fb
3
+ size 1064
checkpoint-176/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-176/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39
3
+ size 1033105
checkpoint-176/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<|startoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "01-ai/Yi-34B--tokenization_yi.YiTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<|startoftext|>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "<|endoftext|>",
39
+ "model_max_length": 1024,
40
+ "pad_token": "<unk>",
41
+ "sp_model_kwargs": {},
42
+ "tokenizer_class": "YiTokenizer",
43
+ "unk_token": "<unk>"
44
+ }
checkpoint-176/trainer_state.json ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 44.0,
5
+ "eval_steps": 500,
6
+ "global_step": 176,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.75,
13
+ "learning_rate": 0.0003,
14
+ "loss": 1.0279,
15
+ "step": 3
16
+ },
17
+ {
18
+ "epoch": 1.5,
19
+ "learning_rate": 0.0006,
20
+ "loss": 0.9556,
21
+ "step": 6
22
+ },
23
+ {
24
+ "epoch": 2.25,
25
+ "learning_rate": 0.0009000000000000001,
26
+ "loss": 0.9999,
27
+ "step": 9
28
+ },
29
+ {
30
+ "epoch": 3.0,
31
+ "learning_rate": 0.0012,
32
+ "loss": 0.7355,
33
+ "step": 12
34
+ },
35
+ {
36
+ "epoch": 3.75,
37
+ "learning_rate": 0.0015,
38
+ "loss": 0.6017,
39
+ "step": 15
40
+ },
41
+ {
42
+ "epoch": 4.5,
43
+ "learning_rate": 0.0018000000000000002,
44
+ "loss": 0.4413,
45
+ "step": 18
46
+ },
47
+ {
48
+ "epoch": 5.25,
49
+ "learning_rate": 0.001988888888888889,
50
+ "loss": 0.3896,
51
+ "step": 21
52
+ },
53
+ {
54
+ "epoch": 6.0,
55
+ "learning_rate": 0.0019555555555555554,
56
+ "loss": 0.2588,
57
+ "step": 24
58
+ },
59
+ {
60
+ "epoch": 6.75,
61
+ "learning_rate": 0.0019222222222222223,
62
+ "loss": 0.1937,
63
+ "step": 27
64
+ },
65
+ {
66
+ "epoch": 7.5,
67
+ "learning_rate": 0.001888888888888889,
68
+ "loss": 0.1074,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 8.25,
73
+ "learning_rate": 0.0018555555555555556,
74
+ "loss": 0.1232,
75
+ "step": 33
76
+ },
77
+ {
78
+ "epoch": 9.0,
79
+ "learning_rate": 0.0018222222222222223,
80
+ "loss": 0.086,
81
+ "step": 36
82
+ },
83
+ {
84
+ "epoch": 9.75,
85
+ "learning_rate": 0.0018111111111111112,
86
+ "loss": 1.6039,
87
+ "step": 39
88
+ },
89
+ {
90
+ "epoch": 10.5,
91
+ "learning_rate": 0.0018000000000000002,
92
+ "loss": 1.4514,
93
+ "step": 42
94
+ },
95
+ {
96
+ "epoch": 11.25,
97
+ "learning_rate": 0.0017666666666666666,
98
+ "loss": 0.158,
99
+ "step": 45
100
+ },
101
+ {
102
+ "epoch": 12.0,
103
+ "learning_rate": 0.0017333333333333335,
104
+ "loss": 0.0712,
105
+ "step": 48
106
+ },
107
+ {
108
+ "epoch": 12.75,
109
+ "learning_rate": 0.0017,
110
+ "loss": 0.052,
111
+ "step": 51
112
+ },
113
+ {
114
+ "epoch": 13.5,
115
+ "learning_rate": 0.0016666666666666668,
116
+ "loss": 0.0412,
117
+ "step": 54
118
+ },
119
+ {
120
+ "epoch": 14.25,
121
+ "learning_rate": 0.0016333333333333334,
122
+ "loss": 0.0452,
123
+ "step": 57
124
+ },
125
+ {
126
+ "epoch": 15.0,
127
+ "learning_rate": 0.0016,
128
+ "loss": 0.0432,
129
+ "step": 60
130
+ },
131
+ {
132
+ "epoch": 15.75,
133
+ "learning_rate": 0.0015666666666666667,
134
+ "loss": 0.025,
135
+ "step": 63
136
+ },
137
+ {
138
+ "epoch": 16.5,
139
+ "learning_rate": 0.0015333333333333334,
140
+ "loss": 0.0245,
141
+ "step": 66
142
+ },
143
+ {
144
+ "epoch": 17.25,
145
+ "learning_rate": 0.0015,
146
+ "loss": 0.0172,
147
+ "step": 69
148
+ },
149
+ {
150
+ "epoch": 18.0,
151
+ "learning_rate": 0.0014666666666666667,
152
+ "loss": 0.0154,
153
+ "step": 72
154
+ },
155
+ {
156
+ "epoch": 18.75,
157
+ "learning_rate": 0.0014333333333333333,
158
+ "loss": 0.0128,
159
+ "step": 75
160
+ },
161
+ {
162
+ "epoch": 19.5,
163
+ "learning_rate": 0.0014,
164
+ "loss": 0.0171,
165
+ "step": 78
166
+ },
167
+ {
168
+ "epoch": 20.25,
169
+ "learning_rate": 0.0013666666666666666,
170
+ "loss": 0.011,
171
+ "step": 81
172
+ },
173
+ {
174
+ "epoch": 21.0,
175
+ "learning_rate": 0.0013333333333333333,
176
+ "loss": 0.0218,
177
+ "step": 84
178
+ },
179
+ {
180
+ "epoch": 21.75,
181
+ "learning_rate": 0.0013000000000000002,
182
+ "loss": 0.0102,
183
+ "step": 87
184
+ },
185
+ {
186
+ "epoch": 22.5,
187
+ "learning_rate": 0.0012666666666666666,
188
+ "loss": 0.0063,
189
+ "step": 90
190
+ },
191
+ {
192
+ "epoch": 23.25,
193
+ "learning_rate": 0.0012333333333333335,
194
+ "loss": 0.0073,
195
+ "step": 93
196
+ },
197
+ {
198
+ "epoch": 24.0,
199
+ "learning_rate": 0.0012,
200
+ "loss": 0.0047,
201
+ "step": 96
202
+ },
203
+ {
204
+ "epoch": 24.75,
205
+ "learning_rate": 0.0011666666666666668,
206
+ "loss": 0.0036,
207
+ "step": 99
208
+ },
209
+ {
210
+ "epoch": 25.5,
211
+ "learning_rate": 0.0011333333333333334,
212
+ "loss": 0.0037,
213
+ "step": 102
214
+ },
215
+ {
216
+ "epoch": 26.25,
217
+ "learning_rate": 0.0011,
218
+ "loss": 0.0033,
219
+ "step": 105
220
+ },
221
+ {
222
+ "epoch": 27.0,
223
+ "learning_rate": 0.0010666666666666667,
224
+ "loss": 0.0039,
225
+ "step": 108
226
+ },
227
+ {
228
+ "epoch": 27.75,
229
+ "learning_rate": 0.0010333333333333334,
230
+ "loss": 0.0025,
231
+ "step": 111
232
+ },
233
+ {
234
+ "epoch": 28.5,
235
+ "learning_rate": 0.001,
236
+ "loss": 0.0033,
237
+ "step": 114
238
+ },
239
+ {
240
+ "epoch": 29.25,
241
+ "learning_rate": 0.0009666666666666667,
242
+ "loss": 0.0021,
243
+ "step": 117
244
+ },
245
+ {
246
+ "epoch": 30.0,
247
+ "learning_rate": 0.0009333333333333333,
248
+ "loss": 0.0027,
249
+ "step": 120
250
+ },
251
+ {
252
+ "epoch": 30.75,
253
+ "learning_rate": 0.0009000000000000001,
254
+ "loss": 0.0017,
255
+ "step": 123
256
+ },
257
+ {
258
+ "epoch": 31.5,
259
+ "learning_rate": 0.0008666666666666667,
260
+ "loss": 0.0016,
261
+ "step": 126
262
+ },
263
+ {
264
+ "epoch": 32.25,
265
+ "learning_rate": 0.0008333333333333334,
266
+ "loss": 0.0015,
267
+ "step": 129
268
+ },
269
+ {
270
+ "epoch": 33.0,
271
+ "learning_rate": 0.0008,
272
+ "loss": 0.0013,
273
+ "step": 132
274
+ },
275
+ {
276
+ "epoch": 33.75,
277
+ "learning_rate": 0.0007666666666666667,
278
+ "loss": 0.0008,
279
+ "step": 135
280
+ },
281
+ {
282
+ "epoch": 34.5,
283
+ "learning_rate": 0.0007333333333333333,
284
+ "loss": 0.0018,
285
+ "step": 138
286
+ },
287
+ {
288
+ "epoch": 35.25,
289
+ "learning_rate": 0.0007,
290
+ "loss": 0.0012,
291
+ "step": 141
292
+ },
293
+ {
294
+ "epoch": 36.0,
295
+ "learning_rate": 0.0006666666666666666,
296
+ "loss": 0.0012,
297
+ "step": 144
298
+ },
299
+ {
300
+ "epoch": 36.75,
301
+ "learning_rate": 0.0006333333333333333,
302
+ "loss": 0.0009,
303
+ "step": 147
304
+ },
305
+ {
306
+ "epoch": 37.5,
307
+ "learning_rate": 0.0006,
308
+ "loss": 0.0009,
309
+ "step": 150
310
+ },
311
+ {
312
+ "epoch": 38.25,
313
+ "learning_rate": 0.0005666666666666667,
314
+ "loss": 0.0011,
315
+ "step": 153
316
+ },
317
+ {
318
+ "epoch": 39.0,
319
+ "learning_rate": 0.0005333333333333334,
320
+ "loss": 0.0008,
321
+ "step": 156
322
+ },
323
+ {
324
+ "epoch": 39.75,
325
+ "learning_rate": 0.0005,
326
+ "loss": 0.0009,
327
+ "step": 159
328
+ },
329
+ {
330
+ "epoch": 40.5,
331
+ "learning_rate": 0.00046666666666666666,
332
+ "loss": 0.0008,
333
+ "step": 162
334
+ },
335
+ {
336
+ "epoch": 41.25,
337
+ "learning_rate": 0.00043333333333333337,
338
+ "loss": 0.0008,
339
+ "step": 165
340
+ },
341
+ {
342
+ "epoch": 42.0,
343
+ "learning_rate": 0.0004,
344
+ "loss": 0.0007,
345
+ "step": 168
346
+ },
347
+ {
348
+ "epoch": 42.75,
349
+ "learning_rate": 0.00036666666666666667,
350
+ "loss": 0.0009,
351
+ "step": 171
352
+ },
353
+ {
354
+ "epoch": 43.5,
355
+ "learning_rate": 0.0003333333333333333,
356
+ "loss": 0.0006,
357
+ "step": 174
358
+ }
359
+ ],
360
+ "logging_steps": 3,
361
+ "max_steps": 200,
362
+ "num_train_epochs": 50,
363
+ "save_steps": 500,
364
+ "total_flos": 1.4685426682744013e+17,
365
+ "trial_name": null,
366
+ "trial_params": null
367
+ }