terry69 commited on
Commit
fe4b622
·
verified ·
1 Parent(s): a030b12

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,11 @@
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
 
8
  - generated_from_trainer
9
  base_model: mistralai/Mistral-7B-v0.1
10
- datasets:
11
- - HuggingFaceH4/ultrachat_200k
12
  model-index:
13
  - name: mistral5p
14
  results: []
@@ -19,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # mistral5p
21
 
22
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the HuggingFaceH4/ultrachat_200k dataset.
23
  It achieves the following results on the evaluation set:
24
  - Loss: nan
25
 
@@ -46,8 +44,8 @@ The following hyperparameters were used during training:
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
  - num_devices: 4
49
- - gradient_accumulation_steps: 4
50
- - total_train_batch_size: 128
51
  - total_eval_batch_size: 4
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
@@ -56,9 +54,9 @@ The following hyperparameters were used during training:
56
 
57
  ### Training results
58
 
59
- | Training Loss | Epoch | Step | Validation Loss |
60
- |:-------------:|:------:|:----:|:---------------:|
61
- | 0.7092 | 0.9969 | 243 | nan |
62
 
63
 
64
  ### Framework versions
 
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
+ - alignment-handbook
8
  - generated_from_trainer
9
  base_model: mistralai/Mistral-7B-v0.1
 
 
10
  model-index:
11
  - name: mistral5p
12
  results: []
 
17
 
18
  # mistral5p
19
 
20
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
  - Loss: nan
23
 
 
44
  - seed: 42
45
  - distributed_type: multi-GPU
46
  - num_devices: 4
47
+ - gradient_accumulation_steps: 8
48
+ - total_train_batch_size: 256
49
  - total_eval_batch_size: 4
50
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
  - lr_scheduler_type: cosine
 
54
 
55
  ### Training results
56
 
57
+ | Training Loss | Epoch | Step | Validation Loss |
58
+ |:-------------:|:-----:|:----:|:---------------:|
59
+ | 0.6894 | 1.0 | 406 | nan |
60
 
61
 
62
  ### Framework versions
adapter_config.json CHANGED
@@ -22,11 +22,11 @@
22
  "target_modules": [
23
  "k_proj",
24
  "gate_proj",
25
- "o_proj",
26
- "q_proj",
27
  "down_proj",
28
- "up_proj",
29
- "v_proj"
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
22
  "target_modules": [
23
  "k_proj",
24
  "gate_proj",
 
 
25
  "down_proj",
26
+ "q_proj",
27
+ "o_proj",
28
+ "v_proj",
29
+ "up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fe90f2f98208e74edd203b571c5549da46239bd0283d4b96ce66fefe4c7662e
3
- size 31516744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9915b7e1547d3d9bed1440525979f707b4f386020b4013df18918578132e931f
3
+ size 62973728
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9969230769230769,
3
- "total_flos": 5623477606285312.0,
4
- "train_loss": 0.7391852758548878,
5
- "train_runtime": 18302.1626,
6
- "train_samples": 31180,
7
- "train_samples_per_second": 1.704,
8
- "train_steps_per_second": 0.013
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 2.1219407349982167e+18,
4
+ "train_loss": 0.3640357888684484,
5
+ "train_runtime": 42241.1922,
6
+ "train_samples": 103932,
7
+ "train_samples_per_second": 2.46,
8
+ "train_steps_per_second": 0.01
9
  }
runs/Jun07_23-14-40_ip-172-31-69-60.ec2.internal/events.out.tfevents.1717802114.ip-172-31-69-60.ec2.internal.32152.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b85799135ab367db15a1795367cf3873efe2ee1e377c5075a8ce83e03c97e46
3
- size 13528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3aa58153670c2e6ce55b2a7cf6665a2aa2ed380a166f90c5e8e077be316a481
3
+ size 14364
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9969230769230769,
3
- "total_flos": 5623477606285312.0,
4
- "train_loss": 0.7391852758548878,
5
- "train_runtime": 18302.1626,
6
- "train_samples": 31180,
7
- "train_samples_per_second": 1.704,
8
- "train_steps_per_second": 0.013
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 2.1219407349982167e+18,
4
+ "train_loss": 0.3640357888684484,
5
+ "train_runtime": 42241.1922,
6
+ "train_samples": 103932,
7
+ "train_samples_per_second": 2.46,
8
+ "train_steps_per_second": 0.01
9
  }
trainer_state.json CHANGED
@@ -1,392 +1,623 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9969230769230769,
5
  "eval_steps": 500,
6
- "global_step": 243,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0041025641025641026,
13
- "grad_norm": 0.26542961092188894,
14
- "learning_rate": 8.000000000000001e-06,
15
- "loss": 0.8534,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.020512820512820513,
20
- "grad_norm": 0.2656899378598073,
21
- "learning_rate": 4e-05,
22
- "loss": 0.8667,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.041025641025641026,
27
- "grad_norm": 0.2572181527992512,
28
- "learning_rate": 8e-05,
29
- "loss": 0.8031,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.06153846153846154,
34
- "grad_norm": 0.18710816410634198,
35
- "learning_rate": 0.00012,
36
- "loss": 0.7552,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.08205128205128205,
41
- "grad_norm": 0.17366628242950644,
42
- "learning_rate": 0.00016,
43
- "loss": 0.7434,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.10256410256410256,
48
- "grad_norm": 0.18209587440813843,
49
- "learning_rate": 0.0002,
50
- "loss": 0.7613,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.12307692307692308,
55
- "grad_norm": 0.17584855523843826,
56
- "learning_rate": 0.00019974051702905277,
57
- "loss": 0.7686,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.14358974358974358,
62
- "grad_norm": 0.1876526868957483,
63
- "learning_rate": 0.00019896341474445525,
64
- "loss": 0.7408,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.1641025641025641,
69
- "grad_norm": 0.14962305137903897,
70
- "learning_rate": 0.00019767272604239824,
71
- "loss": 0.7422,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.18461538461538463,
76
- "grad_norm": 0.16336671883236106,
77
- "learning_rate": 0.00019587514915766124,
78
- "loss": 0.7565,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.20512820512820512,
83
- "grad_norm": 0.15152018240587575,
84
- "learning_rate": 0.00019358001290205543,
85
- "loss": 0.7493,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.22564102564102564,
90
- "grad_norm": 0.14810534949892065,
91
- "learning_rate": 0.0001907992282510675,
92
- "loss": 0.7539,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.24615384615384617,
97
- "grad_norm": 0.16847734140964582,
98
- "learning_rate": 0.00018754722652995347,
99
- "loss": 0.7395,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.26666666666666666,
104
- "grad_norm": 0.15436628692835286,
105
- "learning_rate": 0.00018384088452007578,
106
- "loss": 0.747,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.28717948717948716,
111
- "grad_norm": 0.15652291403007046,
112
- "learning_rate": 0.00017969943687415576,
113
- "loss": 0.7506,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.3076923076923077,
118
- "grad_norm": 0.16413347022113,
119
- "learning_rate": 0.0001751443762949772,
120
- "loss": 0.7611,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.3282051282051282,
125
- "grad_norm": 0.16182771329972745,
126
- "learning_rate": 0.00017019934199557867,
127
- "loss": 0.7576,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.3487179487179487,
132
- "grad_norm": 0.137569364566755,
133
- "learning_rate": 0.00016488999701978903,
134
- "loss": 0.7451,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.36923076923076925,
139
- "grad_norm": 0.14900540538806303,
140
- "learning_rate": 0.00015924389505977038,
141
- "loss": 0.7197,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.38974358974358975,
146
- "grad_norm": 0.14655990407738764,
147
- "learning_rate": 0.00015329033746173975,
148
- "loss": 0.7149,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.41025641025641024,
153
- "grad_norm": 0.1402768270199422,
154
- "learning_rate": 0.00014706022116196208,
155
- "loss": 0.7018,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.4307692307692308,
160
- "grad_norm": 0.14134928021057855,
161
- "learning_rate": 0.00014058587834217355,
162
- "loss": 0.7324,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.4512820512820513,
167
- "grad_norm": 0.15600188868890955,
168
- "learning_rate": 0.00013390090863657047,
169
- "loss": 0.748,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.4717948717948718,
174
- "grad_norm": 0.13996117144988574,
175
- "learning_rate": 0.0001270400047611508,
176
- "loss": 0.7703,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.49230769230769234,
181
- "grad_norm": 0.1510982087519504,
182
- "learning_rate": 0.00012003877247033411,
183
- "loss": 0.7515,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.5128205128205128,
188
- "grad_norm": 0.14250885612041378,
189
- "learning_rate": 0.00011293354577522263,
190
- "loss": 0.7196,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.5333333333333333,
195
- "grad_norm": 0.14795873562666287,
196
- "learning_rate": 0.00010576119838245844,
197
- "loss": 0.731,
198
  "step": 130
199
  },
200
  {
201
- "epoch": 0.5538461538461539,
202
- "grad_norm": 0.13063022182064904,
203
- "learning_rate": 9.85589523322443e-05,
204
- "loss": 0.7301,
205
  "step": 135
206
  },
207
  {
208
- "epoch": 0.5743589743589743,
209
- "grad_norm": 0.13557487318693218,
210
- "learning_rate": 9.136418482863229e-05,
211
- "loss": 0.718,
212
  "step": 140
213
  },
214
  {
215
- "epoch": 0.5948717948717949,
216
- "grad_norm": 0.15507641481392034,
217
- "learning_rate": 8.42142342645646e-05,
218
- "loss": 0.7193,
219
  "step": 145
220
  },
221
  {
222
- "epoch": 0.6153846153846154,
223
- "grad_norm": 0.15494002592225475,
224
- "learning_rate": 7.714620644833111e-05,
225
- "loss": 0.731,
226
  "step": 150
227
  },
228
  {
229
- "epoch": 0.6358974358974359,
230
- "grad_norm": 0.1601203804358568,
231
- "learning_rate": 7.019678203706163e-05,
232
- "loss": 0.75,
233
  "step": 155
234
  },
235
  {
236
- "epoch": 0.6564102564102564,
237
- "grad_norm": 0.15685275896075176,
238
- "learning_rate": 6.340202617660842e-05,
239
- "loss": 0.7505,
240
  "step": 160
241
  },
242
  {
243
- "epoch": 0.676923076923077,
244
- "grad_norm": 0.1416704035001306,
245
- "learning_rate": 5.679720133572206e-05,
246
- "loss": 0.7311,
247
  "step": 165
248
  },
249
  {
250
- "epoch": 0.6974358974358974,
251
- "grad_norm": 0.14126335283382643,
252
- "learning_rate": 5.0416584305848524e-05,
253
- "loss": 0.755,
254
  "step": 170
255
  },
256
  {
257
- "epoch": 0.717948717948718,
258
- "grad_norm": 0.13481365291511518,
259
- "learning_rate": 4.4293288316255653e-05,
260
- "loss": 0.695,
261
  "step": 175
262
  },
263
  {
264
- "epoch": 0.7384615384615385,
265
- "grad_norm": 0.14785228000969083,
266
- "learning_rate": 3.845909118765073e-05,
267
- "loss": 0.7209,
268
  "step": 180
269
  },
270
  {
271
- "epoch": 0.7589743589743589,
272
- "grad_norm": 0.1503268472656009,
273
- "learning_rate": 3.294427041611425e-05,
274
- "loss": 0.7307,
275
  "step": 185
276
  },
277
  {
278
- "epoch": 0.7794871794871795,
279
- "grad_norm": 0.15299701883959732,
280
- "learning_rate": 2.7777446043207058e-05,
281
- "loss": 0.7351,
282
  "step": 190
283
  },
284
  {
285
- "epoch": 0.8,
286
- "grad_norm": 0.1373494159366909,
287
- "learning_rate": 2.2985432127701946e-05,
288
- "loss": 0.7304,
289
  "step": 195
290
  },
291
  {
292
- "epoch": 0.8205128205128205,
293
- "grad_norm": 0.16245499908137687,
294
- "learning_rate": 1.859309758975132e-05,
295
- "loss": 0.7385,
296
  "step": 200
297
  },
298
  {
299
- "epoch": 0.841025641025641,
300
- "grad_norm": 0.13807085431435603,
301
- "learning_rate": 1.462323714966114e-05,
302
- "loss": 0.714,
303
  "step": 205
304
  },
305
  {
306
- "epoch": 0.8615384615384616,
307
- "grad_norm": 0.12926916684005163,
308
- "learning_rate": 1.1096453031056264e-05,
309
- "loss": 0.7078,
310
  "step": 210
311
  },
312
  {
313
- "epoch": 0.882051282051282,
314
- "grad_norm": 0.12918592796308734,
315
- "learning_rate": 8.031048042356392e-06,
316
- "loss": 0.7208,
317
  "step": 215
318
  },
319
  {
320
- "epoch": 0.9025641025641026,
321
- "grad_norm": 0.14837431396022016,
322
- "learning_rate": 5.442930591433992e-06,
323
- "loss": 0.7435,
324
  "step": 220
325
  },
326
  {
327
- "epoch": 0.9230769230769231,
328
- "grad_norm": 0.14838485907528307,
329
- "learning_rate": 3.3455321263955786e-06,
330
- "loss": 0.7255,
331
  "step": 225
332
  },
333
  {
334
- "epoch": 0.9435897435897436,
335
- "grad_norm": 0.1480931290003826,
336
- "learning_rate": 1.7497374309405346e-06,
337
- "loss": 0.695,
338
  "step": 230
339
  },
340
  {
341
- "epoch": 0.9641025641025641,
342
- "grad_norm": 0.14733343614150893,
343
- "learning_rate": 6.638281360408339e-07,
344
- "loss": 0.7012,
345
  "step": 235
346
  },
347
  {
348
- "epoch": 0.9846153846153847,
349
- "grad_norm": 0.1368626907349573,
350
- "learning_rate": 9.343974109685682e-08,
351
- "loss": 0.7092,
352
  "step": 240
353
  },
354
  {
355
- "epoch": 0.9969230769230769,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  "eval_loss": NaN,
357
- "eval_runtime": 748.4122,
358
- "eval_samples_per_second": 1.545,
359
- "eval_steps_per_second": 0.386,
360
- "step": 243
361
- },
362
- {
363
- "epoch": 0.9969230769230769,
364
- "step": 243,
365
- "total_flos": 5623477606285312.0,
366
- "train_loss": 0.7391852758548878,
367
- "train_runtime": 18302.1626,
368
- "train_samples_per_second": 1.704,
369
- "train_steps_per_second": 0.013
370
  }
371
  ],
372
  "logging_steps": 5,
373
- "max_steps": 243,
374
  "num_input_tokens_seen": 0,
375
  "num_train_epochs": 1,
376
- "save_steps": 1000,
377
  "stateful_callbacks": {
378
  "TrainerControl": {
379
  "args": {
380
  "should_epoch_stop": false,
381
  "should_evaluate": false,
382
  "should_log": false,
383
- "should_save": false,
384
  "should_training_stop": false
385
  },
386
  "attributes": {}
387
  }
388
  },
389
- "total_flos": 5623477606285312.0,
390
  "train_batch_size": 8,
391
  "trial_name": null,
392
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 406,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0024630541871921183,
13
+ "grad_norm": 0.13191281259059906,
14
+ "learning_rate": 4.8780487804878055e-06,
15
+ "loss": 0.7592,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.012315270935960592,
20
+ "grad_norm": 0.1377675086259842,
21
+ "learning_rate": 2.4390243902439026e-05,
22
+ "loss": 0.7841,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.024630541871921183,
27
+ "grad_norm": 0.11912436038255692,
28
+ "learning_rate": 4.878048780487805e-05,
29
+ "loss": 0.7896,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.03694581280788178,
34
+ "grad_norm": 0.12670652568340302,
35
+ "learning_rate": 7.317073170731707e-05,
36
+ "loss": 0.7835,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.04926108374384237,
41
+ "grad_norm": 0.10752439498901367,
42
+ "learning_rate": 9.75609756097561e-05,
43
+ "loss": 0.7806,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.06157635467980296,
48
+ "grad_norm": 0.10097604990005493,
49
+ "learning_rate": 0.00012195121951219512,
50
+ "loss": 0.7567,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.07389162561576355,
55
+ "grad_norm": 0.10570556670427322,
56
+ "learning_rate": 0.00014634146341463414,
57
+ "loss": 0.7527,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.08620689655172414,
62
+ "grad_norm": 0.11552459746599197,
63
+ "learning_rate": 0.0001707317073170732,
64
+ "loss": 0.7514,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.09852216748768473,
69
+ "grad_norm": 0.11246493458747864,
70
+ "learning_rate": 0.0001951219512195122,
71
+ "loss": 0.75,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.11083743842364532,
76
+ "grad_norm": 0.10585317760705948,
77
+ "learning_rate": 0.0001999407400739705,
78
+ "loss": 0.738,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.12315270935960591,
83
+ "grad_norm": 0.10629229247570038,
84
+ "learning_rate": 0.00019970011699250152,
85
+ "loss": 0.7748,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.1354679802955665,
90
+ "grad_norm": 0.09721837192773819,
91
+ "learning_rate": 0.00019927487224577402,
92
+ "loss": 0.7469,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.1477832512315271,
97
+ "grad_norm": 0.10196585208177567,
98
+ "learning_rate": 0.0001986657932891657,
99
+ "loss": 0.7435,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.16009852216748768,
104
+ "grad_norm": 0.09946288913488388,
105
+ "learning_rate": 0.00019787400799669154,
106
+ "loss": 0.7482,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.1724137931034483,
111
+ "grad_norm": 0.10071098804473877,
112
+ "learning_rate": 0.00019690098257244064,
113
+ "loss": 0.7551,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.18472906403940886,
118
+ "grad_norm": 0.10068488121032715,
119
+ "learning_rate": 0.00019574851883550395,
120
+ "loss": 0.7502,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.19704433497536947,
125
+ "grad_norm": 0.10105381160974503,
126
+ "learning_rate": 0.00019441875088341997,
127
+ "loss": 0.7438,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.20935960591133004,
132
+ "grad_norm": 0.09201391041278839,
133
+ "learning_rate": 0.00019291414114031743,
134
+ "loss": 0.7374,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.22167487684729065,
139
+ "grad_norm": 0.09724140912294388,
140
+ "learning_rate": 0.00019123747579707275,
141
+ "loss": 0.7588,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.23399014778325122,
146
+ "grad_norm": 0.09425447881221771,
147
+ "learning_rate": 0.0001893918596519257,
148
+ "loss": 0.7631,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.24630541871921183,
153
+ "grad_norm": 0.10638237744569778,
154
+ "learning_rate": 0.00018738071036110808,
155
+ "loss": 0.7507,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.25862068965517243,
160
+ "grad_norm": 0.10152100026607513,
161
+ "learning_rate": 0.00018520775211013093,
162
+ "loss": 0.7503,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.270935960591133,
167
+ "grad_norm": 0.09959056228399277,
168
+ "learning_rate": 0.00018287700871745036,
169
+ "loss": 0.7644,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 0.2832512315270936,
174
+ "grad_norm": 0.10374708473682404,
175
+ "learning_rate": 0.00018039279618328212,
176
+ "loss": 0.7601,
177
  "step": 115
178
  },
179
  {
180
+ "epoch": 0.2955665024630542,
181
+ "grad_norm": 0.10066307336091995,
182
+ "learning_rate": 0.0001777597146973627,
183
+ "loss": 0.7392,
184
  "step": 120
185
  },
186
  {
187
+ "epoch": 0.3078817733990148,
188
+ "grad_norm": 0.1052442416548729,
189
+ "learning_rate": 0.00017498264012045687,
190
+ "loss": 0.7443,
191
  "step": 125
192
  },
193
  {
194
+ "epoch": 0.32019704433497537,
195
+ "grad_norm": 0.09332836419343948,
196
+ "learning_rate": 0.00017206671495538612,
197
+ "loss": 0.7407,
198
  "step": 130
199
  },
200
  {
201
+ "epoch": 0.33251231527093594,
202
+ "grad_norm": 0.10167726129293442,
203
+ "learning_rate": 0.0001690173388242972,
204
+ "loss": 0.737,
205
  "step": 135
206
  },
207
  {
208
+ "epoch": 0.3448275862068966,
209
+ "grad_norm": 0.09576103836297989,
210
+ "learning_rate": 0.0001658401584698049,
211
+ "loss": 0.7467,
212
  "step": 140
213
  },
214
  {
215
+ "epoch": 0.35714285714285715,
216
+ "grad_norm": 0.09853356331586838,
217
+ "learning_rate": 0.00016254105729852464,
218
+ "loss": 0.7413,
219
  "step": 145
220
  },
221
  {
222
+ "epoch": 0.3694581280788177,
223
+ "grad_norm": 0.0903814435005188,
224
+ "learning_rate": 0.00015912614448635782,
225
+ "loss": 0.7529,
226
  "step": 150
227
  },
228
  {
229
+ "epoch": 0.3817733990147783,
230
+ "grad_norm": 0.10001372545957565,
231
+ "learning_rate": 0.00015560174366570446,
232
+ "loss": 0.7396,
233
  "step": 155
234
  },
235
  {
236
+ "epoch": 0.39408866995073893,
237
+ "grad_norm": 0.10121896117925644,
238
+ "learning_rate": 0.0001519743812155516,
239
+ "loss": 0.7536,
240
  "step": 160
241
  },
242
  {
243
+ "epoch": 0.4064039408866995,
244
+ "grad_norm": 0.09317266196012497,
245
+ "learning_rate": 0.00014825077417612186,
246
+ "loss": 0.754,
247
  "step": 165
248
  },
249
  {
250
+ "epoch": 0.4187192118226601,
251
+ "grad_norm": 0.09699834138154984,
252
+ "learning_rate": 0.00014443781781046136,
253
+ "loss": 0.7585,
254
  "step": 170
255
  },
256
  {
257
+ "epoch": 0.43103448275862066,
258
+ "grad_norm": 0.09634856134653091,
259
+ "learning_rate": 0.00014054257283599973,
260
+ "loss": 0.7193,
261
  "step": 175
262
  },
263
  {
264
+ "epoch": 0.4433497536945813,
265
+ "grad_norm": 0.09830117970705032,
266
+ "learning_rate": 0.00013657225234972695,
267
+ "loss": 0.7346,
268
  "step": 180
269
  },
270
  {
271
+ "epoch": 0.45566502463054187,
272
+ "grad_norm": 0.09632200747728348,
273
+ "learning_rate": 0.00013253420847119803,
274
+ "loss": 0.7356,
275
  "step": 185
276
  },
277
  {
278
+ "epoch": 0.46798029556650245,
279
+ "grad_norm": 0.09794441610574722,
280
+ "learning_rate": 0.0001284359187281004,
281
+ "loss": 0.7404,
282
  "step": 190
283
  },
284
  {
285
+ "epoch": 0.4802955665024631,
286
+ "grad_norm": 0.09384100884199142,
287
+ "learning_rate": 0.0001242849722095936,
288
+ "loss": 0.7282,
289
  "step": 195
290
  },
291
  {
292
+ "epoch": 0.49261083743842365,
293
+ "grad_norm": 0.09335623681545258,
294
+ "learning_rate": 0.00012008905551306356,
295
+ "loss": 0.7511,
296
  "step": 200
297
  },
298
  {
299
+ "epoch": 0.5049261083743842,
300
+ "grad_norm": 0.11448359489440918,
301
+ "learning_rate": 0.00011585593851031347,
302
+ "loss": 0.7173,
303
  "step": 205
304
  },
305
  {
306
+ "epoch": 0.5172413793103449,
307
+ "grad_norm": 0.11399874091148376,
308
+ "learning_rate": 0.00011159345995955006,
309
+ "loss": 0.7261,
310
  "step": 210
311
  },
312
  {
313
+ "epoch": 0.5295566502463054,
314
+ "grad_norm": 0.10711462795734406,
315
+ "learning_rate": 0.00010730951298980776,
316
+ "loss": 0.7293,
317
  "step": 215
318
  },
319
  {
320
+ "epoch": 0.541871921182266,
321
+ "grad_norm": 0.11034953594207764,
322
+ "learning_rate": 0.00010301203048469083,
323
+ "loss": 0.7284,
324
  "step": 220
325
  },
326
  {
327
+ "epoch": 0.5541871921182266,
328
+ "grad_norm": 0.11478842049837112,
329
+ "learning_rate": 9.870897039249911e-05,
330
+ "loss": 0.7146,
331
  "step": 225
332
  },
333
  {
334
+ "epoch": 0.5665024630541872,
335
+ "grad_norm": 0.1125415787100792,
336
+ "learning_rate": 9.440830098993969e-05,
337
+ "loss": 0.7218,
338
  "step": 230
339
  },
340
  {
341
+ "epoch": 0.5788177339901478,
342
+ "grad_norm": 0.11621884256601334,
343
+ "learning_rate": 9.011798612671286e-05,
344
+ "loss": 0.7242,
345
  "step": 235
346
  },
347
  {
348
+ "epoch": 0.5911330049261084,
349
+ "grad_norm": 0.11078532040119171,
350
+ "learning_rate": 8.58459704782957e-05,
351
+ "loss": 0.7268,
352
  "step": 240
353
  },
354
  {
355
+ "epoch": 0.603448275862069,
356
+ "grad_norm": 0.11298695206642151,
357
+ "learning_rate": 8.160016483423199e-05,
358
+ "loss": 0.7317,
359
+ "step": 245
360
+ },
361
+ {
362
+ "epoch": 0.6157635467980296,
363
+ "grad_norm": 0.1098036915063858,
364
+ "learning_rate": 7.738843144917119e-05,
365
+ "loss": 0.7174,
366
+ "step": 250
367
+ },
368
+ {
369
+ "epoch": 0.6280788177339901,
370
+ "grad_norm": 0.11521229147911072,
371
+ "learning_rate": 7.321856948378259e-05,
372
+ "loss": 0.7123,
373
+ "step": 255
374
+ },
375
+ {
376
+ "epoch": 0.6403940886699507,
377
+ "grad_norm": 0.11004281044006348,
378
+ "learning_rate": 6.909830056250527e-05,
379
+ "loss": 0.7247,
380
+ "step": 260
381
+ },
382
+ {
383
+ "epoch": 0.6527093596059114,
384
+ "grad_norm": 0.11443013697862625,
385
+ "learning_rate": 6.503525447487715e-05,
386
+ "loss": 0.7016,
387
+ "step": 265
388
+ },
389
+ {
390
+ "epoch": 0.6650246305418719,
391
+ "grad_norm": 0.12297528237104416,
392
+ "learning_rate": 6.103695504692122e-05,
393
+ "loss": 0.7014,
394
+ "step": 270
395
+ },
396
+ {
397
+ "epoch": 0.6773399014778325,
398
+ "grad_norm": 0.11536238342523575,
399
+ "learning_rate": 5.7110806208751655e-05,
400
+ "loss": 0.7248,
401
+ "step": 275
402
+ },
403
+ {
404
+ "epoch": 0.6896551724137931,
405
+ "grad_norm": 0.1148216500878334,
406
+ "learning_rate": 5.326407828419979e-05,
407
+ "loss": 0.7145,
408
+ "step": 280
409
+ },
410
+ {
411
+ "epoch": 0.7019704433497537,
412
+ "grad_norm": 0.11440951377153397,
413
+ "learning_rate": 4.9503894527847964e-05,
414
+ "loss": 0.7127,
415
+ "step": 285
416
+ },
417
+ {
418
+ "epoch": 0.7142857142857143,
419
+ "grad_norm": 0.11594696342945099,
420
+ "learning_rate": 4.583721793440188e-05,
421
+ "loss": 0.7491,
422
+ "step": 290
423
+ },
424
+ {
425
+ "epoch": 0.7266009852216748,
426
+ "grad_norm": 0.1071944460272789,
427
+ "learning_rate": 4.227083834482728e-05,
428
+ "loss": 0.7173,
429
+ "step": 295
430
+ },
431
+ {
432
+ "epoch": 0.7389162561576355,
433
+ "grad_norm": 0.11470213532447815,
434
+ "learning_rate": 3.881135987312757e-05,
435
+ "loss": 0.72,
436
+ "step": 300
437
+ },
438
+ {
439
+ "epoch": 0.7512315270935961,
440
+ "grad_norm": 0.11714080721139908,
441
+ "learning_rate": 3.546518867704499e-05,
442
+ "loss": 0.7289,
443
+ "step": 305
444
+ },
445
+ {
446
+ "epoch": 0.7635467980295566,
447
+ "grad_norm": 0.11828279495239258,
448
+ "learning_rate": 3.223852109533112e-05,
449
+ "loss": 0.7209,
450
+ "step": 310
451
+ },
452
+ {
453
+ "epoch": 0.7758620689655172,
454
+ "grad_norm": 0.1184413954615593,
455
+ "learning_rate": 2.9137332173554043e-05,
456
+ "loss": 0.7071,
457
+ "step": 315
458
+ },
459
+ {
460
+ "epoch": 0.7881773399014779,
461
+ "grad_norm": 0.12409494072198868,
462
+ "learning_rate": 2.616736459968936e-05,
463
+ "loss": 0.7104,
464
+ "step": 320
465
+ },
466
+ {
467
+ "epoch": 0.8004926108374384,
468
+ "grad_norm": 0.1128469705581665,
469
+ "learning_rate": 2.33341180699841e-05,
470
+ "loss": 0.7128,
471
+ "step": 325
472
+ },
473
+ {
474
+ "epoch": 0.812807881773399,
475
+ "grad_norm": 0.11089266836643219,
476
+ "learning_rate": 2.0642839104785272e-05,
477
+ "loss": 0.7166,
478
+ "step": 330
479
+ },
480
+ {
481
+ "epoch": 0.8251231527093597,
482
+ "grad_norm": 0.11719653755426407,
483
+ "learning_rate": 1.8098511333192024e-05,
484
+ "loss": 0.7333,
485
+ "step": 335
486
+ },
487
+ {
488
+ "epoch": 0.8374384236453202,
489
+ "grad_norm": 0.11491881310939789,
490
+ "learning_rate": 1.570584626452173e-05,
491
+ "loss": 0.7208,
492
+ "step": 340
493
+ },
494
+ {
495
+ "epoch": 0.8497536945812808,
496
+ "grad_norm": 0.10957372933626175,
497
+ "learning_rate": 1.3469274563679402e-05,
498
+ "loss": 0.7046,
499
+ "step": 345
500
+ },
501
+ {
502
+ "epoch": 0.8620689655172413,
503
+ "grad_norm": 0.1188158467411995,
504
+ "learning_rate": 1.1392937846586215e-05,
505
+ "loss": 0.7178,
506
+ "step": 350
507
+ },
508
+ {
509
+ "epoch": 0.874384236453202,
510
+ "grad_norm": 0.11511045694351196,
511
+ "learning_rate": 9.48068101086026e-06,
512
+ "loss": 0.7161,
513
+ "step": 355
514
+ },
515
+ {
516
+ "epoch": 0.8866995073891626,
517
+ "grad_norm": 0.11417835205793381,
518
+ "learning_rate": 7.736045115951251e-06,
519
+ "loss": 0.7274,
520
+ "step": 360
521
+ },
522
+ {
523
+ "epoch": 0.8990147783251231,
524
+ "grad_norm": 0.11594397574663162,
525
+ "learning_rate": 6.16226082591359e-06,
526
+ "loss": 0.7081,
527
+ "step": 365
528
+ },
529
+ {
530
+ "epoch": 0.9113300492610837,
531
+ "grad_norm": 0.11524856835603714,
532
+ "learning_rate": 4.762242426960262e-06,
533
+ "loss": 0.7209,
534
+ "step": 370
535
+ },
536
+ {
537
+ "epoch": 0.9236453201970444,
538
+ "grad_norm": 0.12005619704723358,
539
+ "learning_rate": 3.5385824308756587e-06,
540
+ "loss": 0.6989,
541
+ "step": 375
542
+ },
543
+ {
544
+ "epoch": 0.9359605911330049,
545
+ "grad_norm": 0.12374743819236755,
546
+ "learning_rate": 2.493546774280531e-06,
547
+ "loss": 0.7276,
548
+ "step": 380
549
+ },
550
+ {
551
+ "epoch": 0.9482758620689655,
552
+ "grad_norm": 0.11894236505031586,
553
+ "learning_rate": 1.6290706226390285e-06,
554
+ "loss": 0.7144,
555
+ "step": 385
556
+ },
557
+ {
558
+ "epoch": 0.9605911330049262,
559
+ "grad_norm": 0.11776220053434372,
560
+ "learning_rate": 9.46754786777726e-07,
561
+ "loss": 0.7015,
562
+ "step": 390
563
+ },
564
+ {
565
+ "epoch": 0.9729064039408867,
566
+ "grad_norm": 0.109645314514637,
567
+ "learning_rate": 4.4786275855247527e-07,
568
+ "loss": 0.7109,
569
+ "step": 395
570
+ },
571
+ {
572
+ "epoch": 0.9852216748768473,
573
+ "grad_norm": 0.11453017592430115,
574
+ "learning_rate": 1.333183711524133e-07,
575
+ "loss": 0.7094,
576
+ "step": 400
577
+ },
578
+ {
579
+ "epoch": 0.9975369458128078,
580
+ "grad_norm": 0.1103915199637413,
581
+ "learning_rate": 3.7040883734462683e-09,
582
+ "loss": 0.6894,
583
+ "step": 405
584
+ },
585
+ {
586
+ "epoch": 1.0,
587
  "eval_loss": NaN,
588
+ "eval_runtime": 944.7015,
589
+ "eval_samples_per_second": 1.223,
590
+ "eval_steps_per_second": 0.306,
591
+ "step": 406
592
+ },
593
+ {
594
+ "epoch": 1.0,
595
+ "step": 406,
596
+ "total_flos": 2.1219407349982167e+18,
597
+ "train_loss": 0.3640357888684484,
598
+ "train_runtime": 42241.1922,
599
+ "train_samples_per_second": 2.46,
600
+ "train_steps_per_second": 0.01
601
  }
602
  ],
603
  "logging_steps": 5,
604
+ "max_steps": 406,
605
  "num_input_tokens_seen": 0,
606
  "num_train_epochs": 1,
607
+ "save_steps": 100,
608
  "stateful_callbacks": {
609
  "TrainerControl": {
610
  "args": {
611
  "should_epoch_stop": false,
612
  "should_evaluate": false,
613
  "should_log": false,
614
+ "should_save": true,
615
  "should_training_stop": false
616
  },
617
  "attributes": {}
618
  }
619
  },
620
+ "total_flos": 2.1219407349982167e+18,
621
  "train_batch_size": 8,
622
  "trial_name": null,
623
  "trial_params": null